# Load and Inspect Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from datetime import datetime
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
import os
import tensorflow as tf

In [2]:
training_path = os.path.join("data", "train.csv")
test_path = os.path.join("data", "example_test.csv")

training_data = pd.read_csv(training_path)
test_data = pd.read_csv(test_path)

In [3]:
training_data.head()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.0,-0.00473,-0.003273,-0.000461,-0.000476,-0.0032,-1,1.174378,0.34464,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4


In [4]:
test_data.tail()

Unnamed: 0,date,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
15214,2,0.0,1,-1.21324,-1.557117,0.530793,0.382429,0.316811,0.240976,0.741902,...,0.200094,1.655182,2.551488,0.525934,1.242721,1.977483,2.563083,1.857149,2.424928,15214
15215,2,0.0,1,-0.413328,-0.642504,0.429951,0.333967,-0.728263,-0.637617,0.204294,...,2.07096,6.393191,22.159397,-0.101824,3.804838,1.78015,7.504901,4.702145,15.37613,15215
15216,2,0.0,1,-1.378947,-1.702976,0.548763,0.396754,0.328203,0.249898,0.784458,...,0.200728,1.740141,2.685696,0.527251,1.245219,1.981606,2.567519,1.876328,2.450874,15216
15217,2,0.0,1,-0.324708,-1.089962,-0.8739,-0.544143,-1.265208,-0.844335,2.302628,...,0.300244,5.243907,11.789678,0.310616,2.660067,3.052869,6.39939,5.396259,10.972647,15217
15218,2,0.0,1,-1.652183,-1.857245,0.451629,0.352413,-0.716532,-0.626735,0.268371,...,2.073743,6.423787,22.254007,-0.09853,3.816995,1.79347,7.534632,4.730957,15.457871,15218


In [5]:
training_data.columns

Index(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp',
       'feature_0', 'feature_1', 'feature_2',
       ...
       'feature_121', 'feature_122', 'feature_123', 'feature_124',
       'feature_125', 'feature_126', 'feature_127', 'feature_128',
       'feature_129', 'ts_id'],
      dtype='object', length=138)

Recurrent Neural Networks

In [6]:
training_data.shape

(2390491, 138)

In [7]:
# count nan values for a feature
feature_X = np.array(training_data['feature_2'])
sum(np.isnan(feature_X))

0

# Format and Preprocess Data

In [8]:
training_subset = training_data.iloc[:, np.r_[0, -1, 8:10, 6]]

In [9]:
training_subset.head()

Unnamed: 0,date,ts_id,feature_1,feature_2,resp
0,0,0,-1.872746,-2.191242,0.00627
1,0,1,-1.349537,-1.704709,-0.009792
2,0,2,0.81278,-0.256156,0.02397
3,0,3,1.174378,0.34464,-0.0032
4,0,4,-3.172026,-3.093182,-0.002604


In [10]:
# train / val / test split
train = training_subset.iloc[:1_800_000, :].values
val = training_subset.iloc[1_800_000:, :].values
test = training_subset.iloc[2_000_000:, :].values

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [12]:
train_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_imputer.fit_transform(train)

array([[ 0.00000000e+00,  0.00000000e+00, -1.87274634e+00,
        -2.19124240e+00,  6.27036224e-03],
       [ 0.00000000e+00,  1.00000000e+00, -1.34953705e+00,
        -1.70470899e+00, -9.79168235e-03],
       [ 0.00000000e+00,  2.00000000e+00,  8.12780428e-01,
        -2.56155843e-01,  2.39701263e-02],
       ...,
       [ 3.83000000e+02,  1.79999700e+06,  1.01015137e+00,
         1.48995455e+00,  1.95213579e-02],
       [ 3.83000000e+02,  1.79999800e+06, -4.45742371e-01,
        -4.81690252e-03, -8.31328922e-04],
       [ 3.83000000e+02,  1.79999900e+06, -1.33349299e+00,
        -1.94146229e+00,  3.20697425e-02]])

In [13]:
val_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
val_imputer.fit_transform(val)

array([[ 3.83000000e+02,  1.80000000e+06, -1.40838645e+00,
        -4.62556637e-01,  1.10915059e-03],
       [ 3.83000000e+02,  1.80000100e+06, -5.79351885e-01,
         3.74608206e-01, -7.44823587e-03],
       [ 3.83000000e+02,  1.80000200e+06,  1.03305402e-01,
         2.55344523e-01, -1.74704001e-02],
       ...,
       [ 4.99000000e+02,  2.39048800e+06, -6.22475214e-01,
        -9.63682487e-01,  1.65907701e-02],
       [ 4.99000000e+02,  2.39048900e+06, -1.46375663e+00,
        -1.10722789e+00, -2.00369054e-03],
       [ 4.99000000e+02,  2.39049000e+06, -1.81718419e+00,
        -1.13157699e+00, -1.90462146e-03]])

In [14]:
test_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
test_imputer.fit_transform(test)

array([[ 4.27000000e+02,  2.00000000e+06, -1.38956149e+00,
        -1.08551625e+00, -2.64024510e-03],
       [ 4.27000000e+02,  2.00000100e+06, -2.05935251e+00,
        -1.72132752e+00,  2.07447851e-03],
       [ 4.27000000e+02,  2.00000200e+06,  1.62477203e+00,
        -8.73924370e-02,  9.44587982e-02],
       ...,
       [ 4.99000000e+02,  2.39048800e+06, -6.22475214e-01,
        -9.63682487e-01,  1.65907701e-02],
       [ 4.99000000e+02,  2.39048900e+06, -1.46375663e+00,
        -1.10722789e+00, -2.00369054e-03],
       [ 4.99000000e+02,  2.39049000e+06, -1.81718419e+00,
        -1.13157699e+00, -1.90462146e-03]])

In [15]:
train_scaler = StandardScaler()
train_scaler.fit_transform(train)

array([[-1.56596337, -1.73204985, -0.888991  , -1.02947972,  0.21103872],
       [-1.56596337, -1.73204792, -0.68511968, -0.83224238, -0.37541714],
       [-1.56596337, -1.732046  ,  0.15743899, -0.24500877,  0.85729085],
       ...,
       [ 1.69890765,  1.732046  ,  0.23434564,  0.46285253,  0.69485783],
       [ 1.69890765,  1.73204792, -0.33295121, -0.14311767, -0.04825756],
       [ 1.69890765,  1.73204985, -0.67886802, -0.92822056,  1.15302328]])

In [16]:
val_scaler = StandardScaler()
val_scaler.fit_transform(val)

array([[-1.81147648, -1.73204787, -0.67939952, -0.33840878,  0.03728328],
       [-1.81147648, -1.73204201, -0.35257276, -0.00476153, -0.29821577],
       [-1.81147648, -1.73203614, -0.08345169, -0.05229339, -0.69114259],
       ...,
       [ 1.66402982,  1.73203614, -0.36957308, -0.53812961,  0.64425233],
       [ 1.66402982,  1.73204201, -0.70122787, -0.59533881, -0.0847581 ],
       [ 1.66402982,  1.73204787, -0.84055811, -0.605043  , -0.08087402]])

In [17]:
test_scaler = StandardScaler()
test_scaler.fit_transform(test)

array([[-1.74365824, -1.73204637, -0.66457239, -0.58639798, -0.114324  ],
       [-1.74365824, -1.7320375 , -0.92578288, -0.83685231,  0.06843576],
       [-1.74365824, -1.73202863,  0.51098165, -0.19322403,  3.64958663],
       ...,
       [ 1.70946908,  1.73202863, -0.36541791, -0.53840608,  0.6311398 ],
       [ 1.70946908,  1.7320375 , -0.69350761, -0.59495048, -0.08964884],
       [ 1.70946908,  1.73204637, -0.83134013, -0.60454191, -0.08580857]])

In [28]:
def format_data(raw_data, past_window):
    X = []
    y = []

    for i in range(past_window, len(raw_data) - 1):
        X.append(raw_data[i - past_window:i, 0:raw_data.shape[1] - 1])
        y.append([raw_data[i + 1, -1]])
  
    return np.array(X), np.array(y)

In [29]:
X_train, y_train = format_data(train, 60)

In [30]:
X_val, y_val = format_data(val, 60)

In [31]:
X_test, y_test = format_data(test,  60)

In [49]:
X_train.shape

(1799939, 60, 4)

# Build and Train Model

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam

In [34]:
model = Sequential()

model.add(LSTM(units=64, return_sequences=True, input_shape=(60, X_train.shape[2])))
model.add(LSTM(units=10, return_sequences=False))
model.add(Dropout(0.25))
model.add(Dense(units=1))

In [35]:
model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

In [36]:
history = model.fit(X_train, y_train, shuffle=True, epochs=2, validation_data=(X_val, y_val), batch_size=512)

Epoch 1/2
Epoch 2/2


In [35]:
y_test

array([ 0.03383681, -0.00331175,  0.00127515, ...,  0.01659077,
       -0.00200369, -0.00190462])

In [37]:
model.evaluate(X_test, y_test)



0.0006657449412159622

In [45]:
pred = model.predict(X_test)

KeyboardInterrupt: 

In [44]:
pred

array([[[-0.00027925],
        [-0.00036546],
        [-0.00038058],
        [-0.00038547],
        [-0.00038717],
        [-0.00038776],
        [-0.00038797],
        [-0.00038804],
        [-0.00038807],
        [-0.00038807],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0.00038808],
        [-0

In [32]:
y_test[0:1]

array([0.03383681])

# Useful Resources

[1] https://towardsdatascience.com/a-quick-start-of-time-series-forecasting-with-a-practical-example-using-fb-prophet-31c4447a2274  
[2] https://towardsdatascience.com/a-quick-deep-learning-recipe-time-series-forecasting-with-keras-in-python-f759923ba64  
[3] https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/  
[4] https://pythonprogramming.net/cryptocurrency-recurrent-neural-network-deep-learning-python-tensorflow-keras/  


# Unsorted Notes

In [25]:
# draw sample for easier training
training_sample = training_data.sample(10000, axis=0)
subset = training_data.loc[:, ['date', 'feature_1', 'feature_2', 'resp', 'weight']]
subset_tensor = tf.convert_to_tensor(subset)
subset_tensor.shape
subset_np = subset.values
subset

### Detailed Problem Formulation

We are dealing with a time-series problem slightly different from usual time-series forecasting problems. As opposed to the usual case, we have only one value to predict from a multivariate feature set. 

| f0  | f1  | f2  | f3  | l  |
|-----|-----|-----|-----|----|
| f10 | f11 | f12 | f13 | l1 |
| f20 | f21 | f22 | f23 | l2 |
| f30 | f31 | f32 | f33 | l3 |
| f40 | f41 | f42 | f43 | y  |

This might be an example window of hypothetical input data where y is the value to be predicted. Obviously, the data cannot be streamed into the network like this. We need to find another way of formatting the input data.

Possible models that could be applied are RNNs or LSTMs.