In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import roll_time_series, make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
train_file, test_file = 'Train.csv', 'Test.csv'

In [3]:
df, tdf = pd.read_csv(train_file), pd.read_csv(test_file)

In [4]:
df['train'] = 1
tdf['train'] = 0
df = pd.concat([df, tdf], ignore_index=True)
del tdf

In [5]:
df = df[df['stock'] == 0]


In [6]:
df.head()

Unnamed: 0,ID,stock,Date,Open,High,Low,Close,holiday,unpredictability_score,train
0,id_0,0,2017-01-03,82.9961,82.7396,82.9144,82.8101,1,7,1
1,id_1,0,2017-01-04,83.1312,83.1669,83.3779,82.969,0,7,1
2,id_2,0,2017-01-05,82.6622,82.7634,82.8984,82.8578,0,7,1
3,id_3,0,2017-01-06,83.0279,82.795,82.8425,82.7385,0,7,1
4,id_4,0,2017-01-09,82.3761,82.0828,82.1473,81.8641,0,7,1


In [13]:
# max_timeshift and min_shift are hyperparams here
df_rolled = roll_time_series(df, column_id='stock', column_sort='Date', max_timeshift=31, min_timeshift=5) 

Rolling: 100%|██████████| 20/20 [00:01<00:00, 14.74it/s]


In [21]:
df_rolled[df_rolled['Date'] == '2017-02-15']

Unnamed: 0,ID,stock,Date,Open,High,Low,Close,holiday,unpredictability_score,train,id
480,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-15)"
511,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-16)"
542,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-17)"
573,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-21)"
604,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-22)"
635,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-23)"
666,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-24)"
697,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-27)"
1944,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-02-28)"
1975,id_30,0,2017-02-15,83.322,84.4489,83.3619,84.4636,0,7,1,"(0, 2017-03-01)"


In [15]:
df_rolled[df_rolled['train'] ==0].head()

Unnamed: 0,ID,stock,Date,Open,High,Low,Close,holiday,unpredictability_score,train,id
22336,id_713,0,2019-11-01,,,,,0,7,0,"(0, 2019-11-01)"
22367,id_713,0,2019-11-01,,,,,0,7,0,"(0, 2019-11-04)"
22368,id_714,0,2019-11-04,,,,,0,7,0,"(0, 2019-11-04)"
22398,id_713,0,2019-11-01,,,,,0,7,0,"(0, 2019-11-05)"
22399,id_714,0,2019-11-04,,,,,0,7,0,"(0, 2019-11-05)"


In [None]:
len(df), df_rolled.id.nunique()

In [None]:
df_rolled.groupby("id").size().agg([np.min, np.max])


In [None]:
X = extract_features(df_rolled.drop(columns=["Open", "High", "Low", "Close", "ID", "train"], axis=1), 
                     column_id="stock", column_sort="Date", 
                     show_warnings=False, n_jobs=7, impute_function=impute)

In [None]:
X

In [None]:
X.shape

In [None]:
X.columns

In [None]:
X.index

In [None]:

X = X.set_index(X.index.map(lambda x: x[1]), drop=True)
X.index.name = "last_date"
X

In [None]:
y = df.set_index("Date").sort_index().Open

In [None]:
y

In [None]:
y = y[y.index.isin(X.index)]
X = X[X.index.isin(y.index)]

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test = X[:570], X[570:]
y_train, y_test = y[:570], y[570:]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train

In [None]:
X_test

In [None]:
for col in X_train.columns:
    if X_train[col].isnull().sum() != 0:
        print(col)

In [None]:
X_train_selected = select_features(X_train, y_train)

In [None]:
X_train_selected

In [None]:
ada = LinearRegression()

ada.fit(X_train_selected, y_train)

In [None]:
X_test_selected = X_test[X_train_selected.columns]

y_pred = pd.Series(ada.predict(X_test_selected), index=X_test_selected.index)

In [None]:
plt.figure(figsize=(15, 6))

y.plot(ax=plt.gca())
y_pred.plot(ax=plt.gca(), legend=None, marker=".")

In [None]:
X_test_selected

In [None]:
y_pred

In [None]:
y_test

In [None]:
print(mean_squared_error(y_test.tolist(), y_pred.tolist(), squared=False))