## Return prediction by LSTM
This is a part of the implementation conducted in my university coursework (DSS: Data Science School). Since the real data is confidential, this file is read-only.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 130)
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
import matplotlib.pyplot as plt
plt.style.use("default")
%matplotlib inline

In [None]:
# read files
df_train_VIF_norm = pd.read_csv('path/to/VIF_norm_vol_merged_df_cleaned.csv') # normalized then VIF 57 columns
df_train_all = pd.read_csv('path/to/vol_merged_df_cleaned.csv')               # not normalized yet 124 columns
df_test = pd.read_csv('path/to/filtered_vol_merged_df_test.csv')              # not normalized yet 124 columns

In [None]:
df_train_VIF_norm

In [None]:
df_train_VIF_norm.columns

In [None]:
df_train_all

In [None]:
df_test

In [None]:
# concatenate train and test

# explanatory variables
variables = df_train_VIF_norm.columns

# filter by explanatory variables
df_train_filtered = df_train_all.filter(variables)
df_train_filtered = df_train_filtered.drop(['flag_up', 'flag_down', 'Dates'], axis=1)
df_train_filtered.index = pd.to_datetime(df_train_all['Dates'])

df_test_filtered = df_test.filter(variables)
df_test_filtered = df_test_filtered.drop(['flag_up', 'flag_down', 'Dates'], axis=1)
df_test_filtered.index = pd.to_datetime(df_test['Dates'])

# concat
df_all_data = pd.concat([df_train_filtered, df_test_filtered], axis=0)

In [None]:
df_all_data

In [None]:
# normalize all data

# scale objective variable
scaler1 = MinMaxScaler(feature_range=(0,1))
scaled_target = scaler1.fit_transform(df_all_data[['ESIndex']])

# scale explanatory variables
scaler2 = MinMaxScaler(feature_range=(0,1))
scaled_variables = scaler2.fit_transform(df_all_data.drop(['ESIndex'], axis=1))

# concat scaled data
scaled_data = np.concatenate([scaled_target, scaled_variables], axis=1)
print(scaled_data)
print(scaled_data.shape)

In [None]:
# length of train data
training_data_len = len(df_train_filtered)
print(training_data_len)

In [None]:
# period for prediction
window_size = 200

train_data = scaled_data[:int(training_data_len),:]

# generate x_train & y_train from train_data
x_train, y_train = [], []
for i in range(window_size, len(train_data)-20+1):
    x_train.append(train_data[i-window_size:i, :]) # window_size * number of variables
    y_train.append(train_data[i:i+20, 0])          # 20 days of return

# change to numpy array
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2]))

print(f'x_train.shape: {x_train.shape}')
print(f'y_train.shape: {y_train.shape}')

In [None]:
# LSTM
model = Sequential()
model.add(LSTM(units=50,input_shape=(x_train.shape[1], x_train.shape[2]))) # input shape = (200, number of variables)
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=20)) # output shape = (20,)

model.compile(optimizer='adam', loss='mean_squared_error')
x_train = np.asarray(x_train).astype('float32')
y_train = np.asarray(y_train).astype('float32')
history = model.fit(x_train, y_train, batch_size=32, epochs=20)

In [None]:
model.summary()

In [None]:
# preprocess test data

'''
テストデータの各日付に対して次の20日間の値を予測するので
訓練データからは(window_size-1)日分もらう
'''
test_data = scaled_data[training_data_len - window_size + 1:, :]

print(f'len(test_data): {len(test_data)}')

x_test = []
y_test = scaled_data[training_data_len:, 0] # test GT
for i in range(window_size, len(test_data)+1):
    x_test.append(test_data[i-window_size:i, :])

print(f'len(x_test): {len(x_test)}')

# change to numpy array
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], x_test.shape[2])) # 変数の数に注意

print(f'x_test.shape: {x_test.shape}')

In [None]:
# predict
x_test = np.asarray(x_test).astype('float32')
scaled_predictions = model.predict(x_test)

# denormalization
predictions = scaler1.inverse_transform(scaled_predictions)
y_test = scaler1.inverse_transform([y_test])

In [None]:
# adopt and plot the first day of each 20-days prediction

# prepare dataframe for plot
df_plot = pd.DataFrame()
df_plot.index = df_all_data.index[-1841:]
df_plot['Return_val'] = y_test[0]         # return validation
df_plot['Return_pred'] = predictions[:,0] # return prediction
df_plot.plot(figsize=(20,6))