# Predicting Stock Returns [PyTorch - LSTM]

## 0. One-Off Data Processing [Temp]

In [1]:
from datetime import datetime as dt
import pandas as pd
import numpy as np
import os
from functools import partial
from tqdm import tqdm

path = path = r'C:\YZC\NUS\Semester 1\DSA5105_Principles of Machine Learning\Principles of ML_Project'
os.chdir(path)
tqdm = partial(tqdm,position=0,leave=True)
print('done')

done


In [2]:
sp500 = pd.read_csv('processed_df_v1.csv')
sp500 = sp500.drop(['Year_x','Year_y','grouper1'],axis=1)
sp500['Date'] = pd.to_datetime(sp500['Date'])
sp500['Target'] = sp500.groupby('Stock')['Target'].shift(-1)
sp500['Target_Return'] = sp500.groupby('Stock')['Return'].shift(-1)
sp500['Target_Close'] = sp500.groupby('Stock')['Close'].shift(-1)

In [3]:
inf_cols = ['Stochastic_5','Stochastic_15','RS_5','RS_15']
null_mean_cols = ['SMA_Volume_ratio','Stochastic_5','Stochastic_15','Stochastic_%D_5','Stochastic_%D_15','Stochastic_Ratio',
'+DM_5','-DM_5','+DM_15','-DM_15','RS_5','RS_15','RSI_5','RSI_ratio']
null_adjfill_cols = ['Return','Target','Target_Return','Target_Close']

def impute(df,inf_cols,null_mean_cols,null_adjfill_cols,groupby_col):
    for c in inf_cols:
        df[c] = df[c].replace([-np.inf,np.inf],np.nan)
    for c in null_mean_cols:
        result = df.groupby(groupby_col)[c].apply(lambda x:x.fillna(x.mean()))
        df[c] = result.droplevel(0)
    for c in null_adjfill_cols:
        df[c] = df.groupby(groupby_col)[c].ffill().bfill()
    return df

def NA_test(df,cols):
    for c in cols:
        if df[c].isnull().values.any():
            print('null',c)
        if np.isinf(df[c].values).any():
            print('inf',c)
    print('test done')

sp500_1 = impute(sp500,inf_cols,null_mean_cols,null_adjfill_cols,'Stock')
NA_test(sp500_1,[c for c in sp500_1.columns if c not in ['Date','Stock','Year']])
print('done')

test done
done


In [4]:
sp500_1.to_csv('processed_df_v2.csv',index=False)

## 1. Data Pre-Processing
#### (Start from here)

In [2]:
from datetime import datetime as dt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
from functools import partial

path = path = r'C:\YZC\NUS\Semester 1\DSA5105_Principles of Machine Learning\Principles of ML_Project'
os.chdir(path)
# tqdm = partial(tqdm,position=0,leave=True)
# tqdm._instances.clear()
# while len(tqdm._instances) > 0:
#     tqdm._instances.pop().close()
print('done')

done


In [3]:
# Essentials
sp500 = pd.read_csv('processed_df_v2.csv')
sp500_var = sp500.copy().drop(['Date','Return','Stock','Target','Target_Return','Target_Close'],axis=1)
sp500.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Stock,Return,...,United States_CA,United States_CP_end,United States_CP_avg,United States_Gov_NetDebt,United States_GDP,United States_UR,Fed_Rate,Target,Target_Return,Target_Close
0,2000-01-03,1.1534,1.206028,1.023149,1.201642,7226398,0.0,0.0,ATVI,-0.030293,...,-3.921,3.427,3.367,-0.537,4.077,3.967,5.45,0.0,-0.030293,1.165241
1,2000-01-04,1.175329,1.187609,1.04157,1.165241,4262390,0.0,0.0,ATVI,-0.030293,...,-3.921,3.427,3.367,-0.537,4.077,3.967,5.45,1.0,0.003765,1.169628
2,2000-01-05,1.153401,1.196818,1.151208,1.169628,3389998,0.0,0.0,ATVI,0.003765,...,-3.921,3.427,3.367,-0.537,4.077,3.967,5.45,0.0,-0.019498,1.146823
3,2000-01-06,1.162173,1.169628,1.137612,1.146823,2429998,0.0,0.0,ATVI,-0.019498,...,-3.921,3.427,3.367,-0.537,4.077,3.967,5.45,1.0,0.027915,1.178837
4,2000-01-07,1.162172,1.187609,1.133228,1.178837,15549590,0.0,0.0,ATVI,0.027915,...,-3.921,3.427,3.367,-0.537,4.077,3.967,5.45,1.0,0.050223,1.238042


In [4]:
# Sampling
sampled_stocks = ['AAPL','CCI', 'USB', 'ADI', 'PNW', 'QCOM']
sp500_sampled = sp500.loc[sp500['Stock'].isin(sampled_stocks)].reset_index(drop=True)

In [5]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X):
        return self
    def transform(self,X):
        return X[self.attribute_names].to_numpy()

class ArrayTransformer(BaseEstimator,TransformerMixin): 
    # Restructure into: samples x timesteps x features
    def __init__(self):
        return None
    def fit(self,X):
        return self
    def transform(self,X):
        X = X.toarray()
        X1_cols = list(range(30))
        X1_cols.extend(sampled_stocks)    
        X1_df = pd.DataFrame(X,columns=X1_cols)
        X1 = []
        for s in sampled_stocks:
            X1.append(X1_df.loc[X1_df[s]==1].to_numpy())
        X1 = np.asarray(X1)
        return X1

In [7]:
# Pipeline for X
cat_vars = ['Stock']
num_vars = list(set(sp500_var.columns) - set(cat_vars))

num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_vars)),
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=30)),
])
cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_vars)),
    ('ohe',OneHotEncoder(categories=[sampled_stocks])), #Note: categories should be a list of list(s)
])
concat_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline),
])
final_pipeline = Pipeline([
    ('combined_pipeline',concat_pipeline),
    ('to_array',ArrayTransformer()),
])
X1 = final_pipeline.fit_transform(sp500_sampled)

# Transform Y:
y1 = []
for s in sampled_stocks:
    y1.append(sp500_sampled.loc[sp500_sampled['Stock']==s,'Target'].to_numpy())
y1 = np.asarray(y1)
y1 = y1.astype(int)

print('done')

done


In [32]:
# # Verifying order of labels: ['AAPL','CCI', 'USB', 'ADI', 'PNW', 'QCOM']
# idx = {}
# for s in sampled_stocks:
#     idx[s] = sp500_sampled.index[sp500_sampled['Stock']==s].tolist()[0]
# print(idx)
# for i in idx:
#     print(i,X1[idx[i],30:]) # Correct

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
print(y1.shape,X1.shape)

cuda
(6, 5995) (6, 5995, 36)


In [9]:
class LSTM_Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=36, hidden_size=50,num_layers=1,batch_first=True)
        self.linear = nn.Linear(50,1)
        self.sigmoid = nn.Sigmoid()
    def forward(self,x):
        x, _ = self.lstm(x)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

In [10]:
def time_series_split(X,split_pct):
    time_series = X.shape[1]
    train_size = int(time_series*split_pct)
    test_size = time_series - train_size
    X_train, X_test = X[:,:train_size],X[:,train_size:time_series]
    return X_train, X_test

def convert_tensor(X,unsqueeze=False):
    X = torch.from_numpy(X)
    X = X.type(torch.FloatTensor)
    if unsqueeze==True:
        X = torch.unsqueeze(X,dim=-1)
    return X

split_pct = 0.8
X_train, X_test = time_series_split(X1,split_pct)
y_train, y_test = time_series_split(y1,split_pct)
X_train, X_test = convert_tensor(X_train), convert_tensor(X_test)
y_train, y_test = convert_tensor(y_train,unsqueeze=True), convert_tensor(y_test,unsqueeze=True)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

torch.Size([6, 4796, 36]) torch.Size([6, 4796, 1]) torch.Size([6, 1199, 36]) torch.Size([6, 1199, 1])


In [11]:
model = LSTM_Model()
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()
loader = data.DataLoader(data.TensorDataset(X_train,y_train))

In [12]:
n_epochs = 100
results = pd.DataFrame(columns=['epoch','accuracy','precision','recall'])
for epoch in tqdm(range(n_epochs)):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        model.eval()
    with torch.no_grad():
        y_pred = model(X_test)
        y_pred = torch.squeeze(y_pred,dim=-1)
        y_test1 = torch.squeeze(y_test,dim=-1)
        y_pred, y_test1 = y_pred.numpy(), y_test1.numpy()
        y_pred = (y_pred>0.5).astype(float).flatten()
        y_test1 = y_test1.flatten()
        # print(y_pred,y_test1)
        accuracy = metrics.accuracy_score(y_test1,y_pred)
        precision = metrics.precision_score(y_test1,y_pred)
        recall = metrics.recall_score(y_test1,y_pred)
        tqdm.write(f"Epoch{epoch}: accuracy={accuracy}, precision={precision},recall={recall}") 
        results.loc[len(results)] = [epoch,accuracy,precision,recall]
results.to_csv('pytorch_lstm_results_v0.csv',index=False)    

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch0: accuracy=0.5137614678899083, precision=0.520895031260283,recall=0.843816631130064
Epoch1: accuracy=0.5157075340561579, precision=0.5217673814165043,recall=0.8560767590618337
Epoch2: accuracy=0.5151515151515151, precision=0.5216962524654832,recall=0.8459488272921108
Epoch3: accuracy=0.517931609674729, precision=0.5239541160593792,recall=0.8278251599147122
Epoch4: accuracy=0.513900472616069, precision=0.5223566543924251,recall=0.7939765458422174
Epoch5: accuracy=0.5127884348067834, precision=0.522264287001983,recall=0.7721215351812367
Epoch6: accuracy=0.5116763969974979, precision=0.5219871205151794,recall=0.7561300639658849
Epoch7: accuracy=0.509869335557409, precision=0.5210742260350616,recall=0.7446695095948828
Epoch8: accuracy=0.5108423686405338, precision=0.5219108519842016,recall=0.7396055437100213
Epoch9: accuracy=0.5095913261050876, precision=0.5211960635881908,recall=0.7340085287846482
Epoch10: accuracy=0.5093133166527662, precision=0.5211509146341463,recall=0.7289445628

In [16]:
# Cross validation
# Hyperparam Tuning