# 1 - Data Preprocessing

## 1.1. Download Dataset

In [78]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1kv1w7tpK_ax4Yid2bUIr2GP1mhHK2PM0'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('price-split-adjusted.csv')

import pandas as pd
data = pd.read_csv("price-split-adjusted.csv")

print(data.head())

         date symbol        open       close         low        high  \
0  2016-01-05   WLTW  123.430000  125.839996  122.309998  126.250000   
1  2016-01-06   WLTW  125.239998  119.980003  119.940002  125.540001   
2  2016-01-07   WLTW  116.379997  114.949997  114.930000  119.739998   
3  2016-01-08   WLTW  115.480003  116.620003  113.500000  117.440002   
4  2016-01-11   WLTW  117.010002  114.970001  114.089996  117.330002   

      volume  
0  2163600.0  
1  2386400.0  
2  2489500.0  
3  2006300.0  
4  1408600.0  


## 1.2. Preprocess data

In [79]:
import numpy as np
import time
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

stocks = ["AAPL", "AMZN", "FB", "NFLX", "GOOGL"]

def preprocess_data(all_data):
  pd.set_option('display.max_columns', None)

  sequence_length = 7
  stocks_data = {}

  all_data_grouped_by_symbol = all_data.groupby('symbol')

  for stock in stocks:
    stocks_data[stock] = dict()
    data         = all_data_grouped_by_symbol.get_group(stock).drop(columns=['symbol', 'date'])
    # data['date'] = data['date'].apply(lambda x: time.mktime(datetime.datetime.strptime(x, "%Y-%m-%d").timetuple()))

    normalizer = MinMaxScaler()
    data['close']   = normalizer.fit_transform(data.close.values.reshape(-1,1))
    data['high']    = normalizer.fit_transform(data.high.values.reshape(-1,1))
    data['low']     = normalizer.fit_transform(data.low.values.reshape(-1,1))
    data['open']    = normalizer.fit_transform(data.open.values.reshape(-1,1))
    data['volume']  = normalizer.fit_transform(data.open.values.reshape(-1,1))
    # data['date']    = normalizer.fit_transform(data.open.values.reshape(-1,1))

    data = data.values.tolist()
    
    train_data = []
    label_data = []

    for i in range(0, len(data) - (sequence_length + 1)):
      sequence = data[i:(i+sequence_length)]
      target   = data[i+sequence_length+1]
      train_data.append(sequence)
      label_data.append(target[2])

    train_data = np.array(train_data)
    label_data = np.array(label_data)

    # 80% data for training
    stock_train_data   = train_data[0 : round(len(train_data) * 0.8)]
    stock_train_labels = label_data[0 : round(len(train_data) * 0.8)]
    # 20% remaining data for test
    stock_test_data    = train_data[round(len(train_data) * 0.8) + 1 : -1]
    stock_test_labels  = label_data[round(len(train_data) * 0.8) + 1 : -1]

    #splitting training data in 10 folds (training and validation sets)
    kfolds = KFold(n_splits=10)
    kfolds.get_n_splits(stock_train_data)

    stocks_data[stock]['train_data']    = stock_train_data
    stocks_data[stock]['train_labels']  = stock_train_labels
    stocks_data[stock]['test_data']     = stock_test_data
    stocks_data[stock]['test_labels']   = stock_test_labels
    stocks_data[stock]['split_indexes'] = kfolds

    # print(stocks_data[stock]['train_data'].shape)
    # print(stocks_data[stock]['train_labels'].shape)
    # print(stocks_data[stock]['test_data'].shape)
    # print(stocks_data[stock]['test_labels'].shape)
    # print(stocks_data[stock]['split_indexes'])
  return stocks_data



# 2 - Model Implementation

In [80]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score

# --------------------------------------------------------------------------------------------------
# Sentiment Analysis model
class StockPredictor(nn.Module):
    def __init__(self, inputs, n_hidden, n_output_nodes):
        super(StockPredictor, self).__init__()
        self.lstm = nn.LSTM(inputs, n_hidden, bidirectional=True, num_layers=2, batch_first=True)
        self.linear1 = nn.Linear(n_hidden*2, 4)
        self.linear2 = nn.Linear(4, n_output_nodes)

    def forward(self, x):
        x, (hidden_state, cell_state) = self.lstm(x)
        hidden_out = torch.cat((hidden_state[0,:,:],hidden_state[1,:,:]),1)
        x = self.linear1(hidden_out)
        x = self.linear2(x)
        return x
# --------------------------------------------------------------------------------------------------


In [81]:
import timeit

def train_models(inputs, n_hidden, n_output_nodes, learning_rate, epochs, stocks_data):
  stock_models = {}

  for stock in stocks:
    start = timeit.default_timer()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model  = StockPredictor(inputs, n_hidden, n_output_nodes).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    for epoch in range(0, epochs):
      training_loss_overall   = 0
      validation_loss_overall = 0

      training_accuracy_overall   = 0
      validation_accuracy_overall = 0  
      
      for train_indexes, validation_indexes in stocks_data[stock]["split_indexes"].split(stocks_data[stock]["train_data"]):
        model.train()

        train_input        = stocks_data[stock]["train_data"][train_indexes] 
        train_target       = stocks_data[stock]["train_labels"][train_indexes]
        train_input_torch  = torch.from_numpy(np.array(train_input)).float().to(device)
        train_target_torch = torch.from_numpy(np.array(train_target)).float().to(device)

        # forward + backward + optimize
        outputs = model(train_input_torch)
        loss    = criterion(outputs.view(-1), train_target_torch)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        model.eval()

        train_outputs = model(train_input_torch)
        train_loss    = criterion(train_outputs.view(-1), train_target_torch)

        validation_input        = stocks_data[stock]["train_data"][validation_indexes]
        validation_target       = stocks_data[stock]["train_labels"][validation_indexes]
        validation_input_torch  = torch.from_numpy(np.array(validation_input)).float().to(device)
        validation_target_torch = torch.from_numpy(np.array(validation_target)).float().to(device)
        
        validation_outputs = model(validation_input_torch)
        validation_loss    = criterion(validation_outputs.view(-1), validation_target_torch)

        train_loss       = train_loss.item()
        train_acc        = np.sum(np.isclose(train_outputs.view(-1).detach().cpu().numpy(), train_target_torch.cpu().numpy(), atol=0.02) == [True]*len(train_target_torch.cpu().numpy()))/len(train_target_torch.cpu().numpy())
        validation_loss  = validation_loss.item() 
        validation_acc   = np.sum(np.isclose(validation_outputs.view(-1).detach().cpu().numpy(), validation_target_torch.cpu().numpy(), atol=0.02) == [True]*len(validation_target_torch.cpu().numpy()))/len(validation_target_torch.cpu().numpy())

        training_loss_overall       += train_loss
        validation_loss_overall     += validation_loss
        training_accuracy_overall   += train_acc
        validation_accuracy_overall += validation_acc

      if (((epoch % round(epochs * 0.2)) == (round(epochs*0.2)-1)) or epoch == 0):
        print('Stock: %s, Epoch: %d, TrainLoss: %.6f, TrainAcc: %.6f, Val.Loss: %.6f, ValAcc: %.6f' %(stock, epoch + 1, training_loss_overall/10, training_accuracy_overall/10, validation_loss_overall/10, validation_accuracy_overall/10))
      stop = timeit.default_timer()
    stock_models[stock] = model      
    print('------------------------ Time: %f  --------------' %(stop-start))
  return stock_models

In [82]:
def evaluate_models(stock_models, stocks_data):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  criterion = nn.MSELoss()

  for stock in stock_models:
    start = timeit.default_timer()
    test_input  = stocks_data[stock]["test_data"]
    test_target = stocks_data[stock]["test_labels"]
    test_input_torch  = torch.from_numpy(np.array(test_input)).float().to(device)
    test_target_torch = torch.from_numpy(np.array(test_target)).float().to(device)
    
    model = stock_models[stock]
    model.eval()    
    test_outputs  = model(test_input_torch)
    test_loss     = criterion(test_outputs.view(-1), test_target_torch)

    test_loss  = test_loss.item()
    test_acc   = np.sum(np.isclose(test_outputs.view(-1).detach().cpu().numpy(), test_target_torch.cpu().numpy(), atol=0.02) == [True]*len(test_target_torch.cpu().numpy()))/len(test_target_torch.cpu().numpy())

    end = timeit.default_timer()

    print('Stock: %s, TestLoss: %.6f, TestAcc: %.6f, Time: %6f' %(stock, test_loss, test_acc, (end-start)))

## Best Model


In [83]:
processed_data = preprocess_data(all_data)

start = timeit.default_timer()
my_models      = train_models(inputs=5, n_hidden=10, n_output_nodes=1, learning_rate=0.01, epochs=200, stocks_data=processed_data)
evaluate_models(my_models, processed_data)
stop = timeit.default_timer()

print('Overall Running Time: ', stop - start)  


Stock: AAPL, Epoch: 1, TrainLoss: 0.150896, TrainAcc: 0.044735, Val.Loss: 0.050860, ValAcc: 0.007857
Stock: AAPL, Epoch: 40, TrainLoss: 0.000575, TrainAcc: 0.638072, Val.Loss: 0.000580, ValAcc: 0.638921
Stock: AAPL, Epoch: 80, TrainLoss: 0.000338, TrainAcc: 0.777853, Val.Loss: 0.000342, ValAcc: 0.775902
Stock: AAPL, Epoch: 120, TrainLoss: 0.000298, TrainAcc: 0.813492, Val.Loss: 0.000306, ValAcc: 0.815137
Stock: AAPL, Epoch: 160, TrainLoss: 0.000288, TrainAcc: 0.821966, Val.Loss: 0.000297, ValAcc: 0.822249
Stock: AAPL, Epoch: 200, TrainLoss: 0.000284, TrainAcc: 0.826004, Val.Loss: 0.000292, ValAcc: 0.829367
------------------------ Time: 25.295379  --------------
Stock: AMZN, Epoch: 1, TrainLoss: 0.009507, TrainAcc: 0.154187, Val.Loss: 0.012937, ValAcc: 0.090000
Stock: AMZN, Epoch: 40, TrainLoss: 0.000114, TrainAcc: 0.952957, Val.Loss: 0.000123, ValAcc: 0.949296
Stock: AMZN, Epoch: 80, TrainLoss: 0.000107, TrainAcc: 0.958976, Val.Loss: 0.000115, ValAcc: 0.954296
Stock: AMZN, Epoch: 120,

Hyper Parameter Testing


In [49]:
processed_data = preprocess_data(all_data)
n_hidden       = 5
n_output_nodes = 1
learning_rate  = 0.01
epochs         = 100

my_models      = train_models(5, n_hidden, n_output_nodes, learning_rate, epochs, processed_data)
evaluate_models(my_models, processed_data)

Stock: AAPL, Epoch: 1, TrainLoss: 0.057437, TrainAcc: 0.063437, Val.Loss: 0.068635, ValAcc: 0.066429
Stock: AAPL, Epoch: 20, TrainLoss: 0.000415, TrainAcc: 0.731127, Val.Loss: 0.000419, ValAcc: 0.737381
Stock: AAPL, Epoch: 40, TrainLoss: 0.000340, TrainAcc: 0.779041, Val.Loss: 0.000342, ValAcc: 0.771586
Stock: AAPL, Epoch: 60, TrainLoss: 0.000310, TrainAcc: 0.800898, Val.Loss: 0.000314, ValAcc: 0.800831
Stock: AAPL, Epoch: 80, TrainLoss: 0.000295, TrainAcc: 0.819035, Val.Loss: 0.000299, ValAcc: 0.819402
Stock: AAPL, Epoch: 100, TrainLoss: 0.000295, TrainAcc: 0.817135, Val.Loss: 0.000302, ValAcc: 0.817259
------------------------
Stock: AMZN, Epoch: 1, TrainLoss: 0.033808, TrainAcc: 0.063908, Val.Loss: 0.003271, ValAcc: 0.207634
Stock: AMZN, Epoch: 20, TrainLoss: 0.000183, TrainAcc: 0.903459, Val.Loss: 0.000187, ValAcc: 0.894407
Stock: AMZN, Epoch: 40, TrainLoss: 0.000150, TrainAcc: 0.923734, Val.Loss: 0.000151, ValAcc: 0.921494
Stock: AMZN, Epoch: 60, TrainLoss: 0.000127, TrainAcc: 0.9

In [50]:
processed_data = preprocess_data(all_data)
n_hidden       = 10
n_output_nodes = 1
learning_rate  = 0.01
epochs         = 200

my_models      = train_models(5, n_hidden, n_output_nodes, learning_rate, epochs, processed_data)
evaluate_models(my_models, processed_data)

Stock: AAPL, Epoch: 1, TrainLoss: 0.272317, TrainAcc: 0.011956, Val.Loss: 0.138030, ValAcc: 0.025000
Stock: AAPL, Epoch: 40, TrainLoss: 0.000560, TrainAcc: 0.650345, Val.Loss: 0.000564, ValAcc: 0.653156
Stock: AAPL, Epoch: 80, TrainLoss: 0.000417, TrainAcc: 0.725029, Val.Loss: 0.000419, ValAcc: 0.725198
Stock: AAPL, Epoch: 120, TrainLoss: 0.000328, TrainAcc: 0.788309, Val.Loss: 0.000335, ValAcc: 0.777270
Stock: AAPL, Epoch: 160, TrainLoss: 0.000299, TrainAcc: 0.811669, Val.Loss: 0.000302, ValAcc: 0.810826
Stock: AAPL, Epoch: 200, TrainLoss: 0.000295, TrainAcc: 0.817134, Val.Loss: 0.000303, ValAcc: 0.815111
------------------------
Stock: AMZN, Epoch: 1, TrainLoss: 0.065837, TrainAcc: 0.060570, Val.Loss: 0.026562, ValAcc: 0.061429
Stock: AMZN, Epoch: 40, TrainLoss: 0.000187, TrainAcc: 0.900212, Val.Loss: 0.000189, ValAcc: 0.897234
Stock: AMZN, Epoch: 80, TrainLoss: 0.000119, TrainAcc: 0.950502, Val.Loss: 0.000122, ValAcc: 0.947888
Stock: AMZN, Epoch: 120, TrainLoss: 0.000111, TrainAcc: 

In [51]:
processed_data = preprocess_data(all_data)
n_hidden       = 10
n_output_nodes = 1
learning_rate  = 0.01
epochs         = 300

my_models      = train_models(5, n_hidden, n_output_nodes, learning_rate, epochs, processed_data)
evaluate_models(my_models, processed_data)

Stock: AAPL, Epoch: 1, TrainLoss: 0.016795, TrainAcc: 0.143885, Val.Loss: 0.029048, ValAcc: 0.141429
Stock: AAPL, Epoch: 60, TrainLoss: 0.000292, TrainAcc: 0.818244, Val.Loss: 0.000303, ValAcc: 0.822254
Stock: AAPL, Epoch: 120, TrainLoss: 0.000277, TrainAcc: 0.832182, Val.Loss: 0.000284, ValAcc: 0.825805
Stock: AAPL, Epoch: 180, TrainLoss: 0.000272, TrainAcc: 0.834954, Val.Loss: 0.000279, ValAcc: 0.832229
Stock: AAPL, Epoch: 240, TrainLoss: 0.000269, TrainAcc: 0.836221, Val.Loss: 0.000276, ValAcc: 0.835800
Stock: AAPL, Epoch: 300, TrainLoss: 0.000268, TrainAcc: 0.836537, Val.Loss: 0.000273, ValAcc: 0.840086
------------------------
Stock: AMZN, Epoch: 1, TrainLoss: 0.010805, TrainAcc: 0.153797, Val.Loss: 0.013498, ValAcc: 0.062857
Stock: AMZN, Epoch: 60, TrainLoss: 0.000117, TrainAcc: 0.951374, Val.Loss: 0.000119, ValAcc: 0.947882
Stock: AMZN, Epoch: 120, TrainLoss: 0.000105, TrainAcc: 0.958342, Val.Loss: 0.000107, ValAcc: 0.957867
Stock: AMZN, Epoch: 180, TrainLoss: 0.000101, TrainAcc

In [52]:
processed_data = preprocess_data(all_data)
n_hidden       = 15
n_output_nodes = 1
learning_rate  = 0.01
epochs         = 400

my_models      = train_models(5, n_hidden, n_output_nodes, learning_rate, epochs, processed_data)
evaluate_models(my_models, processed_data)

Stock: AAPL, Epoch: 1, TrainLoss: 0.056454, TrainAcc: 0.061139, Val.Loss: 0.025785, ValAcc: 0.029124
Stock: AAPL, Epoch: 80, TrainLoss: 0.000316, TrainAcc: 0.801929, Val.Loss: 0.000324, ValAcc: 0.802285
Stock: AAPL, Epoch: 160, TrainLoss: 0.000288, TrainAcc: 0.820461, Val.Loss: 0.000296, ValAcc: 0.822234
Stock: AAPL, Epoch: 240, TrainLoss: 0.000281, TrainAcc: 0.825767, Val.Loss: 0.000289, ValAcc: 0.830076
Stock: AAPL, Epoch: 320, TrainLoss: 0.000278, TrainAcc: 0.828221, Val.Loss: 0.000286, ValAcc: 0.831494
Stock: AAPL, Epoch: 400, TrainLoss: 0.000275, TrainAcc: 0.831390, Val.Loss: 0.000282, ValAcc: 0.835785
------------------------
Stock: AMZN, Epoch: 1, TrainLoss: 0.011771, TrainAcc: 0.133600, Val.Loss: 0.015055, ValAcc: 0.034286
Stock: AMZN, Epoch: 80, TrainLoss: 0.000107, TrainAcc: 0.957788, Val.Loss: 0.000113, ValAcc: 0.954296
Stock: AMZN, Epoch: 160, TrainLoss: 0.000100, TrainAcc: 0.964519, Val.Loss: 0.000106, ValAcc: 0.961439
Stock: AMZN, Epoch: 240, TrainLoss: 0.000097, TrainAcc

In [53]:
processed_data = preprocess_data(all_data)
n_hidden       = 20
n_output_nodes = 1
learning_rate  = 0.01
epochs         = 1000

my_models      = train_models(5, n_hidden, n_output_nodes, learning_rate, epochs, processed_data)
evaluate_models(my_models, processed_data)

Stock: AAPL, Epoch: 1, TrainLoss: 0.013490, TrainAcc: 0.195665, Val.Loss: 0.026480, ValAcc: 0.169286
Stock: AAPL, Epoch: 200, TrainLoss: 0.000275, TrainAcc: 0.828142, Val.Loss: 0.000282, ValAcc: 0.831515
Stock: AAPL, Epoch: 400, TrainLoss: 0.000268, TrainAcc: 0.835904, Val.Loss: 0.000276, ValAcc: 0.835805
Stock: AAPL, Epoch: 600, TrainLoss: 0.000266, TrainAcc: 0.837092, Val.Loss: 0.000276, ValAcc: 0.836525
Stock: AAPL, Epoch: 800, TrainLoss: 0.000252, TrainAcc: 0.845329, Val.Loss: 0.000264, ValAcc: 0.842229
Stock: AAPL, Epoch: 1000, TrainLoss: 0.000237, TrainAcc: 0.854673, Val.Loss: 0.000243, ValAcc: 0.853647
------------------------
Stock: AMZN, Epoch: 1, TrainLoss: 0.025336, TrainAcc: 0.097008, Val.Loss: 0.013060, ValAcc: 0.125947
Stock: AMZN, Epoch: 200, TrainLoss: 0.000098, TrainAcc: 0.964599, Val.Loss: 0.000105, ValAcc: 0.963582
Stock: AMZN, Epoch: 400, TrainLoss: 0.000097, TrainAcc: 0.965628, Val.Loss: 0.000105, ValAcc: 0.962872
Stock: AMZN, Epoch: 600, TrainLoss: 0.000091, Train

In [54]:
processed_data = preprocess_data(all_data)
n_hidden       = 5
n_output_nodes = 1
learning_rate  = 0.01
epochs         = 1000

my_models      = train_models(5, n_hidden, n_output_nodes, learning_rate, epochs, processed_data)
evaluate_models(my_models, processed_data)

Stock: AAPL, Epoch: 1, TrainLoss: 0.269170, TrainAcc: 0.031116, Val.Loss: 0.122506, ValAcc: 0.000000
Stock: AAPL, Epoch: 200, TrainLoss: 0.000293, TrainAcc: 0.822441, Val.Loss: 0.000299, ValAcc: 0.822254
Stock: AAPL, Epoch: 400, TrainLoss: 0.000285, TrainAcc: 0.823311, Val.Loss: 0.000295, ValAcc: 0.827229
Stock: AAPL, Epoch: 600, TrainLoss: 0.000281, TrainAcc: 0.825291, Val.Loss: 0.000291, ValAcc: 0.828668
Stock: AAPL, Epoch: 800, TrainLoss: 0.000276, TrainAcc: 0.829093, Val.Loss: 0.000285, ValAcc: 0.830805
Stock: AAPL, Epoch: 1000, TrainLoss: 0.000271, TrainAcc: 0.832736, Val.Loss: 0.000280, ValAcc: 0.833647
------------------------
Stock: AMZN, Epoch: 1, TrainLoss: 0.010662, TrainAcc: 0.142074, Val.Loss: 0.015338, ValAcc: 0.082143
Stock: AMZN, Epoch: 200, TrainLoss: 0.000101, TrainAcc: 0.963331, Val.Loss: 0.000108, ValAcc: 0.959296
Stock: AMZN, Epoch: 400, TrainLoss: 0.000096, TrainAcc: 0.966500, Val.Loss: 0.000100, ValAcc: 0.965010
Stock: AMZN, Epoch: 600, TrainLoss: 0.000091, Train