In [86]:
import os
import time

import pandas as pd
import FinanceDataReader as fdr
import datetime
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import numpy as np
import argparse
from copy import deepcopy 
from sklearn.metrics import mean_absolute_error

import seaborn as sns 
import matplotlib.pyplot as plt

In [87]:
def load_data_set(filename):
    dir_path = './data_set'
    
    with open(join(dir_path, filename), 'r') as f:
        results = json.load(f)
        
    return results 

In [88]:
result = load_data_set('my_data_set.json')

In [89]:
def get_data(sym):
    file_path = "./stockData/"
    csv_path = os.path.join(file_path, f"{sym}.csv")
    df = pd.read_csv(csv_path, parse_dates=True, index_col = ['Date'])
    df.drop('Symbol', axis=1, inplace=True)
    return df

In [90]:
def Min_Max_Scaler(dataframe):
    d_min = np.min(dataframe, 0)
    d_max = np.max(dataframe, 0)

    numerator = dataframe - d_min
    denominator = d_max - d_min

    df = numerator / (denominator + 1e-7)


    return df

In [91]:
class StockDataset(Dataset):
    
    def __init__(self, data, x_frames, y_frames, start, end):
        self.x_frames = x_frames
        self.y_frames = y_frames

        self.start = start
        self.end = end

        self.data = data.loc[self.start : self.end]


    def __len__(self):
        return len(self.data) - (self.x_frames + self.y_frames) + 1
    
    def __getitem__(self, idx):
        idx += self.x_frames
        data = self.data.iloc[idx-self.x_frames:idx+self.y_frames]
        data = data[['Close','Open', 'High', 'Low', 'Volume', 'Change']]
        data = data.values
        X = data[:self.x_frames]
        y = data[self.x_frames:]
        
        return X, y

In [92]:
class LSTM(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, batch_size, dropout, use_bn):
        super(LSTM, self).__init__()
        self.input_dim = input_dim 
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        self.batch_size = batch_size
        self.dropout = dropout
        self.use_bn = use_bn 
        
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
        self.hidden = self.init_hidden()
        self.regressor = self.make_regressor()
        
    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
    
    def make_regressor(self):
        layers = []
        if self.use_bn:
            layers.append(nn.BatchNorm1d(self.hidden_dim))
        layers.append(nn.Dropout(self.dropout))
        
        layers.append(nn.Linear(self.hidden_dim, self.hidden_dim // 2))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(self.hidden_dim // 2, self.output_dim))
        regressor = nn.Sequential(*layers)
        return regressor
    
    def forward(self, x):
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y_pred = self.regressor(lstm_out[-1].view(self.batch_size, -1))
        return y_pred

In [109]:
def metric(y_pred, y_true):
    perc_y_pred = (y_pred.detach().numpy())
    perc_y_true = (y_true.detach().numpy())
    mae = mean_absolute_error(perc_y_true, perc_y_pred, multioutput='raw_values')
    return mae*100

In [94]:
def train(model, partition, optimizer, loss_fn, args):

    model.train()
    model.zero_grad()
    optimizer.zero_grad()

    train_acc = 0.0
    train_loss = 0.0
    
    length = 0
    
    for i in range(len(partition['train'])):
        length += len(partition['train'][i])
    
    for _ in range(70):
        for i in range(len(partition['train'])):
            X, y = next(iter(partition['train'][i]))

            X = X.to(args.device)
            y = y.to(args.device)

            X = X.transpose(0, 1).float()
            y_true = y[:, :, 0].float()

            model.zero_grad()
            optimizer.zero_grad()
            model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]


            y_pred = model(X)
            loss = loss_fn(y_pred.view(-1), y_true.view(-1))
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_acc += metric(y_pred, y_true)[0]

    train_loss = train_loss / length
    train_acc = train_acc / length
    return model, train_loss, train_acc

In [111]:
def validate(model, partition, loss_fn, args):
    
    model.eval()

    val_acc = 0.0
    val_loss = 0.0
    
    length = 0
    for i in range(len(partition['val'])):
        length += len(partition['val'][i])
        
    with torch.no_grad():
        for _ in range(3):
            for i in range(len(partition['val'])):
                X, y = next(iter(partition['val'][i]))        
                X = X.to(args.device)
                y = y.to(args.device)

                X = X.transpose(0, 1).float()
                y_true = y[:, :, 0].float()

                model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]

                y_pred = model(X)

                loss = loss_fn(y_pred.view(-1), y_true.view(-1))
                
                val_loss += loss.item()
                val_acc += metric(y_pred, y_true)[0]

    val_loss = val_loss / length
    val_acc = val_acc / length
    return val_loss, val_acc

In [108]:
def experiment(partition, args):
    hash_key = hashlib.sha1(str(vars(args)).encode()).hexdigest()[:6]

    model = LSTM(args.input_dim, args.hid_dim, args.y_frames, args.n_layers, args.batch_size, args.dropout, args.use_bn)
    model.to(args.device)

    loss_fn = torch.nn.MSELoss()

    loss_fn = nn.MSELoss()
    if args.optim == 'SGD':
        optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.l2)
    elif args.optim == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.l2)
    elif args.optim == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2)
    else:
        raise ValueError('In-valid optimizer choice. Choose one of SGD, RMSprop, Adam')
    
    # ===== List for epoch-wise data ====== #
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    # ===================================== #

    for epoch in range(args.epoch):  # loop over the dataset multiple times
        
        start = time.time()
        model, train_loss, train_acc = train(model, partition, optimizer, loss_fn, args)

        val_loss, val_acc = validate(model, partition, loss_fn, args)

        end = time.time()

        print('Epoch {}, Acc(train/val): {:2.2f}/{:2.2f}, Loss(train/val) {:2.2f}/{:2.2f}. Took {:2.2f} sec'
              .format(epoch+1, train_acc, val_acc, train_loss*10, val_loss*10, end-start))
        
        # ====== Add Epoch Data ====== #
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        # ============================ #

    # ======= Add Result to Dictionary ======= #
    result = {}
    result['train_losses'] = [x for x in train_losses]
    result['val_losses'] = [x for x in val_losses]
    result['train_accs'] = [x for x in train_accs]
    result['val_accs'] = [x for x in val_accs]
    result['train_acc'] = train_acc
    result['val_acc'] = val_acc
    
    return model, vars(args), result

In [97]:
import hashlib
import json
from os import listdir
from os.path import isfile, join


def save_exp_result(setting, result):
    exp_name = setting['exp_name']

    hash_key = hashlib.sha1(str(setting).encode()).hexdigest()[:6]
    
    filepath = './results'
    if not os.path.isdir(filepath):
        os.mkdir(filepath)

    filename = './results/{}-{}.json'.format(exp_name, hash_key)
    result.update(setting)
    with open(filename, 'w') as f:
        json.dump(result, f)

    
def load_exp_result(exp_name):
    dir_path = './results'
    filenames = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) if '.json' in f]
    list_result = []
    for filename in filenames:
        if exp_name in filename:
            with open(join(dir_path, filename), 'r') as infile:
                results = json.load(infile)
                list_result.append(results)
    df = pd.DataFrame(list_result)
    return df

In [98]:
# ====== Random Seed Initialization ====== #
seed = 666
np.random.seed(seed)
torch.manual_seed(seed)

parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.exp_name = "exp1_lr"
args.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(args.device)

cpu


In [99]:
# ====== Data Loading ====== #
# args.symbol = 'AAPL'
args.batch_size = 16
args.x_frames = 5
args.y_frames = 1

In [100]:
dataset_list=[]

for sym in result[0]:
    df = get_data(sym)
    df = Min_Max_Scaler(df)
    ds_train = StockDataset(df, args.x_frames, args.y_frames, '2000-01-01', '2018-01-01')
    ds_valid = StockDataset(df, args.x_frames, args.y_frames, '2018-01-01', '2019-01-01')
    
    partition = {'train': ds_train, 'val':ds_valid}
    dataset_list.append(partition)

In [101]:
loader_train = []

for dataset in dataset_list:
    trainloader = DataLoader(dataset['train'], batch_size=args.batch_size, shuffle=False, drop_last=True)
    loader_train.append(trainloader)

In [102]:
loader_valid = []

for dataset in dataset_list:
    validloader = DataLoader(dataset['val'], batch_size=args.batch_size, shuffle=False, drop_last=True)
    loader_valid.append(validloader)

In [103]:
partition = {'train': loader_train, 'val':loader_valid}

In [104]:
# ====== Model Capacity ===== #
args.input_dim = 6
args.hid_dim = 50
args.n_layers = 1

In [105]:
# ====== Regularization ======= #
args.l2 = 0.0001
args.dropout = 0.3
args.use_bn = True

In [114]:
# ====== Optimizer & Training ====== #
args.optim = 'Adam' #'RMSprop' #SGD, RMSprop, ADAM...
args.lr = 0.0001
args.epoch = 50
args.path = './model_checkpoint/'

In [115]:
# ====== Experiment Variable ====== #
name_var1 = 'hid_dim'
name_var2 = 'n_layers'
list_var1 = [10, 50, 100]
list_var2 = [1,2,3]

model_dict = {}

for var1 in list_var1:
    for var2 in list_var2:
        setattr(args, name_var1, var1)
        setattr(args, name_var2, var2)
        print(args)
                
        model, setting, result = experiment(partition, deepcopy(args))
        hash_key = hashlib.sha1(str(setting).encode()).hexdigest()[:6]
        model_dict[hash_key] = model
        save_exp_result(setting, result)

Namespace(batch_size=16, device='cpu', dropout=0.3, epoch=50, exp_name='exp1_lr', hid_dim=10, input_dim=6, l2=0.0001, lr=0.0001, n_layers=1, optim='Adam', path='./model_checkpoint/', use_bn=True, x_frames=5, y_frames=1)
Epoch 1, Acc(train/val): 3.21/129.13, Loss(train/val) 0.01/9.55. Took 50.48 sec
Epoch 2, Acc(train/val): 2.51/180.34, Loss(train/val) 0.00/20.10. Took 47.83 sec
Epoch 3, Acc(train/val): 2.02/194.41, Loss(train/val) 0.00/24.24. Took 48.41 sec
Epoch 4, Acc(train/val): 1.79/178.19, Loss(train/val) 0.00/20.96. Took 48.64 sec
Epoch 5, Acc(train/val): 1.72/194.17, Loss(train/val) 0.00/25.36. Took 49.23 sec
Epoch 6, Acc(train/val): 1.69/178.30, Loss(train/val) 0.00/21.34. Took 50.98 sec
Epoch 7, Acc(train/val): 1.66/175.02, Loss(train/val) 0.00/20.45. Took 49.39 sec
Epoch 8, Acc(train/val): 1.64/169.43, Loss(train/val) 0.00/19.08. Took 49.96 sec
Epoch 9, Acc(train/val): 1.63/173.30, Loss(train/val) 0.00/19.89. Took 48.81 sec
Epoch 10, Acc(train/val): 1.62/175.46, Loss(train/va

Epoch 46, Acc(train/val): 1.18/152.56, Loss(train/val) 0.00/15.86. Took 54.38 sec
Epoch 47, Acc(train/val): 1.18/158.59, Loss(train/val) 0.00/16.90. Took 59.72 sec
Epoch 48, Acc(train/val): 1.17/150.54, Loss(train/val) 0.00/15.18. Took 52.68 sec
Epoch 49, Acc(train/val): 1.18/152.03, Loss(train/val) 0.00/15.63. Took 52.74 sec
Epoch 50, Acc(train/val): 1.18/151.57, Loss(train/val) 0.00/15.42. Took 53.65 sec
Namespace(batch_size=16, device='cpu', dropout=0.3, epoch=50, exp_name='exp1_lr', hid_dim=10, input_dim=6, l2=0.0001, lr=0.0001, n_layers=3, optim='Adam', path='./model_checkpoint/', use_bn=True, x_frames=5, y_frames=1)
Epoch 1, Acc(train/val): 4.01/61.56, Loss(train/val) 0.01/2.07. Took 59.38 sec
Epoch 2, Acc(train/val): 3.10/94.91, Loss(train/val) 0.01/5.12. Took 60.73 sec
Epoch 3, Acc(train/val): 2.60/139.08, Loss(train/val) 0.00/11.62. Took 58.70 sec
Epoch 4, Acc(train/val): 2.21/163.43, Loss(train/val) 0.00/16.42. Took 68.81 sec
Epoch 5, Acc(train/val): 2.08/167.31, Loss(train/v

Epoch 42, Acc(train/val): 0.58/32.75, Loss(train/val) 0.00/0.85. Took 55.27 sec
Epoch 43, Acc(train/val): 0.57/26.41, Loss(train/val) 0.00/0.61. Took 64.62 sec
Epoch 44, Acc(train/val): 0.57/24.34, Loss(train/val) 0.00/0.49. Took 65.11 sec
Epoch 45, Acc(train/val): 0.57/26.18, Loss(train/val) 0.00/0.58. Took 66.02 sec
Epoch 46, Acc(train/val): 0.57/25.30, Loss(train/val) 0.00/0.54. Took 66.58 sec
Epoch 47, Acc(train/val): 0.56/31.08, Loss(train/val) 0.00/0.78. Took 73.54 sec
Epoch 48, Acc(train/val): 0.56/23.73, Loss(train/val) 0.00/0.43. Took 67.81 sec
Epoch 49, Acc(train/val): 0.56/22.24, Loss(train/val) 0.00/0.37. Took 72.57 sec
Epoch 50, Acc(train/val): 0.56/20.24, Loss(train/val) 0.00/0.32. Took 70.60 sec
Namespace(batch_size=16, device='cpu', dropout=0.3, epoch=50, exp_name='exp1_lr', hid_dim=50, input_dim=6, l2=0.0001, lr=0.0001, n_layers=2, optim='Adam', path='./model_checkpoint/', use_bn=True, x_frames=5, y_frames=1)
Epoch 1, Acc(train/val): 2.45/164.26, Loss(train/val) 0.00/1

Epoch 39, Acc(train/val): 0.54/25.41, Loss(train/val) 0.00/0.39. Took 65.07 sec
Epoch 40, Acc(train/val): 0.53/24.85, Loss(train/val) 0.00/0.37. Took 66.20 sec
Epoch 41, Acc(train/val): 0.53/26.62, Loss(train/val) 0.00/0.42. Took 65.25 sec
Epoch 42, Acc(train/val): 0.53/31.50, Loss(train/val) 0.00/0.60. Took 65.77 sec
Epoch 43, Acc(train/val): 0.53/28.20, Loss(train/val) 0.00/0.47. Took 78.11 sec
Epoch 44, Acc(train/val): 0.53/31.64, Loss(train/val) 0.00/0.60. Took 84.05 sec
Epoch 45, Acc(train/val): 0.52/31.43, Loss(train/val) 0.00/0.59. Took 81.87 sec
Epoch 46, Acc(train/val): 0.52/26.17, Loss(train/val) 0.00/0.41. Took 80.12 sec
Epoch 47, Acc(train/val): 0.52/24.05, Loss(train/val) 0.00/0.34. Took 81.06 sec
Epoch 48, Acc(train/val): 0.52/27.36, Loss(train/val) 0.00/0.44. Took 81.40 sec
Epoch 49, Acc(train/val): 0.52/22.87, Loss(train/val) 0.00/0.31. Took 80.80 sec
Epoch 50, Acc(train/val): 0.52/24.64, Loss(train/val) 0.00/0.36. Took 74.44 sec
Namespace(batch_size=16, device='cpu', d

Epoch 36, Acc(train/val): 0.53/18.16, Loss(train/val) 0.00/0.21. Took 86.40 sec
Epoch 37, Acc(train/val): 0.53/18.13, Loss(train/val) 0.00/0.21. Took 87.55 sec
Epoch 38, Acc(train/val): 0.53/10.65, Loss(train/val) 0.00/0.09. Took 77.75 sec
Epoch 39, Acc(train/val): 0.52/14.55, Loss(train/val) 0.00/0.14. Took 86.35 sec
Epoch 40, Acc(train/val): 0.52/16.18, Loss(train/val) 0.00/0.16. Took 82.59 sec
Epoch 41, Acc(train/val): 0.52/13.06, Loss(train/val) 0.00/0.12. Took 84.82 sec
Epoch 42, Acc(train/val): 0.52/16.64, Loss(train/val) 0.00/0.17. Took 69.26 sec
Epoch 43, Acc(train/val): 0.51/19.59, Loss(train/val) 0.00/0.24. Took 65.14 sec
Epoch 44, Acc(train/val): 0.51/22.08, Loss(train/val) 0.00/0.30. Took 66.00 sec
Epoch 45, Acc(train/val): 0.51/20.46, Loss(train/val) 0.00/0.26. Took 65.67 sec
Epoch 46, Acc(train/val): 0.51/17.21, Loss(train/val) 0.00/0.19. Took 66.00 sec
Epoch 47, Acc(train/val): 0.51/18.25, Loss(train/val) 0.00/0.21. Took 64.70 sec
Epoch 48, Acc(train/val): 0.51/22.45, Lo