In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm._tqdm_notebook import tqdm_notebook

## Data Preprocessing

In [2]:
aapl = pd.read_csv("stocks_data/AAPL.csv")

In [3]:
aapl.describe()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close
count,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0
mean,56.837998,55.746272,56.303765,56.303756,116447600.0,52.701389
std,60.897304,59.873248,60.377622,60.393604,98919590.0,58.828695
min,0.942143,0.908571,0.927857,0.937143,5420532.0,0.817857
25%,4.715402,4.526072,4.643929,4.62,49157800.0,4.031936
50%,28.198572,27.507143,27.881428,27.818571,88923800.0,24.277634
75%,97.451427,95.653572,96.514999,96.625,152868400.0,89.866283
max,233.470001,229.779999,230.779999,232.070007,1855410000.0,228.523819


In [4]:
aapl.shape

(4947, 7)

In [5]:
aapl.head(-5)

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2000-01-03,4.017857,3.631696,3.745536,3.997768,133949200.0,3.488905
1,2000-01-04,3.950893,3.613839,3.866071,3.660714,128094400.0,3.194754
2,2000-01-05,3.948661,3.678571,3.705357,3.714286,194580400.0,3.241507
3,2000-01-06,3.821429,3.392857,3.790179,3.392857,191993200.0,2.960991
4,2000-01-07,3.607143,3.410714,3.446429,3.553571,115183600.0,3.101249
...,...,...,...,...,...,...,...
4937,2019-08-19,212.729996,210.029999,210.619995,210.350006,24413600.0,210.350006
4938,2019-08-20,213.350006,210.320007,210.880005,210.360001,26884300.0,210.360001
4939,2019-08-21,213.649994,211.600006,212.990005,212.639999,21535400.0,212.639999
4940,2019-08-22,214.440002,210.750000,213.190002,212.460007,22253700.0,212.460007


In [6]:
df_train, df_test = train_test_split(aapl, train_size=0.8, test_size=0.2, shuffle=False)

In [7]:
print("train: {0} test:{1}".format(df_train.shape,df_test.shape))

train: (3957, 7) test:(990, 7)


In [8]:
selected_cols = ["Open","High","Low","Close","Volume"]

In [9]:
x_train = df_train.loc[:,selected_cols].values

In [10]:
x_train[:5]

array([[3.74553561e+00, 4.01785707e+00, 3.63169646e+00, 3.99776793e+00,
        1.33949200e+08],
       [3.86607146e+00, 3.95089293e+00, 3.61383939e+00, 3.66071439e+00,
        1.28094400e+08],
       [3.70535707e+00, 3.94866061e+00, 3.67857146e+00, 3.71428561e+00,
        1.94580400e+08],
       [3.79017854e+00, 3.82142854e+00, 3.39285707e+00, 3.39285707e+00,
        1.91993200e+08],
       [3.44642854e+00, 3.60714293e+00, 3.41071439e+00, 3.55357146e+00,
        1.15183600e+08]])

In [11]:
scaler = MinMaxScaler()
scaler.fit_transform(x_train)

array([[0.02110112, 0.02302218, 0.02086823, 0.02317552, 0.0672496 ],
       [0.0220038 , 0.02252095, 0.02073139, 0.0206233 , 0.06407726],
       [0.02080023, 0.02250424, 0.02122745, 0.02102895, 0.1001018 ],
       ...,
       [0.84213532, 0.84760239, 0.85531618, 0.85158584, 0.02195045],
       [0.84400753, 0.85164438, 0.86129364, 0.8585522 , 0.01404532],
       [0.8411618 , 0.85748279, 0.85416673, 0.86370127, 0.0218818 ]])

In [12]:
x_test = scaler.transform(df_test.loc[:,selected_cols].values)

In [13]:
x_test[:5]

array([[0.86505119, 0.86639013, 0.86681119, 0.86150534, 0.02509619],
       [0.84565509, 0.8505216 , 0.85470316, 0.84431658, 0.0229056 ],
       [0.8380165 , 0.84258735, 0.81960505, 0.81872267, 0.03442309],
       [0.81809618, 0.8278416 , 0.82627218, 0.82811218, 0.03068854],
       [0.80985847, 0.81347012, 0.81539019, 0.82266022, 0.02931016]])

In [14]:
print(np.isnan(x_train).any(), np.isnan(x_train).any())

False False


In [15]:
x_train.shape

(3957, 5)

In [18]:
def trim_dataset(mat,batch_size):

    no_of_rows_drop = mat.shape[0]%batch_size
    if no_of_rows_drop > 0:
        return mat[:-no_of_rows_drop]
    else:
        return mat

In [16]:
def build_timeseries(mat, y_col_index,time_steps=50):

    dim_0 = mat.shape[0] - time_steps
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, time_steps, dim_1))
    y = np.zeros((dim_0,))
    
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:time_steps+i]
        y[i] = mat[time_steps+i, y_col_index]

    print("length of time-series i/o",x.shape,y.shape)
    return x, y

In [17]:
x_train,y_train = build_timeseries(x_train,3)

HBox(children=(IntProgress(value=0, max=3907), HTML(value='')))


length of time-series i/o (3907, 50, 5) (3907,)


In [19]:
x_train = trim_dataset(x_train,20)
y_train = trim_dataset(y_train,20)

In [20]:
x_train.shape

(3900, 50, 5)

## Model Definition

In [21]:
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        
        batch_size = x.size(0)

        # Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden