In [1]:
import d2l
import math
import mxnet as mx
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn, rnn
from mxnet.gluon import data as gdata
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('all_stocks_5yr.csv')
df.describe()

Unnamed: 0,open,high,low,close,volume
count,619029.0,619032.0,619032.0,619040.0,619040.0
mean,83.023334,83.778311,82.256096,83.043763,4321823.0
std,97.378769,98.207519,96.507421,97.389748,8693610.0
min,1.62,1.69,1.5,1.59,0.0
25%,40.22,40.62,39.83,40.245,1070320.0
50%,62.59,63.15,62.02,62.62,2082094.0
75%,94.37,95.18,93.54,94.41,4284509.0
max,2044.0,2067.99,2035.11,2049.0,618237600.0


In [4]:
df1 = df.ffill().copy()

In [4]:

df1['open'] = np.log(df1['open'])
df1['high'] = 10* (np.log(df1['high']) - np.log(df1['open']))
df1['low'] = 10* (np.log(df1['low']) - np.log(df1['open']))
df1['close'] = np.log(df1['close'])
df1['volume'] = np.log(df1['volume'])
df1.loc[:, 'date'] = pd.to_datetime(df1.loc[:,'date'], format="%Y/%m/%d")
df1['year'] = pd.DatetimeIndex(df1['date']).year
df1.head()

Unnamed: 0,date,open,high,low,close,volume,Name,year
0,2013-02-08,2.712706,17.180717,16.851275,2.691243,15.944635,AAL,2013
1,2013-02-11,2.70069,17.152094,16.639512,2.671386,15.999537,AAL,2013
2,2013-02-12,2.670694,16.924995,16.638363,2.658159,15.910579,AAL,2013
3,2013-02-13,2.66026,17.256185,16.783332,2.685123,16.143715,AAL,2013
4,2013-02-14,2.704042,17.106322,15.824342,2.638343,17.277486,AAL,2013


In [5]:
df2 = df1[df1['year'] == 2018]
df2.to_csv (r'test.csv', index = None, header=True)
df3 = df1[df1['year'] != 2018]
df3.to_csv (r'train.csv', index = None, header=True)

In [6]:
name = list(np.unique(df3[df3['year'] == 2017].Name))

## Generate the feature matrix

In [7]:
def create_feature(df3):
    
    grouped = df3.groupby('date')
    i = 0
    price = nd.zeros((2525, len(grouped)))
    
    for date, group in grouped:
        rec = grouped.get_group(date).reset_index(drop = True)
        price_date = np.zeros((505,5))
        for a in range(len(name)):
            if len(rec[rec['Name'] == name[a]]) > 0:
                price_date[a] = rec[rec['Name'] == name[a]].iloc[:,1:6].values
            '''
            else:
                print('here == 0: ', name[a])
                unnamed.append(name[a])
            '''    
        price[:,i] = price_date.flatten()
        i+=1
    return price

test_feature = create_feature(df2) #test
print('test feature matrix: \n',test_feature)

train_feature = create_feature(df3) #train
print('train feature matrix: \n',train_feature)

test feature matrix: 
 
[[ 4.210942   4.213904   4.2419024 ...  4.260706   4.2040954  4.222298 ]
 [27.802025  28.027933  28.009087  ... 28.199827  27.955805  27.949581 ]
 [27.72068   27.752186  27.859013  ... 27.727163  27.555634  27.777302 ]
 ...
 [28.142883  28.150318  28.246767  ... 28.252508  28.229082  28.307703 ]
 [ 4.2734666  4.278054   4.2840004 ...  4.301765   4.2941513  4.302171 ]
 [14.574242  14.660737  14.745314  ... 14.901385  15.409698  15.327316 ]]
<NDArray 2525x26 @cpu(0)>
train feature matrix: 
 
[[ 3.8082168  3.8104331  3.8024313 ...  4.207822   4.2112384  4.2121277]
 [24.772491  24.729116  24.6991    ... 27.745897  27.740746  27.753443 ]
 [24.695015  24.566221  24.598486  ... 27.699837  27.672344  27.656794 ]
 ...
 [22.293644  22.405806  22.458382  ... 28.25052   28.238503  28.2227   ]
 [ 3.4980216  3.5043554  3.5186841 ...  4.2828965  4.2820683  4.2772217]
 [14.770726  14.211676  14.308546  ... 13.963733  13.473722  14.34856  ]]
<NDArray 2525x1233 @cpu(0)>


In [5]:
train_feature2 = train_feature
for i in range(2525):
    if train_feature2[i].min() == 0:
        max_index = np.nonzero(train_feature2[i])[0].max()
        min_index = np.nonzero(train_feature2[i])[0].min()
        if max_index == 1232:
            #print(train_feature2[i])
            train_feature2[i,:min_index+1] = train_feature2[i,min_index+1]
            #print(train_feature2[i])
        elif min_index == 0:
            train_feature2[i,:max_index+1:] = train_feature2[i,max_index]

NameError: name 'train_feature' is not defined

In [4]:
train_feature2 = train_feature2.T
feature = train_feature2.asnumpy()
np.savetxt("feature.csv", feature, delimiter=",")

NameError: name 'train_feature2' is not defined

### Run this when import from csv file

## Generate the label (train and test)

In [2]:
def create_label(df3):
    
    grouped = df3.groupby('date')
    i = 0
    price = nd.zeros((505, len(grouped)))
    
    for date, group in grouped:
        rec = grouped.get_group(date).reset_index(drop = True)
        price_date = nd.zeros((505,))
        for a in range(len(name)):
            if len(rec[rec['Name'] == name[a]]) > 0:
                price_date[a] = rec[rec['Name'] == name[a]].iloc[:,1].values
        price[:,i] = price_date
        i+=1
    return price

In [3]:
train_label = create_label(df3)
print('train label matrix: \n',train_label)

NameError: name 'df3' is not defined

In [20]:
test_label = create_label(df2)
print('test label matrix: \n',test_label)

test label matrix: 
 
[[4.210942  4.213904  4.2419024 ... 4.260706  4.2040954 4.222298 ]
 [3.9575698 3.9676468 3.960432  ... 3.9510515 3.8983297 3.9300594]
 [4.61413   4.6673937 4.679814  ... 4.7278304 4.6847205 4.719302 ]
 ...
 [4.7957907 4.825991  4.8306313 ... 4.8266325 4.7957907 4.801148 ]
 [3.9545076 3.9257286 3.9322176 ... 3.980429  3.9201896 3.955657 ]
 [4.284138  4.2734666 4.287029  ... 4.339119  4.2868915 4.286341 ]]
<NDArray 505x26 @cpu(0)>


In [25]:
train_label2 = train_label
for i in range(505):
    if train_label2[i].min() == 0:
        max_index = np.nonzero(train_label2[i])[0].max()
        min_index = np.nonzero(train_label2[i])[0].min()
        if max_index == 1232:
            #print(train_feature2[i])
            train_label2[i,:min_index+1] = train_label2[i,min_index+1]
            #print(train_feature2[i])
        elif min_index == 0:
            train_label2[i,:max_index+1:] = train_label2[i,max_index]

In [26]:
test_label = test_label.T

train_label2 = train_label2.T
label = train_label2.asnumpy()
np.savetxt("label.csv", label, delimiter=",")

### Label file Read from CSV

In [59]:
feature_df = pd.read_csv("feature.csv", header = None)
featureMatrix = nd.array(feature_df.values)

In [60]:
label_df = pd.read_csv("label.csv", header = None)
labelMatrix = nd.array(label_df.values)

In [61]:
ctx = d2l.try_gpu()
featureMatrix = featureMatrix[226:1233,:].as_in_context(ctx)
labelMatrix = labelMatrix[226:,:].as_in_context(ctx)
print(featureMatrix.shape, labelMatrix.shape)
batch_size = 10
train_iter = gdata.DataLoader(gdata.ArrayDataset(featureMatrix, labelMatrix), batch_size=batch_size,\
                              shuffle=False,last_batch = 'discard')

(1007, 2525) (1007, 505)


## Regression

In [9]:
import sys
sys.path.insert(0, '..')
import math
import time

In [10]:
class RNNReg(nn.Block):
    def __init__(self, rnn_layer, out_size=505, **kwargs):
        super(RNNReg, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.out_size = out_size # we only predict the open price next day
        self.dense = nn.Dense(out_size)

    def forward(self, inputs, state):
        # the shape is (batch_size, time_step_forward, sample_length)
        X = inputs.reshape(1, inputs.shape[0], inputs.shape[1])
        Y, state = self.rnn(X, state)
        output = self.dense(Y.reshape((-1, Y.shape[-1])))
        return output, state

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [11]:
def predict_rnn_gluon(inputs, step_forward, model, ctx):
    # inputs should be of dimension (days of that year)*2525
    state = model.begin_state(batch_size=1, ctx=ctx)
    output = [inputs[0]]
    for t in range(len(inputs) + step_forward - 1):
        X = nd.array(output[-1], ctx=ctx)
        (Y, state) = model(X, state)
        if t < len(inputs) - 1:
            output.append(inputs[t+1])
        else:
            output.append(Y.reshape((-1, Y.shape[-1])))
    return output

In [12]:
def grad_clipping_gluon(model, theta, ctx):
    params = [p.data() for p in model.collect_params().values()]
    d2l.grad_clipping(params, theta, ctx)

In [52]:

# this is the dummy data, need to be replaced with
# X = (batch_size, all_data_per_day), Y = (batch_size, all_open_price_next_day)
# where X and Y are both ndarrays, so that just treat them as train_features and train_labels
#train_features = nd.zeros((60, 2525), ctx=ctx)
#train_labels = nd.ones((60, 505), ctx=ctx)


def train_and_predict_rnn_gluon(model, num_hiddens, data_iter, ctx, num_epochs,\
                                num_steps, lr, clipping_theta, batch_size):
    loss = gloss.L2Loss()
    model.initialize(ctx=ctx, force_reinit=True, init=init.Normal(0.01))
    trainer = gluon.Trainer(model.collect_params(), 'adam',
                            {'learning_rate': lr, 'wd': 0})
    start = time.time()
    i = 0
    
    for epoch in range(num_epochs):
        state = model.begin_state(batch_size=batch_size, ctx=ctx)
        for X, Y in data_iter:
            i += 1
            for s in state:
                s.detach()
            with autograd.record():
                (output, state) = model(X, state)
                # y = Y.T.reshape((-1,))
                l = loss(output, Y).mean()
                if math.isnan(l.asscalar()):
                    print('Y=', Y, '\n------------------------------\n', 'output=', output)
            l.backward()
            # Clip the gradient
            grad_clipping_gluon(model, clipping_theta, ctx)
            # Since the error has already taken the mean, the gradient does
            # not need to be averaged
            trainer.step(1)
        if (epoch + 1) % 50 == 0:
            print('epoch: ', epoch+1, ', loss: ', l.asscalar())
    #return checkX, checkY, checkout

count = 0
for i in range(20):
    for j in range(505):
        count+=1
        if count == 5051:
            print(checkout[i,j].asscalar())
        elif count == 5053:
            print(checkout[i,j].asscalar())

## RNN - single layer

In [64]:
num_steps = 1
num_epochs, lr, clipping_theta = 2, .0001, 1e-3

num_hiddens = 1024
rnn_layer = rnn.RNN(num_hiddens)
rnn_layer.initialize(ctx=ctx)
single_rnn_model = RNNReg(rnn_layer, 505)
single_rnn_model.initialize(force_reinit=True, ctx = ctx)

train_and_predict_rnn_gluon(single_rnn_model, num_hiddens, train_iter, ctx, num_epochs,\
                            num_steps, lr, clipping_theta, batch_size)

Y= 
[[3.6488385 3.717224  4.9939637 ... 4.611947  3.1954024 3.7812307]
 [3.6467545 3.741709  4.999507  ... 4.6395717 3.1846983 3.8122027]
 [3.609295  3.6941156 4.9570937 ... 4.619369  3.1398327 3.769076 ]
 ...
 [3.635215  3.6888795 4.97632   ... 4.5978436 3.0694473 3.7748277]
 [3.6216707 3.6893792 4.9766645 ... 4.5897517 3.06152   3.7669973]
 [3.6346872 3.6566145 4.9797635 ... 4.600961  3.0320642 3.7681527]]
<NDArray 10x505 @gpu(0)> 
------------------------------
 output= 
[[      nan       nan       nan ...       nan       nan       nan]
 [3.571809  3.9056041 4.9104524 ... 4.5692925 3.1931314 3.7374125]
 [3.5656755 3.8939724 4.8847666 ... 4.5427556 3.205308  3.733468 ]
 ...
 [3.5879107 3.9067445 4.8730226 ... 4.5769267 3.19339   3.712761 ]
 [3.581739  3.9055588 4.8731585 ... 4.5701766 3.195555  3.706445 ]
 [3.5847254 3.8843894 4.857658  ... 4.570254  3.194002  3.7087502]]
<NDArray 10x505 @gpu(0)>
Y= 
[[3.6214032 3.6926224 5.018273  ... 4.6238937 3.0521126 3.7766614]
 [3.6181886 3.679

Y= 
[[4.0775375 3.9104216 4.757462  ... 4.8511705 3.7873664 4.1345263]
 [4.0874877 3.9275026 4.7660127 ... 4.85989   3.7896295 4.1335654]
 [4.0807524 3.9207845 4.7016616 ... 4.856318  3.8013148 4.1385207]
 ...
 [4.0998297 3.9725535 4.6154175 ... 4.8556175 3.8093257 4.134366 ]
 [4.114964  3.9778109 4.6120467 ... 4.8404794 3.8044379 4.136126 ]
 [4.1154537 3.9852734 4.641116  ... 4.861826  3.7918873 4.145513 ]]
<NDArray 10x505 @gpu(0)> 
------------------------------
 output= 
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
<NDArray 10x505 @gpu(0)>
Y= 
[[4.1206617 3.9928653 4.6231084 ... 4.886658  3.7990782 4.1502523]
 [4.1178985 3.985831  4.6228137 ... 4.884921  3.783735  4.1431346]
 [4.116106  3.9562314 4.609859  ... 4.8778653 3.7857792 4.1385207]
 ...
 [4.103965  3.9245443 4.6608887 ... 4.858416  3.8286414 4.108905 ]
 [4.09134   3.9187999 4.682

 output= 
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
<NDArray 10x505 @gpu(0)>
Y= 
[[3.6643305 3.900153  5.0232224 ... 4.7617464 3.2132602 3.749504 ]
 [3.6719875 3.8945715 5.018405  ... 4.7638817 3.2112467 3.7674596]
 [3.6331024 3.8995454 4.9972124 ... 4.743627  3.1974478 3.7565382]
 ...
 [3.6581626 4.0233855 5.048252  ... 4.7612333 3.2051826 3.7850983]
 [3.6558397 3.9887989 5.0631647 ... 4.7549686 3.188829  3.775057 ]
 [3.6638181 3.9663217 5.0719814 ... 4.756775  3.2104404 3.7780344]]
<NDArray 10x505 @gpu(0)> 
------------------------------
 output= 
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
<NDArray 10x505 @gpu(0)>
Y= 
[[3.637586  3.9327075 5.0732346 ... 4.7242856 3.174715  3.7623622]
 [3.63

Y= 
[[3.7912104 3.3703945 5.0904317 ... 4.7923965 3.1974478 3.85397  ]
 [3.7762034 3.3235962 5.081777  ... 4.7859907 3.148024  3.8546057]
 [3.7948148 3.346037  5.0847526 ... 4.800737  3.198673  3.8732822]
 ...
 [3.8471644 3.558486  5.096568  ... 4.8369164 3.2623184 3.904394 ]
 [3.849296  3.5698144 5.108669  ... 4.8414273 3.267666  3.9136217]
 [3.8416007 3.584352  5.1054606 ... 4.8303914 3.2580965 3.9058037]]
<NDArray 10x505 @gpu(0)> 
------------------------------
 output= 
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
<NDArray 10x505 @gpu(0)>
Y= 
[[3.8431015 3.577389  5.0947924 ... 4.8263116 3.249987  3.9019728]
 [3.849296  3.5854614 5.103579  ... 4.8309503 3.2557862 3.9023767]
 [3.849722  3.558201  5.0967517 ... 4.835885  3.2596338 3.9162142]
 ...
 [3.854182  3.5854614 5.1039433 ... 4.8402424 3.3279095 3.9090185]
 [3.8605192 3.5907152 5.135

In [None]:
inputs = test_feature.T
predict = predict_rnn_gluon(inputs, 1, single_rnn_model, ctx)
print(predict)

## GRU

In [None]:
num_steps = 30
num_epochs, batch_size, lr, clipping_theta = 200, 20, .1, 1e-2

gru_layer = rnn.GRU(num_hiddens)
gru_layer.initialize(ctx=ctx)
gru_model = RNNReg(rnn_layer, 505)
gru_model.initialize(force_reinit=True, ctx = ctx)

In [None]:
train_and_predict_rnn_gluon(gru_model, num_hiddens, train_iter, ctx, num_epochs,\
                            num_steps, lr, clipping_theta, batch_size)

In [None]:
inputs = test_feature.T
predict = predict_rnn_gluon(inputs, 1, single_rnn_model, ctx)
print(predict)