In [1]:
from pandas_datareader import data as web
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import torch

In [91]:
# Stock market indices
# assets = ['^GSPC', '^DJI', '^IXIC', '^RUT']

assets = ['SPY', 'DIA', 'ONEQ', 'IWM']

asset_list = assets

In [94]:
# Iterate through index list, check if we already have the data (load it) or go get it from Yahoo! (save it)
# Make a dictionary of dataframes containing all the data

start_date = '01-01-2005'
stop_date = '05-01-2018'
directory = 'yahoo_data'

# If the 'directory' doesn't exist, create it
if os.path.isdir(directory) == False:
    os.mkdir(directory)

asset_dict = {}

for asset in asset_list:
    pathname = directory + '/' + asset + '_' + start_date + '.csv'
    if os.path.isfile(pathname) == True:
        print('Loading from file: ', pathname)        
        df = pd.read_csv(pathname, index_col='Date')
    else:
        print('Downloading from Yahoo! - ', asset)
        df = web.DataReader(asset, data_source='yahoo', start=start_date, end=stop_date)
        df.to_csv(pathname)    

    # Just before we put it in the dictionary, copy the index to a Date field and make a new index which enumerates
    # all the entries.
    df.insert(0, 'Date', df.index)
    df.index = np.arange(df.shape[0])
    df['Date']=pd.to_datetime(df['Date'], format='%Y/%m/%d')
    
    asset_dict[asset] = df 

Downloading from Yahoo! -  SPY
Downloading from Yahoo! -  DIA
Downloading from Yahoo! -  ONEQ
Downloading from Yahoo! -  IWM


In [95]:
for asset in asset_list:
    display(asset_dict[asset])
    print(asset, asset_dict[asset].describe())

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2005-01-03,121.760002,119.900002,121.559998,120.300003,55748000.0,87.421600
1,2005-01-04,120.540001,118.440002,120.459999,118.830002,69167600.0,86.353340
2,2005-01-05,119.250000,118.000000,118.739998,118.010002,65667300.0,85.757454
3,2005-01-06,119.150002,118.260002,118.440002,118.610001,47814700.0,86.193451
4,2005-01-07,119.230003,118.129997,118.970001,118.440002,55847700.0,86.069946
...,...,...,...,...,...,...,...
3350,2018-04-25,264.130005,260.850006,262.910004,263.630005,103840900.0,250.251343
3351,2018-04-26,267.250000,264.290009,264.790009,266.309998,67731900.0,252.795288
3352,2018-04-27,267.339996,265.500000,267.000000,266.559998,57053600.0,253.032639
3353,2018-04-30,267.890015,264.429993,267.260010,264.510010,82182300.0,251.086670


SPY               High          Low         Open        Close        Volume  \
count  3355.000000  3355.000000  3355.000000  3355.000000  3.355000e+03   
mean    158.618683   156.850125   157.777961   157.785174  1.480847e+08   
std      47.267962    47.257414    47.268697    47.273572  1.009376e+08   
min      70.000000    67.099998    67.949997    68.110001  1.527000e+07   
25%     124.024998   122.645000   123.509998   123.364998  7.687135e+07   
50%     141.869995   140.380005   141.160004   141.190002  1.196369e+08   
75%     199.370003   197.560005   198.470001   198.644997  1.860118e+08   
max     286.630005   284.500000   285.929993   286.579987  8.710263e+08   

         Adj Close  
count  3355.000000  
mean    133.794391  
std      50.344772  
min      53.503864  
25%      94.983700  
50%     112.221672  
75%     176.385826  
max     270.951447  


Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2005-01-03,108.589996,106.930000,108.150002,107.320000,11691300.0,73.506165
1,2005-01-04,107.599998,105.889999,107.500000,106.300003,10628300.0,72.807533
2,2005-01-05,106.779999,105.709999,106.300003,105.709999,10885300.0,72.403397
3,2005-01-06,106.660004,105.830002,106.129997,106.040001,8238900.0,72.629478
4,2005-01-07,106.500000,105.629997,106.330002,105.839996,8555700.0,72.492485
...,...,...,...,...,...,...,...
3350,2018-04-25,241.330002,238.089996,240.289993,240.699997,5269200.0,226.737381
3351,2018-04-26,243.880005,241.119995,241.429993,243.119995,3845800.0,229.017075
3352,2018-04-27,243.440002,241.770004,242.970001,242.949997,3209800.0,228.856888
3353,2018-04-30,244.839996,241.509995,244.100006,241.539993,3475700.0,227.528656


DIA               High          Low         Open        Close        Volume  \
count  3355.000000  3355.000000  3355.000000  3355.000000  3.355000e+03   
mean    141.717285   140.264581   141.021648   141.029833  9.071673e+06   
std      40.051009    39.990225    40.029948    40.023839  8.081543e+06   
min      67.169998    64.779999    65.510002    65.440002  1.006800e+06   
25%     111.160004   109.685001   110.279999   110.400002  4.433950e+06   
50%     129.460007   128.000000   128.820007   128.800003  6.657000e+06   
75%     170.584999   169.260002   169.889999   170.020004  1.030350e+07   
max     265.929993   264.119995   265.559998   265.910004  9.169520e+07   

         Adj Close  
count  3355.000000  
mean    116.519076  
std      43.828723  
min      49.443554  
25%      81.676556  
50%      99.338135  
75%     147.937439  
max     249.256119  


Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2005-01-03,86.449997,84.730003,86.250000,84.910004,40300.0,73.121384
1,2005-01-04,85.489998,82.839996,85.489998,83.250000,68800.0,71.691879
2,2005-01-05,83.480003,82.500000,83.339996,82.540001,112800.0,71.080414
3,2005-01-06,83.019997,82.339996,82.419998,82.449997,38300.0,71.002930
4,2005-01-07,82.930000,81.870003,82.400002,82.400002,38300.0,70.959862
...,...,...,...,...,...,...,...
3350,2018-04-25,275.579987,271.809998,274.510010,275.109985,37700.0,265.009979
3351,2018-04-26,280.200012,277.049988,278.000000,279.410004,40900.0,269.152161
3352,2018-04-27,282.549988,278.619995,282.549988,279.100006,28800.0,268.853485
3353,2018-04-30,281.290009,277.570007,280.010010,277.670013,18900.0,267.475983


ONEQ               High          Low         Open        Close        Volume  \
count  3355.000000  3355.000000  3355.000000  3355.000000  3.355000e+03   
mean    134.788319   133.137759   134.124319   133.999705  4.808244e+04   
std      57.932287    57.563535    57.831186    57.771240  1.506505e+05   
min      51.750000    49.500000    50.230000    49.799999  8.000000e+02   
25%      90.505001    89.160004    90.000000    89.800003  9.500000e+03   
50%     109.260002   108.000000   108.709999   108.650002  1.740000e+04   
75%     182.885002   179.529999   181.714996   181.320000  3.250000e+04   
max     299.750000   296.820007   299.250000   297.859985  2.490400e+06   

         Adj Close  
count  3355.000000  
mean    123.096690  
std      57.440780  
min      44.096684  
25%      78.924301  
50%      97.158524  
75%     170.002983  
max     286.494904  


Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2005-01-03,65.209999,63.570000,65.095001,63.910000,16062600.0,51.335308
1,2005-01-04,64.084999,62.430000,64.035004,62.540001,27450000.0,50.234875
2,2005-01-05,62.639999,61.290001,62.470001,61.290001,29884200.0,49.230827
3,2005-01-06,62.200001,61.224998,61.669998,61.615002,23061200.0,49.491879
4,2005-01-07,61.950001,60.895000,61.755001,60.930000,20906200.0,48.941639
...,...,...,...,...,...,...,...
3350,2018-04-25,154.800003,152.960007,154.360001,154.199997,21981400.0,148.582062
3351,2018-04-26,155.410004,153.940002,154.419998,154.910004,14862900.0,149.266220
3352,2018-04-27,155.369995,153.779999,154.880005,154.690002,17734300.0,149.054199
3353,2018-04-30,155.649994,153.289993,155.020004,153.320007,15624600.0,147.734131


IWM               High          Low         Open        Close        Volume  \
count  3355.000000  3355.000000  3355.000000  3355.000000  3.355000e+03   
mean     90.878000    89.442976    90.204790    90.202952  4.973638e+07   
std      28.328728    28.271442    28.302914    28.304778  3.133952e+07   
min      35.639999    34.259998    34.730000    34.389999  5.060000e+04   
25%      69.939999    68.225002    69.174999    69.165001  2.836570e+07   
50%      81.150002    79.910004    80.570000    80.620003  4.178410e+07   
75%     115.150002   113.645000   114.405003   114.430000  6.193485e+07   
max     160.630005   158.770004   160.360001   159.960007  3.551170e+08   

         Adj Close  
count  3355.000000  
mean     80.354959  
std      29.464536  
min      29.028275  
25%      57.749250  
50%      69.029083  
75%     105.353165  
max     153.777161  


In [96]:
# Display a Plotly interactive candelstick chart

def plotly_candlestick(asset_dict, asset_name):

    df = asset_dict[asset_name]
    
    trace1 = {
        'x': df.Date,
        'open': df.Open,
        'close': df.Close,
        'high': df.High,
        'low': df.Low,
        'type': 'candlestick',
        'name': asset_name,
        'showlegend': True
    }

    data = [trace1]
        
    layout = go.Layout({
        'title': {
            'text': 'Asset: ' + asset_name,
            'font': {
                'size': 20
            }
        }
    })

    fig = go.Figure(trace1, layout)
    fig.show()

# Plot all the assets in Plotly interactive charts with a Candlestick chart
for asset in asset_list:
    plotly_candlestick(asset_dict, asset)

In [99]:
train_start = '01-01-2005'
valid_start = '01-02-2008'
test_start = '01-01-2010'
test_end = '05-01-2018'

In [101]:
test_asset_dict = {}
for asset in asset_list:
    df = asset_dict[asset]
    mask = (df['Date'] > test_start) & (df['Date'] <= test_end)
    test_asset_dict[asset] = df.loc[mask]    
    
    # Reset the index to start from 0 again
    test_asset_dict[asset].index = np.arange(test_asset_dict[asset].shape[0])
    
#    display(test_asset_dict[asset])
    
    start = test_asset_dict[asset]['Adj Close'].loc[0]
    end = test_asset_dict[asset]['Adj Close'].loc[test_asset_dict[asset].shape[0]-1]
    gain = 100.0 * (end - start) / start
    
    print(asset, '{:1.2f}'.format(start), '{:1.2f}'.format(end), '{:1.2f}'.format(gain))


SPY 91.09 251.53 176.14
DIA 81.86 226.79 177.04
ONEQ 81.00 269.47 232.68
IWM 54.72 148.46 171.30


Table 8 of the paper has the following buy and hold results:
`
S&P500 136.4%
DJI 136.6%
NASDAQ 228.9%
R2000 163.5%

This is weird - the results are close-ish for ONEQ and IWM but not really for SPY and DIA.

In [2]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x203a2639d10>

In [3]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[-0.0187,  0.1713, -0.2944]],

        [[-0.3521,  0.1026, -0.2971]],

        [[-0.3191,  0.0781, -0.1957]],

        [[-0.1634,  0.0941, -0.1637]],

        [[-0.3368,  0.0959, -0.0538]]], grad_fn=<StackBackward>)
(tensor([[[-0.3368,  0.0959, -0.0538]]], grad_fn=<StackBackward>), tensor([[[-0.9825,  0.4715, -0.0633]]], grad_fn=<StackBackward>))


In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [5]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [6]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.1389, -1.2024, -0.9693],
        [-1.1065, -1.2200, -0.9834],
        [-1.1286, -1.2093, -0.9726],
        [-1.1190, -1.1960, -0.9916],
        [-1.0137, -1.2642, -1.0366]])
tensor([[-0.0462, -4.0106, -3.6096],
        [-4.8205, -0.0286, -3.9045],
        [-3.7876, -4.1355, -0.0394],
        [-0.0185, -4.7874, -4.6013],
        [-5.7881, -0.0186, -4.1778]])


In [7]:
import torch
import torch.nn as nn

In [8]:
input_dim = 5
hidden_dim = 10
n_layers = 1

lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)

In [14]:
batch_size = 1
seq_len = 1

inp = torch.randn(batch_size, seq_len, input_dim)
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
cell_state = torch.randn(n_layers, batch_size, hidden_dim)
hidden = (hidden_state, cell_state)

print(inp.shape)
print(hidden_state.shape)
print(cell_state.shape)


torch.Size([1, 1, 5])
torch.Size([1, 1, 10])
torch.Size([1, 1, 10])


In [10]:
out, hidden = lstm_layer(inp, hidden)
print("Output shape: ", out.shape)
print("Hidden: ", hidden)

Output shape:  torch.Size([1, 1, 10])
Hidden:  (tensor([[[ 0.4440, -0.5991, -0.1967, -0.0285, -0.3774,  0.1672,  0.2565,
          -0.0973,  0.0308,  0.3004]]], grad_fn=<StackBackward>), tensor([[[ 0.9670, -1.0159, -1.0160, -0.1539, -0.6550,  0.2409,  0.4990,
          -0.1687,  0.0931,  0.4196]]], grad_fn=<StackBackward>))


In [15]:
seq_len = 3
inp = torch.randn(batch_size, seq_len, input_dim)
out, hidden = lstm_layer(inp, hidden)
print(out.shape)

torch.Size([1, 3, 10])


In [16]:
# Obtaining the last output
out = out.squeeze()[-1, :]
print(out.shape)

torch.Size([10])


In [17]:
print(out)

tensor([ 0.0084,  0.4925,  0.0722,  0.1287, -0.0090, -0.1334, -0.2742, -0.1923,
         0.0494, -0.0402], grad_fn=<SliceBackward>)
