In [4]:
import numpy as np
import tensorflow as tf
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras import optimizers
from keras.callbacks import LearningRateScheduler
from keras.callbacks import ModelCheckpoint
import keras.backend as K
from keras.models import load_model

In [5]:
import pandas as pd
import pandas_datareader.data as web
import fix_yahoo_finance
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

In [20]:
def getAppleData():
    # Grab 10 years of Apple and NASDAQ data.  
    # Note that the old Yahoo Finance API was deprecated.  (Thanks, Verizon!  That Marissa pay-out was steep.)
    # Ran Aroussi has developed a fix that saves accessibility to Yahoo Finance. Thanks, Ran! Great stuff. 
    start, end = "2006-01-01", "2015-12-31"
    aapl_all = web.get_data_yahoo("aapl", start=start, end=end)
    nasdaq_all = web.get_data_yahoo("^ixic", start=start, end=end)
    aapl = aapl_all['Adj Close']
    nasdaq = nasdaq_all['Adj Close']
    
    # Calculate daily returns.
    aapl_returns = np.log(aapl / aapl.shift(1))
    nasdaq_returns = np.log(nasdaq / nasdaq.shift(1))
    aapl_returns = aapl_returns.dropna()
    nasdaq_returns = nasdaq_returns.dropna()
    aapl_returns = pd.DataFrame(aapl_returns)
    nasdaq_returns = pd.DataFrame(nasdaq_returns)
    
    data = pd.merge(nasdaq_returns, aapl_returns, left_index=True, right_index=True)
    data.rename(columns={'Adj Close_x':'nasdaq', 'Adj Close_y':'aapl'}, inplace=True)
    return data

def getAmazonData():
    # Grab Amazon and Fama-French 3 Factor data
    start, end = "1998-01-01", "2017-3-30"
    amzn_all = web.get_data_yahoo('amzn', start, end)
    ff5f = pd.read_csv('ff5f.csv', index_col='Date')
    ff5f.set_index(pd.to_datetime(ff5f.axes[0].astype(str)), inplace=True)
    ff3f = ff5f.rename(columns = {'Mkt-RF':'Mkt_RF'}) / 100
    amzn_all['AMZN_r'] = np.log(amzn_all['Close'] / amzn_all['Close'].shift(1)).dropna()
    data = pd.merge(amzn_all, ff3f, left_index=True, right_index=True).dropna()
    return data

dataApple = getAppleData()
dataAmazon = getAmazonData()

In [54]:
class MLPModel:
    name = 'MLP'
    
    @staticmethod
    def create(layers=[500,1000,1000,1000,500], input_dim=1):
        model = Sequential()
        for u in layers:
            model.add(Dense(units=u, activation='relu', input_dim=input_dim))
        model.add(Dense(units=1))
        model.compile(loss='mse', optimizer='adam', metrics=['mse'])
        return model

    @staticmethod
    def prepareData(X, Y, **kwargs):
        return X, Y

class LSTMModel:
    name = 'LSTM'

    @staticmethod
    def create(blocks=4, look_back=1, input_dim=1):
        shape = (input_dim,look_back+1)
        model = Sequential()
        model.add(LSTM(blocks, input_shape=shape))
        model.add(Dense(units=1))
        model.compile(loss='mse', optimizer='adam', metrics=['mse'])
        return model
    
    @staticmethod
    def createDataset(X, Y, look_back=1):
        dataX, dataY = [], []
        for i in xrange(len(X)-look_back-1):
            a = X[i:(i+look_back+1)]
            dataX.append(a)
            dataY.append(Y[i+look_back])
        return np.array(dataX), np.array(dataY)
    
    @staticmethod
    def prepareData(X, Y, **kwargs):
        X, Y = LSTMModel.createDataset(X, Y, kwargs['look_back'])
        return X.reshape(X.shape[0], X.shape[2], X.shape[1]), Y

In [57]:
def evalModels(data, 
               models=[],
               epochs=20, batch_size=100,
               nIter=9, testSplit=0.2,
               xVars=['nasdaq'], yVar='aapl',
              ):
    # Prep data
    datatrain, datatest = train_test_split(data, test_size = testSplit, random_state=1114)
    rawTrainX = datatrain[xVars].as_matrix()
    rawTrainY = datatrain[yVar].as_matrix()
    rawTestX = datatest[xVars].as_matrix()
    rawTestY = datatest[yVar].as_matrix()
    datatest = datatest.copy()
        
    # Run OLS
    print ('Run OLS')
    startTime = time.time()
    sumMSE = 0
    for i in xrange(nIter):
        print('Iteration %d/%d' % (i+1, nIter)),
        mod = smf.ols(formula='%s ~ %s' % (yVar, ' + '.join(xVars)), data = datatrain).fit()
        fitted = mod.predict(exog = datatest)
        mse = ((datatest[yVar] - fitted)**2).mean()
        print('- the MSE is %f' % mse)

        sumMSE += mse
    print ('The average MSE is %f' % (sumMSE/nIter))
    print ('The average running time is: %0.2f seconds\n' % ((time.time()-startTime)/nIter))
    
    # Run model
    np.random.seed(1115)
    for (modelClass, modelParams) in models:
        modelName = modelClass.name
        trainX, trainY = modelClass.prepareData(rawTrainX, rawTrainY, **modelParams)
        testX, testY = modelClass.prepareData(rawTestX, rawTestY, **modelParams)
        print ('Run %s - %s' % (modelName, modelParams))
        startTime = time.time()
        sumMSE = 0
        for i in xrange(nIter):
            print('Iteration %d/%d' % (i+1, nIter)),
            model = modelClass.create(input_dim=trainX.shape[1], **modelParams)
            model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=0)
            target = model.predict(testX)
            colName = '%s %s' % (modelName, i)
            padSize = len(datatest)-len(target)
            if padSize>0:
                target = np.concatenate((datatest[yVar].as_matrix()[:padSize].reshape(padSize,1), target), axis=0)
            datatest[colName] = target
            mse = ((datatest[yVar] - datatest[colName])**2).mean()
            print('- the MSE is %f' % mse)
            sumMSE += mse
        print ('The average MSE is %f' % (sumMSE/nIter))
        print ('The average running time is: %0.2f seconds\n' % ((time.time()-startTime)/nIter))
    return




In [60]:
with tf.device('/cpu:0'):
    evalModels(dataAmazon, nIter=5,
               models=((MLPModel, {'layers': [500,1000,1000,1000,500]}),
                       (LSTMModel, {'blocks': 10, 'look_back': 30}),
                       (LSTMModel, {'blocks': 10, 'look_back': 90}),
                      ),
               epochs=10, batch_size=100,
               xVars= ['RF', 'Mkt_RF', 'SMB', 'HML'], yVar='AMZN_r')

Run OLS
Iteration 1/5 - the MSE is 0.001076
Iteration 2/5 - the MSE is 0.001076
Iteration 3/5 - the MSE is 0.001076
Iteration 4/5 - the MSE is 0.001076
Iteration 5/5 - the MSE is 0.001076
The average MSE is 0.001076
The average running time is: 0.02 seconds

Run MLP - {'layers': [500, 1000, 1000, 1000, 500]}
Iteration 1/5 - the MSE is 0.001067
Iteration 2/5 - the MSE is 0.001072
Iteration 3/5 - the MSE is 0.001129
Iteration 4/5 - the MSE is 0.001101
Iteration 5/5 - the MSE is 0.001112
The average MSE is 0.001096
The average running time is: 9.77 seconds



In [26]:
with tf.device('/cpu:0'):
    evalModels(dataApple, nIter=5,
               models=((MLPModel, {'layers': [500,1000,1000,1000,500]}),
                       (LSTMModel, {'blocks': 10, 'look_back': 30}),
                       (LSTMModel, {'blocks': 10, 'look_back': 90}),
                      ),
               epochs=10, batch_size=100,
               xVars=['nasdaq'], yVar='aapl')

Run OLS
Iteration 1/5 - the MSE is 0.000236
Iteration 2/5 - the MSE is 0.000236
Iteration 3/5 - the MSE is 0.000236
Iteration 4/5 - the MSE is 0.000236
Iteration 5/5 - the MSE is 0.000236
The average MSE is 0.000236
The average running time is: 0.01 seconds



# Experiment Setup

* Our experiment was done on a cluster consisting of 16 machines, each with 64 cores and 256GB of RAM.

* The TensorFlow backend was set to use all cores, thus in total, there were 1024 cores involved in the experiments.

* Each model was evaluated using 1024 iterations with the work evenly distributed across 16 machines: 64 iterations per each.

* Each batch of 64 iterations were performed on the same set of train/test split.

* The running times and MSE were summed and divided by 1024 in the end.

* The average running time is the time for evaluating the model given the allocation of 64 cores.

In [7]:
def printResults(results):
    allSum = results.sum()
    nIter = 64*16
    print 'Avg MSE/Time for OLS        : %f / %6.3fs' % (allSum.sum_mse_ols/nIter, allSum.sum_time_ols/nIter)
    print 'Avg MSE/Time for MLP        : %f / %6.3fs' % (allSum.sum_mse_mlp/nIter, allSum.sum_time_mlp/nIter)
    print 'Avg MSE/Time for LSTM 30-day: %f / %6.3fs' % (allSum.sum_mse_lstm30/nIter, allSum.sum_time_lstm30/nIter)
    print 'Avg MSE/Time for LSTM 90-day: %f / %6.3fs' % (allSum.sum_mse_lstm90/nIter, allSum.sum_time_lstm90/nIter)

In [8]:
results_apple = pd.read_csv('results_apple.csv')
printResults(results_apple)
results_apple

Avg MSE/Time for OLS        : 0.000264 /  0.009s
Avg MSE/Time for MLP        : 0.000274 /  8.557s
Avg MSE/Time for LSTM 30-day: 0.000436 / 12.170s
Avg MSE/Time for LSTM 90-day: 0.000385 / 19.064s


Unnamed: 0,sum_mse_ols,sum_mse_mlp,sum_mse_lstm30,sum_mse_lstm90,sum_time_ols,sum_time_mlp,sum_time_lstm30,sum_time_lstm90,avg_mse_ols,avg_mse_mlp,avg_mse_lstm30,avg_mse_lstm90,avg_time_ols,avg_time_mlp,avg_time_lstm30,avg_time_lstm90
0,0.013919,0.014524,0.023353,0.021361,0.599649,571.538367,759.04431,1137.6034,0.000217,0.000227,0.000365,0.000334,0.00937,8.930287,11.860067,17.775053
1,0.016751,0.018019,0.028311,0.025657,0.602322,568.95505,738.300886,1132.38996,0.000262,0.000282,0.000442,0.000401,0.009411,8.889923,11.535951,17.693593
2,0.014358,0.014819,0.026758,0.021152,0.597244,532.876408,778.115347,1190.824929,0.000224,0.000232,0.000418,0.00033,0.009332,8.326194,12.158052,18.60664
3,0.017773,0.018158,0.027796,0.024916,0.594912,532.511121,786.928174,1206.412115,0.000278,0.000284,0.000434,0.000389,0.009296,8.320486,12.295753,18.850189
4,0.019862,0.020779,0.031269,0.027091,0.603526,577.549403,782.794428,1244.476547,0.00031,0.000325,0.000489,0.000423,0.00943,9.024209,12.231163,19.444946
5,0.021847,0.022348,0.028534,0.025409,0.59772,531.897036,753.867752,1168.345011,0.000341,0.000349,0.000446,0.000397,0.009339,8.310891,11.779184,18.255391
6,0.021044,0.022538,0.035652,0.032028,0.592847,530.34771,757.251138,1173.282044,0.000329,0.000352,0.000557,0.0005,0.009263,8.286683,11.832049,18.332532
7,0.023034,0.023442,0.036216,0.03386,0.601955,571.544838,794.068499,1268.347815,0.00036,0.000366,0.000566,0.000529,0.009406,8.930388,12.40732,19.817935
8,0.016341,0.016779,0.024582,0.020754,0.594186,535.277359,766.951741,1172.986672,0.000255,0.000262,0.000384,0.000324,0.009284,8.363709,11.983621,18.327917
9,0.013408,0.013862,0.022742,0.019485,0.605273,575.349695,789.769718,1257.083521,0.000209,0.000217,0.000355,0.000304,0.009457,8.989839,12.340152,19.64193


In [9]:
results_amazon = pd.read_csv('results_amazon.csv')
printResults(results_amazon)
results_amazon

Avg MSE/Time for OLS        : 0.001087 /  0.025s
Avg MSE/Time for MLP        : 0.001106 / 13.424s
Avg MSE/Time for LSTM 30-day: 0.001422 / 14.093s
Avg MSE/Time for LSTM 90-day: 0.001330 / 22.236s


Unnamed: 0,sum_mse_ols,sum_mse_mlp,sum_mse_lstm30,sum_mse_lstm90,sum_time_ols,sum_time_mlp,sum_time_lstm30,sum_time_lstm90,avg_mse_ols,avg_mse_mlp,avg_mse_lstm30,avg_mse_lstm90,avg_time_ols,avg_time_mlp,avg_time_lstm30,avg_time_lstm90
0,0.074353,0.074672,0.094065,0.089332,1.502055,904.900912,919.935083,1420.733386,0.001162,0.001167,0.00147,0.001396,0.02347,14.139077,14.373986,22.198959
1,0.063293,0.064499,0.083912,0.078888,1.737169,902.965093,923.859376,1425.137317,0.000989,0.001008,0.001311,0.001233,0.027143,14.10883,14.435303,22.267771
2,0.071138,0.072608,0.091345,0.085963,1.628179,825.208503,865.916178,1427.326239,0.001112,0.001134,0.001427,0.001343,0.02544,12.893883,13.52994,22.301972
3,0.065022,0.065576,0.08872,0.084029,1.475621,826.561748,867.046117,1417.19981,0.001016,0.001025,0.001386,0.001313,0.023057,12.915027,13.547596,22.143747
4,0.065917,0.066793,0.081673,0.074274,1.784764,935.956677,962.059321,1462.055797,0.00103,0.001044,0.001276,0.001161,0.027887,14.624323,15.032177,22.844622
5,0.073881,0.075607,0.100148,0.094884,1.560523,823.66025,843.948646,1360.458118,0.001154,0.001181,0.001565,0.001483,0.024383,12.869691,13.186698,21.257158
6,0.077519,0.078986,0.096707,0.086757,1.616146,838.784271,901.63005,1375.185785,0.001211,0.001234,0.001511,0.001356,0.025252,13.106004,14.08797,21.487278
7,0.068419,0.069883,0.088692,0.086586,1.607048,906.21675,942.728714,1454.68725,0.001069,0.001092,0.001386,0.001353,0.02511,14.159637,14.730136,22.729488
8,0.071592,0.073383,0.095444,0.089334,1.465034,822.767257,871.115288,1411.297358,0.001119,0.001147,0.001491,0.001396,0.022891,12.855738,13.611176,22.051521
9,0.075153,0.076364,0.095544,0.089293,1.472571,905.171817,933.873805,1436.662569,0.001174,0.001193,0.001493,0.001395,0.023009,14.14331,14.591778,22.447853
