In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

Using TensorFlow backend.


In [3]:
import pandas as pd
import pandas_datareader.data as web
import fix_yahoo_finance
import statsmodels.api as sm
import statsmodels.formula.api as smf

  from pandas.core import datetools


In [4]:
# Grab 10 years of Apple and NASDAQ data.  
# Note that the old Yahoo Finance API was deprecated.  (Thanks, Verizon!  That Marissa pay-out was steep.)
# Ran Aroussi has developed a fix that saves accessibility to Yahoo Finance. Thanks, Ran! Great stuff. 

start, end = "2006-01-01", "2015-12-31"
aapl_all = web.get_data_yahoo("aapl", start=start, end=end)
nasdaq_all = web.get_data_yahoo("^ixic", start=start, end=end)
aapl = aapl_all['Adj Close']
nasdaq = nasdaq_all['Adj Close']

[*********************100%***********************]  1 of 1 downloaded

In [5]:
nasdaq_all.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-01-03,2216.530029,2249.679932,2189.909912,2243.73999,2243.73999,1998300000
2006-01-04,2246.959961,2265.280029,2246.070068,2263.459961,2263.459961,1887560000
2006-01-05,2264.929932,2277.560059,2264.5,2276.870117,2276.870117,1891750000
2006-01-06,2289.209961,2306.719971,2281.01001,2305.620117,2305.620117,2233640000
2006-01-09,2306.179932,2322.629883,2303.129883,2318.689941,2318.689941,1949140000
2006-01-10,2306.219971,2320.320068,2303.929932,2320.320068,2320.320068,1978160000
2006-01-11,2321.409912,2332.919922,2316.48999,2331.360107,2331.360107,2380600000
2006-01-12,2327.169922,2330.310059,2313.219971,2316.689941,2316.689941,2011460000
2006-01-13,2317.73999,2321.699951,2308.159912,2317.040039,2317.040039,1784410000
2006-01-17,2302.560059,2305.870117,2294.050049,2302.689941,2302.689941,1702260000


In [6]:
# Calculate daily returns.

aapl_returns = np.log(aapl / aapl.shift(1))
nasdaq_returns = np.log(nasdaq / nasdaq.shift(1))
aapl_returns = aapl_returns.dropna()
nasdaq_returns = nasdaq_returns.dropna()
aapl_returns = pd.DataFrame(aapl_returns)
nasdaq_returns = pd.DataFrame(nasdaq_returns)

In [7]:
print(aapl_returns.describe())
print(nasdaq_returns.describe())

         Adj Close
count  2516.000000
mean      0.000938
std       0.021618
min      -0.197470
25%      -0.009446
50%       0.000897
75%       0.012243
max       0.130194
         Adj Close
count  2516.000000
mean      0.000319
std       0.013934
min      -0.095877
25%      -0.005501
50%       0.000957
75%       0.006919
max       0.111594


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
data = pd.merge(nasdaq_returns, aapl_returns, left_index=True, right_index=True)
data.rename(columns={'Adj Close_x':'nasdaq', 'Adj Close_y':'aapl'}, inplace=True)
datatrain, datatest = train_test_split(data, test_size = 0.2)

In [10]:
# We need a baseline against which to compare the deep learner.  
# The most obvious is regression and its prediction or out-of-sample error, the MSE.

mod = smf.ols(formula='aapl ~ nasdaq', data = datatrain).fit()
datatest['fitted'] = mod.predict(exog = datatest)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
# The baseline against which to measure.

print('The average out-sample AAPL return is %f' % datatest['aapl'].mean())
print('The average predicted AAPL return is %f' % datatest['fitted'].mean())
print('The MSE is %f' % ((datatest['aapl'] - datatest['fitted'])**2).mean())

The average out-sample AAPL return is -0.000107
The average predicted AAPL return is 0.001462
The MSE is 0.000268


In [12]:
train_X = datatrain['nasdaq'].as_matrix()
train_y = datatrain['aapl'].as_matrix()
test_X = datatest['nasdaq'].as_matrix()
test_y = datatest['aapl'].as_matrix()

dim = data.shape[1]

In [13]:
model = Sequential()
model.add(Dense(units=500, activation='relu', input_dim=1))
model.add(Dense(units=1000, activation='relu', input_dim=1))
model.add(Dense(units=1000, activation='relu', input_dim=1))
model.add(Dense(units=1000, activation='relu', input_dim=1))
model.add(Dense(units=500, activation='relu', input_dim=1))
model.add(Dense(units=1))

# Compile, fit, and generate scores and predicted probabilities.
model.compile(loss='mse', optimizer='adam', metrics=['mse'])

#model.fit(data, labels, epochs=20, batch_size=100)
model.fit(train_X, train_y, epochs=20, batch_size=100, verbose=0)
target = model.predict(test_X)
datatest['target'] = target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
# Baseline.

print('The average out-sample AAPL return is %f' % datatest['aapl'].mean())
print('The average predicted AAPL return is %f' % datatest['fitted'].mean())
print('The MSE is %f' % ((datatest['aapl'] - datatest['fitted'])**2).mean())

The average out-sample AAPL return is -0.000107
The average predicted AAPL return is 0.001462
The MSE is 0.000268


In [15]:
# MLP.

print('The average out-sample AAPL return is %f' % datatest['aapl'].mean())
print('The average predicted AAPL return is %f' % datatest['target'].mean())
print('The MSE is %f' % ((datatest['aapl'] - datatest['target'])**2).mean())

The average out-sample AAPL return is -0.000107
The average predicted AAPL return is 0.001370
The MSE is 0.000280


In [16]:
for i in range(9):
    print(i)
    model = Sequential()
    model.add(Dense(units=500, activation='relu', input_dim=1))
    model.add(Dense(units=1000, activation='relu', input_dim=1))
    model.add(Dense(units=1000, activation='relu', input_dim=1))
    model.add(Dense(units=1000, activation='relu', input_dim=1))
    model.add(Dense(units=500, activation='relu', input_dim=1))
    model.add(Dense(units=1))
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    model.fit(train_X, train_y, epochs=20, batch_size=100, verbose=0)
    target = model.predict(test_X)
    datatest['target'] = target
    print('The MSE is %f' % ((datatest['aapl'] - datatest['target'])**2).mean())    

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


The MSE is 0.000284
1
The MSE is 0.000309
2
The MSE is 0.000306
3
The MSE is 0.000289
4
The MSE is 0.000298
5
The MSE is 0.000277
6
The MSE is 0.000293
7
The MSE is 0.000281
8
The MSE is 0.000290


In [19]:
# Grab Amazon and Fama-French 3 Factor data.
# Note that the Yahoo Finance API was recently deprecated.
# (Thanks, Marissa!  Obviously your golden parachute had a steep opportunity cost.)
import quandl
#start, end = dt.datetime(1998, 1, 1), dt.datetime(2017, 3, 30)
start, end = "1998-01-01", "2017-3-30"
#aapl_all = web.get_data_yahoo("aapl", start=start, end=end)
amzn_all = web.DataReader('amzn', 'google', start, end)
ff3f = quandl.get("KFRENCH/FACTORS_D", start_date="1998-01-01", end_date="2017-03-30").rename(columns = {'Mkt-RF':'Mkt_RF'}) / 100 

In [20]:
amzn_all['AMZN_r'] = np.log(amzn_all['Close'] / amzn_all['Close'].shift(1)).dropna()
data = pd.merge(amzn_all, ff3f, left_index=True, right_index=True).dropna()

In [21]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,AMZN_r,Mkt_RF,SMB,HML,RF
count,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0
mean,186.994071,189.148172,184.754221,187.075434,6711516.0,0.000978,0.000281,0.000127,9.2e-05,5e-05
std,207.907913,209.489151,206.067163,207.898473,5399765.0,0.029765,0.012155,0.005801,0.006156,6.4e-05
min,5.91,6.1,5.51,5.97,986435.0,-0.284568,-0.0895,-0.0378,-0.0422,0.0
25%,39.44,40.0,38.815,39.44,3747206.0,-0.011867,-0.0049,-0.0033,-0.0026,0.0
50%,84.66,86.24,83.38,84.6,5615482.0,0.000329,0.0007,0.0002,0.0,1e-05
75%,267.07,269.99,264.23,267.23,7947571.0,0.014016,0.0058,0.0036,0.0027,8e-05
max,874.95,877.06,871.66,876.34,104404600.0,0.296181,0.1135,0.0385,0.048,0.00022


In [23]:
from sklearn.model_selection import train_test_split
datatrain, datatest = train_test_split(data, test_size = 0.8, random_state = 2062661000)

In [24]:
mod = smf.ols(formula='AMZN_r ~ RF + Mkt_RF + SMB + HML', data = datatrain).fit()
#print(mod.summary())
#datatrain['fitted'] = mod.predict(exog = datatrain)
datatest['fitted'] = mod.predict(exog = datatest)

                            OLS Regression Results                            
Dep. Variable:                 AMZN_r   R-squared:                       0.205
Model:                            OLS   Adj. R-squared:                  0.201
Method:                 Least Squares   F-statistic:                     51.19
Date:                Wed, 02 Aug 2017   Prob (F-statistic):           2.29e-38
Time:                        12:14:47   Log-Likelihood:                 1724.2
No. Observations:                 799   AIC:                            -3438.
Df Residuals:                     794   BIC:                            -3415.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0010      0.001      0.829      0.4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
# Baseline.

print('The average out-sample AMZN return is %f' % datatest['AMZN_r'].mean())
print('The average predicted AMZN return is %f' % datatest['fitted'].mean())
print('The MSE is %f' % ((datatest['AMZN_r'] - datatest['fitted'])**2).mean())

The average out-sample AMZN return is 0.001057
The average predicted AMZN return is 0.000513
The MSE is 0.000629


In [32]:
#RF + Mkt_RF + SMB + HML

train_X = datatrain[['RF', 'Mkt_RF', 'SMB', 'HML']].as_matrix()
train_y = datatrain['AMZN_r'].as_matrix()
test_X = datatest[['RF', 'Mkt_RF', 'SMB', 'HML']].as_matrix()
test_y = datatest['AMZN_r'].as_matrix()

dim = train_X.shape[1]
dim

4

In [33]:
model = Sequential()
model.add(Dense(units=500, activation='relu', input_dim=dim))
model.add(Dense(units=1000, activation='relu', input_dim=dim))
model.add(Dense(units=1000, activation='relu', input_dim=dim))
model.add(Dense(units=1000, activation='relu', input_dim=dim))
model.add(Dense(units=500, activation='relu', input_dim=dim))
model.add(Dense(units=1))

# Compile, fit, and generate scores and predicted probabilities.
model.compile(loss='mse', optimizer='adam', metrics=['mse'])

#model.fit(data, labels, epochs=20, batch_size=100)
model.fit(train_X, train_y, epochs=20, batch_size=100, verbose=0)
target = model.predict(test_X)
datatest['target'] = target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [34]:
# MLP.

print('The average out-sample AMZN return is %f' % datatest['AMZN_r'].mean())
print('The average predicted AZMN return is %f' % datatest['target'].mean())
print('The MSE is %f' % ((datatest['AMZN_r'] - datatest['target'])**2).mean())

The average out-sample AMZN return is 0.001057
The average predicted AZMN return is -0.002734
The MSE is 0.000679


In [38]:
for i in range(9):
    print(i)
    model = Sequential()
    model.add(Dense(units=500, activation='relu', input_dim=dim))
    model.add(Dense(units=1000, activation='relu', input_dim=dim))
    model.add(Dense(units=1000, activation='relu', input_dim=dim))
    model.add(Dense(units=1000, activation='relu', input_dim=dim))
    model.add(Dense(units=500, activation='relu', input_dim=dim))
    model.add(Dense(units=1))
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    model.fit(train_X, train_y, epochs=20, batch_size=100, verbose=0)
    target = model.predict(test_X)
    datatest['target'] = target
    print('The MSE is %f' % ((datatest['AMZN_r'] - datatest['target'])**2).mean())

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


The MSE is 0.000749
1
The MSE is 0.000629
2
The MSE is 0.000637
3
The MSE is 0.000647
4
The MSE is 0.000648
5
The MSE is 0.000659
6
The MSE is 0.000633
7
The MSE is 0.000643
8
The MSE is 0.000678
