In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

Using TensorFlow backend.


In [3]:
import pandas as pd
import pandas_datareader.data as web
import fix_yahoo_finance
import statsmodels.api as sm
import statsmodels.formula.api as smf

  from pandas.core import datetools


In [4]:
# Grab 10 years of Apple and NASDAQ data.  
# Note that the old Yahoo Finance API was deprecated.  (Thanks, Verizon!  That Marissa pay-out was steep.)
# Ran Aroussi has developed a fix that saves accessibility to Yahoo Finance. Thanks, Ran! Great stuff. 

start, end = "2006-01-01", "2015-12-31"
aapl_all = web.get_data_yahoo("aapl", start=start, end=end)
nasdaq_all = web.get_data_yahoo("^ixic", start=start, end=end)
aapl = aapl_all['Adj Close']
nasdaq = nasdaq_all['Adj Close']

[*********************100%***********************]  1 of 1 downloaded

In [5]:
nasdaq_all.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-01-03,2216.530029,2249.679932,2189.909912,2243.73999,2243.73999,1998300000
2006-01-04,2246.959961,2265.280029,2246.070068,2263.459961,2263.459961,1887560000
2006-01-05,2264.929932,2277.560059,2264.5,2276.870117,2276.870117,1891750000
2006-01-06,2289.209961,2306.719971,2281.01001,2305.620117,2305.620117,2233640000
2006-01-09,2306.179932,2322.629883,2303.129883,2318.689941,2318.689941,1949140000
2006-01-10,2306.219971,2320.320068,2303.929932,2320.320068,2320.320068,1978160000
2006-01-11,2321.409912,2332.919922,2316.48999,2331.360107,2331.360107,2380600000
2006-01-12,2327.169922,2330.310059,2313.219971,2316.689941,2316.689941,2011460000
2006-01-13,2317.73999,2321.699951,2308.159912,2317.040039,2317.040039,1784410000
2006-01-17,2302.560059,2305.870117,2294.050049,2302.689941,2302.689941,1702260000


In [6]:
# Calculate daily returns.

aapl_returns = np.log(aapl / aapl.shift(1))
nasdaq_returns = np.log(nasdaq / nasdaq.shift(1))
aapl_returns = aapl_returns.dropna()
nasdaq_returns = nasdaq_returns.dropna()
aapl_returns = pd.DataFrame(aapl_returns)
nasdaq_returns = pd.DataFrame(nasdaq_returns)

In [7]:
print(aapl_returns.describe())
print(nasdaq_returns.describe())

         Adj Close
count  2516.000000
mean      0.000938
std       0.021618
min      -0.197470
25%      -0.009446
50%       0.000897
75%       0.012243
max       0.130194
         Adj Close
count  2516.000000
mean      0.000319
std       0.013934
min      -0.095877
25%      -0.005501
50%       0.000957
75%       0.006919
max       0.111594


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
data = pd.merge(nasdaq_returns, aapl_returns, left_index=True, right_index=True)
data.rename(columns={'Adj Close_x':'nasdaq', 'Adj Close_y':'aapl'}, inplace=True)
datatrain, datatest = train_test_split(data, test_size = 0.2)

In [10]:
# We need a baseline against which to compare the deep learner.  
# The most obvious is regression and its prediction or out-of-sample error, the MSE.

mod = smf.ols(formula='aapl ~ nasdaq', data = datatrain).fit()
datatest['fitted'] = mod.predict(exog = datatest)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
# The baseline against which to measure.

print('The average out-sample AAPL return is %f' % datatest['aapl'].mean())
print('The average predicted AAPL return is %f' % datatest['fitted'].mean())
print('The MSE is %f' % ((datatest['aapl'] - datatest['fitted'])**2).mean())

The average out-sample AAPL return is 0.000271
The average predicted AAPL return is 0.000285
The MSE is 0.000232


In [12]:
train_X = datatrain['nasdaq'].as_matrix()
train_y = datatrain['aapl'].as_matrix()
test_X = datatest['nasdaq'].as_matrix()
test_y = datatest['aapl'].as_matrix()

dim = data.shape[1]

In [13]:
model = Sequential()
model.add(Dense(units=500, activation='relu', input_dim=1))
model.add(Dense(units=1000, activation='relu', input_dim=1))
model.add(Dense(units=1000, activation='relu', input_dim=1))
model.add(Dense(units=1000, activation='relu', input_dim=1))
model.add(Dense(units=500, activation='relu', input_dim=1))
model.add(Dense(units=1))

# Compile, fit, and generate scores and predicted probabilities.
model.compile(loss='mse', optimizer='adam', metrics=['mse'])

#model.fit(data, labels, epochs=20, batch_size=100)
model.fit(train_X, train_y, epochs=20, batch_size=100, verbose=0)
target = model.predict(test_X)
datatest['target'] = target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
# Baseline.

print('The average out-sample AAPL return is %f' % datatest['aapl'].mean())
print('The average predicted AAPL return is %f' % datatest['fitted'].mean())
print('The MSE is %f' % ((datatest['aapl'] - datatest['fitted'])**2).mean())

The average out-sample AAPL return is 0.000271
The average predicted AAPL return is 0.000285
The MSE is 0.000232


In [15]:
# MLP.

print('The average out-sample AAPL return is %f' % datatest['aapl'].mean())
print('The average predicted AAPL return is %f' % datatest['target'].mean())
print('The MSE is %f' % ((datatest['aapl'] - datatest['target'])**2).mean())

The average out-sample AAPL return is 0.000271
The average predicted AAPL return is -0.001049
The MSE is 0.000247


In [16]:
for i in range(9):
    print(i)
    model = Sequential()
    model.add(Dense(units=500, activation='relu', input_dim=1))
    model.add(Dense(units=1000, activation='relu', input_dim=1))
    model.add(Dense(units=1000, activation='relu', input_dim=1))
    model.add(Dense(units=1000, activation='relu', input_dim=1))
    model.add(Dense(units=500, activation='relu', input_dim=1))
    model.add(Dense(units=1))
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    model.fit(train_X, train_y, epochs=20, batch_size=100, verbose=0)
    target = model.predict(test_X)
    datatest['target'] = target
    print('The MSE is %f' % ((datatest['aapl'] - datatest['target'])**2).mean())    

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


The MSE is 0.000291
1
The MSE is 0.000242
2
The MSE is 0.000327
3
The MSE is 0.000250
4
The MSE is 0.000250
5
The MSE is 0.000245
6
The MSE is 0.000269
7
The MSE is 0.000247
8
The MSE is 0.000269
