In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('sphist.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2015-12-07,2090.419922,2090.419922,2066.780029,2077.070068,4043820000.0,2077.070068
1,2015-12-04,2051.23999,2093.840088,2051.23999,2091.689941,4214910000.0,2091.689941
2,2015-12-03,2080.709961,2085.0,2042.349976,2049.620117,4306490000.0,2049.620117
3,2015-12-02,2101.709961,2104.27002,2077.110107,2079.51001,3950640000.0,2079.51001
4,2015-12-01,2082.929932,2103.370117,2082.929932,2102.629883,3712120000.0,2102.629883


In [3]:
df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Volume       float64
Adj Close    float64
dtype: object

In [4]:
# Convert Datetime to Datetime format
df['DateTime'] = pd.to_datetime(df.Date)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,DateTime
0,2015-12-07,2090.419922,2090.419922,2066.780029,2077.070068,4043820000.0,2077.070068,2015-12-07
1,2015-12-04,2051.23999,2093.840088,2051.23999,2091.689941,4214910000.0,2091.689941,2015-12-04
2,2015-12-03,2080.709961,2085.0,2042.349976,2049.620117,4306490000.0,2049.620117,2015-12-03
3,2015-12-02,2101.709961,2104.27002,2077.110107,2079.51001,3950640000.0,2079.51001,2015-12-02
4,2015-12-01,2082.929932,2103.370117,2082.929932,2102.629883,3712120000.0,2102.629883,2015-12-01


In [5]:
# Reorder by DateTime
df_ordered = df.sort_values('DateTime', ascending=True)
df_ordered.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,DateTime
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66,1950-01-03
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85,1950-01-04
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93,1950-01-05
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98,1950-01-06
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08,1950-01-09


In [6]:
df_ordered['index'] = range(0,df.shape[0],1)
df_ordered.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,DateTime,index
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66,1950-01-03,0
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85,1950-01-04,1
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93,1950-01-05,2
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98,1950-01-06,3
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08,1950-01-09,4


In [7]:
# Reset index
df_ordered = df_ordered.set_index(['index'])

In [8]:
# Identify data newer than April 1 2015
df_ordered['date_after_april1_2015'] = df_ordered.DateTime > datetime(year=2015, month=4, day=1)
df_ordered.head()

Unnamed: 0_level_0,Date,Open,High,Low,Close,Volume,Adj Close,DateTime,date_after_april1_2015
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66,1950-01-03,False
1,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85,1950-01-04,False
2,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93,1950-01-05,False
3,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98,1950-01-06,False
4,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08,1950-01-09,False


In [9]:
# Calculate rolling average 5, 365 days
data_mean_5day = df_ordered.Close.rolling(5).mean().shift(1)
data_mean_365day = df_ordered.Close.rolling(365).mean().shift(1)
data_mean_ratio = data_mean_5day/data_mean_365day

In [10]:
# Calculate rolling standard deviation 5, 365 days
data_std_5day = df_ordered.Close.rolling(5).std().shift(1)
data_std_365day = df_ordered.Close.rolling(365).std().shift(1)
data_std_ratio = data_std_5day/data_std_365day

In [11]:
df_ordered['data_mean_5day'] = data_mean_5day
df_ordered['data_mean_365day'] = data_mean_365day
df_ordered['data_mean_ratio'] = data_mean_ratio
df_ordered['data_std_5day'] = data_std_5day
df_ordered['data_std_365day'] = data_std_365day
df_ordered['data_std_ratio'] = data_std_ratio

In [12]:
# Isolote data newer than 1951 and drop all rows had NaN
df_new = df_ordered[df_ordered["DateTime"] > datetime(year=1951, month=1, day=2)]
df_no_NA = df_new.dropna(axis=0)
df_no_NA.head()

Unnamed: 0_level_0,Date,Open,High,Low,Close,Volume,Adj Close,DateTime,date_after_april1_2015,data_mean_5day,data_mean_365day,data_mean_ratio,data_std_5day,data_std_365day,data_std_ratio
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
365,1951-06-19,22.02,22.02,22.02,22.02,1100000.0,22.02,1951-06-19,False,21.8,19.447726,1.120954,0.256223,1.790253,0.143121
366,1951-06-20,21.91,21.91,21.91,21.91,1120000.0,21.91,1951-06-20,False,21.9,19.462411,1.125246,0.213659,1.789307,0.119409
367,1951-06-21,21.780001,21.780001,21.780001,21.780001,1100000.0,21.780001,1951-06-21,False,21.972,19.476274,1.128142,0.092574,1.788613,0.051758
368,1951-06-22,21.549999,21.549999,21.549999,21.549999,1340000.0,21.549999,1951-06-22,False,21.96,19.489562,1.126757,0.115108,1.787659,0.06439
369,1951-06-25,21.290001,21.290001,21.290001,21.290001,2440000.0,21.290001,1951-06-25,False,21.862,19.502082,1.121008,0.204132,1.786038,0.114293


In [13]:
# Split train/test datasets
df_train = df_no_NA[df_no_NA['DateTime'] < datetime(year=2013, month=1, day=1)]
df_test = df_no_NA[df_no_NA['DateTime'] >= datetime(year=2013, month=1, day=1)]

In [14]:
# Instantiate the linear regression model
model = LinearRegression()
features = ['data_mean_5day', 'data_mean_365day', 'data_mean_ratio', 'data_std_5day', 'data_std_365day', 'data_std_ratio']
X = df_train[features]
X_test = df_test[features]
y = df_train.Close
y_test = df_test.Close

In [15]:
model.fit(X, y)
pred = model.predict(X_test)

In [16]:
MAE = sum(abs(pred - y_test))/len(pred) # Mean prediction error
print(MAE)
print(model.score(X, y))

16.13112382117241
0.9995187793989123
