In [204]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [205]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [206]:
!wget -nc https://lazyprogrammer.me/course_files/sp500sub.csv

File ‘sp500sub.csv’ already there; not retrieving.



In [207]:
df0 = pd.read_csv('sp500sub.csv', index_col='Date', parse_dates=True)

In [208]:
df0.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,9.22,9.51,9.17,9.5,9.5,1865400.0,INCY
2010-01-05,9.51,10.29,9.45,10.27,10.27,7608900.0,INCY
2010-01-06,10.38,11.09,10.35,11.0,11.0,8046700.0,INCY
2010-01-07,11.0,11.06,10.62,10.82,10.82,3680300.0,INCY
2010-01-08,10.82,11.0,10.75,10.94,10.94,1529300.0,INCY


In [209]:
df = df0[df0['Name'] == 'IBM'][['Close']].copy()

In [210]:
df['LogClose'] = np.log(df['Close'])

In [211]:
df['LogReturn'] = df['LogClose'].diff()

In [212]:
Ntest = 252
train = df.iloc[:-Ntest]
test = df.iloc[-Ntest:]

In [213]:
# Make supervised dataset
# let's see if we can use T past values to predict the next value
series = df['LogReturn'].to_numpy()[1:] # first value is nan
target = (series > 0) * 1 # This converts TRUE/FALSE to 1/0
# we 'll use past 1 month stock returns to predict direction of next day stock movement
T = 21
X = []
Y = []
# count up to 1 less than before, since there is no target for last close
for t in range(len(series) - T):
  x = series[t:t+T] # using x(t),x(t+1),.....,x(t+T-1) to predict if x(t+T)>0
  X.append(x)
  y = target[t+T]
  Y.append(y)

X = np.array(X).reshape(-1, T)
Y = np.array(Y)
N = len(X)
print("X.shape", X.shape, "Y.shape", Y.shape)

X.shape (2241, 21) Y.shape (2241,)


In [214]:
Xtrain, Ytrain = X[:-Ntest], Y[:-Ntest]
Xtest, Ytest = X[-Ntest:], Y[-Ntest:]

In [215]:
lr = LogisticRegression()
lr.fit(Xtrain, Ytrain)
lr.score(Xtrain, Ytrain)

0.5093011563599799

In [216]:
lr.score(Xtest, Ytest)

0.49603174603174605

In [217]:
svc = SVC()
svc.fit(Xtrain, Ytrain)
svc.score(Xtrain, Ytrain)

0.7551533433886375

In [218]:
svc.score(Xtest, Ytest)

0.49603174603174605

In [219]:
rf = RandomForestClassifier()
rf.fit(Xtrain, Ytrain)
rf.score(Xtrain, Ytrain) # Random Forest fits data in LINES and HYPER-PLANES and can attain 100% in-sample accuracy if no overlap between points of opposing classes

1.0

In [220]:
rf.score(Xtest, Ytest)

0.4880952380952381

In [221]:
# Exercise: maybe you believe walk-forward validation will be
# more realistic - will it lead to better results?

In [None]:
# Exericise: do you think using raw (log) prices would have worked?
# Using raw (log) prices doesn't work to predict if next day's return > 0