Importing the required libraries 

In [12]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)

We also need to import some custom functions and data files related to our textbook

In [13]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
      QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Classification on Stock Data
This data set consists of percentage returns for the S&P 500 stock index over 1,250 days, from the beginning of 2001 until the end of 2005. For each date, we have recorded the percentage returns for each of the five previous trading days, Lag1 through Lag5. We have also recorded Volume (the number of shares traded on the previous day, in billions), Today (the percentage return on the date in question) and Direction (whether the market was Up or Down on this date).

In [14]:
# Viewing the data
Smarket = load_data('Smarket')
print(Smarket)

      Year   Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
0     2001  0.381 -0.192 -2.624 -1.055  5.010  1.19130  0.959        Up
1     2001  0.959  0.381 -0.192 -2.624 -1.055  1.29650  1.032        Up
2     2001  1.032  0.959  0.381 -0.192 -2.624  1.41120 -0.623      Down
3     2001 -0.623  1.032  0.959  0.381 -0.192  1.27600  0.614        Up
4     2001  0.614 -0.623  1.032  0.959  0.381  1.20570  0.213        Up
...    ...    ...    ...    ...    ...    ...      ...    ...       ...
1245  2005  0.422  0.252 -0.024 -0.584 -0.285  1.88850  0.043        Up
1246  2005  0.043  0.422  0.252 -0.024 -0.584  1.28581 -0.955      Down
1247  2005 -0.955  0.043  0.422  0.252 -0.024  1.54047  0.130        Up
1248  2005  0.130 -0.955  0.043  0.422  0.252  1.42236 -0.298      Down
1249  2005 -0.298  0.130 -0.955  0.043  0.422  1.38254 -0.489      Down

[1250 rows x 9 columns]


# Running Logisitc Regression on the data

In [25]:
train = (Smarket.Year < 2005)
Smarket_train = Smarket.loc[train]
Smarket_test = Smarket.loc[~train]
Smarket_test.shape

(252, 9)

In [31]:
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year'])
print(allvars)
print('===========================================================================')
design = MS(allvars)

X = design.fit_transform(Smarket)
y = Smarket.Direction == 'Up'
print(y)
print('===========================================================================')

X_train, X_test = X.loc[train], X.loc[~train]
y_train, y_test = y.loc[train], y.loc[~train]

glm_train = sm.GLM(y_train,
                   X_train,
                   family=sm.families.Binomial()) # Binomial family is useful for binary outcomes.
results = glm_train.fit()

probs = results.predict(exog=X_test) # to apply the model to the test data
print(probs)

labels = np.array(['Down']*Smarket_test.shape[0])
labels[probs>0.5] = "Up"
print('===========================================================================')
print(confusion_table(labels, Smarket.Direction[~train]))
# extracting the true labels from the data
L_test = Smarket.Direction.loc[~train]
print('===========================================================================')
print(np.mean(labels == L_test), np.mean(labels != L_test))

Index(['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume'], dtype='object')
0        True
1        True
2       False
3        True
4        True
        ...  
1245     True
1246    False
1247     True
1248    False
1249    False
Name: Direction, Length: 1250, dtype: bool
998     0.528220
999     0.515669
1000    0.522652
1001    0.513854
1002    0.498334
          ...   
1245    0.483637
1246    0.506048
1247    0.516658
1248    0.516124
1249    0.508072
Length: 252, dtype: float64
Truth      Down  Up
Predicted          
Down         77  97
Up           34  44
0.4801587301587302 0.5198412698412699


The results are rather disappointing: the test error rate is 52%, which is worse than random guessing! Let's reduce the number of features and see if we can further improve the results

In [26]:
model = MS(['Lag1', 'Lag2']).fit(Smarket)
print(Smarket)
X = model.transform(Smarket)
X_train, X_test = X.loc[train], X.loc[~train]
glm_train = sm.GLM(y_train,
                   X_train,
                   family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)
print(probs)
labels = np.array(['Down']*Smarket_test.shape[0])
labels[probs>0.5] = "Up"
print('===========================================================================')
print(confusion_table(labels, Smarket.Direction[~train]))
# extracting the true labels from the data
L_test = Smarket.Direction.loc[~train]
print('===========================================================================')
print(np.mean(labels == L_test), np.mean(labels != L_test))

      Year   Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
0     2001  0.381 -0.192 -2.624 -1.055  5.010  1.19130  0.959        Up
1     2001  0.959  0.381 -0.192 -2.624 -1.055  1.29650  1.032        Up
2     2001  1.032  0.959  0.381 -0.192 -2.624  1.41120 -0.623      Down
3     2001 -0.623  1.032  0.959  0.381 -0.192  1.27600  0.614        Up
4     2001  0.614 -0.623  1.032  0.959  0.381  1.20570  0.213        Up
...    ...    ...    ...    ...    ...    ...      ...    ...       ...
1245  2005  0.422  0.252 -0.024 -0.584 -0.285  1.88850  0.043        Up
1246  2005  0.043  0.422  0.252 -0.024 -0.584  1.28581 -0.955      Down
1247  2005 -0.955  0.043  0.422  0.252 -0.024  1.54047  0.130        Up
1248  2005  0.130 -0.955  0.043  0.422  0.252  1.42236 -0.298      Down
1249  2005 -0.298  0.130 -0.955  0.043  0.422  1.38254 -0.489      Down

[1250 rows x 9 columns]
998     0.509827
999     0.520824
1000    0.533263
1001    0.526057
1002    0.507210
          ...   
1245    0

# How good is this result?
Now the results appear to be a little better: 56% of the daily movements have been correctly predicted. It is worth noting that in this case, a much simpler strategy of predicting that the market will increase every day will also be correct 56% of the time! Hence, in terms of overall error rate, the logistic regression method is no better than the naive approach.

# ========================================================
# Running LDA

In [18]:
lda = LDA(store_covariance=True)
# Since the LDA estimator automatically adds an intercept, we should remove the column corresponding to 
# the intercept in both X_train and X_test. We can also directly use the labels rather than the Boolean 
# vectors y_train.
X_train, X_test = [M.drop(columns=['intercept'])
                   for M in [X_train, X_test]]
L_train = Smarket.Direction.loc[train]
lda.fit(X_train, L_train)

In [19]:
lda_pred = lda.predict(X_test)
print(confusion_table(lda_pred, L_test))
print(np.mean(lda_pred == L_test), np.mean(lda_pred != L_test))

Truth      Down   Up
Predicted           
Down         35   35
Up           76  106
0.5595238095238095 0.44047619047619047


We see that the results are not that different than the logistic regression

# ========================================================
# Running QDA

In [20]:
qda = QDA(store_covariance=True)
qda.fit(X_train, L_train)
qda_pred = qda.predict(X_test)
print(confusion_table(qda_pred, L_test))
print(np.mean(qda_pred == L_test), np.mean(qda_pred != L_test))

Truth      Down   Up
Predicted           
Down         30   20
Up           81  121
0.5992063492063492 0.4007936507936508


# How good is this result?
This level of accuracy is quite impressive for stock market data, which is known to be quite hard to model accurately. This suggests that the quadratic form assumed by QDA may capture the true relationship more accurately than the linear forms assumed by LDA and logistic regression. However, we recommend evaluating this method’s performance on a larger test set before betting that this approach will consistently beat the market!