# Logistic Regression, LDA, QDA, and Naive Bayes



In [4]:
!pip install ISLP



In [5]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)

In [6]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
      QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [7]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
Weekly = load_data('/content/drive/MyDrive/Weekly')
Weekly

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.270,Down
1,1990,-0.270,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
2,1990,-2.576,-0.270,0.816,1.572,-3.936,0.159837,3.514,Up
3,1990,3.514,-2.576,-0.270,0.816,1.572,0.161630,0.712,Up
4,1990,0.712,3.514,-2.576,-0.270,0.816,0.153728,1.178,Up
...,...,...,...,...,...,...,...,...,...
1084,2010,-0.861,0.043,-2.173,3.599,0.015,3.205160,2.969,Up
1085,2010,2.969,-0.861,0.043,-2.173,3.599,4.242568,1.281,Up
1086,2010,1.281,2.969,-0.861,0.043,-2.173,4.835082,0.283,Up
1087,2010,0.283,1.281,2.969,-0.861,0.043,4.454044,1.034,Up


#Logistic Regression

In [10]:
allvars = Weekly.columns.drop(['Today', 'Direction', 'Year'])
design = MS(allvars)
X = design.fit_transform(Weekly)
y = Weekly.Direction == 'Up'
glm = sm.GLM(y,
             X,
             family=sm.families.Binomial())
results = glm.fit()
summarize(results)


Unnamed: 0,coef,std err,z,P>|z|
intercept,0.2669,0.086,3.106,0.002
Lag1,-0.0413,0.026,-1.563,0.118
Lag2,0.0584,0.027,2.175,0.03
Lag3,-0.0161,0.027,-0.602,0.547
Lag4,-0.0278,0.026,-1.05,0.294
Lag5,-0.0145,0.026,-0.549,0.583
Volume,-0.0227,0.037,-0.616,0.538


Lag2 is the most significant.

In [11]:
probs = results.predict()
probs[:10]


array([0.60862494, 0.60103144, 0.58756995, 0.48164156, 0.61690129,
       0.56841902, 0.57860971, 0.51519724, 0.57151998, 0.55542873])

In [12]:
labels = np.array(['Down']*1089)
labels[probs>0.5] = "Up"


In [13]:
confusion_table(labels, Weekly.Direction)


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,54,48
Up,430,557


In [14]:
(54+557)/1089, np.mean(labels == Weekly.Direction)


(0.5610651974288338, 0.5610651974288338)

In [47]:
train = (Weekly.Year <= 2008)
Weekly_train = Weekly.loc[train]
Weekly_test = Weekly.loc[~train]
Weekly_test.shape


(104, 9)

In [48]:
D = Weekly.Direction
L_train, L_test = D.loc[train], D.loc[~train]

In [51]:
model = MS(['Lag2']).fit(Weekly)
X = model.transform(Weekly)
X_train, X_test = X.loc[train], X.loc[~train]
y_train, y_test = y.loc[train], y.loc[~train]
glm_train = sm.GLM(y_train,
                   X_train,
                   family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)
labels = np.array(['Down']*104)
labels[probs>0.5] = 'Up'
confusion_table(labels, L_test)


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,9,5
Up,34,56


In [52]:
(9+56)/104


0.625

## Linear Discriminant Analysis

In [53]:
lda = LDA(store_covariance=True)


In [54]:
X_train, X_test = [M.drop(columns=['intercept'])
                   for M in [X_train, X_test]]
lda.fit(X_train, L_train)


In [58]:
lda.classes_


array(['Down', 'Up'], dtype='<U4')

In [59]:
lda.priors_


array([0.44771574, 0.55228426])

In [60]:
lda_pred = lda.predict(X_test)


In [61]:
confusion_table(lda_pred, L_test)


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,9,5
Up,34,56


In [62]:
(9+56)/104

0.625

Accuracy of LDA = 0.625

## Quadratic Discriminant Analysis


In [63]:
qda = QDA(store_covariance=True)
qda.fit(X_train, L_train)


In [64]:
qda_pred = qda.predict(X_test)
confusion_table(qda_pred, L_test)


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,0,0
Up,43,61


In [65]:
(0+61)/104

0.5865384615384616

Accuracy of QDA = 0.5865384615384616

## Naive Bayes


In [66]:
NB = GaussianNB()
NB.fit(X_train, L_train)


In [67]:
nb_labels = NB.predict(X_test)
confusion_table(nb_labels, L_test)


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,0,0
Up,43,61


In [69]:
(0+61)/104

0.5865384615384616

Accuracy of Naive Bayes = 0.5865384615384616

(i)
It is difficult to compare the performance of the models as they are all particularly poor at predicting the results. Accuracy is a reasonable metric for this problem (unlike a disease for example). The highest accuray comes from the Logistic Regression and LDA with QDA and Naive Bayes being worse than guessing. Interestingly, the QDA & Naive Bayes models resulted in a particularly bullish strategy of assuming every day is an 'Up'.

In conclusion there is no model which can predict much better than random guessing using this data.

Accuracy (%) for each model:  
Logistic reg = 62.5%  
LDA = 62.5%    
QDA = 58.65%  
Naive Bayes = 58.65%