# Lab: Logistic Regression, LDA, QDA, and KNN

## 4.6.1 The Data

In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns

%matplotlib inline

In [6]:
# Import data
df = pd.read_csv('./datasets/Smarket.csv',usecols=range(1,10), index_col=0, parse_dates=True)
df.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2001-01-01,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2001-01-01,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
2001-01-01,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
2001-01-01,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [7]:
# Features
print(df.columns)

Index(['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today', 'Direction'], dtype='object')


In [8]:
# Shape
print(df.shape)

(1250, 8)


In [10]:
# Summary
print(df.describe())

              Lag1         Lag2         Lag3         Lag4        Lag5  \
count  1250.000000  1250.000000  1250.000000  1250.000000  1250.00000   
mean      0.003834     0.003919     0.001716     0.001636     0.00561   
std       1.136299     1.136280     1.138703     1.138774     1.14755   
min      -4.922000    -4.922000    -4.922000    -4.922000    -4.92200   
25%      -0.639500    -0.639500    -0.640000    -0.640000    -0.64000   
50%       0.039000     0.039000     0.038500     0.038500     0.03850   
75%       0.596750     0.596750     0.596750     0.596750     0.59700   
max       5.733000     5.733000     5.733000     5.733000     5.73300   

            Volume        Today  
count  1250.000000  1250.000000  
mean      1.478305     0.003138  
std       0.360357     1.136334  
min       0.356070    -4.922000  
25%       1.257400    -0.639500  
50%       1.422950     0.038500  
75%       1.641675     0.596750  
max       3.152470     5.733000  


In [13]:
# Info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1250 entries, 2001-01-01 to 2005-01-01
Data columns (total 8 columns):
Lag1         1250 non-null float64
Lag2         1250 non-null float64
Lag3         1250 non-null float64
Lag4         1250 non-null float64
Lag5         1250 non-null float64
Volume       1250 non-null float64
Today        1250 non-null float64
Direction    1250 non-null object
dtypes: float64(7), object(1)
memory usage: 87.9+ KB
None


In [14]:
# Correlation
print(df.corr())

            Lag1      Lag2      Lag3      Lag4      Lag5    Volume     Today
Lag1    1.000000 -0.026294 -0.010803 -0.002986 -0.005675  0.040910 -0.026155
Lag2   -0.026294  1.000000 -0.025897 -0.010854 -0.003558 -0.043383 -0.010250
Lag3   -0.010803 -0.025897  1.000000 -0.024051 -0.018808 -0.041824 -0.002448
Lag4   -0.002986 -0.010854 -0.024051  1.000000 -0.027084 -0.048414 -0.006900
Lag5   -0.005675 -0.003558 -0.018808 -0.027084  1.000000 -0.022002 -0.034860
Volume  0.040910 -0.043383 -0.041824 -0.048414 -0.022002  1.000000  0.014592
Today  -0.026155 -0.010250 -0.002448 -0.006900 -0.034860  0.014592  1.000000


## 4.6.2 Logistic Regression

We now fit a logistic regression in order to predict **Direction** using **Lag1** through **Lag5** and **Volume**. We use the *glm()* function from statsmodels. It takes as an input a formula, a dataset, and a family. In our case, we use the Binomial family to perform the logistic regression.

In [24]:
formula = 'Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume'

In [26]:
model = sm.formula.glm(formula,df,family = sm.families.Binomial())
result = model.fit()

In [27]:
print(result.summary())

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                 1250
Model:                                              GLM   Df Residuals:                     1243
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    logit   Scale:                             1.0
Method:                                            IRLS   Log-Likelihood:                -863.79
Date:                                  Mon, 09 Jul 2018   Deviance:                       1727.6
Time:                                          20:22:22   Pearson chi2:                 1.25e+03
No. Iterations:                                       4                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------

The smallest p-value is associated with lag1. However, at a value of 0.145, there is no clear evidence of a real association between **Lag1** and **Direction**. We now list the coefficient of the model.

In [30]:
print("Coeffieients")
print(result.params)
print
print("p-Values")
print(result.pvalues)
print
print("Dependent variables")
print(result.model.endog_names)

Coeffieients
Intercept    0.126000
Lag1         0.073074
Lag2         0.042301
Lag3        -0.011085
Lag4        -0.009359
Lag5        -0.010313
Volume      -0.135441
dtype: float64
p-Values
Intercept    0.600700
Lag1         0.145232
Lag2         0.398352
Lag3         0.824334
Lag4         0.851445
Lag5         0.834998
Volume       0.392404
dtype: float64
Dependent variables
['Direction[Down]', 'Direction[Up]']


The ${\tt predict()}$ function can be used to predict the probability that the market will go down, given values of the predictors. If no data set is supplied to the ${\tt predict()}$ function, then the probabilities are computed for the training data that was used to fit the logistic regression model.

In [32]:
predictions = result.predict()
print(predictions[0:10])

[0.49291587 0.51853212 0.51886117 0.48477764 0.48921884 0.49304354
 0.50734913 0.49077084 0.48238647 0.51116222]


In [34]:
# Assign predictions
glm_pred = ['Up' if x<0.5 else 'Down' for x in predictions]

In [39]:
# Confusion matrix
confusion_matrix(df.Direction,glm_pred)

array([[145, 457],
       [141, 507]])

In [49]:
tn, fp, fn, tp = confusion_matrix(df.Direction,glm_pred).ravel()
print("True Negative = ", tn)
print("False Positive = ", fp)
print("False Negative = ", fn)
print("True Positive = ", tp)

True Negative =  145
False Positive =  457
False Negative =  141
True Positive =  507


The diagonal elements of the confusion matrix indicate correct predictions, while the off-diagonals represent incorrect predictions. Hence our model correctly predicted that the market would go up on 507 days and that it would go down on 145 days, for a total of 507 + 145 = 652 correct predictions. In this case, logistic regression correctly predicted the movement of the market 52.2% of the time. this is confirmed by checking the output of the  𝚌𝚕𝚊𝚜𝚜𝚒𝚏𝚒𝚌𝚊𝚝𝚒𝚘𝚗⎯𝚛𝚎𝚙𝚘𝚛𝚝()classification_report()  function.

In [50]:
# Classification report
print(classification_report(df.Direction,glm_pred))

             precision    recall  f1-score   support

       Down       0.51      0.24      0.33       602
         Up       0.53      0.78      0.63       648

avg / total       0.52      0.52      0.48      1250



At first glance, it appears that the logistic regression model is working
a little better than random guessing. But remember, this result is misleading
because we trained and tested the model on the same set of 1,250 observations.
In other words, 100− 52.2 = 47.8% is the **training error rate**. As we
have seen previously, the training error rate is often overly optimistic — it
tends to underestimate the _test_ error rate. 

In order to better assess the accuracy
of the logistic regression model in this setting, we can fit the model
using part of the data, and then examine how well it predicts the held out
data. This will yield a more realistic error rate, in the sense that in practice
we will be interested in our model’s performance not on the data that
we used to fit the model, but rather on days in the future for which the
market’s movements are unknown.

We creaate a vector containing the observations from 2001 to 2004 and make predictions on the 2005 data.

In [51]:
X_train = df[:'2004'][:]
y_train = df[:'2004']['Direction']

X_test = df['2005':][:]
y_test = df['2005':]['Direction']

In [52]:
# Shape of test set
print(X_test.shape)

(252, 8)


In [54]:
# We create and fit the logistic regression
model = sm.formula.glm(formula=formula, data=X_train, family=sm.families.Binomial())
result = model.fit()

In [56]:
# Make predictions
predictions = result.predict(X_test)
print(predictions[0:10])

Year
2005-01-01    0.471780
2005-01-01    0.484331
2005-01-01    0.477348
2005-01-01    0.486146
2005-01-01    0.501666
2005-01-01    0.498909
2005-01-01    0.497230
2005-01-01    0.490432
2005-01-01    0.495989
2005-01-01    0.489359
dtype: float64


In [57]:
# Assign predictions
glm_pred = ['Up' if x<0.5 else 'Down' for x in predictions]

In [59]:
# Confusion matrix
confusion_matrix(y_test,glm_pred)

array([[77, 34],
       [97, 44]])

In [61]:
tn, fp, fn, tp = confusion_matrix(y_test,glm_pred).ravel()
print("True Negative = ", tn)
print("False Positive = ", fp)
print("False Negative = ", fn)
print("True Positive = ", tp)

True Negative =  77
False Positive =  34
False Negative =  97
True Positive =  44


The results are rather disappointing: the test error rate (1 -  𝚛𝚎𝚌𝚊𝚕𝚕recall ) is 52%, which is worse than random guessing! Of course, this result is not all that surprising, given that one would not generally expect to be able to use previous days’ returns to predict future market performance. (After all, if it were possible to do so, then the authors of this book [along with your professor] would probably be out striking it rich rather than teaching statistics.)

We recall that the logistic regression model had very underwhelming p-values associated with all of the predictors, and that the smallest p-value, though not very small, corresponded to  𝙻𝚊𝚐𝟷Lag1 . Perhaps by removing the variables that appear not to be helpful in predicting  𝙳𝚒𝚛𝚎𝚌𝚝𝚒𝚘𝚗Direction , we can obtain a more effective model. After all, using predictors that have no relationship with the response tends to cause a deterioration in the test error rate (since such predictors cause an increase in variance without a corresponding decrease in bias), and so removing such predictors may in turn yield an improvement.

In the space below, refit a logistic regression using just  𝙻𝚊𝚐𝟷  and  𝙻𝚊𝚐𝟸 , which seemed to have the highest predictive power in the original logistic regression model.

In [62]:
formula = 'Direction ~ Lag1+Lag2+Volume'

In [63]:
model = sm.formula.glm(formula,df,family = sm.families.Binomial())
result = model.fit()

In [64]:
print(result.summary())

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                 1250
Model:                                              GLM   Df Residuals:                     1246
Model Family:                                  Binomial   Df Model:                            3
Link Function:                                    logit   Scale:                             1.0
Method:                                            IRLS   Log-Likelihood:                -863.85
Date:                                  Mon, 09 Jul 2018   Deviance:                       1727.7
Time:                                          21:07:09   Pearson chi2:                 1.25e+03
No. Iterations:                                       4                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------

In [66]:
# Make predictions
predictions = result.predict(X_test)

# Assign predictions
glm_pred = ['Up' if x<0.5 else 'Down' for x in predictions]

In [67]:
# Confusion matrix
confusion_matrix(y_test,glm_pred)

array([[  4, 107],
       [  1, 140]])

In [68]:
tn, fp, fn, tp = confusion_matrix(y_test,glm_pred).ravel()
print("True Negative = ", tn)
print("False Positive = ", fp)
print("False Negative = ", fn)
print("True Positive = ", tp)

True Negative =  4
False Positive =  107
False Negative =  1
True Positive =  140
