# Lab 2: classification methods

This lab is due by midnight Saturday Feb 19th

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


In [2]:
# You will need to change this for your environment
DATA_ROOT = 'Data/'

In [3]:
# Note the 'index_col' argument here, which makes slicing easier below.
market = pd.read_csv(DATA_ROOT + 'Smarket.csv', index_col=0, parse_dates=True)
market.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2001-01-01,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2001-01-01,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
2001-01-01,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
2001-01-01,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


## Logistic Regression


In [4]:
# We will re-use this formula with other learning methods below
all_lags = 'Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume'

marklr = smf.glm(formula=all_lags, data=market, family=sm.families.Binomial())
mlr_res = marklr.fit()
print(mlr_res.summary())

# The predicted values are probabilities
mlr_prob = mlr_res.predict()
print('predicted probabilities:', mlr_prob[0:10])

# Here we create a set of qualitative predictions by thresholding on the probabilities
predictions_nominal = ["Up" if x < 0.5 else "Down" for x in mlr_prob]
print('qualitative predictions:', predictions_nominal[0:10])

# Note: the '.T' here to take the transpose so that the true classes are columns and the predicted classes are rows,
# matching the class slides
print('confusion matrix:\n', confusion_matrix(market["Direction"], predictions_nominal).T)

print(classification_report(market["Direction"], predictions_nominal, digits=3))

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                 1250
Model:                                              GLM   Df Residuals:                     1243
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -863.79
Date:                                  Sat, 12 Feb 2022   Deviance:                       1727.6
Time:                                          16:11:05   Pearson chi2:                 1.25e+03
No. Iterations:                                       4                                         
Covariance Type:                              nonrobust                                         
                 coef    std e

In [6]:
# Split the data into training and test sets, training on everything up to and including 2004 data
# and testing on 2005 and later data:
x_train = market[:'2004'][:]
y_train = market[:'2004']['Direction']

x_test = market['2005':][:]
y_test = market['2005':]['Direction']

In [7]:
# Fit a logistic regression to the training data and (below) evaluate it using the test data
mlr_04 = smf.glm(formula=all_lags, data=x_train, family=sm.families.Binomial())
res_04 = mlr_04.fit()
print(res_04.summary())

# Build predictions of the test data using a 0.5 threshold
prob_04 = res_04.predict(x_test)
pred_04 = ['Up' if x < 0.5 else 'Down' for x in prob_04]

print('confusion matrix:\n', confusion_matrix(y_test, pred_04).T)
print(classification_report(y_test, pred_04))

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                  998
Model:                                              GLM   Df Residuals:                      991
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -690.55
Date:                                  Sat, 12 Feb 2022   Deviance:                       1381.1
Time:                                          16:16:57   Pearson chi2:                     998.
No. Iterations:                                       4                                         
Covariance Type:                              nonrobust                                         
                 coef    std e

## Your job: build and test a LR model with only the two predictors with the best p-values above

Looking at the model summary above, that will be Lag1 and Lag2.

Build the new model below, and generate a new confusion matrix and classification report as above.

In [8]:
# Build a model using just lag1 and lag2 and test it (skip the code for the lab)

slr = smf.glm(formula='Direction ~ Lag1 + Lag2', data=x_train, family=sm.families.Binomial())
slr_fit = slr.fit()
print(slr_fit.summary())
prob_slr = slr_fit.predict(x_test)
pred_slr = ['Up' if x < 0.5 else 'Down' for x in prob_slr]
print('confusion matrix:\n', confusion_matrix(y_test, pred_slr).T)
print(classification_report(y_test, pred_slr))

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                  998
Model:                                              GLM   Df Residuals:                      995
Model Family:                                  Binomial   Df Model:                            2
Link Function:                                    logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -690.70
Date:                                  Sat, 12 Feb 2022   Deviance:                       1381.4
Time:                                          16:17:18   Pearson chi2:                     998.
No. Iterations:                                       4                                         
Covariance Type:                              nonrobust                                         
                 coef    std e

## Questions 1 - 3

Question 1: How does the overall accuracy of this smaller model compare

A:The overall accuracy of the smaller model actually seems to be better than that of the larger model.


Question 2: Show how to use the confusion matrix to derive the overall accuracy as shown in the classification report.
(The calculations can be typed here and do not have to be shown with code.)

A:35+106/(35+35+76+106) = 55.9%


Question 3: How does the interpretability of the second model compare with the first in your opinion? Justify your answer.

A: Naturally when the amount of variables/lags decreases, the interpretability improves. Thus, when we have a model with 2 lags vs. another model with 5, and the model with 2 also has better accuracy, it strongly indicates that the other three lags were unecessary or unproductive to the efficacy of our model.

## K-Nearest Neighbors

We now build a model for the same data with K-Nearest neighbors

In [9]:
knn = neighbors.KNeighborsClassifier(n_neighbors=1)

# Restrict the training and test data to only have the 'Lag1' and 'Lag2' predictor variables.
# (This code fits the model and makes predictions in one line.)
pred = knn.fit(x_train[['Lag1', 'Lag2']], y_train).predict(x_test[['Lag1', 'Lag2']])

print('KNN confusion matrix:\n', confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred))

KNN confusion matrix:
 [[43 58]
 [68 83]]
              precision    recall  f1-score   support

        Down       0.43      0.39      0.41       111
          Up       0.55      0.59      0.57       141

    accuracy                           0.50       252
   macro avg       0.49      0.49      0.49       252
weighted avg       0.50      0.50      0.50       252



In [10]:
# KNN with K of 1 performed poorly, let's try K of 3

knn = neighbors.KNeighborsClassifier(n_neighbors=3)
pred = knn.fit(x_train[['Lag1', 'Lag2']], y_train).predict(x_test[['Lag1', 'Lag2']])

print('KNN confusion matrix:\n', confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred))

KNN confusion matrix:
 [[48 55]
 [63 86]]
              precision    recall  f1-score   support

        Down       0.47      0.43      0.45       111
          Up       0.58      0.61      0.59       141

    accuracy                           0.53       252
   macro avg       0.52      0.52      0.52       252
weighted avg       0.53      0.53      0.53       252



## Your task: try some more values for K (number of neighbors) and report on which has best overall accuracy

In [15]:
# That was an improvement, try some other values to compare

for k in range(1,100,1):
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    pred = knn.fit(x_train[['Lag1', 'Lag2']], y_train).predict(x_test[['Lag1', 'Lag2']])
    print(k)
    print('KNN confusion matrix:\n', confusion_matrix(y_test, pred).T)
    print(classification_report(y_test, pred))

1
KNN confusion matrix:
 [[43 58]
 [68 83]]
              precision    recall  f1-score   support

        Down       0.43      0.39      0.41       111
          Up       0.55      0.59      0.57       141

    accuracy                           0.50       252
   macro avg       0.49      0.49      0.49       252
weighted avg       0.50      0.50      0.50       252

2
KNN confusion matrix:
 [[74 93]
 [37 48]]
              precision    recall  f1-score   support

        Down       0.44      0.67      0.53       111
          Up       0.56      0.34      0.42       141

    accuracy                           0.48       252
   macro avg       0.50      0.50      0.48       252
weighted avg       0.51      0.48      0.47       252

3
KNN confusion matrix:
 [[48 55]
 [63 86]]
              precision    recall  f1-score   support

        Down       0.47      0.43      0.45       111
          Up       0.58      0.61      0.59       141

    accuracy                           0.53       

## Question 4:

Question 4: Which of the other K values that you tried for K-Nearest neighbors worked the best, based on overall accuracy?

A: The highest accuracy I observed was for k=74, with an accuracy of 0.55. Others, such as k=71 and k=67 and k=97 had accuracy of 0.54.

# Linear discriminant analysis

In [16]:
lda = LinearDiscriminantAnalysis()
ldm = lda.fit(x_train[['Lag1', 'Lag2']], y_train)

print('Priors:', ldm.priors_)
print('Means:', ldm.means_)
print('Coefficients:', ldm.coef_)

pred = ldm.predict(x_test[['Lag1', 'Lag2']])
print(confusion_matrix(pred, y_test).T)
print(classification_report(y_test, pred))


Priors: [0.49198397 0.50801603]
Means: [[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]
Coefficients: [[-0.05544078 -0.0443452 ]]
[[ 35  76]
 [ 35 106]]
              precision    recall  f1-score   support

        Down       0.50      0.32      0.39       111
          Up       0.58      0.75      0.66       141

    accuracy                           0.56       252
   macro avg       0.54      0.53      0.52       252
weighted avg       0.55      0.56      0.54       252



## Quadratic discriminant analysis

In [17]:
qda = QuadraticDiscriminantAnalysis()
qdm = qda.fit(x_train[['Lag1', 'Lag2']], y_train)

print('Priors:', qdm.priors_)
print('Means:', qdm.means_)

q_pred = qdm.predict(x_test[['Lag1', 'Lag2']])
print(confusion_matrix(q_pred, y_test).T)
print(classification_report(y_test, q_pred))

Priors: [0.49198397 0.50801603]
Means: [[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]
[[ 30  81]
 [ 20 121]]
              precision    recall  f1-score   support

        Down       0.60      0.27      0.37       111
          Up       0.60      0.86      0.71       141

    accuracy                           0.60       252
   macro avg       0.60      0.56      0.54       252
weighted avg       0.60      0.60      0.56       252



## Question 5

Question 5: which of the methods that you tried produced the best results for predicting Direction from Lag1 and Lag2?

A: QDA appears to have had the best results.

# Carseats data

Now load the carseats data and try to predict whether the store is located in the US from the other predictor variables.

Report below on your findings about (at least) three different learning approaches, comparing their overall accuracy.

If you use K-nearest neighbors, be sure to try a few different values for K and report on the best one, showing your work.

If you use logistic regression, try to find a simple model with good accuracy by dropping predictors with high p-values.

In [18]:
seats = pd.read_csv(DATA_ROOT + 'Carseats.csv')
seats.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [26]:
# Pick random training and test sets for your analysis:
x_train, x_test, y_train, y_test = train_test_split(seats, seats['US'],
                                                    train_size=0.8, test_size=0.2)

# Hint: if you need to remove some predictors for training or testing in any of the learning methods,
# you can use the pandas 'drop' function to drop the corresponding columns, e.g.
x_train.drop(columns=['US']).head()

# Hint 2: if you want to write a formula and include a lot of columns, you could use the method
# that was shown in lab 1, e.g.:
#sm.OLS.from_formula('medv ~ ' + '+'.join(df.columns.difference(['medv', 'age', 'indus'])), df)
all_vals = 'US ~ Sales+CompPrice+Income+Advertising+Population+Price+ShelveLoc+Age+Education+Urban'
dropped_vals = 'US ~ Advertising+Population'

In [25]:
# Your code goes here. I would recommend using a different cell for each learning method:

# learning method 1
# using all values in linear regression to see which predictors to drop
mlr_04 = smf.glm(formula=all_vals, data=x_train, family=sm.families.Binomial())
res_04 = mlr_04.fit()
print(res_04.summary())

# Build predictions of the test data using a 0.5 threshold
prob_04 = res_04.predict(x_test)
pred_04 = ['Yes' if x < 0.5 else 'No' for x in prob_04]

print('confusion matrix:\n', confusion_matrix(y_test, pred_04).T)
print(classification_report(y_test, pred_04))
print("----------------------------")

mlr_04 = smf.glm(formula=dropped_vals, data=x_train, family=sm.families.Binomial())
res_04 = mlr_04.fit()
print(res_04.summary())

# Build predictions of the test data using a 0.5 threshold
prob_04 = res_04.predict(x_test)
pred_04 = ['Yes' if x < 0.5 else 'No' for x in prob_04]

print('confusion matrix:\n', confusion_matrix(y_test, pred_04).T)
print(classification_report(y_test, pred_04))

                   Generalized Linear Model Regression Results                   
Dep. Variable:     ['US[No]', 'US[Yes]']   No. Observations:                  320
Model:                               GLM   Df Residuals:                      308
Model Family:                   Binomial   Df Model:                           11
Link Function:                     logit   Scale:                          1.0000
Method:                             IRLS   Log-Likelihood:                -78.553
Date:                   Sat, 12 Feb 2022   Deviance:                       157.11
Time:                           18:11:59   Pearson chi2:                     286.
No. Iterations:                        8                                         
Covariance Type:               nonrobust                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Inte

In [27]:
# learning method 2
for k in range(1,100,1):
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    pred = knn.fit(x_train[['Advertising', 'Population']], y_train).predict(x_test[['Advertising', 'Population']])
    print(k)
    print('KNN confusion matrix:\n', confusion_matrix(y_test, pred).T)
    print(classification_report(y_test, pred))

1
KNN confusion matrix:
 [[23 10]
 [ 5 42]]
              precision    recall  f1-score   support

          No       0.70      0.82      0.75        28
         Yes       0.89      0.81      0.85        52

    accuracy                           0.81        80
   macro avg       0.80      0.81      0.80        80
weighted avg       0.82      0.81      0.82        80

2
KNN confusion matrix:
 [[28 15]
 [ 0 37]]
              precision    recall  f1-score   support

          No       0.65      1.00      0.79        28
         Yes       1.00      0.71      0.83        52

    accuracy                           0.81        80
   macro avg       0.83      0.86      0.81        80
weighted avg       0.88      0.81      0.82        80

3
KNN confusion matrix:
 [[24 10]
 [ 4 42]]
              precision    recall  f1-score   support

          No       0.71      0.86      0.77        28
         Yes       0.91      0.81      0.86        52

    accuracy                           0.82       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


65
KNN confusion matrix:
 [[ 0  0]
 [28 52]]
              precision    recall  f1-score   support

          No       0.00      0.00      0.00        28
         Yes       0.65      1.00      0.79        52

    accuracy                           0.65        80
   macro avg       0.33      0.50      0.39        80
weighted avg       0.42      0.65      0.51        80

66
KNN confusion matrix:
 [[ 0  0]
 [28 52]]
              precision    recall  f1-score   support

          No       0.00      0.00      0.00        28
         Yes       0.65      1.00      0.79        52

    accuracy                           0.65        80
   macro avg       0.33      0.50      0.39        80
weighted avg       0.42      0.65      0.51        80

67
KNN confusion matrix:
 [[ 0  0]
 [28 52]]
              precision    recall  f1-score   support

          No       0.00      0.00      0.00        28
         Yes       0.65      1.00      0.79        52

    accuracy                           0.65    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


80
KNN confusion matrix:
 [[ 0  0]
 [28 52]]
              precision    recall  f1-score   support

          No       0.00      0.00      0.00        28
         Yes       0.65      1.00      0.79        52

    accuracy                           0.65        80
   macro avg       0.33      0.50      0.39        80
weighted avg       0.42      0.65      0.51        80

81
KNN confusion matrix:
 [[ 0  0]
 [28 52]]
              precision    recall  f1-score   support

          No       0.00      0.00      0.00        28
         Yes       0.65      1.00      0.79        52

    accuracy                           0.65        80
   macro avg       0.33      0.50      0.39        80
weighted avg       0.42      0.65      0.51        80

82
KNN confusion matrix:
 [[ 0  0]
 [28 52]]
              precision    recall  f1-score   support

          No       0.00      0.00      0.00        28
         Yes       0.65      1.00      0.79        52

    accuracy                           0.65    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# learning method 3
qda = QuadraticDiscriminantAnalysis()
qdm = qda.fit(x_train[['Advertising', 'Population']], y_train)

print('Priors:', qdm.priors_)
print('Means:', qdm.means_)

q_pred = qdm.predict(x_test[['Advertising', 'Population']])
print(confusion_matrix(q_pred, y_test).T)
print(classification_report(y_test, q_pred))


Priors: [0.35625 0.64375]
Means: [[  0.55263158 252.80701754]
 [ 10.24271845 280.19417476]]
[[27  1]
 [12 40]]
              precision    recall  f1-score   support

          No       0.69      0.96      0.81        28
         Yes       0.98      0.77      0.86        52

    accuracy                           0.84        80
   macro avg       0.83      0.87      0.83        80
weighted avg       0.88      0.84      0.84        80



## Questions 6-9

(Each of the three questions below carries the same weight as the earlier questions.)

Question 6: What was the first method you tried, and what was its best overall accuracy?

A: Using linear regression with all the variables allowed me to drop all except for 2. The model had an overall accuracy of 0.88.


Question 7: What was the second method you tried, and what was its best overall accuracy?

A: The second method I tried was KNN. The highest accuracy I received using KNN was 0.82 with K=3.


Question 8: What was the third method you tried, and what was its best overall accuracy?

A: The third method I tried was QDA, with an accuracy of 0.84.