In [3]:
import pandas as pd
import numpy as np

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.metrics import confusion_matrix, classification_report, precision_score

## 4.6.3 Linear Discriminant Analysis

In [126]:
Smarket = pd.read_csv('Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)

X_train = Smarket[:'2004'][['Lag1','Lag2']]
y_train = Smarket[:'2004']['Direction']
X_test = Smarket['2005':][['Lag1','Lag2']]
y_test = Smarket['2005':]['Direction']


In [89]:
lda = LDA()
model = lda.fit(X_train, y_train)

In [90]:
model.priors_
#49.2%of the training observations were when the market went up

array([0.49198397, 0.50801603])

In [28]:
model.means_
#tendency for previous days' returns to be positive on the days when market declines
#tendancy for previous days' to be negative, on days when market increases

array([[ 0.04279022,  0.03389409],
       [-0.03954635, -0.03132544]])

In [33]:
model.coef_
#coefficients differ from R

array([[-0.05544078, -0.0443452 ]])

In [43]:
#predict Up or Down

pred = model.predict(X_test)
print(np.unique(pred, return_counts=True)) #predicted 70Down, 182 Up
print()
print(confusion_matrix(pred,y_test))
print()
print(classification_report(y_test, pred, digits=3))

(array(['Down', 'Up'], dtype='<U4'), array([ 70, 182], dtype=int64))

[[ 35  35]
 [ 76 106]]

              precision    recall  f1-score   support

        Down      0.500     0.315     0.387       111
          Up      0.582     0.752     0.656       141

    accuracy                          0.560       252
   macro avg      0.541     0.534     0.522       252
weighted avg      0.546     0.560     0.538       252



In [45]:
#prediction probabilities 

pred_p = model.predict_proba(X_test)
pred_p

array([[0.49017925, 0.50982075],
       [0.4792185 , 0.5207815 ],
       [0.46681848, 0.53318152],
       [0.47400107, 0.52599893],
       [0.49278766, 0.50721234],
       [0.49385615, 0.50614385],
       [0.49510156, 0.50489844],
       [0.4872861 , 0.5127139 ],
       [0.49070135, 0.50929865],
       [0.48440262, 0.51559738],
       [0.49069628, 0.50930372],
       [0.51199885, 0.48800115],
       [0.48951523, 0.51048477],
       [0.47067612, 0.52932388],
       [0.47445929, 0.52554071],
       [0.47995834, 0.52004166],
       [0.49357753, 0.50642247],
       [0.50308938, 0.49691062],
       [0.49788061, 0.50211939],
       [0.48863309, 0.51136691],
       [0.50065681, 0.49934319],
       [0.51087353, 0.48912647],
       [0.50399248, 0.49600752],
       [0.49163351, 0.50836649],
       [0.50417721, 0.49582279],
       [0.50267505, 0.49732495],
       [0.49140429, 0.50859571],
       [0.48059641, 0.51940359],
       [0.48827181, 0.51172819],
       [0.50621869, 0.49378131],
       [0.

In [46]:
np.unique(pred_p[:,1]>0.5, return_counts=True)

#this is the default - same as above 

(array([False,  True]), array([ 70, 182], dtype=int64))

In [48]:
np.stack((pred_p[10:20,1], pred[10:20])).T

#these probabilities output by the model corresponds to the probability that that market will go UP

array([['0.5093037238790318', 'Up'],
       ['0.4880011537380811', 'Down'],
       ['0.510484773063352', 'Up'],
       ['0.5293238777881214', 'Up'],
       ['0.5255407143881711', 'Up'],
       ['0.5200416608518921', 'Up'],
       ['0.5064224705341396', 'Up'],
       ['0.4969106228816935', 'Down'],
       ['0.5021193878585957', 'Up'],
       ['0.5113669134834818', 'Up']], dtype='<U32')

#### Changing the threshold

In [50]:
print(np.unique(pred_p[:,1]>0.9, return_counts=True))

#we want to prodict a market decrease ONLY if posterior proability is at least 90% 
# --- no days in 2005 meet that threshold 

(array([False]), array([252], dtype=int64))


In [51]:
#the greatest posterior probability of decrease in 2005 was: 
max(pred_p[:,1])

0.5422132554518978

## 4.6.4 Quadratic Discriminant Analysis

In [53]:
qda = QDA()
model2 = qda.fit(X_train, y_train)

In [54]:
print(model2.priors_)
print(model2.means_)

[0.49198397 0.50801603]
[[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]


In [58]:
pred2 = model2.predict(X_test)
print(np.unique(pred2, return_counts=True))
print(confusion_matrix(pred2, y_test))
print(classification_report(y_test, pred2, digits=3))

(array(['Down', 'Up'], dtype=object), array([ 50, 202], dtype=int64))
[[ 30  20]
 [ 81 121]]
              precision    recall  f1-score   support

        Down      0.600     0.270     0.373       111
          Up      0.599     0.858     0.706       141

    accuracy                          0.599       252
   macro avg      0.600     0.564     0.539       252
weighted avg      0.599     0.599     0.559       252



## An Application to Carseats Data???

In [129]:
df2 = pd.read_csv('Carseats.csv')
df2 = pd.get_dummies(df2, drop_first=False, columns=['ShelveLoc'])

print(df2.head())
print(df2.shape)
print(df2.columns)

X_train = df2.loc[:319,'Sales':'Education']
y_train_bad = df2.loc[:319,'ShelveLoc_Bad']
y_train_good = df2.loc[:319,'ShelveLoc_Good']
y_train_medium = df2.loc[:319,'ShelveLoc_Medium']

X_test = df2.loc[320:,'Sales':'Education']
y_test_bad = df2.loc[320:,'ShelveLoc_Bad']
y_test_good = df2.loc[320:,'ShelveLoc_Good']
y_test_medium = df2.loc[320:,'ShelveLoc_Medium']

   Sales  CompPrice  Income  Advertising  Population  Price  Age  Education  \
0   9.50        138      73           11         276    120   42         17   
1  11.22        111      48           16         260     83   65         10   
2  10.06        113      35           10         269     80   59         12   
3   7.40        117     100            4         466     97   55         14   
4   4.15        141      64            3         340    128   38         13   

  Urban   US  ShelveLoc_Bad  ShelveLoc_Good  ShelveLoc_Medium  
0   Yes  Yes              1               0                 0  
1   Yes  Yes              0               1                 0  
2   Yes  Yes              0               0                 1  
3   Yes  Yes              0               0                 1  
4   Yes   No              1               0                 0  
(400, 13)
Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'Age', 'Education', 'Urban', 'US', 'ShelveLoc_B

In [137]:
lda = LDA()
model_bad = lda.fit(X_train, y_train_bad)
pred_p_bad = model_bad.predict_proba(X_train)

#pred_p_bad
print(np.unique(pred_p_bad, return_counts=True))
print(confusion_matrix(pred_p_bad, y_test_bad))
print(classification_report(y_test_bad, pred_p_bad, digits=3))


(array([4.73656651e-04, 4.80696661e-04, 6.19421571e-04, 8.03844206e-04,
       8.94722899e-04, 9.76682569e-04, 1.05497540e-03, 1.11166565e-03,
       1.15184580e-03, 1.17247201e-03, 1.18029478e-03, 1.18632760e-03,
       1.67843342e-03, 1.79958728e-03, 1.82168677e-03, 1.91364501e-03,
       1.97304885e-03, 2.07333144e-03, 2.25304478e-03, 2.33320173e-03,
       2.64228926e-03, 2.64380835e-03, 2.74581553e-03, 2.75889308e-03,
       2.79164640e-03, 2.79596491e-03, 3.01588054e-03, 3.03263050e-03,
       3.34721004e-03, 3.91072368e-03, 4.23790064e-03, 4.24521697e-03,
       4.36536683e-03, 4.56216788e-03, 5.46575855e-03, 5.76584682e-03,
       5.76811406e-03, 6.20098580e-03, 6.20689930e-03, 6.32953915e-03,
       6.39150154e-03, 6.52847438e-03, 7.10447980e-03, 7.85451167e-03,
       7.87525706e-03, 7.94009141e-03, 8.01702820e-03, 8.66455538e-03,
       8.81568635e-03, 9.57452793e-03, 9.72638886e-03, 1.01983609e-02,
       1.04897127e-02, 1.17158803e-02, 1.22883141e-02, 1.24378583e-02,
     

ValueError: Found input variables with inconsistent numbers of samples: [320, 80]

In [None]:
pred_p