In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize)

In [2]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
(LinearDiscriminantAnalysis as LDA,
QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Logistic Regression

In [3]:
Smarket = load_data('Smarket')

#allvars = Smarket.columns.drop(['Today', 'Direction', 'Year']) # all predictors
allvars = ['Lag1','Lag2'] # by removing other less related variable we can improve the correct rate
design = MS(allvars)

X = design.fit_transform(Smarket)
y = Smarket.Direction == 'Up' # True/False response

train = (Smarket.Year < 2005) # Separate the data into train and test
Smarket_train = Smarket.loc[train]
Smarket_test = Smarket.loc[~train]
X_train, X_test = X.loc[train], X.loc[~train]
y_train, y_test = y.loc[train], y.loc[~train]
D = Smarket.Direction
L_train, L_test = D.loc[train], D.loc[~train]

glm_train = sm.GLM(y_train, X_train, family=sm.families.Binomial())
results = glm_train.fit()

probs = results.predict(exog=X_test)
labels = np.array(['Down'] * len(probs))
labels[probs > 0.5] = 'Up'

print(confusion_table(labels, L_test))
print("Correct rate for test set is ", np.mean(labels == L_test))

Truth      Down   Up
Predicted           
Down         35   35
Up           76  106
Correct rate for test set is  0.5595238095238095


## LDA

In [4]:
X_train, X_test = [M.drop(columns = ['intercept']) for M in [X_train, X_test]]

In [5]:
model = LDA(store_covariance = True)
model.fit(X_train, L_train)
model_test = model.predict(X_test)
print(confusion_table(model_test, L_test))

model_prob = model.predict_proba(X_test)
np.all([model.classes_[i] for i in np.argmax(model_prob, axis=1)] == model_test)  # check decision boundary, np.all(np.where(model_prob[:, 1] >= 0.5, 'Up', 'Down') == model_test)

Truth      Down   Up
Predicted           
Down         35   35
Up           76  106


True

## QDA

In [6]:
model = QDA(store_covariance = True)

model.fit(X_train, L_train)
model_test = model.predict(X_test)
print(confusion_table(model_test, L_test))
model_prob = model.predict_proba(X_test)
np.all([model.classes_[i] for i in np.argmax(model_prob, axis=1)] == model_test)  # check decision boundary, np.all(np.where(model_prob[:, 1] >= 0.5, 'Up', 'Down') == model_test)

Truth      Down   Up
Predicted           
Down         30   20
Up           81  121


True

## Naive Bayes

In [7]:
model = GaussianNB()
model.fit(X_train, L_train)
model_test = model.predict(X_test)
print(confusion_table(model_test, L_test))

model_prob = model.predict_proba(X_test)
np.all([model.classes_[i] for i in np.argmax(model_prob, axis=1)] == model_test)  # check decision boundary, np.all(np.where(model_prob[:, 1] >= 0.5, 'Up', 'Down') == model_test)

Truth      Down   Up
Predicted           
Down         29   20
Up           82  121


True

## KNN

In [8]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, L_train)
model_test = model.predict(X_test)
print(confusion_table(model_test, L_test))

model_prob = model.predict_proba(X_test)
np.all([model.classes_[i] for i in np.argmax(model_prob, axis=1)] == model_test)  # check decision boundary, np.all(np.where(model_prob[:, 1] >= 0.5, 'Up', 'Down') == model_test)

Truth      Down  Up
Predicted          
Down         48  55
Up           63  86


True

### Example: Caravan

In [16]:
Caravan = load_data('Caravan')
Purchase = Caravan.Purchase
#Purchase.value_counts()
feature_df = Caravan.drop(columns=['Purchase'])

In [24]:
scaler = StandardScaler(with_mean = True, with_std = True, copy = True)
scaler.fit(feature_df)
X_std = scaler.transform(feature_df)
feature_std = pd.DataFrame(X_std, columns = feature_df.columns)

MOSTYPE    -7.749822e-17
MAANTHUI   -1.614038e-16
MGEMOMV    -2.099164e-16
MGEMLEEF    2.154084e-16
MOSHOOFD   -2.929067e-17
                ...     
AZEILPL    -6.102222e-19
APLEZIER   -1.830667e-18
AFIETS      0.000000e+00
AINBOED     3.905422e-17
ABYSTAND    3.356222e-18
Length: 85, dtype: float64

In [64]:
(X_train, X_test, y_train, y_test) = train_test_split(feature_std, Purchase, test_size = 1000, random_state = 0) # randomly devide train and test set
model = KNeighborsClassifier(n_neighbors = 1)
model.fit(X_train, y_train)

model_test = model.predict(X_test)
print(confusion_table(model_test, y_test))

model_prob = model.predict_proba(X_test)
np.all([model.classes_[i] for i in np.argmax(model_prob, axis=1)] == model_test)  # check decision boundary, np.all(np.where(model_prob[:, 1] >= 0.5, 'Up', 'Down') == model_test)

Truth       No  Yes
Predicted          
No         880   58
Yes         53    9


True

In [65]:
# 
print("Prediction accuracy is ", np.mean(model_test == y_test))
print("Prediction accuracy for Yes is ", sum((y_test == "Yes") & (model_test == "Yes")) / sum(y_test == "Yes"))
print("Better than null rate ", np.mean(y_test == "Yes"))

Prediction accuracy is  0.889
Prediction accuracy for Yes is  0.13432835820895522
Better than null rate  0.067
