In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize)

In [2]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
Boston = load_data('Boston')
Boston

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0


In [4]:
Boston['crim01'] = Boston['crim'] > np.median(Boston['crim'])

In [5]:
Boston.corr()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv,crim01
crim,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,0.455621,-0.388305,0.409395
zn,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,-0.412995,0.360445,-0.436151
indus,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,0.6038,-0.483725,0.60326
chas,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,-0.053929,0.17526,0.070097
nox,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,0.590879,-0.427321,0.723235
rm,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,-0.613808,0.69536,-0.156372
age,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,0.602339,-0.376955,0.61394
dis,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,-0.496996,0.249929,-0.616342
rad,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,0.488676,-0.381626,0.619786
tax,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,0.543993,-0.468536,0.608741


In [19]:
features = Boston.drop(columns= ['crim','crim01','ptratio', 'medv', 'rm', 'chas'])
X = MS(features).fit_transform(Boston)
Y = Boston.crim01

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y , train_size=0.8)

### Logit

In [32]:
logit = sm.Logit(Y_train, X_train)
results = logit.fit()
prob = results.predict(exog= X_test)
label = np.array([False]*len(X_test))
label[prob > 0.5] = True
confusion_table(label, Y_test)

Optimization terminated successfully.
         Current function value: 0.226777
         Iterations 10


Truth,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,45,4
True,9,44


In [33]:
np.mean(label == Y_test)

0.8725490196078431

### LAD

In [34]:
lda = LDA()
results = lda.fit(X_train, Y_train)
pred = results.predict(X_test)
confusion_table(pred,Y_test)

Truth,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,52,12
True,2,36


In [35]:
np.mean(pred == Y_test)

0.8627450980392157

### QDA

In [36]:
Xqda = X.drop(columns= 'intercept')
Xqda_train, Xqda_test = train_test_split(Xqda, train_size=0.8, random_state=0)
qda = QDA()
results = qda.fit(Xqda_train,Y_train)
predqda = results.predict(Xqda_test)
confusion_table(predqda, Y_test)

Truth,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,24,21
True,30,27


In [37]:
np.mean(predqda == Y_test)

0.5

### NB

In [38]:
NB = GaussianNB()
results = NB.fit(X_train, Y_train)
predNB = results.predict(X_test)
confusion_table(predNB, Y_test)

Truth,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,44,10
True,10,38


In [39]:
np.mean(predNB == Y_test)

0.803921568627451

### KNN

In [40]:
knn1 = KNeighborsClassifier(n_neighbors=4)
results = knn1.fit(X_train,Y_train)
pred = knn1.predict(X_test)
confusion_table(pred, Y_test)

Truth,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,52,3
True,2,45


In [41]:
np.mean(pred==Y_test)

0.9509803921568627

### Conclusions

Logit, KNN and LDA seem to be the best models for prediction crimerates higher than the median in Boston based on the available data, performing consistently well for different random samplings of test data 
