In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV,cross_val_score
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.naive_bayes import GaussianNB

In [4]:
car = pd.read_csv('car.data.txt', header = None)
y = car.iloc[:,6]
X = car.iloc[:,:6]

In [5]:
# Check NA
X[X.isna().any(axis = 1)]

Unnamed: 0,0,1,2,3,4,5


In [6]:
y.value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: 6, dtype: int64

In [7]:
car.head(3)

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc


In [8]:
X.iloc[:,0:2].replace({'low':0,'med':1/3,'high':2/3, 'vhigh':1}, inplace = True)
X.iloc[:,2].replace({'2':0,'3':1/3,'4':2/3,'5more':1},inplace = True)
X.iloc[:,3].replace({'2':0,'4':0.5,'more':1},inplace = True)
X.iloc[:,4].replace({'small':0,'med':0.5,'big':1},inplace = True)
X.iloc[:,5].replace({'low':0,'med':0.5,'high':1},inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  limit=limit, regex=regex)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=45)
f_measure_score = {'decision_tree':{},'knn':{},'logistic':{},'NB':{},'svm':{}}

In [10]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)

#### Decision Tree

In [11]:
para_tree = {'criterion':['entropy'],'max_depth':list(range(8,11)),'min_samples_leaf':list(range(1,4)),'random_state':[45]}
d_tree = DecisionTreeClassifier()
grid_tree = GridSearchCV(d_tree, para_tree, cv = cv, scoring='f1_micro')
grid_tree.fit(X_train,y_train)
y_pred_tree = grid_tree.predict(X_test)
nested_score_tree = cross_val_score(grid_tree, X = X, y = y, cv = cv) 
f_measure_score['decision_tree']['mean'] = np.mean(nested_score_tree)
f_measure_score['decision_tree']['std'] = np.std(nested_score_tree)

In [13]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_tree))

precision,recall,f-measure
              precision    recall  f1-score   support

        acc       0.92      0.99      0.95        74
       good       0.81      0.87      0.84        15
      unacc       1.00      0.98      0.99       243
      vgood       1.00      1.00      1.00        14

avg / total       0.98      0.97      0.97       346



In [14]:
grid_tree.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'random_state': 45}

#### KNN

In [15]:
para_knn = {'n_neighbors':list(range(12,17)),'weights':['uniform','distance']}
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(knn, para_knn, cv = cv, scoring='f1_micro')
grid_knn.fit(X_train,y_train)
y_pred_knn = grid_knn.predict(X_test)
nested_score_knn = cross_val_score(grid_knn, X = X, y = y, cv = cv) 
f_measure_score['knn']['mean'] = np.mean(nested_score_knn)
f_measure_score['knn']['std'] = np.std(nested_score_knn)

In [16]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_knn))

precision,recall,f-measure
              precision    recall  f1-score   support

        acc       0.84      0.99      0.91        74
       good       0.80      0.53      0.64        15
      unacc       1.00      0.98      0.99       243
      vgood       1.00      0.64      0.78        14

avg / total       0.95      0.95      0.95       346



In [17]:
grid_knn.best_params_

{'n_neighbors': 14, 'weights': 'distance'}

#### Logistic Regression

In [18]:
para_log = {'C':[10,100,1000]}
logistic = LogisticRegression(multi_class='multinomial',solver='lbfgs',penalty = 'l2',random_state = 45)
grid_log = GridSearchCV(logistic, para_log, cv = cv, scoring='f1_micro')
grid_log.fit(X_train, y_train)
y_pred_log = grid_log.predict(X_test)
nested_score_log = cross_val_score(grid_log, X = X, y = y, cv = cv) 
f_measure_score['logistic']['mean'] = np.mean(nested_score_log)
f_measure_score['logistic']['std'] = np.std(nested_score_log)

In [19]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_log),'\n')

precision,recall,f-measure
              precision    recall  f1-score   support

        acc       0.67      0.59      0.63        74
       good       0.75      0.60      0.67        15
      unacc       0.88      0.91      0.89       243
      vgood       0.78      1.00      0.88        14

avg / total       0.82      0.83      0.83       346
 



In [20]:
grid_log.best_params_

{'C': 100}

#### Naive Bayes

In [21]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
nested_score_nb = cross_val_score(nb, X = X, y = y, cv = cv) 
f_measure_score['NB']['mean'] = np.mean(nested_score_nb)
f_measure_score['NB']['std'] = np.std(nested_score_nb)

In [22]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_nb),'\n')

precision,recall,f-measure
              precision    recall  f1-score   support

        acc       0.59      0.18      0.27        74
       good       0.86      0.40      0.55        15
      unacc       0.86      0.84      0.85       243
      vgood       0.18      1.00      0.30        14

avg / total       0.77      0.68      0.69       346
 



#### SVM

In [23]:
para_svm = {'kernel':['rbf'],'C':[10],'gamma':[5]} 
svm = SVC(random_state = 45,probability = True)
grid_svm = GridSearchCV(svm, para_svm, cv = cv, scoring='f1_micro')
grid_svm.fit(X_train, y_train)
y_pred_svm = grid_svm.predict(X_test)
nested_score_svm = cross_val_score(grid_svm, X = X, y = y, cv = cv) 
f_measure_score['svm']['mean'] = np.mean(nested_score_svm)
f_measure_score['svm']['std'] = np.std(nested_score_svm)

In [24]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_svm),'\n')

precision,recall,f-measure
              precision    recall  f1-score   support

        acc       0.99      0.95      0.97        74
       good       0.94      1.00      0.97        15
      unacc       0.99      1.00      0.99       243
      vgood       1.00      1.00      1.00        14

avg / total       0.99      0.99      0.99       346
 



In [25]:
grid_svm.best_params_

{'C': 10, 'gamma': 5, 'kernel': 'rbf'}

#### Model Comparision

In [26]:
for k,v in f_measure_score.items():
    print(k, ': ', v)

decision_tree :  {'mean': 0.9722400027855788, 'std': 0.00990523827453036}
knn :  {'mean': 0.9554795430881123, 'std': 0.015399220191967707}
logistic :  {'mean': 0.8298503468460193, 'std': 0.025511416781890617}
NB :  {'mean': 0.6978159935533751, 'std': 0.029350048530706916}
svm :  {'mean': 0.9948008160802164, 'std': 0.004026972842290787}


In [27]:
accuracy_svm = grid_svm.score(X_test,y_test)
print('accuracy of SVM: ', accuracy_svm)

accuracy of SVM:  0.9855491329479769
