In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBClassifier
import xgboost as xgb


import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_recall_fscore_support, precision_score, recall_score

In [2]:
column_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'overall_score']
data = pd.read_csv('./car.data', names=column_headers, index_col=False)
data.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,overall_score
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
data.shape

(1728, 7)

## Exploratory Data Analysis

In [4]:
# No null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   buying         1728 non-null   object
 1   maint          1728 non-null   object
 2   doors          1728 non-null   object
 3   persons        1728 non-null   object
 4   lug_boot       1728 non-null   object
 5   safety         1728 non-null   object
 6   overall_score  1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
# Identify the categorical variables
categorical_variables = [col for col in data.columns if data[col].dtype=='object']

# Inspect categorical variables
data[categorical_variables].describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,overall_score
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,low,low,5more,more,big,low,unacc
freq,432,432,432,576,576,576,1210


In [6]:
for column in column_headers:
    print(round(data[column].value_counts()/1728*100,3))

low      25.0
med      25.0
vhigh    25.0
high     25.0
Name: buying, dtype: float64
low      25.0
med      25.0
vhigh    25.0
high     25.0
Name: maint, dtype: float64
5more    25.0
2        25.0
3        25.0
4        25.0
Name: doors, dtype: float64
more    33.333
2       33.333
4       33.333
Name: persons, dtype: float64
big      33.333
small    33.333
med      33.333
Name: lug_boot, dtype: float64
low     33.333
med     33.333
high    33.333
Name: safety, dtype: float64
unacc    70.023
acc      22.222
good      3.993
vgood     3.762
Name: overall_score, dtype: float64


In [7]:
# Map into ordinal data
buying_mapper = {'low':1, 'med':2, 'high':3, 'vhigh':4}
maint_mapper = {'low':1, 'med':2, 'high':3, 'vhigh':4}
doors_mapper = {'2':1, '3':2, '4':3, '5more':4}
persons_mapper = {'2':1, '4':2, 'more':3}
lug_boot = {'small':1, 'med':2, 'big':3}
safety_mapper = {'low':1, 'med':2, 'high':3}
score_mapper = {'unacc':1, 'acc':2, 'good':3, 'vgood':4}

In [8]:
# data['buying'] = data['buying'].map(buying_mapper)
data['maint'] = data['maint'].map(maint_mapper)
data['doors'] = data['doors'].map(doors_mapper)
data['persons'] = data['persons'].map(persons_mapper)
data['lug_boot'] = data['lug_boot'].map(lug_boot)
data['safety'] = data['safety'].map(safety_mapper)
data['overall_score'] = data['overall_score'].map(score_mapper)

In [9]:
# X and y dataset
X = data[[x for x in data.columns if x != 'buying']]
y = data['buying']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [11]:
# define hyperparameters for GridSearch and model tuning
estimators = [
    DecisionTreeClassifier(criterion='gini', max_depth=2, random_state=123), 
    DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123), 
    DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=123), 
    DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=123), 
    DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123),
    DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123), 
              ]

hyperparameters = {'base_estimator':estimators, 'n_estimators': [5, 15, 25, 50], 'learning_rate':[0.01, 0.1, 0.5, 1]}

# define evaluation procedure
adaboost = GridSearchCV(AdaBoostClassifier(random_state=123), hyperparameters, cv=5, scoring="accuracy", n_jobs=-1)
adaboost.fit(X_train, y_train.values.ravel())
print(f'Best Accuracy score: {adaboost.best_score_:.3f} with parameters: {adaboost.best_params_}')

Best Accuracy score: 0.311 with parameters: {'base_estimator': DecisionTreeClassifier(max_depth=2, random_state=123), 'learning_rate': 0.01, 'n_estimators': 25}


In [12]:
adaboost_prediction = adaboost.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, adaboost_prediction)*100:.1f}')
print(f'F1 Score: {f1_score(y_test, adaboost_prediction, average="weighted")*100:.1f}')

Accuracy: 29.7
F1 Score: 27.4


In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, adaboost_prediction))

              precision    recall  f1-score   support

        high       0.24      0.23      0.24       120
         low       0.85      0.12      0.22       137
         med       0.21      0.22      0.22       134
       vhigh       0.33      0.62      0.43       128

    accuracy                           0.30       519
   macro avg       0.41      0.30      0.27       519
weighted avg       0.42      0.30      0.27       519



## SVM 

In [14]:
# SVM
hyperparameters = {'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 
                   'C':[0.01, 0.1], 
                   'tol':[0.001, 0.01],
                   'decision_function_shape':['ovr'],
                   'gamma':[1e-1, 1e-2]
                  }

# define evaluation procedure
svm = GridSearchCV(SVC(random_state=123), hyperparameters, scoring="accuracy", cv=5, n_jobs=-1)
svm.fit(X_train, y_train.values.ravel())
print(f'Best Accuracy score: {svm.best_score_:.3f} with parameters: {svm.best_params_}')

Best Accuracy score: 0.313 with parameters: {'C': 0.1, 'decision_function_shape': 'ovr', 'gamma': 0.1, 'kernel': 'linear', 'tol': 0.001}


In [15]:
svm_prediction = svm.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, svm_prediction)*100:.1f}')
print(f'F1 Score: {f1_score(y_test, svm_prediction, average="weighted")*100:.1f}')

Accuracy: 32.2
F1 Score: 31.9


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, svm_prediction))

              precision    recall  f1-score   support

        high       0.27      0.33      0.30       120
         low       0.68      0.19      0.30       137
         med       0.30      0.35      0.32       134
       vhigh       0.31      0.42      0.36       128

    accuracy                           0.32       519
   macro avg       0.39      0.32      0.32       519
weighted avg       0.40      0.32      0.32       519



# Train Best Model on whole dataset 

In [17]:
clf = SVC(random_state=123, **svm.best_params_)
clf.fit(X, y.values.ravel())

SVC(C=0.1, gamma=0.1, kernel='linear', random_state=123)