## UCI car evaluation - Final CodeAcademy project


In [114]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [115]:
cars = pd.read_csv("car.data", header=None)
cars[1000:1100]

Unnamed: 0,0,1,2,3,4,5,6
1000,med,high,3,2,small,med,unacc
1001,med,high,3,2,small,high,unacc
1002,med,high,3,2,med,low,unacc
1003,med,high,3,2,med,med,unacc
1004,med,high,3,2,med,high,unacc
...,...,...,...,...,...,...,...
1095,med,med,2,4,big,low,unacc
1096,med,med,2,4,big,med,acc
1097,med,med,2,4,big,high,vgood
1098,med,med,2,more,small,low,unacc


In [116]:
columns = ["buying","maint","doors","persons","lug_boot","safety","class"]

In [117]:
cars.columns = columns
cars.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


### Preliminary Data analysis

In [118]:
cars.shape

(1728, 7)

In [119]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [120]:
cars.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,high,high,3,2,big,high,unacc
freq,432,432,432,576,576,576,1210


In [121]:
cars.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [122]:
cars.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'], dtype='object')

In [123]:
cars["maint"].value_counts()

high     432
vhigh    432
med      432
low      432
Name: maint, dtype: int64

In [124]:
# checking distribution of target variable class 
cars["class"].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [125]:
round(cars["class"].value_counts()/len(cars)*100,2)

unacc    70.02
acc      22.22
good      3.99
vgood     3.76
Name: class, dtype: float64

Conclusion: Looks like there is a high percentage, that the price of a car is unaccurate.

### Data Preprocessing

Encode categorical features to integers

In [126]:
encoder = OrdinalEncoder()

In [127]:
data = encoder.fit_transform(cars.drop('class',axis=1))
df = pd.DataFrame(data,columns=columns[:-1])
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3.0,3.0,0.0,0.0,2.0,1.0
1,3.0,3.0,0.0,0.0,2.0,2.0
2,3.0,3.0,0.0,0.0,2.0,0.0
3,3.0,3.0,0.0,0.0,1.0,1.0
4,3.0,3.0,0.0,0.0,1.0,2.0
...,...,...,...,...,...,...
1723,1.0,1.0,3.0,2.0,1.0,2.0
1724,1.0,1.0,3.0,2.0,1.0,0.0
1725,1.0,1.0,3.0,2.0,0.0,1.0
1726,1.0,1.0,3.0,2.0,0.0,2.0


### Modelling

In [133]:
X_train, X_test, y_train, y_test = train_test_split(df, cars["class"], test_size=0.2, random_state=101)

In [140]:
len(y_test)

346

In [272]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(df, cars["class"], test_size=0.15, random_state=1000)

### Decision Tree 1)

In [273]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [274]:
tree = DecisionTreeClassifier()

In [275]:
tree.fit(X_train,y_train)

DecisionTreeClassifier()

In [276]:
y_pred = tree.predict(X_test)

In [277]:
accuracy_score(y_test,y_pred)

0.9682080924855492

In [278]:
print(confusion_matrix(y_test, y_pred))

[[ 66   1   2   1]
 [  4   9   0   0]
 [  2   1 249   0]
 [  0   0   0  11]]


In [279]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         acc       0.92      0.94      0.93        70
        good       0.82      0.69      0.75        13
       unacc       0.99      0.99      0.99       252
       vgood       0.92      1.00      0.96        11

    accuracy                           0.97       346
   macro avg       0.91      0.91      0.91       346
weighted avg       0.97      0.97      0.97       346



### Decision Tree 2)

In [280]:
tree.fit(X_train2,y_train2)

DecisionTreeClassifier()

In [281]:
y_pred2 = tree.predict(X_test2)

In [282]:
accuracy_score(y_test2,y_pred2)

0.9807692307692307

In [283]:
print(confusion_matrix(y_test2, y_pred2))

[[ 62   1   1   1]
 [  1   4   0   0]
 [  1   0 176   0]
 [  0   0   0  13]]


In [284]:
print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

         acc       0.97      0.95      0.96        65
        good       0.80      0.80      0.80         5
       unacc       0.99      0.99      0.99       177
       vgood       0.93      1.00      0.96        13

    accuracy                           0.98       260
   macro avg       0.92      0.94      0.93       260
weighted avg       0.98      0.98      0.98       260

