In [69]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

%matplotlib inline

In [None]:
df = pd.read_csv("car_evaluation.csv", names=["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"])

In [4]:
df.shape

(1728, 7)

In [11]:
df["class"].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

# Preprocess

In [26]:
le = LabelEncoder().fit(df["buying"])
le_nm_1 = dict(zip(le.classes_, le.transform(le.classes_)))
df["buying"] = df["buying"].apply(lambda x: le_nm_1[x])

In [27]:
le_nm_1

{'high': 0, 'low': 1, 'med': 2, 'vhigh': 3}

In [29]:
le = LabelEncoder().fit(df["maint"])
le_nm_2 = dict(zip(le.classes_, le.transform(le.classes_)))
df["maint"] = df["maint"].apply(lambda x: le_nm_2[x])

In [30]:
le_nm_2

{'high': 0, 'low': 1, 'med': 2, 'vhigh': 3}

In [32]:
le = LabelEncoder().fit(df["lug_boot"])
le_nm_3 = dict(zip(le.classes_, le.transform(le.classes_)))
df["lug_boot"] = df["lug_boot"].apply(lambda x: le_nm_3[x])

In [81]:
le_nm_3

{'big': 0, 'med': 1, 'small': 2}

In [33]:
le = LabelEncoder().fit(df["safety"])
le_nm_4 = dict(zip(le.classes_, le.transform(le.classes_)))
df["safety"] = df["safety"].apply(lambda x: le_nm_4[x])

In [34]:
le_nm_4

{'high': 0, 'low': 1, 'med': 2}

In [35]:
le = LabelEncoder().fit(df["class"])
le_nm_5 = dict(zip(le.classes_, le.transform(le.classes_)))
df["class"] = df["class"].apply(lambda x: le_nm_5[x])

In [36]:
le_nm_5

{'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3}

In [44]:
le = LabelEncoder().fit(df["doors"])
le_nm_6 = dict(zip(le.classes_, le.transform(le.classes_)))
df["doors"] = df["doors"].apply(lambda x: le_nm_6[x])

In [45]:
le_nm_6

{'2': 0, '3': 1, '4': 2, '5more': 3}

In [46]:
le = LabelEncoder().fit(df["persons"])
le_nm_7 = dict(zip(le.classes_, le.transform(le.classes_)))
df["persons"] = df["persons"].apply(lambda x: le_nm_7[x])

In [47]:
le_nm_7

{'2': 0, '4': 1, 'more': 2}

In [48]:
X = df.drop(["class"], axis=1)
y = df["class"]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

# Random Forest

In [50]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest: ", accuracy_score(y_test, y_pred))

Random Forest:  0.9826589595375722


In [55]:
rf_params = {
    "n_estimators": [100, 250, 500,],
    "max_depth": [1, 15, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [2, 5],
    "max_features": ["sqrt"],
    "ccp_alpha": [1, 2, 2.5]
}

In [56]:
rf = RandomForestClassifier()
rf_cv_model = GridSearchCV(rf, rf_params, cv=10, n_jobs=-1).fit(X_train, y_train)
rf_cv_model.best_params_

{'ccp_alpha': 1,
 'max_depth': 1,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [61]:
rf_tuned = RandomForestClassifier(ccp_alpha=1, max_depth=1, max_features="sqrt", min_samples_leaf=2, min_samples_split=2, n_estimators=100).fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
print("Random Forest: ", accuracy_score(y_test, y_pred))

Random Forest:  0.6878612716763006


# Decision Tree

In [63]:
dtr = DecisionTreeClassifier()
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)
print("Decision Tree: ", accuracy_score(y_test, y_pred))

Decision Tree:  0.976878612716763


In [64]:
dtr_params = {
    "splitter": ["best", "random"],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 5, 10],
    "max_features": ["sqrt"],
    "ccp_alpha": [1, 2, 2.5, 5]
}

In [65]:
dtr = DecisionTreeClassifier()
dtr_cv_model = GridSearchCV(dtr, dtr_params, cv=10, n_jobs=-1).fit(X_train, y_train)
dtr_cv_model.best_params_

{'ccp_alpha': 1,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'splitter': 'best'}

In [66]:
dtr_tuned = DecisionTreeClassifier(ccp_alpha=1, max_features="sqrt", min_samples_leaf=2, min_samples_split=2, splitter="best").fit(X_train, y_train)
y_pred = dtr_tuned.predict(X_test)
print("Decision Tree: ", accuracy_score(y_test, y_pred))

Decision Tree:  0.6878612716763006


# SVC

In [70]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print("SVC: ", accuracy_score(y_test, y_pred))

SVC:  0.9277456647398844


In [71]:
svc_params = {
    "C": [1,2,3,4,5]
}

In [72]:
svc = SVC()
svc_cv_model = GridSearchCV(svc, svc_params, cv=10, n_jobs=-1).fit(X_train, y_train)
svc_cv_model.best_params_

{'C': 5}

In [73]:
svc_tuned = SVC(C=5).fit(X_train, y_train)
y_pred = svc_tuned.predict(X_test)
print("SVC: ", accuracy_score(y_test, y_pred))

SVC:  0.9826589595375722


# KNN

In [74]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("KNN: ", accuracy_score(y_test, y_pred))

KNN:  0.9161849710982659


In [75]:
knn_params = {
    "n_neighbors": np.arange(1, 100)
}

In [76]:
knn = KNeighborsClassifier()
knn_cv_model = GridSearchCV(knn, knn_params, cv=10, n_jobs=-1).fit(X_train, y_train)
knn_cv_model.best_params_

{'n_neighbors': 7}

In [77]:
knn_tuned = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
y_pred = knn_tuned.predict(X_test)
print("KNN: ", accuracy_score(y_test, y_pred))

KNN:  0.9450867052023122


# Logistic Regression

In [78]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Logistic Regression: ", accuracy_score(y_test, y_pred))

Logistic Regression:  0.6878612716763006


In [79]:
pickle.dump(svc_tuned, open("SVCmodel.pkl", "wb"))