In [77]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
import joblib

In [6]:
columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [21]:
data = pd.read_csv("car.data", names=columns)
data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [22]:
data.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'], dtype='object')

In [23]:
def label(data, feature):
    feature_label_name = {ni: n for n, ni in enumerate(set(data[feature]))}
    return feature_label_name
buying_label = label(data, 'buying')
maint_label = label(data, 'maint')
doors_label = label(data, 'doors')
person_label = label(data, 'persons')
lug_boot_label = label(data, 'lug_boot')
safety_label = label(data, 'safety')
class_label = label(data, 'class')

In [24]:
df1 = data
df1['buying'] = df1['buying'].map(buying_label)
df1['maint'] = df1['maint'].map(maint_label)
df1['doors'] = df1['doors'].map(doors_label)
df1['persons'] = df1['persons'].map(person_label)
df1['lug_boot'] = df1['lug_boot'].map(lug_boot_label)
df1['safety'] = df1['safety'].map(safety_label)
df1['class'] = df1['class'].map(class_label)
df1

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,0,0,0,0,2,1,1
1,0,0,0,0,2,0,1
2,0,0,0,0,2,2,1
3,0,0,0,0,0,1,1
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
1723,1,1,3,1,0,0,0
1724,1,1,3,1,0,2,2
1725,1,1,3,1,1,1,1
1726,1,1,3,1,1,0,0


In [76]:
lb = LabelEncoder()
df2 = data
for i in df2.columns:
    df2[i] = lb.fit_transform(df2[i])
    
df2['class'].value_counts()

1    1210
3     384
0      69
2      65
Name: class, dtype: int64

In [74]:
X = df1[['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]
y = df1['class']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [79]:
def classifier(model, X_train, X_test, y_train, y_test):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_score = clf.fit(X_train, y_train).decision_function(X_test)
    score = accuracy_score(y_test, y_pred)
    return score, model, y_score

In [80]:
score, lr_model, y_score = classifier(LogisticRegression(), X_train, X_test, y_train, y_test)

In [81]:
score

0.7013888888888888

In [63]:
score, rf = classifier(RandomForestClassifier(), X_train, X_test, y_train, y_test)

In [64]:
score

0.9328703703703703

In [70]:
score, nn = classifier(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2)), X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [71]:
score

0.7291666666666666

In [65]:
logit_model = open("logit_model.pkl", "wb")
joblib.dump(lr_model, logit_model)
logit_model.close()

In [67]:
rf_model = open("rf_model.pkl", "wb")
joblib.dump(rf, rf_model)
rf_model.close()

In [72]:
nn_model = open("nn_model.pkl", "wb")
joblib.dump(nn, nn_model)
nn_model.close()