In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

In [3]:
df = pd.read_csv("car.data", header=None)

df.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety','label']


In [4]:
# 결측치 확인
missing_values = df.isna().sum()
print (missing_values)
# 결측치가 있는 행 제거
df = df.dropna()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
label       0
dtype: int64


In [6]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [5]:
df_encoded = pd.get_dummies(df, columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])


target = df_encoded['label']


features = df_encoded.drop(columns=['label'])


print(features.head())
print(target.head())

   buying_high  buying_low  buying_med  buying_vhigh  maint_high  maint_low  \
0        False       False       False          True       False      False   
1        False       False       False          True       False      False   
2        False       False       False          True       False      False   
3        False       False       False          True       False      False   
4        False       False       False          True       False      False   

   maint_med  maint_vhigh  doors_2  doors_3  ...  doors_5more  persons_2  \
0      False         True     True    False  ...        False       True   
1      False         True     True    False  ...        False       True   
2      False         True     True    False  ...        False       True   
3      False         True     True    False  ...        False       True   
4      False         True     True    False  ...        False       True   

   persons_4  persons_more  lug_boot_big  lug_boot_med  lug_boot_sma

In [18]:
x = features

y = target

In [19]:
LR = LogisticRegression()
RF = RandomForestClassifier()
DT = DecisionTreeClassifier()
SVM = SVC(kernel = 'rbf',probability=True)

In [20]:
def df_maker(col_num, ind_num, fill):
    col = []
    ind = []
    con = []
    for i in range(0,col_num):
        col.append(fill)
    for i in range(0,ind_num):
        ind.append(fill)
    for i in range(0,ind_num):
        con.append(col)
    return pd.DataFrame(con, columns = col, index = ind)

In [21]:
def cross_val(classlist,train_x,train_y):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    LR = classlist[0]
    RF = classlist[1]
    DT = classlist[2]
    SVM = classlist[3]
    
    LR.fit(train_x,train_y)
    RF.fit(train_x,train_y)
    DT.fit(train_x,train_y)
    SVM.fit(train_x,train_y)
    
    LR_scores = cross_val_score(LR, train_x, train_y, cv = kf)
    RF_scores = cross_val_score(RF, train_x, train_y, cv = kf)
    DT_scores = cross_val_score(DT, train_x, train_y, cv = kf)
    SVM_scores = cross_val_score(SVM, train_x, train_y, cv = kf)
        
    score_mean = [LR_scores.mean(), RF_scores.mean(), DT_scores.mean(), SVM_scores.mean()]
    
    score = df_maker(4,5,0)
    score.columns = ['LR_scores','RF_scores','DT_scores','SVM_scores']
    score[score.columns[0]] = LR_scores
    score[score.columns[1]] = RF_scores
    score[score.columns[2]] = DT_scores
    score[score.columns[3]] = SVM_scores
    score.index = [1,2,3,4,5]
    
    return score_mean

In [22]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=423)

In [23]:
classlist = [LR,RF,DT,SVM]
cross_val(classlist,x,y)

[0.9108888330401272, 0.9618128508000335, 0.9699003099606267, 0.972808913462344]