### 필수과제 1
- 수업시간에 제공한 데이터를 가지고
- DT나 다른 알고리즘으로 진행했을 때 어떤 결과가 나오는지?
- 수치만 적는 게 아니라 꼭 모델링별로 설명도 부탁드립니다.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import category_encoders as ce

In [3]:
df = pd.read_excel("car_evaluation.xlsx")

In [4]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [6]:
# 결측치 확인
df.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [7]:
X = df.drop(["class"], axis=1)
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])

In [9]:
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

### LogisticRegression

In [10]:
lr_clf = LogisticRegression(random_state=111)
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)

print("모델 정확도 {0:0.4f}".format(accuracy_score(y_test, y_pred)))

모델 정확도 0.6821


In [11]:
lr_params = {
    "penalty": ["l1", "l2", "elasticnet", None], 
    "C": [0.01, 0.1, 1, 5, 10, 100], 
    "max_iter": [100, 500]
}

grid_cv = GridSearchCV(lr_clf, param_grid=lr_params, scoring="accuracy", cv=3, verbose=1)
grid_cv.fit(X_train, y_train)
print("평균 정확도:{0:.4f}".format(grid_cv.best_score_))
print("최적의 하이퍼파라미터", grid_cv.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
평균 정확도:0.6964
최적의 하이퍼파라미터 {'C': 0.01, 'max_iter': 100, 'penalty': 'l2'}


In [12]:
lr_params = {
    "penalty": ["l2"], 
    "C": [0, 0.01, 0.05], 
    "class_weight": ["balanced", None], 
    "max_iter": [100], 
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]
}

grid_cv = GridSearchCV(lr_clf, param_grid=lr_params, scoring="accuracy", cv=3, verbose=1)
grid_cv.fit(X_train, y_train)
print("평균 정확도:{0:.4f}".format(grid_cv.best_score_))
print("최적의 하이퍼파라미터", grid_cv.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
평균 정확도:0.6998
최적의 하이퍼파라미터 {'C': 0.01, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


In [13]:
lr_clf = LogisticRegression(C=0.01, max_iter=100, penalty="l2", solver="liblinear", random_state=111)
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)

print("모델 정확도 {0:0.4f}".format(accuracy_score(y_test, y_pred)))

모델 정확도 0.7071


- 튜닝 전 정확도 0.6821, 튜닝 후 정확도 0.7071
- 로지스틱 회귀는 데이터의 선형 관계를 가정하기 때문에 선형 관계가 아닌 데이터에서는 성능이 떨어진다.

### KNeighborsClassifier

In [14]:
acc_val = []
for K in range(20):
    K = K+1
    model = KNeighborsClassifier(n_neighbors=K)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    acc_val.append(accuracy)
    print("k", "=", K, accuracy)

k = 1 0.7071290944123314
k = 2 0.6820809248554913
k = 3 0.7938342967244701
k = 4 0.74373795761079
k = 5 0.7996146435452793
k = 6 0.7842003853564548
k = 7 0.7803468208092486
k = 8 0.7784200385356455
k = 9 0.7784200385356455
k = 10 0.7649325626204239
k = 11 0.7687861271676301
k = 12 0.7726396917148363
k = 13 0.7475915221579962
k = 14 0.7398843930635838
k = 15 0.7398843930635838
k = 16 0.7418111753371869
k = 17 0.7456647398843931
k = 18 0.7475915221579962
k = 19 0.7360308285163777
k = 20 0.7456647398843931


In [17]:
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)

print("모델 정확도 {0:0.4f}".format(accuracy_score(y_test, y_pred)))

모델 정확도 0.7996


- k=5일 때 정확도 0.7996
- 약 0.8 정도의 정확도로, 로지스틱 회귀보다 높게 나타난다.

### DecisionTreeClassifier

In [18]:
dt_clf = DecisionTreeClassifier(random_state=111)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)

print("모델 정확도 {0:0.4f}".format(accuracy_score(y_test, y_pred)))

모델 정확도 0.9056


In [19]:
dt_params = {
    "criterion": ["gini", "entropy", "log_loss"], 
    "max_depth": [4,8,12,16], 
    "min_samples_split": [5,10,15,20], 
    "max_features": [1,2,3,4,5,6,7]
}

grid_cv = GridSearchCV(dt_clf, param_grid=dt_params, scoring="accuracy", cv=3, verbose=1)
grid_cv.fit(X_train, y_train)
print("평균 정확도:{0:.4f}".format(grid_cv.best_score_))
print("최적의 하이퍼파라미터", grid_cv.best_params_)

Fitting 3 folds for each of 336 candidates, totalling 1008 fits
평균 정확도:0.8718
최적의 하이퍼파라미터 {'criterion': 'gini', 'max_depth': 12, 'max_features': 6, 'min_samples_split': 5}


In [20]:
dt_params = {
    "max_depth": [9,10,11,12,13,14,15], 
    "min_samples_split": [2,3,4,5,6,7,8,9], 
    "max_features": [5]
}

grid_cv = GridSearchCV(dt_clf, param_grid=dt_params, scoring="accuracy", cv=3, verbose=1)
grid_cv.fit(X_train, y_train)
print("평균 정확도:{0:.4f}".format(grid_cv.best_score_))
print("최적의 하이퍼파라미터", grid_cv.best_params_)

Fitting 3 folds for each of 56 candidates, totalling 168 fits
평균 정확도:0.8850
최적의 하이퍼파라미터 {'max_depth': 11, 'max_features': 5, 'min_samples_split': 3}


In [21]:
dt_clf = DecisionTreeClassifier(max_depth=11, max_features=5, min_samples_split=3, random_state=111)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)

print("모델 정확도 {0:0.4f}".format(accuracy_score(y_test, y_pred)))

모델 정확도 0.9056


- 튜닝 전 정확도 0.9056, 튜닝 후 정확도 0.9056
- 로지스틱 회귀와 KNN 회귀보다 높은 정확도를 보인다.

### VotingClassifier

In [25]:
# hard default
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=5)
dt_clf = DecisionTreeClassifier()

vo_clf = VotingClassifier(estimators=[("LR",lr_clf),("KNN",knn_clf),("DT",dt_clf)])

vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.8208092485549133


In [26]:
# soft default
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=5)
dt_clf = DecisionTreeClassifier()

vo_clf = VotingClassifier(estimators=[("LR",lr_clf),("KNN",knn_clf),("DT",dt_clf)], 
                          voting="soft")

vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.8921001926782274


In [27]:
# hard tunned
lr_clf = LogisticRegression(C=0.01, max_iter=100, penalty="l2", solver="liblinear", random_state=111)
knn_clf = KNeighborsClassifier(n_neighbors=5)
dt_clf = DecisionTreeClassifier(max_depth=11, max_features=5, min_samples_split=3, random_state=111)

vo_clf = VotingClassifier(estimators=[("LR",lr_clf),("KNN",knn_clf),("DT",dt_clf)])

vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.8188824662813102


In [28]:
# soft tunned
lr_clf = LogisticRegression(C=0.01, max_iter=100, penalty="l2", solver="liblinear", random_state=111)
knn_clf = KNeighborsClassifier(n_neighbors=5)
dt_clf = DecisionTreeClassifier(max_depth=11, max_features=5, min_samples_split=3, random_state=111)

vo_clf = VotingClassifier(estimators=[("LR",lr_clf),("KNN",knn_clf),("DT",dt_clf)], 
                          voting="soft")

vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.882466281310212


- 기본값 파라미터의 soft voting이 가장 높은 정확도 0.8921를 보인다.
- 전반적으로 모든 모델에서 과적합이 발생하였고, 모두 Random Forest에 비해 성능이 떨어진다는 공통점이 있다.