# Total process

라이브러리 임포트 -> 데이터셋 로드 -> X,y에 데이터 할당 -> train_test_split 두 번 사용해서 train, valid, test 나눔 -> 하이퍼 파라미터 리스트 만들고 for 반복문으로 accuracy와 confusion matrix 비교, accuracy로 한계가 있어 f1-score도 활용 (pandas dataframe) -> optimal 값을 선정하고 test셋에 적용 후 accuracy와 confusion matrix 도출  

+gridsearchcv도 사용해보았습니다.


In [1]:
# import Library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [2]:
# Load Dataset
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

print(cancer['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [3]:
# assign X and y
X, y = cancer.data, cancer.target

In [4]:
# Simple Explore data

# size of X (feature num)
print(X.shape)

# size of shape (target class num)
print(y.shape)

(569, 30)
(569,)


In [5]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=3)

In [6]:
# size of train, test, valid

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(X_val.shape, y_val.shape)

(364, 30) (114, 30)
(364,) (114,)
(91, 30) (91,)


# Hyperparameter
## max_depth, max_features

max_depth : depth of tree, default=None->divide until class value perfectly decided->Overfitting  
max_features : considering number of feature for optimal division. default=None->use all of feature of dataset.

In [7]:
# validation

# setting candidate of hyperparameter

max_depth_list = [3, 5, 7, 10]
max_features_list = [4, 5, 6] # usually use sqrt or log2() of features

accuracy_score_val_list = []
f1_score_val_list = []

accuracy_score_train_list = []
f1_score_train_list = []

for max_depth in max_depth_list:
    for max_features in max_features_list:
        
        clf = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, random_state=3)
        clf.fit(X_train, y_train)
        
        prediction_val = clf.predict(X_val)
        
        accuracy_val = accuracy_score(prediction_val, y_val)
        f1_val = f1_score(y_val, prediction_val)
        cf_val = confusion_matrix(y_val, prediction_val)
        accuracy_score_val_list.append(accuracy_val)
        f1_score_val_list.append(f1_val)
        
        prediction_train = clf.predict(X_train)
        accuracy_train = accuracy_score(prediction_train, y_train)
        accuracy_score_train_list.append(accuracy_train)
        f1_train = f1_score(y_train, prediction_train)
        f1_score_train_list.append(f1_train)
        cf_train = confusion_matrix(y_train,prediction_train)
        
        print("max_depth: {}, max_features: {}".format(max_depth, max_features))
        print("accuracy score_train: {}".format(accuracy_train))
        print("accuracy score_val: {}".format(accuracy_val))
        print("confusion_matrix_train: {}".format(cf_train))
        print("confusion_matrix_val: \n{}".format(cf_val))
        print("f1 score_train: {}".format(f1_train))
        print("f1 score_val: {}".format(f1_val))
        print()
        

max_depth: 3, max_features: 4
accuracy score_train: 0.9615384615384616
accuracy score_val: 0.9230769230769231
confusion_matrix_train: [[127  12]
 [  2 223]]
confusion_matrix_val: 
[[27  6]
 [ 1 57]]
f1 score_train: 0.9695652173913043
f1 score_val: 0.9421487603305785

max_depth: 3, max_features: 5
accuracy score_train: 0.9532967032967034
accuracy score_val: 0.978021978021978
confusion_matrix_train: [[133   6]
 [ 11 214]]
confusion_matrix_val: 
[[33  0]
 [ 2 56]]
f1 score_train: 0.9617977528089887
f1 score_val: 0.9824561403508771

max_depth: 3, max_features: 6
accuracy score_train: 0.978021978021978
accuracy score_val: 0.967032967032967
confusion_matrix_train: [[134   5]
 [  3 222]]
confusion_matrix_val: 
[[31  2]
 [ 1 57]]
f1 score_train: 0.9823008849557522
f1 score_val: 0.9743589743589743

max_depth: 5, max_features: 4
accuracy score_train: 0.9835164835164835
accuracy score_val: 0.945054945054945
confusion_matrix_train: [[133   6]
 [  0 225]]
confusion_matrix_val: 
[[29  4]
 [ 1 57]]
f

In [8]:
# hyper parameter comparison pandas dataframe
temp_hp_list =[]
for max_depth in max_depth_list:
    for max_features in max_features_list:
        temp_hp_list.append((max_depth, max_features))
        
temp_hp_dict = {'hyper parameter(max_depth, max_features)':temp_hp_list, 'accuracy score_val': accuracy_score_val_list, 'f1 score_val': f1_score_val_list, 'accuracy score_train':accuracy_score_train_list, 'f1 score_train':f1_score_train_list}

df = pd.DataFrame(temp_hp_dict)
df.sort_values(by=['accuracy score_val', 'f1 score_val'], ascending=False)

Unnamed: 0,"hyper parameter(max_depth, max_features)",accuracy score_val,f1 score_val,accuracy score_train,f1 score_train
4,"(5, 5)",0.989011,0.991453,0.986264,0.988864
1,"(3, 5)",0.978022,0.982456,0.953297,0.961798
2,"(3, 6)",0.967033,0.974359,0.978022,0.982301
7,"(7, 5)",0.967033,0.973913,0.997253,0.997783
6,"(7, 4)",0.956044,0.966102,0.997253,0.997783
8,"(7, 6)",0.956044,0.965517,1.0,1.0
10,"(10, 5)",0.956044,0.965517,1.0,1.0
11,"(10, 6)",0.956044,0.965517,1.0,1.0
3,"(5, 4)",0.945055,0.957983,0.983516,0.986842
5,"(5, 6)",0.945055,0.957265,0.994505,0.995556


In [9]:
top = (df.sort_values(by=['accuracy score_val', 'f1 score_val'], ascending=False)).head(1)
top = top.iloc[0,0]
top

(5, 5)

In [10]:
# optimal hyper parameter -> max_depth: 5, max_features: 5
optimal_max_depth, optimal_max_features = top
# fit on test set

clf = DecisionTreeClassifier(max_depth=optimal_max_depth, max_features=optimal_max_features, random_state = 3)
clf.fit(X_train, y_train)

prediction = clf.predict(X_test)

accuracy = accuracy_score(prediction, y_test)
cf = confusion_matrix(y_test, prediction)

print("accuracy_score : {}".format(accuracy))
print("confusion_matrix : \n{}".format(cf))


accuracy_score : 0.9385964912280702
confusion_matrix : 
[[36  4]
 [ 3 71]]


In [11]:
# 추가 (Grid search cv 활용)

from sklearn.model_selection import GridSearchCV

tree_model = DecisionTreeClassifier(random_state=3)

hyper_parameter = {
    'max_depth':[3, 5, 7, 10],
    'max_features':[4, 5, 6]
}

hyper_parameter_tuner = GridSearchCV(tree_model, hyper_parameter, cv=5)
hyper_parameter_tuner.fit(X, y)

best_params = hyper_parameter_tuner.best_params_
print(best_params)

{'max_depth': 3, 'max_features': 6}


In [12]:
# optimal hyper parameter -> max_depth: 3, max_features: 6
optimal_max_depth, optimal_max_features = best_params['max_depth'], best_params['max_features']
# fit on test set

clf = DecisionTreeClassifier(max_depth=optimal_max_depth, max_features=optimal_max_features, random_state=3)
clf.fit(X_train, y_train)

prediction = clf.predict(X_test)

accuracy = accuracy_score(prediction, y_test)
cf = confusion_matrix(y_test, prediction)

print("accuracy_score : {}".format(accuracy))
print("confusion_matrix : \n{}".format(cf))


accuracy_score : 0.956140350877193
confusion_matrix : 
[[37  3]
 [ 2 72]]
