In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:83%; align: left; }</style>"))
display(HTML("<style>#toc-wrapper{ position: relative; width: 20%; top: 130px; left: 0px; }</style>"))

## 필요한 라이브러리 로드

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터셋 로드

In [4]:
df = pd.read_csv("data/diabetes_feature.csv")
df.shape

(768, 16)

In [5]:
df.head() #다 숫자로 되어있어서 전처리 불필요

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,0.848324,72,35,0,33.6,0.468492,50,1,False,False,True,False,169.5,5.138735,False
1,1,-1.123396,66,29,0,26.6,-0.365061,31,0,False,False,True,False,102.5,4.639572,True
2,8,1.943724,64,0,0,23.3,0.604397,32,1,True,False,True,False,169.5,5.138735,False
3,1,-0.998208,66,23,94,28.1,-0.920763,21,0,False,True,False,False,94.0,4.553877,True
4,0,0.504055,40,35,168,43.1,5.484909,33,1,False,False,True,False,168.0,5.129899,False


## 학습과 예측에 사용할 데이터셋 만들기

In [6]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [7]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
    'Insulin_nan','low_glu_insulin']]
X.shape

(768, 9)

In [8]:
y = df['Outcome']
y.shape

(768,)

In [9]:
# 사이킷런에서 제공하는 model_selection 의 train_test_split 으로 만듭니다.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [10]:
# train 세트의 문제와 정답의 데이터 수를 확인해 주세요
X_train.shape, y_train.shape

((614, 9), (614,))

In [11]:
# test 세트의 문제와 정답의 데이터 수를 확인해 주세요
X_test.shape, y_test.shape

((154, 9), (154,))

## 여러 개의 알고리즘을 사용해서 비교하기

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)
            ]
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [13]:
max_depth = np.random.randint(2,20,10)
max_depth

array([16,  3, 11, 17,  4,  9, 10, 12,  9, 15])

In [14]:
max_features = np.random.uniform(0.3, 1.0, 10)
max_features

array([0.6608967 , 0.77243797, 0.36463401, 0.42345136, 0.56228387,
       0.87330686, 0.4474438 , 0.91092421, 0.98734106, 0.5910585 ])

In [15]:
result = []
for estimator in estimators:
    result.append(estimator.__class__.__name__)
result

['DecisionTreeClassifier',
 'RandomForestClassifier',
 'GradientBoostingClassifier']

In [16]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {"max_depth":max_depth, 
                       "max_features":max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != "DecisionTreeClassifier":
        param_distributions["n_estimators"] = np.random.randint(100,200,10)
        
    clf = RandomizedSearchCV(estimator,
                       param_distributions, 
                       n_iter=100,
                       scoring="accuracy",
                       n_jobs=-1,
                       cv=5,
                       verbose=2
                      )
    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [17]:
df = pd.DataFrame(results, columns=["estimator", "best_params", "train_score", "test_score", "cv_result"])

In [19]:
pd.DataFrame(df.loc[1, "cv_result"]).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
76,0.557924,0.098609,0.020025,0.007335,143,0.850385,7,"{'n_estimators': 143, 'max_features': 0.850384...",0.886179,0.934959,0.869919,0.894309,0.942623,0.905598,0.028321,1
79,0.631082,0.036215,0.023318,0.006716,143,0.862418,7,"{'n_estimators': 143, 'max_features': 0.862418...",0.886179,0.934959,0.869919,0.894309,0.942623,0.905598,0.028321,1
80,0.640576,0.030772,0.032438,0.012124,165,0.850385,7,"{'n_estimators': 165, 'max_features': 0.850384...",0.886179,0.951220,0.869919,0.886179,0.934426,0.905584,0.031431,3
97,0.487063,0.023312,0.021391,0.008078,143,0.900075,8,"{'n_estimators': 143, 'max_features': 0.900074...",0.886179,0.943089,0.869919,0.894309,0.934426,0.905584,0.028335,3
60,0.573984,0.022285,0.020109,0.004406,165,0.862418,7,"{'n_estimators': 165, 'max_features': 0.862418...",0.886179,0.951220,0.869919,0.886179,0.934426,0.905584,0.031431,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,0.288041,0.040073,0.015323,0.004381,133,0.607796,2,"{'n_estimators': 133, 'max_features': 0.607795...",0.788618,0.878049,0.853659,0.813008,0.934426,0.853552,0.050992,96
14,0.373950,0.026830,0.015727,0.002291,198,0.470839,2,"{'n_estimators': 198, 'max_features': 0.470838...",0.796748,0.886179,0.845528,0.804878,0.926230,0.851913,0.048976,97
43,0.429952,0.016673,0.022933,0.003699,139,0.850385,2,"{'n_estimators': 139, 'max_features': 0.850384...",0.788618,0.869919,0.837398,0.813008,0.942623,0.850313,0.053423,98
99,0.303079,0.024870,0.011036,0.005660,139,0.862418,2,"{'n_estimators': 139, 'max_features': 0.862418...",0.788618,0.869919,0.837398,0.813008,0.942623,0.850313,0.053423,98
