## 의사결정나무

In [75]:
# 데이터 로드
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

def make_dataset():
    bc = load_breast_cancer()
    df = pd.DataFrame(bc.data, columns=bc.feature_names)
    df['target'] = bc.target
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop('target', axis=1), df['target'], test_size=0.5, random_state=1004)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = make_dataset()
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((284, 30), (285, 30), (284,), (285,))

### 학습

In [5]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)

model.fit(X_train, y_train)

pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9157894736842105

### 하이퍼파라미터
- criterion(gini)
- max_depth(None)
- min_sample_split(2)
- min_sample_leaf(1)

In [35]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(
    random_state=42, 
    criterion="entropy", 
    max_depth=4, 
    min_samples_leaf=2,
    min_samples_split=3
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9228070175438596

## 랜덤포레스트
- n_estimators(100): 트리의 수
- criterion(gini): 불순도 지표
- max_depth(None): 최대 한도 깊이
- min_sample_split(2): 자식 노드를 갖기 위한 최소한의 데이터 수
- min_sample_leaf(1): 리프 노드가 되기 위한 최소 샘플 수

In [62]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    criterion="entropy", 
    max_depth=4, 
    min_samples_leaf=3,
    min_samples_split=3
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9473684210526315

## XGBoost
- booster(gbtree): 부스팅 알고리즘 (dart, gblinear)
- objective(binary:logistic): 이진분류 (다중분류: multi:softmax)
- max_depth(6): 최대 한도 깊이
- learning_rate(0.1): 학습률
- n_estimators(100): 트리의 수
- subsample(1): 훈련 샘플 개수의 비율
- colsample_bytree(1): 특성 개수의 비율
- n_jobs(1): 사용 코어 수(-1:모든 코어를 다 사용)

In [92]:
from xgboost import XGBClassifier

model = XGBClassifier(
    random_state=42,
    booster="dart",
    objective="binary:logistic",
    n_estimators=500,
    learning_rate=0.05,
    subsample=1,
    colsample_bytree=1,
    n_jobs=-1,
    max_depth=5
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9649122807017544

In [97]:
# 조기 종료
from xgboost import XGBClassifier

model = XGBClassifier(
    random_state=42,
    booster="dart",
    objective="binary:logistic",
    n_estimators=500,
    learning_rate=0.05,
    subsample=1,
    colsample_bytree=1,
    n_jobs=-1,
    max_depth=5,
    # early_stopping_rounds=50
)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_set=eval_set)

pred = model.predict(X_test)
accuracy_score(y_test, pred)

[0]	validation_0-logloss:0.65133
[1]	validation_0-logloss:0.61622
[2]	validation_0-logloss:0.58479


[3]	validation_0-logloss:0.55575
[4]	validation_0-logloss:0.53043
[5]	validation_0-logloss:0.50622
[6]	validation_0-logloss:0.48445
[7]	validation_0-logloss:0.46342
[8]	validation_0-logloss:0.44414
[9]	validation_0-logloss:0.42749
[10]	validation_0-logloss:0.41021
[11]	validation_0-logloss:0.39427
[12]	validation_0-logloss:0.38104
[13]	validation_0-logloss:0.36810
[14]	validation_0-logloss:0.35693
[15]	validation_0-logloss:0.34518
[16]	validation_0-logloss:0.33359
[17]	validation_0-logloss:0.32386
[18]	validation_0-logloss:0.31363
[19]	validation_0-logloss:0.30526
[20]	validation_0-logloss:0.29646
[21]	validation_0-logloss:0.28918
[22]	validation_0-logloss:0.28134
[23]	validation_0-logloss:0.27427
[24]	validation_0-logloss:0.26860
[25]	validation_0-logloss:0.26207
[26]	validation_0-logloss:0.25601
[27]	validation_0-logloss:0.25163
[28]	validation_0-logloss:0.24645
[29]	validation_0-logloss:0.24143
[30]	validation_0-logloss:0.23782
[31]	validation_0-logloss:0.23378
[32]	validation_0-log

0.9649122807017544