# [실습] gridSearchCV - xgboost 모델 적용하여 iris, cancer, wine 데이터로 실행한 후 결과 출력(최적의 파라미터 찾기)

# 팀 프로젝트에 적용

In [7]:
# pip install xgboost

In [8]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

In [9]:
# 1. 데이터
datasets = load_breast_cancer()  # 다중분류
x = datasets['data']
y = datasets.target

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42

)
# kfold
n_splits = 5    # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 

In [10]:
# parameters
param = {
    'n_estimators': [100,200], #default 100 / 1~inf(무한대) / 정수
    'learning_rate' : [0.1, 0.01], #default 0.3/ 0~1 / learning_rate는 eta라고 해도 적용됨
    'max_depth' : [3,4,5], #default 3/ 0~inf(무한대) / 정수 => 소수점은 정수로 변환하여 적용해야 함
    'gamma': [4], #default 0 / 0~inf
    'min_child_weight': [0,0.1,0.5], #default 1 / 0~inf
    'subsample' : [0,0.1,0.2], #default 1 / 0~1
    'colsample_bytree' : [0,0.1], #default 1 / 0~1
    'colsample_bylevel' : [0,0.1], #default 1 / 0~1
    'colsample_bynode' : [0,0.1], #default 1 / 0~1
    'reg_alpha' : [0, 0.1], #default 0 / 0~inf / L1 절대값 가중치 규제 / 그냥 alpha도 적용됨
    'reg_lambda' : [1] #default 1 / 0~inf / L2 제곱 가중치 규제 / 그냥 lambda도 적용됨
}

In [11]:
# 2. 모델
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
xgb = XGBClassifier(colsample_bylevel= 0, colsample_bynode = 0, 
                    colsample_bytree = 0.1, gamma = 4, learning_rate = 0.1, 
                    max_depth = 3, min_child_weight = 0, n_estimators = 200, 
                    reg_alpha = 0, reg_lambda = 1, subsample = 0.2)
# model = GridSearchCV(xgb, param, cv=kfold, 
#                      refit=True, verbose=1, n_jobs=-1)


# 3. 훈련
import time
start_time = time.time()
model.fit(x_train, y_train)
end_time = time.time() - start_time

print('최적의 파라미터 : ', model.best_params_)
print('최적의 매개변수 : ', model.best_estimator_)
print('best_score : ', model.best_score_)       # 가장 좋은 score
print('model_score : ', model.score(x_test, y_test))    # 실제 데이터를 넣었을 때의 socre
print('걸린 시간 : ', end_time, '초')

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
최적의 파라미터 :  {'colsample_bylevel': 0, 'colsample_bynode': 0, 'colsample_bytree': 0.1, 'gamma': 4, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 0, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.2}
최적의 매개변수 :  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0, colsample_bynode=0, colsample_bytree=0.1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=4, gpu_id=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=0, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=None, num_parallel_tree=None,
            

In [12]:
# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
cv pred acc :  0.9473684210526315
