# Hyper-parameter 최적화: Spaceship Titanic
#### 작성: 고우주 | kubwa 쿱와

## 1. 패키지 불러오기

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

## 2. 데이터 불러오기 & 탐색

In [None]:
df = pd.read_csv('../dataset/spaceship-preprocessing.csv')
df.head()

In [None]:
df.shape

In [None]:
df['Transported'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(df.corr('pearson'), annot=True, cmap='coolwarm')
plt.show()

## 2-Split the data

In [None]:
X = df.drop('Transported' , axis =1 )
y = df['Transported']
X_train , X_test , y_train , y_test = train_test_split(X , 
                                                       y, 
                                                       random_state = 100,
                                                       test_size =0.2)

X_train.shape, X_test.shape

## 3-Standardize Scale

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

def get_best_score(model):
    
    print(model.best_score_)
    print(model.best_params_)
    print(model.best_estimator_)
    
    return model.best_score_

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, RocCurveDisplay


def plot_confusion_matrix(model, X_test, y_test):
    disp = ConfusionMatrixDisplay.from_estimator(model,
                                                 X_test, y_test,
                                                 cmap=plt.cm.Blues)

def plot_roc_curve(model, X_test, y_test):
    RocCurveDisplay.from_estimator(model, X_test, y_test)
    plt.show()
    
def evaluate(model, y_pred):
    plot_confusion_matrix(model, X_test, y_test)
    print(classification_report(y_test, y_pred))
    plot_roc_curve(model, X_test, y_test)

## 4-Parameter tuning `GridSearchCV`

### XGBoost

https://xgboost.readthedocs.io/en/latest/parameter.html

> 주요 Hyper-parameter
- max_depth: 개별 나무의 깊이 (나무를 대충하기 위해 작은값)
- learning_rate: Boosting 단계 별로 가중치를 두는 학습율
- n_estimators: 나무의 수
- subsample: 각 나무를 학습할 때 사용하는 포인트 수의 비율 (0 초과 1 이하의 숫자로 비율 설정)
- colsample_bytree: 각 나무에서 사용하는 features 수의 비율 (0 초과 1 이하의 숫자로 비율 설정)
- gamma: 각 나무에서 분할하는데 필요한 최소 손실감소량
- reg_lambda: 각 나무에 배정되는 weights에 대하여 L2-regularization 강도

In [None]:
param_grid = {'max_depth': [3, 10],
              'learning_rate': [0.001, 0.1],
              'n_estimators': [100, 200],
              'subsample': [0.7, 0.8],
              'reg_lambda': [0.01, 0.1],
              'colsample_bytree': [0.8, 1],
              'gamma': [0.1, 0.9]
              }

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=1234)
xgb_grid = GridSearchCV(estimator=xgb, 
                        param_grid=param_grid, 
                        cv=3, 
                        verbose=2, 
                        n_jobs=-1)

xgb_grid.fit(X_train, y_train)
get_best_score(xgb_grid);

In [None]:
y_pred_grid = xgb_grid.predict(X_test)
evaluate(xgb_grid, y_pred_grid)

## 5-Parameter tuning `RandomSearchCV`

In [None]:
xgb = XGBClassifier(random_state=1234)
xgb_rand = RandomizedSearchCV(xgb, 
                        param_grid, 
                        cv=3, 
                        verbose=2, 
                        n_jobs=-1)

xgb_rand.fit(X_train, y_train)
get_best_score(xgb_rand);

In [None]:
y_pred_rand = xgb_rand.predict(X_test)
evaluate(xgb_rand, y_pred_rand)

## 6-Parameter tuning `BayesianOptimization`
- https://github.com/fmfn/BayesianOptimization

In [None]:
#%pip install bayesian-optimization

In [None]:
from bayes_opt import BayesianOptimization

In [None]:
def xgb_cv(max_depth, 
           learning_rate, 
           n_estimators, 
           subsample, 
           reg_lambda,
           colsample_bytree,
           gamma
          ):
    
    xgb = XGBClassifier(n_jobs=-1,
                        max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        subsample=subsample,
                        reg_lambda=reg_lambda,
                        random_state=1234,
                        colsample_bytree=colsample_bytree,
                        gamma=gamma
                       )
    
    return cross_val_score(xgb, 
                           X_train, 
                           y_train, 
                           scoring='accuracy', cv=3).mean()

In [None]:
xgb_opt = BayesianOptimization(xgb_cv,
                                 {'max_depth': [3, 10],
                                  'learning_rate': [0.001, 0.1],
                                  'n_estimators': [100, 200],
                                  'subsample': [0.7, 0.8],
                                  'reg_lambda': [0.01, 0.1],
                                  'colsample_bytree': [0.8, 1],
                                  'gamma': [0.1, 0.9]
                                  }, random_state=1234)

In [None]:
xgb_opt.maximize(init_points=2, n_iter=5)

In [None]:
xgb_opt.max

In [None]:
best_params = xgb_opt.max['params']
best_params

In [None]:
best_params['max_depth']

In [None]:
xgb_bayes = XGBClassifier(max_depth=int(best_params['max_depth']),
                          learning_rate=best_params['learning_rate'],
                          n_estimators=int(best_params['n_estimators']),
                          reg_lambda=best_params['reg_lambda'],
                          subsample=best_params['subsample'])

In [None]:
xgb_bayes.fit(X_train, y_train)

In [None]:
y_pred_bayes = xgb_bayes.predict(X_test)
evaluate(xgb_bayes, y_pred_bayes)