# Hyper-parameter 최적화: Spaceship Titanic
#### 작성: 고우주 | kubwa 쿱와

## 1. 패키지 불러오기

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## 2. 데이터 불러오기 & 탐색

In [None]:
df = pd.read_csv('../dataset/spaceship-preprocessing.csv')
df.head()

In [None]:
df.shape

In [None]:
df['Transported'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(df.corr('pearson'), annot=True, cmap='coolwarm')
plt.show()

## 2-Split the data

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Transported' , axis =1 )
y = df['Transported']
X_train , X_test , y_train , y_test = train_test_split(X , 
                                                       y, 
                                                       random_state = 100,
                                                       test_size =0.2)

X_train.shape, X_test.shape

## 3-Standardize Scale

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## 4-Parameter tuning `GridSearchCV`

### XGBoost

https://xgboost.readthedocs.io/en/latest/parameter.html

> 주요 Hyper-parameter
- max_depth: 개별 나무의 깊이 (나무를 대충하기 위해 작은값)
- learning_rate: Boosting 단계 별로 가중치를 두는 학습율
- n_estimators: 나무의 수
- subsample: 각 나무를 학습할 때 사용하는 포인트 수의 비율 (0 초과 1 이하의 숫자로 비율 설정)
- colsample_bytree: 각 나무에서 사용하는 features 수의 비율 (0 초과 1 이하의 숫자로 비율 설정)
- gamma: 각 나무에서 분할하는데 필요한 최소 손실감소량
- reg_lambda: 각 나무에 배정되는 weights에 대하여 L2-regularization 강도

In [None]:
#%pip install optuna

In [None]:
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    xgb_model = XGBClassifier(**params)
    xgb_model.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_model.predict(X_test)

    # Evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
# 시각화
optuna.visualization.plot_optimization_history(study)

In [None]:
# 파라미터들관의 관계
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# 각 파라미터들의 상관관계
optuna.visualization.plot_contour(
    study,
    params=[
        "max_depth",
        "learning_rate",
        "n_estimators",
        "subsample",
        "reg_lambda",
        "colsample_bytree",
        "gamma",
    ],
)

In [None]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)