<a href="https://colab.research.google.com/github/zhh25/Titanic/blob/main/Fine_Tune_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import os
path_root = '/content/drive/MyDrive/titanic'
train = pd.read_csv(os.path.join(path_root, 'data/train.csv'))
test =  pd.read_csv(os.path.join(path_root, 'data/test.csv'))
sample_submission =  pd.read_csv(os.path.join(path_root, 'data/sample_submission.csv'))
titanic = train.drop('Survived', axis = 1)
titanic_label = train['Survived'].copy()

# Transformation Pipeline

In [4]:
exec(open('/content/drive/MyDrive/titanic/titanic_data_pipeline.py' ).read())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
titanic_prepared = full_pipeline.fit_transform(titanic)
test_prepared = full_pipeline.transform(test)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(titanic_prepared, titanic_label, test_size = 0.25,  )

In [7]:
cols_full = attribs_names()

#Tuning Xgboost by Hyperopt 
some helpful links:

[XGBoost - HyperOPT + CV via Python API](https://www.kaggle.com/code/felipeleiteantunes/xgboost-hyperopt-cv-via-python-api/notebook)

[A Guide on XGBoost hyperparameters tuning](https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook)


In [8]:
from xgboost import XGBClassifier
from hyperopt.pyll.stochastic import sample
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import random

In [9]:
xgb_train = xgb.DMatrix(titanic_prepared, label = titanic_label, feature_names=cols_full)

### Initialize domain space

In [19]:
space = {'max_depth': hp.randint('max_depth',10) + 1,
         'eta':  hp.choice('eta', [0.0001,0.001, 0.01, 0.1, 1]) ,
         'colsample_bytree': hp.choice('colsampe_bytree', np.arange(5,10)*0.1),
         'alpha': hp.choice('alpha',[1e-5, 1e-2, 0.1, 1, 10, 100]),
         'lambda': hp.choice('lambda',[1e-5, 1e-2, 0.1, 1, 10, 100]),
         'tree_method': 'approx',
         #'seed': hp.randint('seed',2000000) 
               }

In [21]:
sample(space)

{'alpha': 1e-05,
 'colsample_bytree': 0.7000000000000001,
 'eta': 0.01,
 'lambda': 100,
 'max_depth': 6,
 'tree_method': 'approx'}

### Define objective function

In [22]:
#Define objective function
def objective(space):
    num_round = 1000
    xgb_params = sample(space)
    
    cvresult = xgb.cv(
        xgb_params, 
        xgb_train, 
        num_round, 
        nfold = 4, 
        metrics = ['error'], 
        early_stopping_rounds = 50, 
        stratified = True,
        seed = 0
        #verbose_eval =True
      )
    #print(cvresult)
    print(xgb_params, '\n')
    print(cvresult['test-error-mean'][cvresult.shape[0]-1])
    return {'loss': cvresult['test-error-mean'][cvresult.shape[0]-1],
            'status': STATUS_OK
           }

### Optimization algorithm

for fmin to return the actual value of best_hyperparams instead of index, set 'return_argmin=False'.
[hyperopt result exceeds my hp.choice restriction, why? (XGBoost)](https://stackoverflow.com/questions/54978278/hyperopt-result-exceeds-my-hp-choice-restriction-why-xgboost)

In [23]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials,
                        return_argmin=False
                       )

{'alpha': 10, 'colsample_bytree': 0.8, 'eta': 0.1, 'lambda': 0.1, 'max_depth': 6, 'tree_method': 'approx'}
0.1705905
{'alpha': 1, 'colsample_bytree': 0.9, 'eta': 0.0001, 'lambda': 1, 'max_depth': 3, 'tree_method': 'approx'}
0.1683635
{'alpha': 1, 'colsample_bytree': 0.6000000000000001, 'eta': 1, 'lambda': 1e-05, 'max_depth': 2, 'tree_method': 'approx'}
0.1560065
{'alpha': 0.1, 'colsample_bytree': 0.9, 'eta': 0.0001, 'lambda': 10, 'max_depth': 10, 'tree_method': 'approx'}
0.16612125
{'alpha': 0.01, 'colsample_bytree': 0.8, 'eta': 0.0001, 'lambda': 0.1, 'max_depth': 9, 'tree_method': 'approx'}
0.1807355
{'alpha': 10, 'colsample_bytree': 0.6000000000000001, 'eta': 0.001, 'lambda': 100, 'max_depth': 6, 'tree_method': 'approx'}
0.16724224999999998
{'alpha': 1e-05, 'colsample_bytree': 0.9, 'eta': 0.01, 'lambda': 0.01, 'max_depth': 9, 'tree_method': 'approx'}
0.182973
{'alpha': 1e-05, 'colsample_bytree': 0.9, 'eta': 0.01, 'lambda': 1e-05, 'max_depth': 4, 'tree_method': 'approx'}
0.16723249999

In [24]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'alpha': 1, 'colsample_bytree': 0.7000000000000001, 'eta': 0.01, 'lambda': 0.01, 'max_depth': 6, 'tree_method': 'approx'}


###  Retrain the model using the best hyperparamets
two steps:
1. find the best number of boost rounds using cross-validation
2. retrain the model using the whole data set

In [26]:
cv_result = xgb.cv(
        best_hyperparams, 
        xgb_train, 
        1000, 
        nfold = 4, 
        metrics = ['error'], 
        early_stopping_rounds = 50, 
        stratified = True,
        seed = 0
        #verbose_eval =True
      )

In [27]:
best_round = cv_result.shape[0]
best_round

260

In [28]:
xgb_model = XGBClassifier(
    n_estimators = best_round,
    reg_alpha = best_hyperparams['alpha'],
    reg_lambda = best_hyperparams['lambda'],
    learning_rate = best_hyperparams['eta'],
    colsample_bytree = best_hyperparams['colsample_bytree'],
    max_depth = best_hyperparams['max_depth'],
    random_state = 0

)

In [29]:
xgb_model.fit(titanic_prepared, titanic_label)
xgb_prediction = xgb_model.predict(test_prepared)

In [30]:
xgb_submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': xgb_prediction})
xgb_submission.to_csv(os.path.join(path_root, 'submission/RandomForest_hyperop.csv'), index = False)

### save model

In [31]:
import joblib

joblib.dump(xgb_model, os.path.join(path_root, 'models/best_xgb.sav'))

['/content/drive/MyDrive/titanic/models/best_xgb.sav']