## In this notebook classification is perfromed with the XGBClassifier 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

### Load and prepare data for training 

In [2]:
df = pd.read_csv('~/spaceship_titanic/data/cleaned_train.csv')
df

Unnamed: 0.1,Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Dec,Side,CabinCrowdness
0,0,1,0,2,0.701070,0,-0.342904,-0.279405,-0.308027,-0.269982,-0.266422,0,3,0,-0.563210
1,1,0,0,2,-0.335103,0,-0.171788,-0.273878,-0.263923,0.223638,-0.227164,1,5,1,-0.563210
2,2,1,0,2,2.013556,1,-0.275399,1.916461,-0.308027,5.767644,-0.222703,0,4,1,0.271984
3,3,1,0,2,0.286601,0,-0.342904,0.508430,0.346484,2.723206,-0.094224,0,4,1,0.271984
4,4,0,0,2,-0.887728,0,0.132767,-0.236421,-0.041636,0.238024,-0.264637,1,5,1,-0.563210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7374,8688,1,0,1,0.839227,1,-0.342904,3.907846,-0.308027,1.207281,-0.200398,0,4,0,-0.563210
7375,8689,0,1,0,-0.749572,0,-0.342904,-0.279405,-0.308027,-0.269982,-0.266422,0,6,1,-0.563210
7376,8690,0,0,2,-0.196946,0,-0.342904,-0.279405,2.994519,-0.269082,-0.266422,1,6,1,-0.563210
7377,8691,1,0,1,0.217523,0,-0.342904,0.364741,-0.308027,0.047410,2.619897,0,0,1,0.271984


In [3]:
X = df.drop('Transported', axis=1)
y = df['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0)

### Basic hyperparameter tuning

Grid search on max_depth and min_child_weight

In [4]:

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(objective= 'binary:logistic', learning_rate =0.1, max_depth=5, 
min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=0), 
 param_grid = param_test1, scoring='accuracy', cv = 5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 5, 'min_child_weight': 1}, 0.8084564606948058)

Grid search on regularization weight gamma

In [5]:
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)],
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(objective= 'binary:logistic', learning_rate =0.1, max_depth=5,
 min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=0), 
 param_grid = param_test2, scoring='accuracy', cv = 5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

({'gamma': 0.3, 'reg_alpha': 0.05}, 0.8111668145011768)

Grid search on num_of_estimators

In [6]:
param_test3 = {
 'n_estimators':range(100,160,10),
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(objective= 'binary:logistic', learning_rate =0.1, max_depth=5,
 min_child_weight=1, gamma=0.3, reg_alpha = 0.05, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=0), 
 param_grid = param_test2, scoring='accuracy', cv = 5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

({'gamma': 0.3, 'reg_alpha': 0.05}, 0.8111668145011768)

### Start training with optimal hyperparameters

In [7]:
# Set the parameters for the XGBoost model
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5,
    'min_child_weight': 1,
    'n_estimators': 100,
    'learning_rate': 0.1,
    'gamma': 0.3,
    'reg_alpha':0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight':1, 
    'seed': 0
}
# Train the XGBoost model
clf = XGBClassifier(**params)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True);

# Predict on the validation set
y_pred = clf.predict(X_val)

# Convert probabilities to binary predictions
y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]



[0]	validation_0-logloss:0.65705
[1]	validation_0-logloss:0.62674
[2]	validation_0-logloss:0.60341
[3]	validation_0-logloss:0.58242
[4]	validation_0-logloss:0.56156
[5]	validation_0-logloss:0.54513
[6]	validation_0-logloss:0.52848
[7]	validation_0-logloss:0.51714
[8]	validation_0-logloss:0.50591
[9]	validation_0-logloss:0.49541
[10]	validation_0-logloss:0.48666
[11]	validation_0-logloss:0.47906
[12]	validation_0-logloss:0.47148
[13]	validation_0-logloss:0.46581
[14]	validation_0-logloss:0.45990
[15]	validation_0-logloss:0.45508
[16]	validation_0-logloss:0.45099
[17]	validation_0-logloss:0.44770
[18]	validation_0-logloss:0.44403
[19]	validation_0-logloss:0.44072
[20]	validation_0-logloss:0.43802
[21]	validation_0-logloss:0.43512
[22]	validation_0-logloss:0.43332
[23]	validation_0-logloss:0.43140
[24]	validation_0-logloss:0.42963
[25]	validation_0-logloss:0.42605
[26]	validation_0-logloss:0.42483
[27]	validation_0-logloss:0.42301
[28]	validation_0-logloss:0.42050
[29]	validation_0-loglos

In [8]:
# Generate classification report
classification_rep = classification_report(y_val, y_pred_binary)
print(classification_rep)


              precision    recall  f1-score   support

           0       0.79      0.81      0.80       902
           1       0.82      0.79      0.81       943

    accuracy                           0.80      1845
   macro avg       0.80      0.80      0.80      1845
weighted avg       0.80      0.80      0.80      1845

