# Titanic - XGBoost with GridSearch

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

def preprocess(df):
    result = df.copy()
    
    result['Age'] = result['Age'].fillna(result['Age'].median())
    result['Embarked'] = result['Embarked'].fillna(result['Embarked'].mode()[0])
    result['Fare'] = result['Fare'].fillna(result['Fare'].median())
    
    result['Title'] = result['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    result['Title'] = result['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                                                'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    result['Title'] = result['Title'].replace(['Mlle', 'Ms'], 'Miss')
    result['Title'] = result['Title'].replace('Mme', 'Mrs')
    result['Title'] = result['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4})
    result['Title'] = result['Title'].fillna(0)

    result['FamilySize'] = result['SibSp'] + result['Parch'] + 1
    result['IsAlone'] = (result['FamilySize'] == 1).astype(int)
    
    result['Sex'] = result['Sex'].map({'male': 0, 'female': 1})
    result['Embarked'] = result['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    return result

train_processed = preprocess(train)
test_processed = preprocess(test)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
            'Title', 'FamilySize', 'IsAlone']

X = train_processed[features]
y = train_processed['Survived']
X_test = test_processed[features]

## GridSearch

In [3]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
}

xgb = XGBClassifier(random_state=42, verbosity=0)

grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print(f"Best CV score: {grid_search.best_score_:.4f}")
print(f"Best params: {grid_search.best_params_}")

Best CV score: 0.8451
Best params: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.8}


In [4]:
# Top 10 combinations
results = pd.DataFrame(grid_search.cv_results_)
results[['params', 'mean_test_score', 'std_test_score']].sort_values(
    'mean_test_score', ascending=False
).head(10)

Unnamed: 0,params,mean_test_score,std_test_score
34,"{'learning_rate': 0.05, 'max_depth': 4, 'n_est...",0.845126,0.024969
59,"{'learning_rate': 0.1, 'max_depth': 4, 'n_esti...",0.844009,0.030574
28,"{'learning_rate': 0.05, 'max_depth': 3, 'n_est...",0.844009,0.017781
35,"{'learning_rate': 0.05, 'max_depth': 4, 'n_est...",0.844009,0.02776
40,"{'learning_rate': 0.05, 'max_depth': 5, 'n_est...",0.84399,0.024496
57,"{'learning_rate': 0.1, 'max_depth': 4, 'n_esti...",0.842891,0.028312
63,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.841761,0.021351
58,"{'learning_rate': 0.1, 'max_depth': 4, 'n_esti...",0.841749,0.021741
53,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.840631,0.027179
62,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.840625,0.019678


## Submit Best Model

In [5]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

submission.to_csv('submission_xgb_tuned.csv', index=False)
print(f"Submission saved: {submission.shape}")

Submission saved: (418, 2)
