# Titanic: training a Gradient Boosting Classifier



https://www.kaggle.com/competitions/titanic


The goal of this Notebook is to train a **Gradient Boosting** model to give a solution to the famous **Titanic Competition** from the Kaggle platform. In this particular project, we will use this model to make predictions in our *streamlit app* (refer to ```titanic_streamlit/main_app/streamlit_app.py```

https://www.kaggle.com/code/marcpaulo/titanic-playground-for-new-kagglers-0-78

In another notebook named ```titanic_streamlit/notebooks/training_playground.ipynb``` you can find an exhaustive exploration of the *Titanic Classification problem*. It includes an in-depth analysis and testing: *Exploratory Data Analysis*, *Data Preprocessing* with *Sklearn Pipelines*, and *Hyperparameter Optimization* and *Model selection*.

The notebook presented here is a short version of the aforementioned ```training_playground```. Here, we directly train the *Gradient Boosting Classifier* (the best model found) and save it (*pickle*) for future use in ```titanic_streamlit/main_app/streamlit_app.py```

**P.S** the model trained here achieves a **0.78468 score in the Kaggle competition**, occupying position *1777/14731*, **TOP 13%**. (last update: 07/09/2023)

In [None]:
# Enter your Project Path in which the 'titanic_streamlit' folder is located:

notebook_config = {
    'your_project_path': '<YOUR_PATH_HERE>/titanic_streamlit',  # TODO: fill this!!!
    
    'random_state': 12345,  # for the GradientBoostingClassifier
    'n_jobs': 1 ,           # for the cross_val_score
    'cv': 10,               # for the cross_val_score
    
    'save_model': True,
    'model_file_name': 'trained_grad_boost.pkl',  # where the model is saved
    'run_sanity_check': True  # try to load the model afer it's saved
}

In [None]:
import os
import pickle

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_path = notebook_config['your_project_path'] + '/data/train.csv'
df_train = pd.read_csv(data_path)

In [None]:
# lowercase all column names
df_train.columns = [c.lower() for c in df_train.columns]

# 1. Exploratory Data Analysis & Preprocessing

In [None]:
df_train.head()

In [None]:
print(df_train.shape)

In [None]:
df_train = df_train.drop(columns=['passengerid', 'name', 'cabin', 'ticket'])

In [None]:
# there are some outliers in 'fare' columns,
# let's cut the maximum value to be 300
df_train.loc[df_train['fare'] > 300, 'fare'] = 300

In [None]:
## a new feature: num_relatives = sibsp + parch
df_train['num_relatives'] = df_train['sibsp'] + df_train['parch']

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# First, create three small pipelines to stack more than one Transformer
# for a specific feature. Those features that only require one
# transformation are handled by the final 'preprocessor' here below.
age_pipe = Pipeline(steps=[
    ('age_imp', SimpleImputer(strategy='median')),
    ('age_scale', MinMaxScaler())
])
fare_pipe = Pipeline(steps=[
    ('fare_imp', SimpleImputer(strategy='mean')),
    ('fare_scale', MinMaxScaler())
])
embarked_pipe = Pipeline(steps=[
    ('embarked_imp', SimpleImputer(strategy='most_frequent')),
    ('embarked_onehot', OneHotEncoder(drop=None))
])

# Let's create the final 'preprocessor'
preprocessor = ColumnTransformer(
    transformers=[
        ('age_pipe', age_pipe, ['age']),
        ('fare_pipe', fare_pipe, ['fare']),
        ('embarked_pipe', embarked_pipe, ['embarked']),
        ('minmax_scaler', MinMaxScaler(), ['sibsp', 'parch', 'num_relatives']),
        ('pclass_onehot', OneHotEncoder(drop=None), ['pclass']),
        ('sex_onehot', OneHotEncoder(drop='first'), ['sex'])
    ]
)

# 2. Gradient Boosting Classifier

In [None]:
y_train = df_train['survived'].values  # [0,1]
df_train = df_train.drop(columns=['survived'])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score


grad_boost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('grad_boost', GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=notebook_config['random_state']
    ))
])

grad_boost_acc = cross_val_score(
    estimator=grad_boost,
    X=df_train,
    y=y_train,
    scoring='accuracy',
    cv=notebook_config['cv'],
    n_jobs=notebook_config['n_jobs']
)

print('best GradientBoosting acc (mean) =', round(np.mean(grad_boost_acc), 2))
print('best GradientBoosting acc (std)  =', round(np.std(grad_boost_acc), 2))

In [None]:
grad_boost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('grad_boost', GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=notebook_config['random_state']
    ))
])

grad_boost.fit(df_train, y_train)

In [None]:
# Save the trained model

save_model_path = (
        notebook_config['your_project_path'] + 
        '/models/' + 
        notebook_config['model_file_name']
    )

if notebook_config['save_model']:
    
    
    with open(save_model_path, 'wb') as out_file:
        pickle.dump(grad_boost, out_file)
    print(f"Grad Boost model saved in:\n'{save_model_path}'")

else:

    print('According to the notebook_config, the model is NOT saved')

In [None]:
# SANITY CHECK: Load the model

if notebook_config['run_sanity_check']:
    with open(save_model_path, 'rb') as in_file:
        loaded_model = pickle.load(in_file)

    print(f"Grad Boost model loaded from:\n'{save_model_path}'")
    print('train score:', loaded_model.score(df_train, y_train))

else:
    
    print('According to the notebook_config, do NOT run Sanity Check')