In [19]:
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

tqdm.pandas()

# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 1: Load the data</div>

## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Dataset Metadata</div>

# Overview

The data has been split into two groups:

- **Training set** (`train.csv`)
- **Test set** (`test.csv`)

The training set should be used to build your machine learning models. For the training set, we provide the outcome (also known as the **“ground truth”**) for each passenger. Your model will be based on **features** like passengers’ gender and class. You can also use **feature engineering** to create new features.

The test set should be used to see how well your model performs on **unseen data**. For the test set, we do not provide the ground truth for each passenger. It is your job to **predict these outcomes**. For each passenger in the test set, use the model you trained to predict whether or not they survived the sinking of the Titanic.

We also include `gender_submission.csv`, a set of predictions that assume **all and only female passengers survive**, as an example of what a submission file should look like.

---

# Data Dictionary

| Variable   | Definition                             | Key                                           |
|------------|----------------------------------------|-----------------------------------------------|
| survival   | Survival                               | 0 = No, 1 = Yes                               |
| pclass     | Ticket class                           | 1 = 1st, 2 = 2nd, 3 = 3rd                      |
| sex        | Sex                                    |                                               |
| age        | Age in years                           |                                               |
| sibsp      | # of siblings / spouses aboard Titanic |                                               |
| parch      | # of parents / children aboard Titanic |                                               |
| ticket     | Ticket number                          |                                               |
| fare       | Passenger fare                         |                                               |
| cabin      | Cabin number                           |                                               |
| embarked   | Port of Embarkation                    | C = Cherbourg, Q = Queenstown, S = Southampton |

---

# Variable Notes

- **pclass**: A proxy for socio-economic status (SES)
  - 1st = Upper
  - 2nd = Middle
  - 3rd = Lower

- **age**: Age is fractional if less than 1. If the age is estimated, it is in the form of `xx.5`

- **sibsp**: The dataset defines family relations in this way:
  - **Sibling** = brother, sister, stepbrother, stepsister
  - **Spouse** = husband, wife (mistresses and fiancés were ignored)

- **parch**: The dataset defines family relations in this way:
  - **Parent** = mother, father
  - **Child** = daughter, son, stepdaughter, stepson
  - Some children travelled only with a nanny, therefore `parch = 0` for them.

## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Load the data and display head</div>

In [6]:
# Check environment if in Kaggle
extract_folder_path = '/kaggle/input/titanic' if os.path.exists('/kaggle/input/titanic') else './data'

# Load the training data
train_file_path = os.path.join(extract_folder_path, 'train.csv')
train_data = pd.read_csv(train_file_path)

# Displaying the first few rows of the dataset
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">See the statistical summary of the dataset</div>

In [9]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


We find out that some columns are 'object' type, which means they are categorical variables. We will need to convert them into numerical format before feeding them into the model.

# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 2: Preprocess the data</div>

## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Separating features and target variable</div>

In [None]:
# Separating the features and the target variable
X = train_data.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'])
y = train_data['Survived']

## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Preprocessing the data</div>

For preprocessing, we will handle missing values and encode categorical variables. We will use `SimpleImputer` to fill in missing values and `OneHotEncoder` to convert categorical variables into a format that can be provided to ML algorithms to do a better job in prediction.

In [15]:
# Identifying numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Check the preprocessing results
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed[0:5]

array([[ 3.    , 22.    ,  1.    ,  0.    ,  7.25  ,  0.    ,  1.    ,
         0.    ,  0.    ,  1.    ],
       [ 1.    , 38.    ,  1.    ,  0.    , 71.2833,  1.    ,  0.    ,
         1.    ,  0.    ,  0.    ],
       [ 3.    , 26.    ,  0.    ,  0.    ,  7.925 ,  1.    ,  0.    ,
         0.    ,  0.    ,  1.    ],
       [ 1.    , 35.    ,  1.    ,  0.    , 53.1   ,  1.    ,  0.    ,
         0.    ,  0.    ,  1.    ],
       [ 3.    , 35.    ,  0.    ,  0.    ,  8.05  ,  0.    ,  1.    ,
         0.    ,  0.    ,  1.    ]])

# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 3: Train the model</div>

<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Splitting the data into training and validation sets</div>

In [16]:
# Split data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (712, 7)
X_valid shape: (179, 7)
y_train shape: (712,)
y_valid shape: (179,)


<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Training the XGBoost model</div>

In [18]:
# Define the XGBoost model
xgb_model = XGBClassifier( eval_metric='logloss')

# Create and evaluate the pipeline
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', xgb_model)])

# Preprocessing of training data, fit model
xgb_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get probability predictions
valid_probs = xgb_pipeline.predict_proba(X_valid)

# Extracting the probabilities for the 'Transported' class
valid_transported_probs = valid_probs[:, 1]

# Evaluate the model using AUC
auc_score = roc_auc_score(y_valid, valid_transported_probs)
auc_score

0.8709137709137709

# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 4: Randomized Search Cross Validation</div>

<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Hyperparameter Tuning with Randomized Search CV</div>

Sometimes, the default parameters of a model may not yield the best performance. To optimize the model's performance, we can use techniques like Randomized Search Cross-Validation to find the best hyperparameters.

In [22]:
# Define the parameter grid for XGBoost
param_grid = {
    'model__n_estimators': np.arange(50, 200, 10),
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': np.arange(3, 10, 2),
    'model__min_child_weight': np.arange(1, 6, 2)
}

# Create the randomized search with 10 fold cross-validation
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_grid,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='roc_auc',
    cv=10,  # 10-fold cross-validation
    random_state=42,
    verbose=1
)

# Fit the random search model
xgb_random_search.fit(X_train, y_train)

# Best parameters and score
best_params = xgb_random_search.best_params_
best_score = xgb_random_search.best_score_

best_params, best_score

Fitting 10 folds for each of 10 candidates, totalling 100 fits


({'model__n_estimators': np.int64(160),
  'model__min_child_weight': np.int64(5),
  'model__max_depth': np.int64(5),
  'model__learning_rate': 0.01},
 np.float64(0.8576796454574233))

<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Evaluating the best hyperparameters performance</div>

In [23]:
xgb_model_tuned = XGBClassifier(
    n_estimators=best_params['model__n_estimators'],
    learning_rate=best_params['model__learning_rate'],
    max_depth=best_params['model__max_depth'],
    min_child_weight=best_params['model__min_child_weight'],
    eval_metric='logloss'
)

# Update the pipeline
xgb_pipeline.set_params(model=xgb_model)

# Fit the model with the training data
xgb_pipeline.fit(X_train, y_train)

# Predict probabilities on the validation set
valid_probs_tuned = xgb_pipeline.predict_proba(X_valid)

# Extracting the probabilities for the 'Survived' class
valid_survived_probs_tuned = valid_probs_tuned[:, 1]

# Evaluate the tuned model using AUC
auc_score_tuned = roc_auc_score(y_valid, valid_survived_probs_tuned)
auc_score_tuned

0.8709137709137709

<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Comparing AUC scores before and after tuning</div>

In [26]:
# Display the AUC scores before and after tuning
print(f"AUC Score before tuning: {auc_score:.4f}")
print(f"AUC Score after tuning: {auc_score_tuned:.4f}")

AUC Score before tuning: 0.8709
AUC Score after tuning: 0.8709


<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Determine the probability threshold for classification by accuracy.</div>

In [30]:
# Search for the best threshold
best_threshold = 0.5
best_accuracy = 0
thresholds = np.arange(0.1, 0.9, 0.01)

for threshold in thresholds:
    y_pred = (valid_survived_probs_tuned >= threshold).astype(int)
    accuracy = accuracy_score(y_valid, y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

print(f"Best Threshold: {best_threshold:.2f} with Accuracy: {best_accuracy:.4f}")

Best Threshold: 0.77 with Accuracy: 0.8156


# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 5: Make predictions on the test set</div>

<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Create the submission file and check the format</div>

In [33]:
# Load the test dataset
test_data_path = os.path.join(extract_folder_path, 'test.csv')
test_data = pd.read_csv(test_data_path)

# Preprocessing of test data, make probability predictions
test_probs = xgb_pipeline.predict_proba(test_data)

# Extracting the probabilities for the 'Survived' class
test_survived_probs = test_probs[:, 1]

# Applying the best threshold to get binary predictions
test_predictions = (test_survived_probs >= best_threshold).astype(int)

# Load the sample submission file to compare formats
sample_submission_path = os.path.join(extract_folder_path, 'gender_submission.csv')
sample_submission = pd.read_csv(sample_submission_path)

# Create the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions
})

# See if the submission format matches the sample submission
print('Submission shape:', submission.shape)
print('Sample submission shape:', sample_submission.shape)

print('submission head:')
print(submission.head())

print('sample submission head:')
print(sample_submission.head())

Submission shape: (418, 2)
Sample submission shape: (418, 2)
submission head:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0
sample submission head:
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1


<div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Save the submission file</div>

In [35]:
# Define the file path for the output CSV file
output_csv_path = 'submission.csv'

# Save the output DataFrame to a CSV file
submission.to_csv(output_csv_path, index=False)

# Return the path of the saved file
output_csv_path

'submission.csv'