Import libraries

In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import ppscore as pps
import seaborn as sns
from sklearn.svm import SVC
from lightgbm import LGBMClassifier, plot_importance as plot_importance_lgbm
from xgboost import XGBClassifier, plot_importance as plot_importance_xgb
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, precision_recall_curve, average_precision_score, roc_curve, roc_auc_score

# 1 - Load data

In [None]:
train_data = pd.read_csv("titanic-train.csv")
test_data = pd.read_csv("titanic-test.csv") # no ground truth available

# 2 - Data exploration

In [None]:
train_data.head()

PassengerId and Name irrelevant.

Each column and its value counts:

In [None]:
for column in train_data.drop(['PassengerId', 'Name', 'Survived'], axis=1).columns:
    print(train_data.loc[:, column].value_counts() ,'\n')

In [None]:
for column in train_data.columns:
    s = "Number of unique values in {}: {}".format(column, len(train_data[column].unique()))
    print(s)

In [None]:
missing_values = (train_data.isnull().sum())
print("Missing values per column:\n", missing_values[missing_values > 0])
print("\n Percentage missing values per column:\n", missing_values[missing_values > 0]/train_data.shape[0]*100)

In [None]:
train_data.Age.describe()

In [None]:
train_data.Pclass.describe()

In [None]:
train_data.Embarked.describe()

In [None]:
train_data.Fare.describe()

We can see:
- 3 types of Pclass
- 2 types of Sex
- 0-5,8 Sibsp
- 0-6 Parch
- 3 types of Embarked (mostly S) + null
- A lot of unique Ticket values
- Lots of Cabin info missing
- Age and Fare have high variance

In [None]:
matrix_df = pps.matrix(train_data)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
matrix_df = matrix_df.apply(lambda x: round(x, 2))

sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.75, annot=True)

Looking at this PPS matrix, we can see: 
- Best univariate predictor of Survived: Ticket (0.19), Sex (0.13), Fare (0.09) 

Also: Cabin, Fare, Pclass, Ticket probably are related in some way

# 3 - Feature engineering

In [None]:
# Features we will use
train_data_sub = train_data.drop(["Name", "Cabin", "Ticket", "Survived", "PassengerId"], axis=1)
target = train_data.Survived

train_data_sub.columns

In [None]:
# Turn Age category into a categorical one
train_data_sub.loc[train_data_sub.Age < 18, 'AgeCat'] = 'child'
train_data_sub.loc[(train_data_sub.Age >= 18) & (train_data_sub.Age < 65), 'AgeCat'] = 'adult'
train_data_sub.loc[train_data_sub.Age >= 65, 'AgeCat'] = 'senior'

# Turn Fare category into a categorical one
train_data_sub.loc[train_data_sub.Fare < 10, 'FareCat'] = 'cheap'
train_data_sub.loc[(train_data_sub.Fare >= 10) & (train_data_sub.Age < 20), 'FareCat'] = 'fair'
train_data_sub.loc[(train_data_sub.Fare >= 20) & (train_data_sub.Age < 30), 'FareCat'] = 'medium'
train_data_sub.loc[(train_data_sub.Fare >= 30) & (train_data_sub.Age < 50), 'FareCat'] = 'pricy'
train_data_sub.loc[train_data_sub.Fare >= 50, 'FareCat'] = 'expensive'

# Drop the original Age column
train_data_sub = train_data_sub.drop(["Age", "Fare"], axis=1)

In [None]:
# Get categorical and numerical column names
categorical_columns = train_data_sub.select_dtypes(include=['object']).columns
numerical_columns = train_data_sub.select_dtypes(exclude=['object']).columns

# 4 - Pipelines and hyper parameter optimization

In [None]:
# Evaluates model's performance on the test set (which will be created in model_best_pipe)
def model_eval_test(best_model_pipeline, X_test, y_test):
    results = []
    
    predictions = best_model_pipeline.best_estimator_.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    
    results.append(accuracy)
    results.append(precision)
    results.append(recall)
    results.append(f1)
    
    print("\n\nTest set results using best classifier:")
    print("Accuracy:", round(results[0], 3))
    print("Precision:", round(results[1], 3))
    print("Recall:", round(results[2], 3))
    print("F1-Score:", round(results[3], 3))
    
    return results

In [None]:
# Preprocessing for numerical data
numerical_transformer_1 = SimpleImputer(strategy='mean')
numerical_transformer_2 = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
data_transformer_1 = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_1, numerical_columns),
        ('cat', categorical_transformer_1, categorical_columns)
    ])
data_transformer_2 = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_2, numerical_columns),
        ('cat', categorical_transformer_1, categorical_columns)
    ])

In [None]:
def model_best_pipe(data, target, numerical_columns, categorical_columns):
    
    # Splitting original train_data into train and test
    # Avoiding information leakage
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.10, random_state=1)

    # Testing different data transformers 
    pipeline = Pipeline(steps=[('data_transformer', data_transformer_1), 
                           ('clf', SVC())]) 

    param_grid = [ 
                    {'data_transformer': [data_transformer_1, data_transformer_2],
                     'clf': [RandomForestClassifier()],
                     'clf__n_estimators': [int(x) for x in np.linspace(5, 30, num=15)],
                     'clf__max_features': [None, "sqrt", "log2"],
                     'clf__max_depth': [int(x) for x in np.linspace(3, 10, num=5)],
                     'clf__random_state': [int(x) for x in np.linspace(1, 49, num=30)]},

                    {'data_transformer': [data_transformer_1, data_transformer_2],
                     'clf': [XGBClassifier(verbosity=0, use_label_encoder=False)],
                     'clf__n_estimators': [int(x) for x in np.linspace(3, 15, num=10)],
                     'clf__eta': np.linspace(0.1, 0.9), # learning rate
                     'clf__max_depth': [int(x) for x in np.linspace(2, 7, num=5)],
                     'clf__gamma': np.linspace(0.1, 1), # min loss reduction required to make further partition on leaf node of tree
                     'clf__lambda': np.linspace(0.1, 1)} # L2 regularization term on weight
                ]

    # Metrics we will use
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']

    # Stratification ensures dataset class ratio 
    cross_validator = StratifiedShuffleSplit(n_splits=5, train_size=0.8, test_size=0.2, random_state=1)

    # Creating the randomized search cv object and fitting it
    best_model_pipeline = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, 
                                             n_iter=50, scoring=metrics, refit='accuracy', 
                                             n_jobs=-1, cv=cross_validator, random_state=1)

    best_model_pipeline.fit(X_train, y_train)

    # Results
    print("Best Data Pipeline: \n{}".format(best_model_pipeline.best_estimator_[0]))
    print("\nBest Classifier: \n {}".format(best_model_pipeline.best_estimator_[1]))
    print("\nMean cross-validated score of the best_estimator: \n {}".format(best_model_pipeline.best_score_))

    return X_train, X_test, y_train, y_test, best_model_pipeline

In [None]:
# Get train and test data and best model's pipeline
X_train, X_test, y_train, y_test, best_model_pipeline = model_best_pipe(train_data_sub, target, numerical_columns, categorical_columns)

# Checking best model's performance on test data
test_set_results = model_eval_test(best_model_pipeline, X_test, y_test)