In [381]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np
from statistics import mode
import re
from pycaret.classification import *

# Read the data
X = pd.read_csv('data/train.csv')
X_test_full = pd.read_csv('data/test.csv')

In [382]:
full = pd.concat([X, X_test_full])
full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [383]:
# mode is the value which appears the most often, it's S for embarked
# mode(full["Embarked"]) # => 'S'
full["Embarked"] = full["Embarked"].fillna(mode(full["Embarked"]))

In [384]:
 # fill missing cabins with new U type
full['Cabin'] = full['Cabin'].fillna('U')

In [385]:
# the first letters of the cabins seem to be referring to a specific part of the boat
# keep only the initials in the Cabin column
full['Cabin'] = full['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())

In [386]:
# in age column Pclass has the highest correlation, so we use that to fill missing values
full['Age'] = full.groupby("Pclass")['Age'].transform(lambda x: x.fillna(x.median()))

In [387]:
# in Fare column Pclass has the highest correlation, so we use that to fill missing values
full['Fare'] = full.groupby("Pclass")['Fare'].transform(lambda x: x.fillna(x.median()))

In [388]:
# get a unique list of Initials from Cabin types
full['Cabin'].unique().tolist()

In [389]:
# Extract the salutation! A few letters with a . in the end
full['Title'] = full.Name.str.extract(' ([A-Za-z]+)\.', expand = False)
full['Title'].unique().tolist()

In [390]:
# Sibsp is the number of siblings / spouses aboard
# Parch is the number of parents / children aboard
# So we calculate the family size
full['familySize'] = full['SibSp'] + full['Parch'] + 1

In [391]:
# Drop redundant features
full = full.drop(['Name', 'SibSp', 'Parch', 'Ticket'], axis = 1)

In [392]:
# One hot encoded
full_ohe = pd.get_dummies(full)



In [393]:
# recover the original test and train dataset
train = full_ohe[full_ohe['Survived'].notna()]
test = full_ohe[full_ohe['Survived'].isna()].drop(['Survived'], axis = 1)

In [394]:
# convert Survived values from float to int, it became float when concatenating
train.loc[:,'Survived'] = train.loc[:,'Survived'].astype(np.int8)

In [395]:
# create transformation pipeline
titanicSetup = setup(train, target = 'Survived', session_id = 123,
           normalize = True,
           polynomial_features = True,
           trigonometry_features = True,
           feature_interaction=True,
           bin_numeric_features= ['Age'],
           ignore_features=['PassengerId'],
          categorical_features=['Pclass'],
          numeric_features=['familySize']
          )

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(891, 38)"
4,Missing Values,False
5,Numeric Features,36
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [396]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.8427,0.8638,0.7696,0.8143,0.7894,0.6641
1,Gradient Boosting Classifier,0.8395,0.8565,0.7656,0.8075,0.7828,0.6562
2,Linear Discriminant Analysis,0.8363,0.8541,0.7194,0.833,0.7686,0.6436
3,Ridge Classifier,0.8347,0.0,0.7111,0.8346,0.7645,0.6391
4,Extreme Gradient Boosting,0.833,0.8594,0.7572,0.7989,0.7756,0.6431
5,Logistic Regression,0.8315,0.8592,0.7364,0.8131,0.768,0.6369
6,K Neighbors Classifier,0.8217,0.841,0.7529,0.78,0.7638,0.6209
7,Extra Trees Classifier,0.8171,0.8406,0.7489,0.7715,0.7588,0.6116
8,Ada Boost Classifier,0.8121,0.8362,0.7317,0.7681,0.7472,0.5981
9,Light Gradient Boosting Machine,0.8106,0.8498,0.7322,0.7718,0.7478,0.5967


In [397]:
# Gradient Boosting Classifier is the most accurate model so we'll use this
my_model = create_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.8413,0.0,0.75,0.8182,0.7826,0.658
1,0.8571,0.0,0.75,0.8571,0.8,0.6897
2,0.7937,0.0,0.6667,0.7619,0.7111,0.5517
3,0.8387,0.0,0.6522,0.8824,0.75,0.6349
4,0.7903,0.0,0.5417,0.8667,0.6667,0.5253
5,0.8065,0.0,0.6667,0.8,0.7273,0.5792
6,0.9194,0.0,0.875,0.913,0.8936,0.8287
7,0.8548,0.0,0.75,0.8571,0.8,0.6869
8,0.7903,0.0,0.625,0.7895,0.6977,0.5405
9,0.8548,0.0,0.8333,0.8,0.8163,0.6964


In [398]:
tuned_model = tune_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.8413,0.0,0.75,0.8182,0.7826,0.658
1,0.8571,0.0,0.75,0.8571,0.8,0.6897
2,0.7937,0.0,0.6667,0.7619,0.7111,0.5517
3,0.8387,0.0,0.6522,0.8824,0.75,0.6349
4,0.7903,0.0,0.5417,0.8667,0.6667,0.5253
5,0.8065,0.0,0.6667,0.8,0.7273,0.5792
6,0.9194,0.0,0.875,0.913,0.8936,0.8287
7,0.8548,0.0,0.75,0.8571,0.8,0.6869
8,0.7903,0.0,0.625,0.7895,0.6977,0.5405
9,0.8548,0.0,0.8333,0.8,0.8163,0.6964


In [369]:
interpret_model(tuned_model, plot = 'summary')

SystemExit: (Type Error): This function only supports tree based models for binary classification.

In [399]:
predict_model(tuned_model);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Ridge Classifier,0.8172,0,0.6699,0.8214,0.738,0.5998


In [400]:
# Finalize Model: returns a model that has been trained on the entire dataset. 
titanic_final_model = finalize_model(tuned_model)

In [401]:
# Show the parameters used for final model
titanic_final_model.get_params()

{'alpha': 0.895,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': False,
 'max_iter': None,
 'normalize': True,
 'random_state': 123,
 'solver': 'auto',
 'tol': 0.001}

In [402]:
# make prediction with tuned model on full dataset
predict_model(titanic_final_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Ridge Classifier,0.8396,0,0.7087,0.8488,0.7725,0.6501


Unnamed: 0,Fare,familySize,Sex_female,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,...,Fare_Power2_multiply_Fare,cos(Fare)_multiply_Sex_female,Age_Power2_multiply_Title_Mr,Fare_Power2_multiply_cos(Fare),Fare_multiply_Sex_female,Title_Mrs_multiply_Sex_female,Title_Mr_multiply_Fare,Pclass_2_multiply_Sex_female,Survived,Label
0,-0.386671,-0.560975,0,0,0,0,0,0,0,0,...,0.074465,0.000000,-0.137488,-0.264348,-0.000000,0,-0.386671,0.0,0,0
1,0.395814,0.059160,1,0,0,0,1,0,0,0,...,-0.018608,-0.115248,0.000000,0.005418,0.395814,1,0.000000,0.0,1,1
2,-0.061999,2.539699,0,0,0,0,0,0,0,0,...,0.009508,-0.000000,-0.000000,0.172436,-0.000000,0,-0.000000,0.0,0,0
3,-0.486337,-0.560975,0,0,0,0,0,0,0,0,...,0.096586,-0.000000,-0.493304,0.076232,-0.000000,0,-0.486337,0.0,0,0
4,0.831478,0.679295,0,0,0,0,0,0,0,0,...,0.091159,-0.000000,-0.641560,-0.064245,0.000000,0,0.831478,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,-0.486337,-0.560975,0,0,0,0,0,1,0,0,...,0.096586,-0.000000,-0.001312,0.076232,-0.000000,0,-0.486337,0.0,1,0
264,0.660333,1.299429,1,0,0,0,0,0,0,0,...,0.027503,-0.969806,-0.000000,-0.040392,0.660333,0,0.000000,1.0,1,1
265,-0.364355,0.679295,0,0,0,0,0,0,0,0,...,0.069536,-0.000000,0.720203,0.005242,-0.000000,0,-0.364355,0.0,0,0
266,2.042579,0.679295,0,0,0,0,0,0,0,0,...,1.693695,-0.000000,0.000000,-0.235249,0.000000,0,0.000000,0.0,1,0


In [403]:
# generate predictions on unseen data
predictions = predict_model(titanic_final_model, data = test)

In [404]:
# The 'Label' and 'Score' columns are added onto the test set. 
# Label is the prediction and Score is the probability of the prediction.
predictions

Unnamed: 0,PassengerId,Pclass,Age,Fare,familySize,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Label
0,892,3,34.5,7.8292,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,893,3,47.0,7.0000,2,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,894,2,62.0,9.6875,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,895,3,27.0,8.6625,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,896,3,22.0,12.2875,3,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,24.0,8.0500,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
414,1306,1,39.0,108.9000,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
415,1307,3,38.5,7.2500,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
416,1308,3,24.0,8.0500,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [405]:
submit = predictions[['PassengerId', 'Label']]

In [406]:
submit.columns = [['PassengerId', 'Survived']]

In [347]:
submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [407]:
# Create file for submission
submit.to_csv('data/submission_pycaret_FE_catboost.csv', index = False)