In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from pycaret.classification import *

In [2]:
path_train = "dataset/titanic/train.csv"
path_test = "dataset/titanic/test.csv"
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

In [3]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Pycaret magic

## setup 

In [5]:
exp_clf = setup(data = df_train, target = 'Survived',  session_id=123, 
                numeric_features=["Age", "Fare"], 
                categorical_features=["Pclass", "Sex", "SibSp", "Parch", "Embarked"],
               ignore_features=["PassengerId", "Name", "Cabin", "Ticket"])

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(891, 12)"
5,Missing Values,True
6,Numeric Features,2
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


## model selection and tunning

In [6]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8171,0.8593,0.7965,0.817,0.8141,0.6067,0.6114,0.374
gbc,Gradient Boosting Classifier,0.8138,0.8477,0.792,0.8152,0.8103,0.5988,0.6054,0.019
lr,Logistic Regression,0.806,0.8469,0.7931,0.806,0.8052,0.5905,0.5918,0.139
ada,Ada Boost Classifier,0.7978,0.835,0.7887,0.7986,0.798,0.5767,0.5771,0.017
ridge,Ridge Classifier,0.7947,0.0,0.7779,0.7937,0.7929,0.5633,0.5653,0.003
lda,Linear Discriminant Analysis,0.7947,0.8452,0.7779,0.7936,0.793,0.5634,0.5652,0.004
rf,Random Forest Classifier,0.7881,0.8477,0.7754,0.7915,0.7866,0.5531,0.5578,0.058
et,Extra Trees Classifier,0.7801,0.8232,0.7668,0.7846,0.7784,0.536,0.542,0.049
lightgbm,Light Gradient Boosting Machine,0.7768,0.8526,0.7596,0.7781,0.7745,0.5258,0.5299,0.01
dt,Decision Tree Classifier,0.7735,0.7644,0.7628,0.7776,0.7731,0.5262,0.5297,0.003


In [7]:
best_model

<catboost.core.CatBoostClassifier at 0x7f206c7716a0>

In [8]:
best_model_tune = tune_model(best_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9048,0.8732,0.8868,0.9093,0.903,0.7955,0.803
1,0.8095,0.8653,0.7942,0.8082,0.808,0.5966,0.5979
2,0.7619,0.8363,0.7479,0.7605,0.761,0.4992,0.4995
3,0.8065,0.8043,0.7859,0.8068,0.803,0.5871,0.5928
4,0.7903,0.7429,0.7522,0.7955,0.7813,0.533,0.5507
5,0.8387,0.9189,0.8147,0.8398,0.8354,0.6493,0.6558
6,0.8226,0.9024,0.7939,0.8249,0.8177,0.6112,0.6209
7,0.9032,0.9441,0.8904,0.9035,0.9024,0.7929,0.7948
8,0.8065,0.8377,0.7961,0.8065,0.8065,0.5921,0.5921
9,0.8548,0.8805,0.8432,0.8541,0.8542,0.6917,0.6921


In [9]:
best_model_final = finalize_model(best_model_tune)

# make prediction

In [10]:
y_pred = predict_model(best_model_final, df_test)

In [11]:
y_pred 

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Label,Score
0,3,male,34.5,0,0,7.8292,Q,0,0.9383
1,3,female,47.0,1,0,7.0000,S,0,0.7155
2,2,male,62.0,0,0,9.6875,Q,0,0.8711
3,3,male,27.0,0,0,8.6625,S,0,0.8379
4,3,female,22.0,1,1,12.2875,S,0,0.5682
...,...,...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,S,0,0.8838
414,1,female,39.0,0,0,108.9000,C,1,0.9608
415,3,male,38.5,0,0,7.2500,S,0,0.9396
416,3,male,,0,0,8.0500,S,0,0.8838


# Prepare for Kaggle submission 

https://www.kaggle.com/c/titanic/overview

In [13]:
y_pred["PassengerId"] = df_test["PassengerId"]

In [15]:
result_df = y_pred[["PassengerId", "Label"]]

In [16]:
result_df.columns = ["PassengerId", "Survived"]

In [18]:
result_df.to_csv("output/titanic_v1.csv", index=False)

Submit output/titanic_v1.csv to https://www.kaggle.com/c/titanic/overview