# Machine Learning on the Titanic Dataset

This notebook focuses on one of [Kaggle's](https://www.kaggle.com/c/titanic) "Getting Started" prediction competitions, the Titanic challenge.

In [1]:
# classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# modules to handle data
import pandas as pd
import numpy as np

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# environment setup
sns.set()
plt.style.use('ggplot')
%matplotlib notebook

## I. Data Wrangling

Like in most cases with any data science project, we are likely to encounter dirty or missing data and will need to do some wrangling before we can really do anything else.

In [3]:
# load data
train = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')

In [4]:
from pycaret.classification import *

In [6]:
d=setup(data=train,
       target='Survived',
       ignore_features=['PassengerId','Name'],
       bin_numeric_features=['Age','Fare'],
       )

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,3230
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(891, 12)"
4,Missing Values,True
5,Numeric Features,3
6,Categorical Features,8
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [7]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Ridge Classifier,0.8315,0.0,0.7531,0.8101,0.7736,0.6404
1,SVM - Linear Kernel,0.8251,0.0,0.7696,0.7972,0.7689,0.6298
2,Logistic Regression,0.8235,0.8742,0.7362,0.7997,0.7598,0.6214
3,Ada Boost Classifier,0.8154,0.8691,0.7069,0.7952,0.7418,0.5999
4,Gradient Boosting Classifier,0.8154,0.8625,0.6527,0.8372,0.7262,0.5916
5,Extreme Gradient Boosting,0.8122,0.8605,0.6736,0.8149,0.7298,0.589
6,CatBoost Classifier,0.8106,0.8683,0.6442,0.8311,0.7204,0.5814
7,Decision Tree Classifier,0.8075,0.7882,0.7114,0.7737,0.7377,0.5866
8,Light Gradient Boosting Machine,0.8043,0.8561,0.6824,0.7833,0.7251,0.5751
9,Extra Trees Classifier,0.7979,0.8611,0.7029,0.7536,0.7235,0.5654


In [8]:
rd=create_model('ridge')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7937,0.0,0.5833,0.8235,0.6829,0.5365
1,0.8571,0.0,0.75,0.8571,0.8,0.6897
2,0.8413,0.0,0.8333,0.7692,0.8,0.6688
3,0.9355,0.0,0.875,0.9545,0.913,0.8619
4,0.7419,0.0,0.625,0.6818,0.6522,0.4477
5,0.9194,0.0,0.9167,0.88,0.898,0.8313
6,0.8387,0.0,0.625,0.9375,0.75,0.6379
7,0.7258,0.0,0.8333,0.6061,0.7018,0.4595
8,0.8387,0.0,0.75,0.8182,0.7826,0.6548
9,0.8226,0.0,0.7391,0.7727,0.7556,0.6164


In [9]:
tuned_rd=tune_model('ridge')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7302,0.0,0.5,0.7059,0.5854,0.3939
1,0.8571,0.0,0.8333,0.8,0.8163,0.6995
2,0.8571,0.0,0.7917,0.8261,0.8085,0.6947
3,0.9355,0.0,0.9167,0.9167,0.9167,0.864
4,0.7581,0.0,0.625,0.7143,0.6667,0.4781
5,0.9194,0.0,0.9167,0.88,0.898,0.8313
6,0.8226,0.0,0.625,0.8824,0.7317,0.6049
7,0.7742,0.0,0.875,0.6562,0.75,0.5517
8,0.8226,0.0,0.7083,0.8095,0.7556,0.6173
9,0.8387,0.0,0.7826,0.7826,0.7826,0.6544


In [10]:
predict_model(tuned_rd)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Ridge Classifier,0.8246,0,0.7961,0.7593,0.7773,0.6328


Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,...,Fare_2.0,Fare_3.0,Fare_4.0,Fare_5.0,Fare_6.0,Fare_7.0,Fare_8.0,Fare_9.0,Survived,Label
0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
2,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
264,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
265,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
266,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0


In [11]:
final_rd = finalize_model(tuned_rd)

In [12]:
predict_model(final_rd)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Ridge Classifier,0.9851,0,0.9806,0.9806,0.9806,0.9685


Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,...,Fare_2.0,Fare_3.0,Fare_4.0,Fare_5.0,Fare_6.0,Fare_7.0,Fare_8.0,Fare_9.0,Survived,Label
0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
2,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
264,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
265,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
266,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0


In [13]:
unseen_predictions = predict_model(final_rd, data=test)
unseen_predictions.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Label
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [14]:
# save PassengerId for final submission
passengerId = test.PassengerId

In [16]:
unseen_predictions.rename(columns={'Label':'Survived'}, inplace=True)

In [17]:
unseen_predictions

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [18]:
k=unseen_predictions.values

In [21]:
pred=k[:,-1]

In [22]:
# dataframe with predictions
kaggle = pd.DataFrame( {'PassengerId': passengerId, 'Survived': pred} )

In [23]:
kaggle

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [25]:
# save to csv
kaggle.to_csv('./Data/titanic_pycaret.csv', index=False)

In [30]:
plot_model(tuned_rd,plot='pr')