Importing modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,GridSearchCV
from sklearn.preprocessing import PowerTransformer,OneHotEncoder
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from xgboost import XGBClassifier

Reading data into dataframe

In [2]:
train_data=pd.read_csv(r"titanic_train.csv")
test_data=pd.read_csv(r"titanic_test.csv")

View head of dataframes

In [3]:
train_data.head()
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Summary of training and testing data

In [4]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float

In [5]:
train_data.describe()
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


PassengerID, Ticket and Cabin are removed as they are not expected to have any relevance to the survival target variable. Cabin also has a significant number of missing values.

In [6]:
train_data.drop(["PassengerId","Ticket","Cabin","Name"],axis=1,inplace=True)
test_data.drop(["Ticket","Cabin","Name"],axis=1,inplace=True)
test_PassengerId = pd.DataFrame(test_data.pop("PassengerId"))
test_data.head()
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Checking total number of missing values in each column of the data

In [7]:
train_data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

Replacing the two missing values in the Embarked column with the modal values 

In [8]:
train_data.Embarked.fillna(train_data.Embarked.mode()[0],inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


Separating training dataset into train and test data, pipeline for imputing missing numberical features, transforming numerical features and fitting using RandomForestClassifier.

In [9]:
Xtr=train_data.drop("Survived",axis=1)
ytr=train_data.Survived

Xtr_train,Xtr_test,ytr_train,ytr_test=train_test_split(Xtr,ytr,test_size=0.3,random_state=1)


preprocess = make_column_transformer(
    (["Age","Fare"], make_pipeline(SimpleImputer(), PowerTransformer())),
    (["Sex","Embarked","Pclass"], OneHotEncoder()))

rfc_pipeline=make_pipeline(preprocess,RandomForestClassifier())


rfc_param_grid={"columntransformer__pipeline__simpleimputer__strategy":["mean","median","most_frequent"],'randomforestclassifier__n_estimators': [450,750],"randomforestclassifier__min_samples_leaf":[2,5,10,50],"randomforestclassifier__criterion":["gini","entropy"],"randomforestclassifier__max_depth":[4,8,16],"randomforestclassifier__max_features":['auto', 'sqrt', 'log2']}

rfc_grid=GridSearchCV(rfc_pipeline,cv=5,param_grid=rfc_param_grid,n_jobs=-1, verbose=1,scoring="accuracy")
rfc_grid.fit(Xtr_train,ytr_train)

print("Best RFC: {} using {}".format(rfc_grid.best_score_,rfc_grid.best_params_))


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:  2.7min finished


Best RFC: 0.8459069020866774 using {'columntransformer__pipeline__simpleimputer__strategy': 'mean', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 4, 'randomforestclassifier__max_features': 'auto', 'randomforestclassifier__min_samples_leaf': 2, 'randomforestclassifier__n_estimators': 450}


Using XGBClassifier

In [10]:

xgbc_param_grid={"columntransformer__pipeline__simpleimputer__strategy":["mean","median","most_frequent"],'xgbclassifier__n_estimators': [100,200,500,750],
    'xgbclassifier__max_depth': [3,5,7,9],
    'xgbclassifier__subsample':[0.6,0.8,1],
    'xgbclassifier__colsample_bytree':[0.6,0.8,1],
    'xgbclassifier__learning_rate': [0.01, 0.02, 0.05, 0.1]}
xgbc_pipeline=make_pipeline(preprocess,XGBClassifier())
xgbc_grid=GridSearchCV(xgbc_pipeline,cv=5,param_grid=xgbc_param_grid,n_jobs=-1, verbose=1,scoring="accuracy")
xgbc_grid.fit(Xtr_train,ytr_train)

print("Best XGBC: {} using {}".format(xgbc_grid.best_score_,xgbc_grid.best_params_))


Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 1492 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done 1942 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 2492 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3142 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 3892 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 4742 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 5692 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 6742 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 7892 tasks      | elapsed:  3.9min


Best XGBC: 0.8443017656500803 using {'columntransformer__pipeline__simpleimputer__strategy': 'mean', 'xgbclassifier__colsample_bytree': 0.6, 'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 200, 'xgbclassifier__subsample': 1}


[Parallel(n_jobs=-1)]: Done 8640 out of 8640 | elapsed:  4.3min finished


Using Logistic Regression

In [11]:

logreg_param_grid={"columntransformer__pipeline__simpleimputer__strategy":["mean","median","most_frequent"],'logisticregression__C': [0.1, 1.0, 1.0],'logisticregression__penalty': ["l1","l2"]}
logreg_pipeline=make_pipeline(preprocess,LogisticRegression())
logreg_grid=GridSearchCV(logreg_pipeline,cv=5,param_grid=logreg_param_grid,n_jobs=-1, verbose=1,scoring="accuracy")
logreg_grid.fit(Xtr_train,ytr_train)

print("Best LogReg: {} using {}".format(logreg_grid.best_score_,logreg_grid.best_params_))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best LogReg: 0.8154093097913323 using {'columntransformer__pipeline__simpleimputer__strategy': 'mean', 'logisticregression__C': 1.0, 'logisticregression__penalty': 'l1'}


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    0.4s finished


Using AdaBoostClassifier

In [40]:
ABC_param_grid={"columntransformer__pipeline__simpleimputer__strategy":["mean","median","most_frequent"],
               "adaboostclassifier__learning_rate":[0.4,0.6,1,1.5],
               "adaboostclassifier__n_estimators":[50,120,400],
               "adaboostclassifier__base_estimator":[DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression()]}

ABC_pipeline=make_pipeline(preprocess,AdaBoostClassifier())
ABC_grid=GridSearchCV(ABC_pipeline,cv=5,param_grid=ABC_param_grid,n_jobs=-1, verbose=1,scoring="accuracy")
ABC_grid.fit(Xtr_train,ytr_train)

print("Best AdaBoostClassifier: {} using {}".format(ABC_grid.best_score_,ABC_grid.best_params_))


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  1.1min finished


Best AdaBoostClassifier: 0.8138041733547352 using {'adaboostclassifier__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'adaboostclassifier__learning_rate': 0.4, 'adaboostclassifier__n_estimators': 400, 'columntransformer__pipeline__simpleimputer__strategy': 'most_frequent'}


transforming the test data then using the results of the XGBoost Classifier to make predictions

In [None]:
test_PassengerId["Survived"] = xgbc_grid.predict(test_data)
test_PassengerId.to_csv("TitanicSubmission.csv",index=False)
