# Assignment 1 - Hands on - Titanic Prediction: Fine Tune your Decision Tree Model

Zhang Yichen 20328707

## Import necessary libraries and define help functions

In [538]:
import re
import math
import numpy as np
import pandas as pd

from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": -1}
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": -1}

def countCabinSharedAmount(name, cabinList):
    if (str(name)!='nan'):
        row=cabinList.loc[cabinList['Cabin'] == name]
        count=row['Count']-1
        return count
    else:
        return -1

## Data preparation

In this step, we read training data from source files, clean the training dataset and generate new features if needs. As information shown in train.info(), the columns 'Age', 'Embarked' and 'Cabin' has null values. Hence, we replaced the corresponding data cell with the mean of 'Age' and the mode of 'Embarked' respectively. Though 'Cabin' has too many missing values and we simply dropped it before in labs, it looks like there is still some hidden information inside.

### Explanation of New Feature
**CabinSharedAmount and Deck:** Though 'Cabin' has too many missing values and we simply dropped it before in labs, it looks like there is still some hidden information inside. **CabinSharedAmount** is used to count if there is any passenger shared his/her cabin with others (same cabin number). This is reasonable as if they have shared cabin they might help with each other during evacuation. **Deck** refers to different floors in the Titanic. As shown in the picture, there were 7 different decks A-G inside the Titanic from top to bottom. If a passenger lived higher in the Titanic, generally speaking, he/she would reach the top deck easier to get on lifeboat or get lifeguards.

![image.png](attachment:image.png)

**Title:** Another feature we dropped in lab is the **Name**. Though the whole name is unique to every passengers and looks like not related to survival rate, the title inside the name is a intereting point. After extracting from the **Name**, the top 4 occurence among all titles are Mr, Miss, Mrs and Master. Others are all labelled as 'Other' due to the little occurence. As shown in the correlation table, the **Title** has a moderate correlation with **Survival**. It might be a good feature.

**FarePerPerson:** The original feature **Fare** seems a bit noisy as if a person spent a lot, it might be due to he is rich or he had many relatives, the amount of which is calculated by the amount of SibSp plus the amount of Parch. Hence, in order to eliminate the possible correlation between Fare and the amount of relateives, FarePerPerson is calculated.

In [596]:
train = pd.read_csv('./data/train.csv',index_col='PassengerId');

print("Dataset information before cleaning (train): ")
train.info()

# Replace the missing values with mean or mode
train['Age'].fillna(train['Age'].mean(),inplace = True)
train['Embarked'].fillna(train['Embarked'].value_counts().index[0],inplace= True)

# Get dummy variable for categorical feature 'Sex' and 'Embarked'
train = pd.get_dummies(train,columns = ['Sex', 'Embarked'])

# Find the distribution of people per cabin and get 'CabinSharedAmount'
cabinListTrain=train[train['Cabin'].notnull()==True].groupby('Cabin')['Name'].count()
cabinListTrain=cabinListTrain.reset_index()
cabinListTrain.columns=['Cabin','Count']
train['CabinSharedAmount'] = train['Cabin'].apply(lambda x: countCabinSharedAmount(x, cabinListTrain)).astype(int)

# Retrieve 'Deck' number from 'Cabin', should be the first character. 'Deck' will be marked as 'U0' if it is missing.
train['Cabin'] = train['Cabin'].fillna("U0")
train['Deck'] = train['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
train['Deck'] = train['Deck'].map(deck)
train['Deck'] = train['Deck'].fillna(-1)
train['Deck'] = train['Deck'].astype(int)

# Retrieve title from name. The regular expression is used to cut out the title with pattern 'xxx.'. Rarely appeared titles will be marked as 'Other'
train['Title'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
train['Title'] = train['Title'].map(titles)
train['Title'] = train['Title'].fillna(-1)

# Calculate 'FarePerPerson' by 'Fare' devided by amount of relatives
train['FarePerPerson'] = train['Fare']/(train['SibSp'] + train['Parch'] +1)

train['AgeStage'] =  np.floor(train['Age']/10)

# Drop columns 'Name' and 'Cabin' as they are useless
train = train.drop(columns = 'Name')
train = train.drop(columns = 'Cabin')

print()
print("Dataset information after cleaning (test): ")
train.info()

Dataset information before cleaning (train): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB

Dataset information after cleaning (test): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Survived           891 non-n

Generate the **correlation table** to check correaltion among features. Irrelevant or partially relevant features can negatively impact model performance.

In [595]:
train.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinSharedAmount,Deck,Title,FarePerPerson,AgeStage
Survived,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307,0.543351,-0.543351,0.16824,0.00365,-0.149683,0.30487,0.305989,0.44205,0.2216,-0.050972
Pclass,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,-0.1319,0.1319,-0.243292,0.221009,0.074053,-0.581854,-0.611216,-0.000986,-0.485079,-0.33961
Age,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566,-0.084153,0.084153,0.032024,-0.013855,-0.019336,0.110734,0.181301,-0.288031,0.141649,0.968798
SibSp,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651,0.114631,-0.114631,-0.059528,-0.026354,0.068734,0.016213,-0.032207,0.309763,-0.094682,-0.226973
Parch,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225,0.245489,-0.245489,-0.011069,-0.081228,0.060814,0.125825,0.040943,0.391307,-0.068978,-0.158376
Fare,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0,0.182333,-0.182333,0.269335,-0.117216,-0.162184,0.466034,0.377299,0.127972,0.840995,0.099775
Sex_female,0.543351,-0.1319,-0.084153,0.114631,0.245489,0.182333,1.0,-1.0,0.082853,0.074115,-0.119224,0.17517,0.150517,0.604275,0.115143,-0.074794
Sex_male,-0.543351,0.1319,0.084153,-0.114631,-0.245489,-0.182333,-1.0,1.0,-0.082853,-0.074115,0.119224,-0.17517,-0.150517,-0.604275,-0.115143,0.074794
Embarked_C,0.16824,-0.243292,0.032024,-0.059528,-0.011069,0.269335,0.082853,-0.082853,1.0,-0.148258,-0.782742,0.107769,0.144757,0.031324,0.271215,0.030254
Embarked_Q,0.00365,0.221009,-0.013855,-0.026354,-0.081228,-0.117216,0.074115,-0.074115,-0.148258,1.0,-0.499421,-0.106815,-0.113566,0.011626,-0.096038,-0.063081


Read testing data from source files and clean the testing dataset. Do the same preparation job as training set.

In [591]:
test = pd.read_csv('./data/test.csv',index_col='PassengerId')

print("Dataset information before cleaning (test): ")
test.info()

test['Age'].fillna(test['Age'].mean(),inplace = True)
test['Embarked'].fillna(test['Embarked'].value_counts().index[0],inplace= True)
test['Fare'].fillna(test['Fare'].mean(),inplace = True)

test = pd.get_dummies(test,columns = ['Sex', 'Embarked'])

cabinListTest=test[test['Cabin'].notnull()==True].groupby('Cabin')['Name'].count()
cabinListTest=cabinListTest.reset_index()
cabinListTest.columns=['Cabin','Count']
test['CabinSharedAmount'] = test['Cabin'].apply(lambda x: countCabinSharedAmount(x, cabinListTest)).astype(int)

test['Cabin'] = test['Cabin'].fillna("U0")
test['Deck'] = test['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
test['Deck'] = test['Deck'].map(deck)
test['Deck'] = test['Deck'].fillna(-1)
test['Deck'] = test['Deck'].astype(int)

test['Title'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
test['Title'] = test['Title'].map(titles)
test['Title'] = test['Title'].fillna(-1)

test['FarePerPerson'] = test['Fare']/(test['SibSp'] + test['Parch'] + 1)

test['AgeStage'] =  np.floor(test['Age']/10)

test = test.drop(columns = 'Name')
test = test.drop(columns = 'Cabin')

print()
print("Dataset information after cleaning (test): ")
test.info()

Dataset information before cleaning (test): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB

Dataset information after cleaning (test): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Pclass             418 non-null    int64  
 1   Age           

In [592]:
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinSharedAmount,Deck,Title,FarePerPerson
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,22.0,1,0,A/5 21171,7.25,0,1,0,0,1,-1,-1,1,3.625
2,1,1,38.0,1,0,PC 17599,71.2833,1,0,1,0,0,0,3,3,35.64165
3,1,3,26.0,0,0,STON/O2. 3101282,7.925,1,0,0,0,1,-1,-1,2,7.925
4,1,1,35.0,1,0,113803,53.1,1,0,0,0,1,1,3,3,26.55
5,0,3,35.0,0,0,373450,8.05,0,1,0,0,1,-1,-1,1,8.05


In [593]:
test.head(5)

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinSharedAmount,Deck,Title,FarePerPerson,AgeStage
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892,3,34.5,0,0,330911,7.8292,0,1,0,1,0,-1,-1,1,7.8292,3.0
893,3,47.0,1,0,363272,7.0,1,0,0,0,1,-1,-1,3,3.5,4.0
894,2,62.0,0,0,240276,9.6875,0,1,0,1,0,-1,-1,1,9.6875,6.0
895,3,27.0,0,0,315154,8.6625,0,1,0,0,1,-1,-1,1,8.6625,2.0
896,3,22.0,1,1,3101298,12.2875,1,0,0,0,1,-1,-1,3,4.095833,2.0


In [599]:
features = ['Pclass', 'Sex_male', 'SibSp', 'Parch', 'Fare', 'Embarked_S', 'Embarked_C', 'CabinSharedAmount', 'Deck', 'Title', 'FarePerPerson']
target = ['Survived']

X = train[features]
y = train[target]

# Use a simple model to approximately check the accuracy
model = DecisionTreeClassifier(random_state=0)
model.fit(X,y)
score_cv = cross_val_score(model, X, y, cv=10)
score_cv.mean()

0.7912609238451935

In [600]:
depths = np.arange(1, 25)
num_leafs = np.arange(5, 200, 10)
min_samples_split = np.arange(10, 400, 20)
criterion = ("gini", "entropy")

param_grid = [{
    'max_depth': depths,
    'max_leaf_nodes': num_leafs,
    'min_samples_split': min_samples_split,
    'criterion': criterion,
}]

# Due to the large amount of combinations, the calculation is quite long. 2mins on i7-4770K with all 8 cores involved
model_gridSearchCV = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, n_jobs=-1,cv=10)
model_gridSearchCV.fit(X,y)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='d...
                          'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 

In [601]:
model_gridSearchCV.best_score_

0.8294631710362047

In [602]:
model_gridSearchCV.best_params_

{'criterion': 'gini',
 'max_depth': 9,
 'max_leaf_nodes': 95,
 'min_samples_split': 10}

Apply the best parameters got from grid search

In [603]:
model_gridSearchCVBest = DecisionTreeClassifier(criterion = 'gini', max_leaf_nodes = 35, max_depth = 9, min_samples_split = 10)
model_gridSearchCVBest.fit(X,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=35,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [604]:
X_test = test[features]
pred_test = model_gridSearchCVBest.predict(X_test)

## Output the result as csv

In [605]:
df = pd.DataFrame(pred_test, columns=['Survived'])
df.insert(0,'PassengerId',np.array(test.index))
df.to_csv('result.csv', index=False)
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
