# Titanic: Machine Learning from Disaster
### Ting-Wei Wu, 10/24/2018

**This is the submission for predicting survivors in the titanic problem from Kaggle competition: ** **https://www.kaggle.com/c/titanic**



Reference from:

https://medium.com/@yehjames/%E8%B3%87%E6%96%99%E5%88%86%E6%9E%90-%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-%E7%AC%AC4-1%E8%AC%9B-kaggle%E7%AB%B6%E8%B3%BD-%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E7%94%9F%E5%AD%98%E9%A0%90%E6%B8%AC-%E5%89%8D16-%E6%8E%92%E5%90%8D-a8842fea7077

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv("gender_submission.csv")

In [3]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
test.insert(1,"Survived", None)

In [7]:
#Combine train and test
data = train.append(test)

In [8]:
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [9]:
data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,80.0,8.0,9.0,512.3292


In [10]:
data['Family_Size'] = data['Parch'] + data['SibSp']

# Feature Engineering

## **1. Title**
Make title a class

In [11]:
data.insert(3, "Title", data["Name"].str.split(',', expand=True)[1])
data["Title"] = data["Title"].str.split('.', expand=True)[0]

In [12]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Title,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size
0,1,0,3,Mr,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,Miss,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,Mr,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [13]:
pd.crosstab(data["Title"],data["Sex"]).T

Title,Capt,Col,Don,Dona,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
female,0,0,0,1,1,0,1,0,0,260,2,1,0,197,2,0,0,1
male,1,4,1,0,7,1,0,2,61,0,0,0,757,0,0,8,1,0


In [14]:
scarce_title_Mr = ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Major', 'Master', 'Rev', 'Sir']
scarce_title_Miss = ['Mlle', 'Ms']
scarce_title_Mrs = ['Lady', 'Dona', 'Mme', 'the Countess']

data["Title"] = data["Title"].replace(scarce_title_Mr, ['Mr']*len(scarce_title_Mr))
data["Title"] = data["Title"].replace(scarce_title_Miss, ['Miss']*len(scarce_title_Miss))
data["Title"] = data["Title"].replace(scarce_title_Mrs, ['Mrs']*len(scarce_title_Mrs))

In [15]:
data.head(100)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size
0,1,0,3,Mr,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,Miss,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.1000,C123,S,1
4,5,0,3,Mr,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.0500,,S,0
5,6,0,3,Mr,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0
6,7,0,1,Mr,"McCarthy, Mr. Timothy J",male,54.00,0,0,17463,51.8625,E46,S,0
7,8,0,3,Master,"Palsson, Master. Gosta Leonard",male,2.00,3,1,349909,21.0750,,S,4
8,9,1,3,Mrs,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.00,0,2,347742,11.1333,,S,2
9,10,1,2,Mrs,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.00,1,0,237736,30.0708,,C,1


## **2. Ticket**

Strip to good shape

In [16]:
data["Ticket"] = data["Ticket"].apply(lambda x: x.replace('.','').replace('/','').strip().split(' ')[0] if not x.isdigit() else 'X')

In [17]:
data["Ticket"].unique()

array(['A5', 'PC', 'STONO2', 'X', 'PP', 'CA', 'SCParis', 'SCA4', 'A4',
       'SP', 'SOC', 'WC', 'SOTONOQ', 'WEP', 'STONO', 'C', 'SCPARIS',
       'SOP', 'Fa', 'LINE', 'FCC', 'SWPP', 'SCOW', 'PPP', 'SC', 'SCAH',
       'AS', 'SOPP', 'FC', 'SOTONO2', 'CASOTON', 'SCA3', 'STONOQ', 'AQ4',
       'A', 'LP', 'AQ3'], dtype=object)

## **3.Embarked, Fare, Age**
Deal with missing value

In [18]:
data["Embarked"] = data["Embarked"].fillna('S')
data["Fare"] = data["Fare"].fillna(data["Fare"].mean())
data["Cabin"] = data["Cabin"].apply(lambda x: str(x)[0] if not pd.isnull(x) else "missing")

In [19]:
data.head(100)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size
0,1,0,3,Mr,"Braund, Mr. Owen Harris",male,22.00,1,0,A5,7.2500,missing,S,1
1,2,1,1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC,71.2833,C,C,1
2,3,1,3,Miss,"Heikkinen, Miss. Laina",female,26.00,0,0,STONO2,7.9250,missing,S,0
3,4,1,1,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,X,53.1000,C,S,1
4,5,0,3,Mr,"Allen, Mr. William Henry",male,35.00,0,0,X,8.0500,missing,S,0
5,6,0,3,Mr,"Moran, Mr. James",male,,0,0,X,8.4583,missing,Q,0
6,7,0,1,Mr,"McCarthy, Mr. Timothy J",male,54.00,0,0,X,51.8625,E,S,0
7,8,0,3,Master,"Palsson, Master. Gosta Leonard",male,2.00,3,1,X,21.0750,missing,S,4
8,9,1,3,Mrs,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.00,0,2,X,11.1333,missing,S,2
9,10,1,2,Mrs,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.00,1,0,X,30.0708,missing,C,1


# Random Forest

In [20]:
data['Sex'] = data['Sex'].astype('category').cat.codes
data['Embarked'] = data['Embarked'].astype('category').cat.codes
data['Pclass'] = data['Pclass'].astype('category').cat.codes
data['Title'] = data['Title'].astype('category').cat.codes
data['Cabin'] = data['Cabin'].astype('category').cat.codes
data['Ticket'] = data['Ticket'].astype('category').cat.codes

In [21]:
data.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size
0,1,0,2,12,"Braund, Mr. Owen Harris",1,22.0,1,0,2,7.25,8,2,1
1,2,1,0,13,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,14,71.2833,2,0,1
2,3,1,2,9,"Heikkinen, Miss. Laina",0,26.0,0,0,31,7.925,8,2,0
3,4,1,0,13,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,36,53.1,2,2,1
4,5,0,2,12,"Allen, Mr. William Henry",1,35.0,0,0,36,8.05,8,2,0
5,6,0,2,12,"Moran, Mr. James",1,,0,0,36,8.4583,8,1,0


In [23]:
#Calculate the missing age by random forest depending on other features
dataAgeNull = data[data["Age"].isnull()]
dataAgeNotNull = data[data["Age"].notnull()]
remove_outlier = dataAgeNotNull[(np.abs(dataAgeNotNull["Fare"]-dataAgeNotNull["Fare"].mean())>(4*dataAgeNotNull["Fare"].std()))|
                      (np.abs(dataAgeNotNull["Family_Size"]-dataAgeNotNull["Family_Size"].mean())>(4*dataAgeNotNull["Family_Size"].std()))                     
                     ]
rfModel_age = RandomForestRegressor(n_estimators=2000,random_state=42)
ageColumns = ['Embarked', 'Fare', 'Pclass', 'Sex', 'Family_Size', 'Title','Cabin','Ticket']
rfModel_age.fit(remove_outlier[ageColumns], remove_outlier["Age"])

ageNullValues = rfModel_age.predict(X= dataAgeNull[ageColumns])
dataAgeNull.loc[:,"Age"] = ageNullValues
data = dataAgeNull.append(dataAgeNotNull)
data.reset_index(inplace=True, drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [24]:
dataTrain = data[pd.notnull(data['Survived'])].sort_values(by=["PassengerId"])
dataTest = data[~pd.notnull(data['Survived'])].sort_values(by=["PassengerId"])

In [25]:
dataTrain = dataTrain[['Survived', 'Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'Family_Size', 'Title', 'Ticket', 'Cabin']]
dataTest = dataTest[['Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'Family_Size', 'Title', 'Ticket', 'Cabin']]

In [26]:
dataTrain

Unnamed: 0,Survived,Age,Embarked,Fare,Pclass,Sex,Family_Size,Title,Ticket,Cabin
263,0,22.000000,2,7.2500,2,1,1,12,2,8
264,1,38.000000,0,71.2833,0,0,1,13,14,2
265,1,26.000000,2,7.9250,2,0,0,9,31,8
266,1,35.000000,2,53.1000,0,0,1,13,36,2
267,0,35.000000,2,8.0500,2,1,0,12,36,8
0,0,41.188763,1,8.4583,2,1,0,12,36,8
268,0,54.000000,2,51.8625,0,1,0,12,36,4
269,0,2.000000,2,21.0750,2,1,4,8,36,8
270,1,27.000000,2,11.1333,2,0,2,13,36,8
271,1,14.000000,0,30.0708,1,0,1,13,36,8


In [29]:
from sklearn.ensemble import RandomForestClassifier
 
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=1000,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1) 

y = dataTrain.iloc[:, 0]
y = y.astype('int')
rf.fit(dataTrain.iloc[:, 1:], y)
print("%.4f" % rf.oob_score_)

0.8283


In [30]:
submit['Survived'] = rf.predict(dataTest).astype(int)
submit.to_csv('submission.csv', index=False)

In [31]:
submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0
