In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
combine = [train_df, test_df]

In [3]:
## Add Title
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Survived'])

Survived,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0
Col,1,1
Countess,0,1
Don,1,0
Dr,4,3
Jonkheer,1,0
Lady,0,1
Major,1,1
Master,17,23
Miss,55,127


In [4]:
## Add Title
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Sir', 'Major', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print(train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5, "Rev": 6}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.470588
5     Rev  0.000000


In [5]:
## Add length of name
for dataset in combine:
    dataset['NameLength'] = (dataset.Name.str.len() / 15).astype(int)

pd.crosstab(train_df['NameLength'], train_df['Sex'])
pd.crosstab(test_df['NameLength'], test_df['Sex'])

Sex,female,male
NameLength,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,3
1,71,225
2,47,36
3,31,2
4,3,0


In [6]:
## Add Family Name and survival rate
for dataset in combine:
    dataset['FamilyName'] = dataset.Name.str.extract('(\w+),', expand=False).fillna("")

family_survial_rate = {}
for family_name in train_df.FamilyName.unique():
    family_survial_rate[family_name] = train_df[(train_df['FamilyName'] == family_name)]['Survived'].mean()

for dataset in combine:
    dataset['FamilySurvivedRate'] = 1
    for family_name in train_df.FamilyName.unique():
        if family_survial_rate.get(family_name) and family_survial_rate.get(family_name) < 0.5:
            dataset.loc[(dataset['FamilyName'] == family_name) & (dataset["SibSp"] + dataset["Parch"] > 0), 'FamilySurvivedRate'] = 0
        if family_survial_rate.get(family_name) and family_survial_rate.get(family_name) >= 0.5:
            dataset.loc[(dataset['FamilyName'] == family_name) & (dataset["SibSp"] + dataset["Parch"] > 0), 'FamilySurvivedRate'] = 2
    
pd.crosstab(train_df['FamilySurvivedRate'], train_df['Survived'])
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,NameLength,FamilyName,FamilySurvivedRate
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1,1,Braund,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3,3,Cumings,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,2,1,Heikkinen,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,3,2,Futrelle,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1,1,Allen,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1,1,Moran,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,McCarthy,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,4,2,Palsson,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,3,3,Johnson,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,3,2,Nasser,2


In [7]:
## Add Cabin type
for dataset in combine:
    dataset['CabinType'] = dataset.Cabin.str.extract('^(.)', expand=False).fillna('T').map( {'A': 1, 'G': 1, 'B': 2, 'D': 2, 'E': 2, 'C': 3, 'F': 3, 'T': 3} ).astype(int)

pd.crosstab(train_df['CabinType'], train_df['Sex'])
pd.crosstab(test_df['CabinType'], train_df['Sex'])

Sex,female,male
CabinType,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,4,4
2.0,10,30
3.0,146,224


In [8]:
## Convert Sex to int
for dataset in combine:
    dataset['SexType'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

pd.crosstab(train_df['SexType'], train_df['Survived'])

Survived,0,1
SexType,Unnamed: 1_level_1,Unnamed: 2_level_1
0,468,109
1,81,233


In [9]:
guess_ages = np.zeros((2,3))
guess_ages

for dataset in combine:
    for i in range(0, 2): # Sex
        for j in range(0, 3): # Pclass
            guess_df = dataset[(dataset['SexType'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ dataset.Age.isnull() & (dataset.SexType == i) & (dataset.Pclass == j + 1), 'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_df.Age

0      22
1      38
2      26
3      35
4      35
5      25
6      54
7       2
8      27
9      14
10      4
11     58
12     20
13     39
14     14
15     55
16      2
17     30
18     31
19     21
20     35
21     34
22     15
23     28
24      8
25     38
26     25
27     19
28     21
29     25
       ..
861    21
862    48
863    21
864    24
865    42
866    27
867    31
868    25
869     4
870    26
871    47
872    33
873    47
874    28
875    15
876    20
877    19
878    25
879    56
880    25
881    33
882    22
883    28
884    25
885    39
886    27
887    19
888    21
889    26
890    32
Name: Age, dtype: int64

In [10]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,3,0.724138
2,2,0.578431
1,1,0.552795
6,6,0.333333
0,0,0.303538
4,4,0.2
5,5,0.136364
7,7,0.0
8,10,0.0


In [11]:
## Add family features
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 0, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [12]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,Title,NameLength,FamilyName,FamilySurvivedRate,CabinType,SexType,FamilySize,IsAlone,manyFamily
0,1,0,3,"Braund, Mr. Owen Harris",male,1,1,0,A/5 21171,7.25,...,S,1,1,Braund,1,3,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2,1,0,PC 17599,71.2833,...,C,3,3,Cumings,2,3,1,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,1,0,0,STON/O2. 3101282,7.925,...,S,2,1,Heikkinen,1,3,1,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2,1,0,113803,53.1,...,S,3,2,Futrelle,2,3,1,1,0,0
4,5,0,3,"Allen, Mr. William Henry",male,2,0,0,373450,8.05,...,S,1,1,Allen,1,3,0,0,1,0


In [13]:
## Add Age*Class
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

Unnamed: 0,Age*Class,Age,Pclass
0,3,1,3
1,2,2,1
2,3,1,3
3,2,2,1
4,6,2,3
5,3,1,3
6,3,3,1
7,0,0,3
8,3,1,3
9,0,0,2


In [14]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [15]:
for dataset in combine:
    dataset['EmbarkedType'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,NameLength,FamilyName,FamilySurvivedRate,CabinType,SexType,FamilySize,IsAlone,manyFamily,Age*Class,EmbarkedType
0,1,0,3,"Braund, Mr. Owen Harris",male,1,1,0,A/5 21171,7.25,...,1,Braund,1,3,0,1,0,0,3,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2,1,0,PC 17599,71.2833,...,3,Cumings,2,3,1,1,0,0,2,1
2,3,1,3,"Heikkinen, Miss. Laina",female,1,0,0,STON/O2. 3101282,7.925,...,1,Heikkinen,1,3,1,0,1,0,3,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2,1,0,113803,53.1,...,2,Futrelle,2,3,1,1,0,0,2,0
4,5,0,3,"Allen, Mr. William Henry",male,2,0,0,373450,8.05,...,1,Allen,1,3,0,0,1,0,6,0


In [16]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

In [17]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,NameLength,FamilyName,FamilySurvivedRate,CabinType,SexType,FamilySize,IsAlone,manyFamily,Age*Class,EmbarkedType
0,1,0,3,"Braund, Mr. Owen Harris",male,1,1,0,A/5 21171,0,...,1,Braund,1,3,0,1,0,0,3,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2,1,0,PC 17599,3,...,3,Cumings,2,3,1,1,0,0,2,1
2,3,1,3,"Heikkinen, Miss. Laina",female,1,0,0,STON/O2. 3101282,1,...,1,Heikkinen,1,3,1,0,1,0,3,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2,1,0,113803,3,...,2,Futrelle,2,3,1,1,0,0,2,0
4,5,0,3,"Allen, Mr. William Henry",male,2,0,0,373450,1,...,1,Allen,1,3,0,0,1,0,6,0
5,6,0,3,"Moran, Mr. James",male,1,0,0,330877,1,...,1,Moran,1,3,0,0,1,0,3,2
6,7,0,1,"McCarthy, Mr. Timothy J",male,3,0,0,17463,3,...,1,McCarthy,1,2,0,0,1,0,3,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,0,3,1,349909,2,...,2,Palsson,1,3,0,4,0,1,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,1,0,2,347742,1,...,3,Johnson,2,3,1,2,0,0,3,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,0,1,0,237736,2,...,2,Nasser,2,3,1,1,0,0,0,1


In [18]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,NameLength,FamilyName,FamilySurvivedRate,CabinType,SexType,FamilySize,IsAlone,manyFamily,Age*Class,EmbarkedType
0,1,0,3,"Braund, Mr. Owen Harris",male,1,1,0,A/5 21171,0,...,1,Braund,1,3,0,1,0,0,3,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2,1,0,PC 17599,3,...,3,Cumings,2,3,1,1,0,0,2,1
2,3,1,3,"Heikkinen, Miss. Laina",female,1,0,0,STON/O2. 3101282,1,...,1,Heikkinen,1,3,1,0,1,0,3,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2,1,0,113803,3,...,2,Futrelle,2,3,1,1,0,0,2,0
4,5,0,3,"Allen, Mr. William Henry",male,2,0,0,373450,1,...,1,Allen,1,3,0,0,1,0,6,0
5,6,0,3,"Moran, Mr. James",male,1,0,0,330877,1,...,1,Moran,1,3,0,0,1,0,3,2
6,7,0,1,"McCarthy, Mr. Timothy J",male,3,0,0,17463,3,...,1,McCarthy,1,2,0,0,1,0,3,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,0,3,1,349909,2,...,2,Palsson,1,3,0,4,0,1,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,1,0,2,347742,1,...,3,Johnson,2,3,1,2,0,0,3,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,0,1,0,237736,2,...,2,Nasser,2,3,1,1,0,0,0,1


In [19]:
##
## Predict
##

### cross validation
from sklearn.model_selection import cross_val_score

### 
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb



In [20]:
X_train = train_df.drop(["PassengerId", "Survived", "Name", "Sex", "FamilySize", "manyFamily", "SibSp", "Parch", "Ticket", "Cabin", "Embarked", "EmbarkedType", "CabinType", "FamilyName"], axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop(["PassengerId",                      "Name", "Sex", "FamilySize", "manyFamily", "SibSp", "Parch", "Ticket", "Cabin", "Embarked", "EmbarkedType", "CabinType", "FamilyName"], axis=1).copy()
print(X_train.shape, Y_train.shape, X_test.shape)
X_train

(891, 9) (891,) (418, 9)


Unnamed: 0,Pclass,Age,Fare,Title,NameLength,FamilySurvivedRate,SexType,IsAlone,Age*Class
0,3,1,0,1,1,1,0,0,3
1,1,2,3,3,3,2,1,0,2
2,3,1,1,2,1,1,1,1,3
3,1,2,3,3,2,2,1,0,2
4,3,2,1,1,1,1,0,1,6
5,3,1,1,1,1,1,0,1,3
6,1,3,3,1,1,1,0,1,3
7,3,0,2,4,2,1,0,0,0
8,3,1,1,3,3,2,1,0,3
9,2,0,2,3,2,2,1,0,0


In [21]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=200, max_depth=3)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(acc_random_forest)

print(cross_val_score(random_forest, X_train, Y_train, cv=4).mean())

86.2
0.847390431751


In [22]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=30, learning_rate=0.05).fit(X_train, Y_train)
Y_pred = gbm.predict(X_test)

acc_xgboost= round(gbm.score(X_train, Y_train) * 100, 2)
print(acc_xgboost)
print(cross_val_score(gbm, X_train, Y_train, cv=4).mean())

86.2
0.856319048268


In [23]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
print(submission)

submission.to_csv('./my_submission.csv', index=False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         0
19           911         1
20           912         0
21           913         0
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         1
392         1284         1
3