In [8]:
##load required libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re

In [9]:
# For .read_csv, always use header=0 when you know row 0 is the header row
trainDf = pd.read_csv('data/train.csv', header=0)
testDf = pd.read_csv('data/test.csv', header=0)

#keep passengerIds to split the df at the end of preprocessing
trainIds = trainDf.PassengerId
testIds = testDf.PassengerId

df = pd.concat([trainDf, testDf])

In [10]:
#create a new column "Gender",  using map function
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

#how big was the family?
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

#how many passengrs travelling together with same ticket?
tickets = df['Ticket'].value_counts()
tickets = pd.DataFrame({'Ticket': tickets.index, 'Ticket_N': tickets.values})
#join Ticket_N column with DF
df = pd.merge(df, tickets, how='left')
#impute missing value to the median fare price of 3rd class
df.loc[df.Fare.isnull(), 'Fare'] = df.groupby("Pclass")["Fare"].median()[3]
#fare per person, divide fare by the number of persons travelling with the ticket
df['Fare_N'] = df.Fare / df.Ticket_N

In [11]:
#extract the title from the name field
df['Title'] = df.Name.apply(lambda x: re.split('[,.]\s?', x)[1])

#map rare titles to the most common equivalents
df.loc[df.Title.isin(['Lady', 'the Countess']), 'Title'] = "Mrs"
df.loc[df.Title.isin(['Mme', 'Mlle']), 'Title'] = "Miss"
df.loc[df.Title == 'Ms', 'Title'] = "Mrs"
df.loc[df.Title.isin(['Capt', 'Major', 'Col', 'Rev', 'Jonkheer', 'Sir']), 'Title'] = "Sir"  
df.loc[df.Title == 'Don', 'Title'] = "Mr"
df.loc[df.Title == 'Dona', 'Title'] = "Mrs"

In [12]:
#create a copy of the "Age" field to impute missing values, leave original field as it'S
df['Age_'] = df['Age']
#fill na's with the median values for each "Title"-"Pclass" group
df['Age_'] = df.groupby(["Pclass", "Title"]).Age.transform(lambda x: x.fillna(x.median()))
df['Age_isNull'] = pd.isnull(df.Age).astype(int)

#engineered feature
df['Age_x_Pclass'] = df['Age_'] * df.Pclass.astype(int)

In [13]:
#extract surname in a separate field
df['Surname'] = df.Name.apply(lambda x: re.split('[,.]\s?', x)[0])
df['FamilyId'] = df.Surname + "-" + df.FamilySize.astype(str)
df[['FamilySize', 'FamilyId']].head(25)
#discard small families (fewer than 2 people), there are way too many, and had higher chance to survive
df.loc[df.FamilySize < 3, 'FamilyId'] = 'X-X'
families = df.FamilyId.value_counts()
families = pd.DataFrame({'FamilyId': families.index, 'Size': families.values})
#families.head()
wrongCounts = families.FamilyId[families.Size < 3]
df.loc[df.FamilyId.isin(wrongCounts), 'FamilyId'] = 'X-X'

In [14]:
#childs and women had priority, when taking the boats, add an indicator for this
df['Priority'] = 0
df.loc[df.Age < 15, 'Priority'] = 1
df.loc[df.Title == "Master", 'Priority'] = 1 #master was used for kids until 12-15 years
df.loc[df.Sex == "female", 'Priority'] = 1

In [15]:
#we get the firt character of the ticket, it can be used to cluster tickets/passengers in groups
df['TicketType'] = df.Ticket.apply(lambda s: s.translate(None, "[][!#$%()*,.:;<=>@^_`|~.{} ]")[0])
df.TicketType.value_counts()

#impute missing values to the most common case
df.loc[df.Embarked.isnull(), 'Embarked'] = "S"
df['Port'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [16]:
#map string variables to integers... scikit-learn needs numeric data
df['Ticket_'] = pd.factorize(df.TicketType)[0]
df['FamilyId_'] = pd.factorize(df.FamilyId)[0]
df['Title_'] = pd.factorize(df.Title)[0]

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 1308
Data columns (total 28 columns):
Age             1046 non-null float64
Cabin           295 non-null object
Embarked        1309 non-null object
Fare            1309 non-null float64
Name            1309 non-null object
Parch           1309 non-null int64
PassengerId     1309 non-null int64
Pclass          1309 non-null int64
Sex             1309 non-null object
SibSp           1309 non-null int64
Survived        891 non-null float64
Ticket          1309 non-null object
Gender          1309 non-null int64
FamilySize      1309 non-null int64
Ticket_N        1309 non-null int64
Fare_N          1309 non-null float64
Title           1309 non-null object
Age_            1309 non-null float64
Age_isNull      1309 non-null int64
Age_x_Pclass    1309 non-null float64
Surname         1309 non-null object
FamilyId        1309 non-null object
Priority        1309 non-null int64
TicketType      1309 non-null object
Port      

In [18]:
#split again df in train and test
trainDf = df[df.PassengerId.isin(trainIds)]
testDf = df[df.PassengerId.isin(testIds)]


In [19]:
#extract survived values for test dataset
survTestDf = pd.read_csv("data/test-full.csv", header=0)
print survTestDf.shape
#survTestDf = survTestDf[["PassengerId", "Survived"]]
testDf = testDf.drop('Survived', 1)
testDf2 = pd.merge(testDf, survTestDf, on="PassengerId", how='right')
print testDf2.shape
testDf2.head(5)

(390, 2)
(390, 28)


Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,Age_x_Pclass,Surname,FamilyId,Priority,TicketType,Port,Ticket_,FamilyId_,Title_,Survived
0,47,,S,7.0,"Wilkes, Mrs. James (Ellen Needs)",0,893,3,female,1,...,141,Wilkes,X-X,1,3,0,4,0,1,1
1,62,,Q,9.6875,"Myles, Mr. Thomas Francis",0,894,2,male,0,...,124,Myles,X-X,0,2,2,5,0,0,0
2,27,,S,8.6625,"Wirz, Mr. Albert",0,895,3,male,0,...,81,Wirz,X-X,0,3,0,4,0,0,0
3,22,,S,12.2875,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,896,3,female,1,...,66,Hirvonen,X-X,1,3,0,4,0,1,1
4,14,,S,9.225,"Svensson, Mr. Johan Cervin",0,897,3,male,0,...,42,Svensson,X-X,1,7,0,7,0,0,1


In [20]:
from sklearn.cross_validation import cross_val_score, LeaveOneOut
from scipy.stats import sem

def loo_cv(X_train,y_train,clf):
    # Perform Leave-One-Out cross validation
    # We are preforming 1313 classifications!
    loo = LeaveOneOut(X_train[:].shape[0])
    scores=np.zeros(X_train[:].shape[0])
    for train_index,test_index in loo:
        X_train_cv, X_test_cv= X_train[train_index], X_train[test_index]
        y_train_cv, y_test_cv= y_train[train_index], y_train[test_index]
        clf = clf.fit(X_train_cv,y_train_cv)
        y_pred=clf.predict(X_test_cv)
        scores[test_index]=metrics.accuracy_score(y_test_cv.astype(int), y_pred.astype(int))
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores))

In [23]:
# Import the random forest package
from sklearn.cross_validation import cross_val_score, LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

#kf = cross_validation.KFold(len(trainDf['Survived']), n_folds=2)

# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 500, max_features = 3, max_depth = 20, 
                                criterion="gini", min_samples_leaf = 3,  min_samples_split=6,
                                 oob_score = True, n_jobs=2, random_state=1234) # max_depth = 6-7

#predictors = ['Pclass', 'Gender', 'Age_', 'Fare_N', 'Priority', 'FamilySize', 'SibSp', 'Parch', 'Port', 
#              'FamilyId_', 'Ticket_', 'Title_', 'Age_isNull', 'Age_x_Pclass']

predictors = ['Pclass', 'Gender', 'Age_','FamilySize', 'Title_', 'Ticket_', 'Fare_N', 'Priority', 'Ticket_N', 'FamilyId_']

# Fit the training data to the Survived labels and create the decision trees
model_rf = forest.fit(trainDf[predictors],trainDf['Survived'])
rf_score = cross_val_score(forest, trainDf[predictors], trainDf['Survived'], n_jobs=-1).mean()
print("cross val score -> {0})".format(rf_score))



#loo_cv(trainDf[predictors],trainDf['Survived'], model_rf)

# Take the same decision trees and run it on the test data
output = model_rf.predict(testDf2[predictors])
#print forest.score(trainDf[predictors],trainDf['Survived'])
print("score on test set -> {0})".format(forest.score(testDf2[predictors],testDf2['Survived'])))
output = model_rf.predict(testDf[predictors])
outDf = pd.DataFrame({'PassengerId': testDf.PassengerId, 'Survived': output.astype(int)})
outDf.to_csv('data/rf-test.csv', index = False)



cross val score -> 0.832772166105)
score on test set -> 0.776923076923)


In [24]:
from sklearn.ensemble import AdaBoostClassifier

bdt = AdaBoostClassifier(forest, algorithm="SAMME", n_estimators=100)

model_bdt = bdt.fit(trainDf[predictors],trainDf['Survived'])
out = model_bdt.predict(testDf2[predictors])
print("score on test set -> {0})".format(model_bdt.score(testDf2[predictors],testDf2['Survived'])))


score on test set -> 0.753846153846)


In [92]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score

predictors = ['Pclass', 'Gender', 'Age_', 'FamilySize', 'FamilyId_','Title_', 'Ticket_', 'Fare_N', 'Priority', 'Ticket_N']
#'Fare_N', 'Priority', 'FamilySize', 'SibSp', 'Parch','Port','FamilyId_', 'Ticket_', 'Title_', 'Age_x_Pclass'
et = ExtraTreesClassifier(n_estimators=480, criterion="gini", max_features = 3, max_depth=18, min_samples_split=6, random_state=1234)

labels = trainDf["Survived"].values
features = trainDf[predictors].values
 
et_score = cross_val_score(et, features, labels, n_jobs=-1).mean()
 
print("cross val score -> {0})".format(et_score))

model = et.fit(trainDf[predictors],trainDf['Survived'])
output = model.predict(testDf2[predictors])
print("score on train set -> {0})".format(et.score(trainDf[predictors],trainDf['Survived'])))
print("score on test set -> {0})".format(et.score(testDf2[predictors],testDf2['Survived'])))

out = model.predict(testDf[predictors])
outDf = pd.DataFrame({'PassengerId': testDf.PassengerId, 'Survived': out.astype(int)})
outDf.to_csv('data/et-test.csv', index = False)


cross val score -> 0.83950617284)
 score on train set -> 0.937149270483)
 score on test set -> 0.774358974359)


In [28]:
from sklearn.ensemble import GradientBoostingClassifier

predictors = ['Pclass', 'Gender', 'Age_', 'FamilySize', 'FamilyId_','Title_', 
              'Ticket_', 'Fare_N', 'Priority', 'Ticket_N']

gbm = GradientBoostingClassifier(n_estimators=600, learning_rate=0.5, max_depth=1, random_state=1234)
model_gbm = gbm.fit(trainDf[predictors],trainDf['Survived'])

print("score on test set -> {0})".format(model_gbm.score(testDf2[predictors],testDf2['Survived'])))


score on test set -> 0.766666666667)


In [27]:
from sklearn.naive_bayes import GaussianNB
predictors = ['Pclass', 'Gender', 'Age_', 'FamilySize', 'FamilyId_','Title_', 
              'Ticket_', 'Fare_N', 'Priority', 'Ticket_N']
nb = GaussianNB()
model_nb = nb.fit(trainDf[predictors],trainDf['Survived'])

print("score on test set -> {0})".format(model_nb.score(testDf2[predictors],testDf2['Survived'])))


score on test set -> 0.748717948718)


In [26]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=20, algorithm = 'kd_tree', p = 1, leaf_size = 50)
model_knn = knn.fit(trainDf[predictors],trainDf['Survived'])
print("score on test set -> {0})".format(model_knn.score(testDf2[predictors],testDf2['Survived'])))


score on test set -> 0.730769230769)
