In [217]:
# Load libraries
import pandas as pd
import numpy as np
import re
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
# warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [218]:
# Load the data set
titanic_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [219]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [220]:
titanic_df.info()
print('=============')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float

# Feature Engineering

In [221]:
full_data = [titanic_df, test_df]

In [222]:
# # New Feature Name_Length
# titanic_df['Name_Length'] = titanic_df['Name'].apply(len)
# test_df['Name_Length'] = test_df['Name'].apply(len)

In [223]:
# Feature that tells if a passenger has a cabin
for dataset in full_data:
    dataset['Has_Cabin'] = dataset['Cabin'].apply(lambda x:0 if type(x)==float else 1)
    
# Feature that counts family number
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
    
# Feature that tells if a passenger has family
for dataset in full_data:
    dataset['Has_Family'] = dataset['FamilySize'].apply(lambda x:0 if x==0 else 1)

In [224]:
# Get sex dummy feature
for dataset in full_data:
    dataset['Male'] = 0
    dataset['Male'] = dataset['Sex'].apply(lambda x:1 if x=='male' else 0)
    dataset['Female'] = 0
    dataset['Female'] = dataset['Sex'].apply(lambda x:1 if x=='female' else 0)

In [225]:
# fill missing Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
# get Embarked dummy column
dummies_Embarked_titanic = pd.get_dummies(titanic_df['Embarked'])
titanic_df = titanic_df.join(dummies_Embarked_titanic)

dummies_Embarked_test = pd.get_dummies(test_df['Embarked'])
test_df = test_df.join(dummies_Embarked_test)

In [226]:
# fill missing Age column
age_avg = titanic_df['Age'].mean()
age_std = titanic_df['Age'].std()
age_null_count = titanic_df['Age'].isnull().sum()
age_null_random = np.random.randint(age_avg - age_std, age_avg + age_std, age_null_count)
titanic_df['Age'][titanic_df['Age'].isnull()] = age_null_random
titanic_df['Age'] = titanic_df['Age'].astype(int)

# Mapping Age
titanic_df.loc[ titanic_df['Age'] <= 16, 'Age'] = 1
titanic_df.loc[(titanic_df['Age'] > 16) & (titanic_df['Age'] <= 32), 'Age'] = 2
titanic_df.loc[(titanic_df['Age'] > 32) & (titanic_df['Age'] <= 48), 'Age'] = 3
titanic_df.loc[(titanic_df['Age'] > 48) & (titanic_df['Age'] <= 64), 'Age'] = 3
titanic_df.loc[ titanic_df['Age'] > 64, 'Age'] = 4 

age_avg = test_df['Age'].mean()
age_std = test_df['Age'].std()
age_null_count = test_df['Age'].isnull().sum()
age_null_random = np.random.randint(age_avg - age_std, age_avg + age_std, age_null_count)
test_df['Age'][test_df['Age'].isnull()] = age_null_random
test_df['Age'] = test_df['Age'].astype(int)

# Mapping Age
test_df.loc[ test_df['Age'] < 16, 'Age'] = 1
test_df.loc[(test_df['Age'] >= 16) & (test_df['Age'] <= 32), 'Age'] = 2
test_df.loc[(test_df['Age'] > 32) & (test_df['Age'] <= 48), 'Age'] = 3
test_df.loc[(test_df['Age'] > 48) & (test_df['Age'] <= 64), 'Age'] = 4
test_df.loc[ test_df['Age'] > 64, 'Age'] = 5



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [227]:
titanic_df['IsChild'] = titanic_df['Age']
titanic_df['IsChild'] = titanic_df['Age'].apply(lambda x: 1 if x == 1 else 0)
test_df['IsChild'] = test_df['Age']
test_df['IsChild'] = test_df['Age'].apply(lambda x: 1 if x == 1 else 0)

In [228]:
titanic_df['IsChild']

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      1
8      0
9      1
10     1
11     0
12     0
13     0
14     1
15     0
16     1
17     0
18     0
19     0
20     0
21     0
22     1
23     0
24     1
25     0
26     0
27     0
28     0
29     0
      ..
861    0
862    0
863    0
864    0
865    0
866    0
867    0
868    0
869    1
870    0
871    0
872    0
873    0
874    0
875    1
876    0
877    0
878    0
879    0
880    0
881    0
882    0
883    0
884    0
885    0
886    0
887    0
888    0
889    0
890    0
Name: IsChild, Length: 891, dtype: int64

In [229]:
# function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title
titanic_df['Title'] = titanic_df['Name'].apply(get_title)
test_df['Title'] = test_df['Name'].apply(get_title)

# Group all non-common titles into one single grouping "Rare"
titanic_df['Title'] = titanic_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic_df['Title'] = titanic_df['Title'].replace('Mlle', 'Miss')
titanic_df['Title'] = titanic_df['Title'].replace('Ms', 'Miss')
titanic_df['Title'] = titanic_df['Title'].replace('Mme', 'Mrs')

test_df['Title'] = test_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test_df['Title'] = test_df['Title'].replace('Mlle', 'Miss')
test_df['Title'] = test_df['Title'].replace('Ms', 'Miss')
test_df['Title'] = test_df['Title'].replace('Mme', 'Mrs')
# Getting Title dummy column
dummies_title_titanic = pd.get_dummies(titanic_df['Title'])
titanic_df = titanic_df.join(dummies_title_titanic)
dummies_title_test = pd.get_dummies(test_df['Title'])
test_df = test_df.join(dummies_title_test)

In [230]:
# fill missing Fare column of test.csv
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())
    
#Mapping Fare
titanic_df.loc[ titanic_df['Fare'] <= 7.91, 'Fare']       = 0
titanic_df.loc[(titanic_df['Fare'] > 7.91) & (titanic_df['Fare'] <= 14.454), 'Fare'] = 1
titanic_df.loc[(titanic_df['Fare'] > 14.454) & (titanic_df['Fare'] <= 31), 'Fare']   = 2
titanic_df.loc[ titanic_df['Fare'] > 31, 'Fare']        = 3

test_df.loc[ test_df['Fare'] <= 7.91, 'Fare']       = 0
test_df.loc[(test_df['Fare'] > 7.91) & (test_df['Fare'] <= 14.454), 'Fare'] = 1
test_df.loc[(test_df['Fare'] > 14.454) & (test_df['Fare'] <= 31), 'Fare']   = 2
test_df.loc[ test_df['Fare'] > 31, 'Fare']        = 3

titanic_df['Fare'] = titanic_df['Fare'].astype(int)
test_df['Fare'] = test_df['Fare'].astype(int)

In [231]:
titanic_df.drop(['PassengerId','Name','Sex','SibSp','Parch','Ticket','Cabin','Embarked','Has_Cabin','Title'],axis=1, inplace=True)

In [232]:
test_df.drop(['Name','Sex','SibSp','Parch','Ticket','Embarked','Cabin','Has_Cabin','Title'],axis=1,inplace=True)

In [233]:
titanic_df

Unnamed: 0,Survived,Pclass,Age,Fare,FamilySize,Has_Family,Male,Female,C,Q,S,IsChild,Master,Miss,Mr,Mrs,Rare
0,0,3,2,0,1,1,1,0,0,0,1,0,0,0,1,0,0
1,1,1,3,3,1,1,0,1,1,0,0,0,0,0,0,1,0
2,1,3,2,1,0,0,0,1,0,0,1,0,0,1,0,0,0
3,1,1,3,3,1,1,0,1,0,0,1,0,0,0,0,1,0
4,0,3,3,1,0,0,1,0,0,0,1,0,0,0,1,0,0
5,0,3,2,1,0,0,1,0,0,1,0,0,0,0,1,0,0
6,0,1,3,3,0,0,1,0,0,0,1,0,0,0,1,0,0
7,0,3,1,2,4,1,1,0,0,0,1,1,1,0,0,0,0
8,1,3,2,1,2,1,0,1,0,0,1,0,0,0,0,1,0
9,1,2,1,2,1,1,0,1,1,0,0,1,0,0,0,1,0


In [240]:
# RandomForest
rf = RandomForestClassifier(n_estimators=500,max_depth=7)
rf.fit(x_train, y_train)
rf_pred = rf.predict(test_df.drop('PassengerId',axis=1))
rf.score(x_train, y_train)

0.85746352413019078

In [245]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
knn.score(x_train, y_train)

0.81032547699214363

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
gaussian.score(X_train, Y_train)
Y_pred_gaussian = gaussian.predict(X_test)
gaussian.score(X_train, Y_train)

In [236]:
submission.to_csv('titanic_RF.csv',index=False)