In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

In [None]:
titanic_train = pd.read_csv('../input/train.csv')
titanic_test = pd.read_csv('../input/test.csv') 

titanic_train['Train'] = 1
titanic_test['Train'] = 0
titanic = titanic_train.drop(['Survived'],axis=1)
titanic = pd.concat([titanic,titanic_test])

# Clean Data

In [None]:
titanic['Age'].median()

In [None]:
# Check for null value %

(titanic.isnull().sum(axis=0))/titanic.count()

In [None]:
# Let's try to impute age first

titanic['Age']=titanic['Age'].fillna(titanic["Age"].median())

# Let's impute embarked by finding the mode and filling
emb = titanic['Embarked'].mode()
emb = emb[0]
titanic['Embarked']=titanic['Embarked'].fillna(emb)

# Let's impute 
titanic.isnull().sum(axis=0)

# Feature Engineering


There are a number of options for engineering variables from the existing data. The two opportunities to create new variables are Salutation and Family Size. Let's create family size using SibSp and Parch which provide the siblings, parents, children and partners of passengers onboard the titanic. 

$Family Size = SibSp + Parch$

In [None]:
titanic['Family_Size'] = titanic['SibSp']+titanic['Parch']

In [None]:
#Now let's split out the Salutation from 'Name'
titanic['Salutation'] = titanic['Name'].apply(lambda x: x.split(",")[1:])
titanic['Salutation'] = titanic['Salutation'].apply(lambda x: x[0])
titanic['Salutation'] = titanic['Salutation'].apply(lambda x: x.split(".")[:1])
titanic['Salutation'] = titanic['Salutation'].apply(lambda x: x[0])

In [None]:
#Create age groups to analyse the relationship between age and survival
titanic['Age_Group'] = titanic['Age'].apply(lambda x: '0-10' if x <= 10 else '11-20' if 11 >= x <= 20 else '21-30' if 21 >= x <= 30 else '31-50' if 31 >= x <= 50 else '51-70' if 51 >= x <= 70 else '71+')

# One Hot Encoding

In [None]:
# Clean up the variables before one hot encoding
# Drop passengerId as this just a unique ID assigned to passengers
#titanic.drop(['PassengerId'], axis=1, inplace=True)
# Drop name as this cannot be used in the model
titanic.drop(['Name'], axis=1, inplace=True)
# Drop ticket as the string format is inconsistent
titanic.drop(['Ticket'], axis=1, inplace=True)
# Drop cabin as the calues are inconsistent as well as many missing values
titanic.drop(['Cabin'], axis=1, inplace=True)

In [None]:
# Next let's create dummy variables ready to train the model

def get_dummies(df):
    cols = df.select_dtypes(include='object')
    for i in cols:
        dummy = pd.get_dummies(df[i], prefix= i + '_')
        df = pd.concat([df,dummy], axis=1)
        df.drop([i], axis=1,inplace=True)
    return df

titanic_new = get_dummies(titanic)

In [None]:
#titanic_new = titanic_new[['PassengerId','Age', 'Fare','Family_Size','Sex__female', 'Sex__male','Embarked__C','Embarked__S', 'Salutation__ Mr','Train']]

# Train Model

In [None]:
# Split out test and train again
titanic_train_new = titanic_new[titanic_new['Train'] == 1]
titanic_test_new = titanic_new[titanic_new['Train'] == 0]

In [None]:
titanic_train_new = titanic_train_new.drop(['Train'],axis=1)
titanic_test_new = titanic_test_new.drop(['Train'],axis=1)

In [None]:
train_y = titanic_train['Survived']
train_x = titanic_train_new.iloc[:,1:]
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(bootstrap=False, n_jobs=100, random_state=0,max_features='sqrt')
clf.fit(train_x,train_y,)

In [None]:
importance = list(zip(train_x.columns, clf.feature_importances_))
importance

In [None]:
 accuracy_score(train_y, clf.predict(train_x))

In [None]:
pd.DataFrame(
    confusion_matrix(train_y, clf.predict(train_x)),
    columns=['Predicted Survived', 'Predicted Died'],
    index=['True Survived', 'True Died']
)

# Apply to test

In [None]:
key = titanic_test_new['PassengerId']
test_x = titanic_test_new.iloc[:,1:]
test_x = test_x.fillna(method='ffill')

In [None]:
predict_y = clf.predict(test_x)
predict_y = pd.DataFrame(predict_y,columns=['Survived'])
results = pd.concat([key,predict_y],axis=1)

In [None]:
results.to_csv('titanic_sink_swim_2.csv', index=False, header=True)