# Titanic Survival Prediction

In [116]:
import pandas as pd # Pandas for data frames
import numpy as np # Standard
import seaborn as sns # Data visualization
import matplotlib.pyplot as plt # Plotting
import sklearn # Machine learning library
import sklearn.preprocessing as preprocess # Preprocessing nulls
from sklearn.linear_model import LogisticRegression #Logistic Regression
from sklearn.ensemble import RandomForestClassifier #Random Forests
from sklearn.neighbors import KNeighborsClassifier #KNearestNeighbors

Read data file in from a CSV:

In [145]:
train = pd.read_csv("C:\\Users\\Tom\\Documents\\Titanic\\train.csv") # Read in training dataset
test = pd.read_csv("C:\\Users\\Tom\\Documents\\Titanic\\test.csv") # Read in testing dataset

Before doing any type of modeling, I want to take a look at the dataset. The key variable here is "Survived". Other variables that might be useful are sex, Age, sibsp (sibling or spouse), parch (parent or child), fare price, and destination embarked (C = Cherbourg, Q = Queenstown, S = Southampton). 

In [148]:
print(train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


I also want to check for null values. In this case, most of the nulls are in age (177) and cabin (687).

In [119]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

I could drop cabin, but there is some nice information there. First, I want to check to see if cabin is redundant with passenger class. For example, if all first class passengers had cabins and only first class passengers had cabins, I could drop cabin as a column. 

In [120]:
# Grouped nulls by passenger class to see if all first class had cabins.
train[train['Cabin'].isnull()].groupby('Pclass').size()

Pclass
1     40
2    168
3    479
dtype: int64

In [121]:
# Checked to see if 2nd and 3rd class passengers had cabins.
train.groupby('Pclass').size() - train[train['Cabin'].isnull()].groupby('Pclass').size()


Pclass
1    176
2     16
3     12
dtype: int64

In [122]:
# Replaced cabins with a cabin level. NaNs for level have 'n'. 
train['Cabin_Level'] = train['Cabin'].str[0].fillna('N')

In [123]:
# According to Encyclopedia Titanica, both of the null passengers embarked at Southampton. 
# https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html
train[train['Embarked'].isnull()] = train[train['Embarked'].isnull()].fillna('S')

In [124]:
# Next part is dealing with the missing ages. This comes from a Datacamp exercise.

# Create a groupby object: by_sex_class
by_sex_class = train.groupby(['Sex', 'Pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['Age']
train['Age'] = by_sex_class['Age'].transform(impute_median)


In [125]:
# Sanity check to make sure all missing values are taken care of. Cabin won't be in any of the models, so we're good.
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Cabin_Level      0
dtype: int64

I am doing the same methods for testing that I did for training. Is this the right way to handle missing data for testing? Concerned about data leakage.

In [126]:
# Do all of the above for the test data set
# Replaced cabins with a cabin level. NaNs for level have 'n'. 
test['Cabin_Level'] = test['Cabin'].str[0].fillna('N')

# Create a groupby object: by_sex_class
by_sex_class = test.groupby(['Sex', 'Pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['Age']
test['Age'] = by_sex_class['Age'].transform(impute_median)

# Do the same thing for 'Fare' on test data
test['Fare'] = by_sex_class['Fare'].transform(impute_median)

In [127]:
train.groupby('Cabin_Level').size()


Cabin_Level
A     15
B     47
C     59
D     33
E     32
F     13
G      4
N    687
T      1
dtype: int64

In [128]:
# There is no deck 'T' on the Titanic, so replacing that with 'N':
train[train['Cabin_Level'] == 'T'] = train[train['Cabin_Level'] == 'T'].replace('T', 'N')

In [143]:
train.groupby('Cabin_Level').size()

Cabin_Level
A     15
B     47
C     59
D     33
E     32
F     13
G      4
N    688
dtype: int64

In [130]:
# Dropping columns not needed in the model. 
X_train = train.drop(['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket'], 1) # 1 is to specify columns
Y_train = train['Survived']
X_test = test.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], 1)


In [131]:
# Fit and transform categorical variables to integers
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [132]:
# Normalizing age, fare columns
from scipy import stats
X_train['Fare'] = stats.zscore(X_train['Fare'])
X_train['Age'] = stats.zscore(X_train['Age'])
X_test['Fare'] = stats.zscore(X_test['Fare'])
X_test['Age'] = stats.zscore(X_test['Fare'])

In [133]:
# Create the classifier: logreg
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)

In [134]:
# Create random forest model
random_forest = RandomForestClassifier(n_estimators=1000)
random_forest.fit(X_train, Y_train)
y_pred_rf = random_forest.predict(X_test)

In [135]:
# Create kNN model
knn = sklearn.neighbors.KNeighborsClassifier()
knn.fit(X_train, Y_train)
y_pred_knn = knn.predict(X_test)


In [139]:
Submission = pd.DataFrame([test['PassengerId'], y_pred_log]).transpose()

In [140]:
# Save file to CSV
Submission.to_csv("C:\\Users\\Tom\\Documents\\Titanic\\Submission.csv")