# Titanic Survival Prediction

In [164]:
import pandas as pd # Pandas for data frames
import numpy as np # Standard
import seaborn as sns # Data visualization
import matplotlib.pyplot as plt # Plotting
import sklearn # Machine learning library
import sklearn.preprocessing as preprocess # Preprocessing nulls

Read data file in from a CSV:

In [165]:
train = pd.read_csv("C:\\Users\\Tom\\Documents\\Titanic\\train.csv") # Read in training dataset
test = pd.read_csv("C:\\Users\\Tom\\Documents\\Titanic\\test.csv") # Read in testing dataset

Before doing any type of modeling, I want to take a look at the dataset. The key variable here is "Survived". Other variables that might be useful are sex, Age, sibsp (sibling or spouse), parch (parent or child), fare price, and destination embarked (C = Cherbourg, Q = Queenstown, S = Southampton). 

In [166]:
train.head()
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


I also want to check for null values. In this case, most of the nulls are in age (177) and cabin (687).

In [167]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

I could drop cabin, but there is some nice information there. First, I want to check to see if cabin is redundant with passenger class. For example, if all first class passengers had cabins and only first class passengers had cabins, I could drop cabin as a column. 

In [168]:
# Grouped nulls by passenger class to see if all first class had cabins.
train[train['Cabin'].isnull()].groupby('Pclass').size()

Pclass
1     40
2    168
3    479
dtype: int64

In [169]:
# Checked to see if 2nd and 3rd class passengers had cabins.
train.groupby('Pclass').size() - train[train['Cabin'].isnull()].groupby('Pclass').size()

Pclass
1    176
2     16
3     12
dtype: int64

In [170]:
# Replaced cabins with a cabin level. NaNs for level have 'n'. 
train['Cabin_Level'] = train['Cabin'].str[0].fillna('N')

In [171]:
# According to Encyclopedia Titanica, both of the null passengers embarked at Southampton. 
# https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html
train[train['Embarked'].isnull()] = train[train['Embarked'].isnull()].fillna('S')

In [172]:
# Next part is dealing with the missing ages. This comes from a Datacamp exercise.

# Create a groupby object: by_sex_class
by_sex_class = train.groupby(['Sex', 'Pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['Age']
train['Age'] = by_sex_class['Age'].transform(impute_median)


In [173]:
# Sanity check to make sure all missing values are taken care of. Cabin won't be in any of the models, so we're good.
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Cabin_Level      0
dtype: int64

I am doing the same methods for testing that I did for training. Is this the right way to handle missing data for testing? Concerned about data leakage.

In [174]:
# Do all of the above for the test data set
# Replaced cabins with a cabin level. NaNs for level have 'n'. 
test['Cabin_Level'] = test['Cabin'].str[0].fillna('N')

# Create a groupby object: by_sex_class
by_sex_class = test.groupby(['Sex', 'Pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['Age']
test['Age'] = by_sex_class['Age'].transform(impute_median)

# Do the same thing for 'Fare' on test data
test['Fare'] = by_sex_class['Fare'].transform(impute_median)

In [175]:
train.groupby('Cabin_Level').size()


Cabin_Level
A     15
B     47
C     59
D     33
E     32
F     13
G      4
N    687
T      1
dtype: int64

In [176]:
# There is no deck 'T' on the Titanic, so replacing that with 'N':
train[train['Cabin_Level'] == 'T'] = train[train['Cabin_Level'] == 'T'].replace('T', 'N')

In [177]:
train.groupby('Cabin_Level').size()

Cabin_Level
A     15
B     47
C     59
D     33
E     32
F     13
G      4
N    688
dtype: int64

In [178]:
# Dropping columns not needed in the model. 
X_train = train.drop(['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket'], 1) # 1 is to specify columns
y_train = train['Survived']
X_test = test.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], 1)


In [230]:
# Fit and transform categorical variables to integers
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

     Pclass   Age  SibSp  Parch      Fare  Sex_female  Sex_male  Embarked_C  \
0         3  22.0      1      0    7.2500           0         1           0   
1         1  38.0      1      0   71.2833           1         0           1   
2         3  26.0      0      0    7.9250           1         0           0   
3         1  35.0      1      0   53.1000           1         0           0   
4         3  35.0      0      0    8.0500           0         1           0   
5         3  25.0      0      0    8.4583           0         1           0   
6         1  54.0      0      0   51.8625           0         1           0   
7         3   2.0      3      1   21.0750           0         1           0   
8         3  27.0      0      2   11.1333           1         0           0   
9         2  14.0      1      0   30.0708           1         0           1   
10        3   4.0      1      1   16.7000           1         0           0   
11        1  58.0      0      0   26.5500           

# TODO: Normalize columns

In [180]:
# First model is logistic regression
from sklearn.linear_model import LogisticRegression

In [181]:
# Create the classifier: logreg
logreg = LogisticRegression()

In [182]:
# Fit the classifier to the training data
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [204]:
# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

In [225]:
Submission = pd.DataFrame([test['PassengerId'], y_pred]).transpose()

In [229]:
# Save file to CSV
Submission.to_csv("C:\\Users\\Tom\\Documents\\Titanic\\Submission.csv")