# Titanic Passengers Survival Predictor

In this project I created a Logistic Regression model that predicts which passengers survived the sinking of the Titanic, based on features like age and class.

The data I used for training the model was provided by Kaggle. 

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the passenger data
passengers = pd.read_csv('passengers.csv')
#print(passengers.head())

# Update sex column to numerical
passengers['Sex'] = passengers.Sex.map({'male':0, 'female':1})
#print(passengers.head())
# Fill the nan values in the age column
#print(passengers['Age'].values)
avg_Age = np.mean(passengers.Age)
print(avg_Age)
passengers['Age'].fillna(value=round(avg_Age,0), inplace=True)
#print(passengers.head(10))

# Create a first class column
passengers['FirstClass'] = passengers.Pclass.apply(lambda x: 1 if x == 1 else 0)
print(passengers.head(10))
# Create a second class column
passengers['SecondClass'] = passengers.Pclass.apply(lambda x: 1 if x == 2 else 0)
print(passengers.head(10))

29.69911764705882
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   
5                                   Moran, Mr. James    0  30.0      0      0   
6                 

In [3]:
# Select the desired features
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers['Survived']

# Perform train, test, split
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size = 0.3)

#Since sklearn‘s Logistic Regression implementation uses Regularization, we need to scale
#our feature data. Create a StandardScaler object, .fit_transform() it on the training features,
#and .transform() the test features.
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
train_features = scaler.fit_transform(X_train)
test_features = scaler.transform(X_test)

# Create and train the model
titanic_model = LogisticRegression()
titanic_model.fit(X_train, y_train)

# Score the model on the train data
training_score = titanic_model.score(X_train, y_train)
print(training_score)
# Score the model on the test data
test_score = titanic_model.score(X_test, y_test)
print(test_score)

0.78330658105939
0.832089552238806


In [4]:
# Analyze the coefficients
#print(titanic_model.coef_)

#To print each feature with its respective coefficient value, you can use the following expression:
print(list(zip(['Sex','Age','FirstClass','SecondClass'],titanic_model.coef_[0])))

# Sample passenger features
jack = np.array([0.0,20.0,0.0,0.0])
rose = np.array([1.0,17.0,1.0,0.0])
will = np.array([0.0,60.0,1.0,0.0])
kristin = np.array([1.0,55.0,1.0,0.0])

# Combine passenger arrays
sample_passengers = np.array([jack, rose, will, kristin])

# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
print(sample_passengers)

# Make survival predictions!
print(titanic_model.predict(sample_passengers))

print(titanic_model.predict_proba(sample_passengers))

[('Sex', 2.309169693177409), ('Age', -0.020936852924071124), ('FirstClass', 2.0049475239620285), ('SecondClass', 1.0485463693407606)]
[[-0.74926865 -0.78151043 -0.59774449 -0.49598387]
 [ 1.33463478 -1.00910469  1.67295561 -0.49598387]
 [-0.74926865  2.25307958  1.67295561 -0.49598387]
 [ 1.33463478  1.87375582  1.67295561 -0.49598387]]
[0 1 0 1]
[[0.9931873  0.0068127 ]
 [0.01228087 0.98771913]
 [0.62082015 0.37917985]
 [0.01303499 0.98696501]]




I'm not sure why I'm getting this warning message about not using valid feature names. I'm going to investigate at a later time. 

For our sample passengers array of Jack, Rose, Will (me), and Kristin (my wife), the model returned [0, 1, 1, 1]. Thus, the model predicted correctly that Jack would die and Rose would live. (But why didn't Jack find another piece of wreckage for his own use?) Furthermore, the model predicted that both my wife and I would survive, probably since we were both traveling First Class!