# Predicting which passengers survived the Titanic shipwrec

## Loading data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

train_data = pd.read_csv("Dataset/train.csv", index_col=0)
test_data = pd.read_csv("Dataset/test.csv", index_col=0)

## First look at data
| Variable | Definition | Key |
| --- | --- | --- |
| survival | Survival | 0 = No, 1 = Yes |
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex |  |
| Age | Age in years |  |
| sibsp | # of siblings / spouses aboard the Titanic |  |
| parch | # of parents / children aboard the Titanic |  |
| ticket | Ticket number |  |
| fare | Passenger fare |  |
| cabin | Cabin number |  |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |

In [2]:
all_data = train_data.append(test_data, sort=False) # combined test and train data for preprocessing
all_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## Counting empty values in each column

In [3]:
empty_values = [(label, all_data[label].isnull().sum()) for label in all_data.columns.values]
sorted(empty_values, reverse=True, key=lambda tup: tup[1])

[('Cabin', 1014),
 ('Survived', 418),
 ('Age', 263),
 ('Embarked', 2),
 ('Fare', 1),
 ('Pclass', 0),
 ('Name', 0),
 ('Sex', 0),
 ('SibSp', 0),
 ('Parch', 0),
 ('Ticket', 0)]

## One Hot Encoding for Sex and Embarked

In [4]:
prep_data = all_data
sex_dummies = pd.get_dummies(prep_data.Sex, drop_first=False)
embarked_dummies = pd.get_dummies(prep_data.Embarked, drop_first=False)
prep_data = pd.concat([prep_data, sex_dummies], axis=1)
prep_data = pd.concat([prep_data, embarked_dummies], axis=1)
prep_data = prep_data.drop(["Sex", "Embarked"], axis=1)
prep_data

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,female,male,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,0,1,0,0,1
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,1,0,0,0,1
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,1,0,0,0,1
5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",,0,0,A.5. 3236,8.0500,,0,1,0,0,1
1306,,1,"Oliva y Ocana, Dona. Fermina",39.0,0,0,PC 17758,108.9000,C105,1,0,1,0,0
1307,,3,"Saether, Mr. Simon Sivertsen",38.5,0,0,SOTON/O.Q. 3101262,7.2500,,0,1,0,0,1
1308,,3,"Ware, Mr. Frederick",,0,0,359309,8.0500,,0,1,0,0,1


## Correlation matrix

In [5]:
corr = prep_data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
Survived,1.0,-0.338481,-0.0772211,-0.0353225,0.0816294,0.257307,0.543351,-0.543351,0.16824,0.00365038,-0.15566
Pclass,-0.338481,1.0,-0.408106,0.060832,0.0183222,-0.558629,-0.124617,0.124617,-0.269658,0.230491,0.0963345
Age,-0.0772211,-0.408106,1.0,-0.243699,-0.150917,0.17874,-0.0636449,0.0636449,0.0857773,-0.019458,-0.0759716
SibSp,-0.0353225,0.060832,-0.243699,1.0,0.373587,0.160238,0.109609,-0.109609,-0.048396,-0.0486777,0.0751977
Parch,0.0816294,0.0183222,-0.150917,0.373587,1.0,0.221539,0.213125,-0.213125,-0.00863513,-0.100943,0.0732584
Fare,0.257307,-0.558629,0.17874,0.160238,0.221539,1.0,0.185523,-0.185523,0.286269,-0.130059,-0.172683
female,0.543351,-0.124617,-0.0636449,0.109609,0.213125,0.185523,1.0,-1.0,0.0665636,0.0886512,-0.119504
male,-0.543351,0.124617,0.0636449,-0.109609,-0.213125,-0.185523,-1.0,1.0,-0.0665636,-0.0886512,0.119504
C,0.16824,-0.269658,0.0857773,-0.048396,-0.00863513,0.286269,0.0665636,-0.0665636,1.0,-0.164166,-0.775441
Q,0.00365038,0.230491,-0.019458,-0.0486777,-0.100943,-0.130059,0.0886512,-0.0886512,-0.164166,1.0,-0.489874


## Dropping unwanted features

In [6]:
prep_data = prep_data.drop(["Name", "Ticket", "Cabin"], axis=1) # dropping strings
# prep_data = prep_data.drop(["Age"], axis=1) # too many empty samples

## Fill Nan with mean value for Fare column

In [7]:
fare_mean = prep_data.Fare.mean()
prep_data.Fare= prep_data.Fare.fillna(fare_mean)

## Normalize data

In [8]:
df = prep_data
min_max_scaler = preprocessing.MinMaxScaler()
normalized_data = pd.DataFrame(min_max_scaler.fit_transform(df.values), columns=df.columns, index=df.index)
normalized_data

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,1.0,0.273456,0.125,0.000000,0.014151,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.473882,0.125,0.000000,0.139136,1.0,0.0,1.0,0.0,0.0
3,1.0,1.0,0.323563,0.000,0.000000,0.015469,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.436302,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0
5,0.0,1.0,0.436302,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1305,,1.0,,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0
1306,,0.0,0.486409,0.000,0.000000,0.212559,1.0,0.0,1.0,0.0,0.0
1307,,1.0,0.480145,0.000,0.000000,0.014151,0.0,1.0,0.0,0.0,1.0
1308,,1.0,,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0


## Save normalized data

In [9]:
normalized_data.to_csv("normalized_data.csv", index=True)