# Predicting which passengers survived the Titanic shipwrec

## Loading data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

train_data = pd.read_csv("Dataset/train.csv", index_col=0)
test_data = pd.read_csv("Dataset/test.csv", index_col=0)

## First look at data
| Variable | Definition | Key |
| --- | --- | --- |
| survival | Survival | 0 = No, 1 = Yes |
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex |  |
| Age | Age in years |  |
| sibsp | # of siblings / spouses aboard the Titanic |  |
| parch | # of parents / children aboard the Titanic |  |
| ticket | Ticket number |  |
| fare | Passenger fare |  |
| cabin | Cabin number |  |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |

In [2]:
all_data = train_data.append(test_data, sort=False) # combined test and train data for preprocessing
all_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## Counting empty values in each column

In [3]:
empty_values = [(label, all_data[label].isnull().sum()) for label in all_data.columns.values]
sorted(empty_values, reverse=True, key=lambda tup: tup[1])

[('Cabin', 1014),
 ('Survived', 418),
 ('Age', 263),
 ('Embarked', 2),
 ('Fare', 1),
 ('Pclass', 0),
 ('Name', 0),
 ('Sex', 0),
 ('SibSp', 0),
 ('Parch', 0),
 ('Ticket', 0)]

## One Hot Encoding for Sex and Embarked

In [4]:
prep_data = all_data
sex_dummies = pd.get_dummies(prep_data.Sex, drop_first=False)
embarked_dummies = pd.get_dummies(prep_data.Embarked, drop_first=False)
prep_data = pd.concat([prep_data, sex_dummies], axis=1)
prep_data = pd.concat([prep_data, embarked_dummies], axis=1)
prep_data = prep_data.drop(["Sex", "Embarked"], axis=1)
prep_data

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,female,male,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,0,1,0,0,1
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,1,0,0,0,1
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,1,0,0,0,1
5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",,0,0,A.5. 3236,8.0500,,0,1,0,0,1
1306,,1,"Oliva y Ocana, Dona. Fermina",39.0,0,0,PC 17758,108.9000,C105,1,0,1,0,0
1307,,3,"Saether, Mr. Simon Sivertsen",38.5,0,0,SOTON/O.Q. 3101262,7.2500,,0,1,0,0,1
1308,,3,"Ware, Mr. Frederick",,0,0,359309,8.0500,,0,1,0,0,1


## Count words in name column

In [5]:
pd.DataFrame(prep_data.Name.str.split().sum(), columns=["words"]).words.value_counts().head()

Mr.        757
Miss.      260
Mrs.       197
William     85
John        72
Name: words, dtype: int64

## One hot encoding for name

In [6]:
prep_data["Mr"] = prep_data.Name.str.contains("Mr.", regex=False).astype(int)
prep_data["Miss"] = prep_data.Name.str.contains("Miss.", regex=False).astype(int)
prep_data["Mrs"] = prep_data.Name.str.contains("Mrs.", regex=False).astype(int)
prep_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,female,male,C,Q,S,Mr,Miss,Mrs
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,0,0,1,1,0,0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0,0,0,1
3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,1,0,1,0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1,0,0,1
5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,0,0,1,1,0,0


## Count words occurency in ticket column

In [7]:
ticket_word_occurency = pd.DataFrame(prep_data.Ticket.str.split().sum(), columns=["words"]).words.value_counts()
ticket_word_occurency.head(20)

PC            92
C.A.          46
SOTON/O.Q.    16
2.            15
W./C.         14
STON/O        14
CA.           12
A/5           12
SC/PARIS      11
2343          11
CA            10
A/5.          10
F.C.C.         9
SOTON/OQ       8
C              8
2144           8
1601           8
347082         7
14879          7
3101295        7
Name: words, dtype: int64

## One hot encoding for ticket

In [8]:
number_of_new_columns_for_ticket_groups = 12
for index in ticket_word_occurency.index[:number_of_new_columns_for_ticket_groups]:
    prep_data[index] = prep_data.Ticket.str.contains(index, regex=False).astype(int)
prep_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,female,...,SOTON/O.Q.,2.,W./C.,STON/O,CA.,A/5,SC/PARIS,2343,CA,A/5.
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,...,0,0,0,0,0,1,0,0,0,0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,...,0,0,0,0,0,0,0,0,0,0
3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,...,0,1,0,1,0,0,0,0,0,0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,...,0,0,0,0,0,0,0,0,0,0
5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,...,0,0,0,0,0,0,0,0,0,0


## Correlation matrix

In [9]:
corr = prep_data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S,Mr,Miss,Mrs,PC,C.A.,SOTON/O.Q.,2.,W./C.,STON/O,CA.,A/5,SC/PARIS,2343,CA,A/5.
Survived,1.0,-0.338481,-0.0772211,-0.0353225,0.0816294,0.257307,0.543351,-0.543351,0.16824,0.00365038,-0.15566,-0.549199,0.327093,0.33904,0.147062,0.0297964,-0.0506608,0.0178947,-0.0566492,0.0178947,-0.0506608,-0.0763381,0.0024965,-0.0751262,-0.0811647,-0.0440945
Pclass,-0.338481,1.0,-0.408106,0.060832,0.0183222,-0.558629,-0.124617,0.124617,-0.269658,0.230491,0.0963345,0.121492,0.0304781,-0.178808,-0.425095,0.00559176,0.0936547,0.110076,0.0520677,0.110076,0.0809822,0.110076,-0.0324126,0.0714115,0.0958838,0.0738694
Age,-0.0772211,-0.408106,1.0,-0.243699,-0.150917,0.17874,-0.0636449,0.0636449,0.0857773,-0.019458,-0.0759716,0.183965,-0.282033,0.217502,0.199506,-0.0522914,-0.0326051,-0.0224748,-0.0243789,-0.0209889,-0.0338176,0.00389827,-0.022889,0.00415875,-0.0723305,-0.00282388
SibSp,-0.0353225,0.060832,-0.243699,1.0,0.373587,0.160238,0.109609,-0.109609,-0.048396,-0.0486777,0.0751977,-0.243104,0.0796221,0.0650979,-0.0370091,-0.0135892,-0.0399396,-0.0341007,0.0357769,-0.0341007,0.523572,-0.0226858,-0.0119546,0.523572,0.548056,0.00852284
Parch,0.0816294,0.0183222,-0.150917,0.373587,1.0,0.221539,0.213125,-0.213125,-0.00863513,-0.100943,0.0732584,-0.30478,0.0683683,0.217673,-0.0187295,0.0659814,-0.01736,-0.0581811,0.125403,-0.0581811,0.290708,-0.0307067,-0.0409655,0.290708,0.353935,0.00151838
Fare,0.257307,-0.558629,0.17874,0.160238,0.221539,1.0,0.185523,-0.185523,0.286269,-0.130059,-0.172683,-0.192246,0.0903913,0.140518,0.494636,-0.054447,-0.053308,-0.0625762,-0.0216708,-0.0625762,0.0578178,-0.0588522,-0.0296937,0.0586625,0.0525475,-0.0346794
female,0.543351,-0.124617,-0.0636449,0.109609,0.213125,0.185523,1.0,-1.0,0.0665636,0.0886512,-0.119504,-0.870678,0.669607,0.566111,0.08269,-0.00627596,-0.0391458,-0.0103257,0.0467812,-0.0227375,0.0289285,-0.0723844,-0.0160095,0.0121878,0.0269095,-0.0285854
male,-0.543351,0.124617,0.0636449,-0.109609,-0.213125,-0.185523,-1.0,1.0,-0.0665636,-0.0886512,0.119504,0.870678,-0.669607,-0.566111,-0.08269,0.00627596,0.0391458,0.0103257,-0.0467812,0.0227375,-0.0289285,0.0723844,0.0160095,-0.0121878,-0.0269095,0.0285854
C,0.16824,-0.269658,0.0857773,-0.048396,-0.00863513,0.286269,0.0665636,-0.0665636,1.0,-0.164166,-0.775441,-0.0655385,-0.0219047,0.0969789,0.391634,-0.0882292,-0.0567068,-0.0666494,-0.0530033,-0.0666494,-0.0490337,-0.0666494,0.159903,-0.0490337,-0.0666494,-0.044727
Q,0.00365038,0.230491,-0.019458,-0.0486777,-0.100943,-0.130059,0.0886512,-0.0886512,-0.164166,1.0,-0.489874,-0.0802244,0.20061,-0.106257,-0.0885439,-0.0621483,-0.0358237,-0.0421049,-0.0334842,-0.0421049,-0.0309764,-0.0217371,-0.0296462,-0.0309764,-0.0421049,0.0018148


## Dropping unwanted features

In [10]:
prep_data = prep_data.drop(["Name", "Ticket", "Cabin"], axis=1) # dropping strings
# prep_data = prep_data.drop(["Age"], axis=1) # too many empty samples

## Fill Nan with mean value for Fare column

In [11]:
fare_mean = prep_data.Fare.mean()
prep_data.Fare= prep_data.Fare.fillna(fare_mean)

In [12]:
ble = prep_data.copy()
min = ble.Age.min()
max = ble.Age.max()
ble.Age = (ble.Age-min)/(max-min)
print(ble.Age.max())
print(ble.Age.min())
ble

1.0
0.0


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,...,SOTON/O.Q.,2.,W./C.,STON/O,CA.,A/5,SC/PARIS,2343,CA,A/5.
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3,0.273456,1,0,7.2500,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1.0,1,0.473882,1,0,71.2833,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,3,0.323563,0,0,7.9250,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,1.0,1,0.436302,1,0,53.1000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.0,3,0.436302,0,0,8.0500,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,,0,0,8.0500,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1306,,1,0.486409,0,0,108.9000,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1307,,3,0.480145,0,0,7.2500,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1308,,3,,0,0,8.0500,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Separate test and train data

In [13]:
train_data = prep_data[prep_data[["Survived", "Age"]].notna().all(axis=1)]
test_data = prep_data[prep_data[["Survived", "Age"]].isna().any(axis=1)]

## Normalize data

In [14]:
min_max_scaler = preprocessing.MinMaxScaler()
normalized_train_data = pd.DataFrame(min_max_scaler.fit_transform(train_data.values),
                               columns=train_data.columns, 
                               index=train_data.index)
normalized_test_data = pd.DataFrame(min_max_scaler.transform(test_data.values),
                               columns=test_data.columns, 
                               index=test_data.index)
normalized_data = pd.concat([normalized_train_data, normalized_test_data])
normalized_data

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,...,SOTON/O.Q.,2.,W./C.,STON/O,CA.,A/5,SC/PARIS,2343,CA,A/5.
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,0.271174,0.2,0.000000,0.014151,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.472229,0.2,0.000000,0.139136,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.321438,0.0,0.000000,0.015469,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.434531,0.2,0.000000,0.103644,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.434531,0.0,0.000000,0.015713,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,1.0,,0.0,0.000000,0.015713,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1306,,0.0,0.484795,0.0,0.000000,0.212559,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1307,,1.0,0.478512,0.0,0.000000,0.014151,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1308,,1.0,,0.0,0.000000,0.015713,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save normalized data

In [15]:
normalized_data.to_csv("normalized_data.csv", index=True)