# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import Sequential

# Load & describe data

Informations:  
survival: 0 = No, 1 = Yes  
pclass(Ticket class): 1 = 1st, 2 = 2nd, 3 = 3rd  
sex: male, female  
Age: in years  
sibsp # of siblings / spouses aboard the Titanic  
parch # of parents / children aboard the Titanic  
ticket: ticket number  
fare: Passenger fare  
cabin: Cabin number  
embarked Port of Embarkation: C = Cherbourg, Q = Queenstown, S = Southampton  

Loading datasets

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Describing both train and test sets

In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


a) We want to predict whether someone survived or not

In [6]:
print(train_df['Survived'][:5])

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


b) We want to know the datatypes

In [7]:
print('Train data')
print(train_df.dtypes)
print('\nTest data')
print(test_df.dtypes)

Train data
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Test data
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Categorical - Survived(numeric), Sex(object), pclass(numeric), embarked(object)  
Numeric - PassengerID, Age, Fare, SibSp, Parch  
Others - Ticket, Name, Cabin

c) Checking missing values

In [8]:
print('Train data')
print(train_df.isnull().sum())
print('\nTest data')
print(test_df.isnull().sum())

Train data
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Test data
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In my opinion we should delete cabin column because there are so many missing values. Column Age can be filled with means.

d) Dataset doesn't seem to have any outliers within important features

e) For me columns like: PassengerId, Name, Ticket, Cabin are not giving many information so i will skip them in first try

In [9]:
train_df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


# Prepare data

In [10]:
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.impute import SimpleImputer

Creating imputer and encoder objects

In [11]:
imputer = SimpleImputer(strategy='mean')
encoder = LabelEncoder()

Filling missing values with mean of each column

In [12]:
train_df['Age'] = imputer.fit_transform(train_df[['Age']])
test_df['Age'] = imputer.fit_transform(test_df[['Age']])

Dropping Cabin column (too many missing values), and rows with missing values (2 rows)

In [13]:
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

Checking if everyting is correct

In [14]:
print('Train data')
print(train_df.isnull().sum())
print('\nTest data')
print(test_df.isnull().sum())

Train data
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Test data
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


Imputing and encoding 

In [15]:
train_df['Sex'] = encoder.fit_transform(train_df['Sex'])
train_df['Embarked'] = encoder.fit_transform(train_df['Embarked'])
test_df['Sex'] = encoder.fit_transform(test_df['Sex'])
test_df['Embarked'] = encoder.fit_transform(test_df['Embarked'])

In [16]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,2


In [17]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,2
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,1
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,2


Preparing X and y

In [18]:
X_train = train_df.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], axis=1)
y_train = train_df['Survived']
X_test = test_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

In [19]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [20]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


In [21]:
from sklearn.preprocessing import StandardScaler

Creating scaler object to normalize data

In [22]:
scaler = StandardScaler()

In [23]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

Normalizing data

In [24]:
for col in X_train.columns:
    X_train[col] = scaler.fit_transform(X_train[[col]])
    X_test[col] = scaler.fit_transform(X_test[[col]]) 

In [25]:
X_train.shape

(889, 7)

In [26]:
X_test.shape

(417, 7)

In [27]:
X_test.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,417.0,417.0,417.0,417.0,417.0,417.0,417.0
mean,-1.064962e-17,-6.549517000000001e-17,6.519149000000001e-17,-4.2598490000000005e-17,-6.789134000000001e-17,-5.2582510000000006e-17,2.007454e-16
std,1.001201,1.001201,1.001201,1.001201,1.001201,1.001201,1.001201
min,-1.502602,-1.320387,-2.393349,-0.5002182,-0.4008043,-0.638017,-1.639919
25%,-1.502602,-1.320387,-0.5738362,-0.5002182,-0.4008043,-0.4966178,-0.4689494
50%,0.8753298,0.7573539,0.005777156,-0.5002182,-0.4008043,-0.379169,0.7020201
75%,0.8753298,0.7573539,0.3825438,0.6152416,-0.4008043,-0.07391031,0.7020201
max,0.8753298,0.7573539,3.650176,8.42346,8.77126,8.536851,0.7020201


# Create Neural Network

Building Neural Network model

In [28]:
model = Sequential()
model.add(Flatten(dtype='float64'))
model.add(Dense(128, activation='relu', dtype='float64'))
model.add(Dense(128, activation='relu', dtype='float64'))
model.add(Dense(64, activation='relu', dtype='float64'))
model.add(Dense(2, activation='softmax', dtype='float64'))

Fitting and compiling model. I've tried with different activation functions, loss function and number of neurons in each layer

In [29]:
model.compile(optimizer='adam', loss='SparseCategoricalCrossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=12, validation_split = 0.15)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x2038ce57dc0>

Making predicitons for test set

In [30]:
preds = model.predict(X_test)

I've used softmax activation to get probabilities of belonging to each class

In [31]:
preds[0]

array([0.91693471, 0.08306529])

Predicting for all test set data

In [32]:
results = np.array([], dtype='int8')
for pred in preds:
    results = np.append(results, np.argmax(pred))
print(results)

[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 0
 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0
 1 1 1 1 0 0 1 0 0 0]


# Show results

Adding new column to test dataset to see who survived

In [33]:
test_df['Survived'] = results
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
0,892,3,"Kelly, Mr. James",1,34.50000,0,0,330911,7.8292,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.00000,1,0,363272,7.0000,2,0
2,894,2,"Myles, Mr. Thomas Francis",1,62.00000,0,0,240276,9.6875,1,0
3,895,3,"Wirz, Mr. Albert",1,27.00000,0,0,315154,8.6625,2,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.00000,1,1,3101298,12.2875,2,0
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,30.27259,0,0,A.5. 3236,8.0500,2,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.00000,0,0,PC 17758,108.9000,0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.50000,0,0,SOTON/O.Q. 3101262,7.2500,2,0
416,1308,3,"Ware, Mr. Frederick",1,30.27259,0,0,359309,8.0500,2,0


Show people who survived

In [34]:
stats_df = test_df[test_df['Survived'] == 1].reset_index(drop=True)
stats_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
0,898,3,"Connolly, Miss. Kate",0,30.00000,0,0,330972,7.6292,1,1
1,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",0,18.00000,0,0,2657,7.2292,0,1
2,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",0,23.00000,1,0,21228,82.2667,2,1
3,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",0,47.00000,1,0,W.E.P. 5734,61.1750,2,1
4,907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",0,24.00000,1,0,SC/PARIS 2167,27.7208,0,1
...,...,...,...,...,...,...,...,...,...,...,...
125,1300,3,"Riordan, Miss. Johanna Hannah""""",0,30.27259,0,0,334915,7.7208,1,1
126,1301,3,"Peacock, Miss. Treasteall",0,3.00000,1,1,SOTON/O.Q. 3101315,13.7750,2,1
127,1302,3,"Naughton, Miss. Hannah",0,30.27259,0,0,365237,7.7500,1,1
128,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",0,37.00000,1,0,19928,90.0000,1,1


Some stacistics

Coutning the percentages of alive and dead people

In [35]:
percentageAlive = test_df[test_df['Survived'] == 1]['Survived'].count() / test_df['Survived'].count()
percentageDead = test_df[test_df['Survived'] == 0]['Survived'].count() / test_df['Survived'].count()
print(f'Alive = {percentageAlive.round(2)}%')
print(f'Dead = {percentageDead.round(2)}%')

Alive = 0.31%
Dead = 0.69%


Sex percentage 

In [36]:
percentageMale = stats_df[stats_df['Sex'] == 1]['Sex'].count() / stats_df['Sex'].count()
percentageFemale = stats_df[stats_df['Sex'] == 0]['Sex'].count() / stats_df['Sex'].count()
print(f'Female = {percentageFemale.round(2)}%')
print(f'Male = {percentageMale.round(2)}%')

Female = 0.92%
Male = 0.08%


Class percentage

In [37]:
percentageClass1 = stats_df[stats_df['Pclass'] == 1]['Pclass'].count() / stats_df['Pclass'].count()
percentageClass2 = stats_df[stats_df['Pclass'] == 2]['Pclass'].count() / stats_df['Pclass'].count()
percentageClass3 = stats_df[stats_df['Pclass'] == 3]['Pclass'].count() / stats_df['Pclass'].count()

print(f'Class 1 = {percentageClass1.round(2)}%')
print(f'Class 2 = {percentageClass2.round(2)}%')
print(f'Class 3 = {percentageClass3.round(2)}%')

Class 1 = 0.4%
Class 2 = 0.25%
Class 3 = 0.35%
