## Import dependencies

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf

## Load target data

In [3]:
test_data_file = "test.csv"
test_data = pd.read_csv(test_data_file)
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## Select columns which will be used as an input for ML model

In [13]:
input_data = test_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].copy()
input_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0000,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,S
414,1,female,39.0,0,0,108.9000,C
415,3,male,38.5,0,0,7.2500,S
416,3,male,,0,0,8.0500,S


## Count NaN values in consecutive columns

In [14]:
input_data.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

## Fill NaN in Age column with other, not-NaN value sampled randomly

In [15]:
input_data["Age"] = input_data["Age"].apply(lambda x: np.random.choice(input_data["Age"].dropna().values) if pd.isnull(x) else x)

## Fill NaN in Fare column with other, not-NaN value sampled randomly

In [16]:
input_data["Fare"] = input_data["Fare"].apply(lambda x: np.random.choice(input_data["Fare"].dropna().values) if pd.isnull(x) else x)

## Check if any NaN values are present

In [17]:
input_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

## Quantify Sex data

In [18]:
input_data["Is_male"] = (input_data["Sex"] == "male").astype(int)
del input_data["Sex"]
input_data

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Is_male
0,3,34.5,0,0,7.8292,Q,1
1,3,47.0,1,0,7.0000,S,0
2,2,62.0,0,0,9.6875,Q,1
3,3,27.0,0,0,8.6625,S,1
4,3,22.0,1,1,12.2875,S,0
...,...,...,...,...,...,...,...
413,3,17.0,0,0,8.0500,S,1
414,1,39.0,0,0,108.9000,C,0
415,3,38.5,0,0,7.2500,S,1
416,3,6.0,0,0,8.0500,S,1


## Quantify Embarked data

In [19]:
input_data["Embarked_S"] = (input_data["Embarked"] == "S").astype(int)
input_data["Embarked_C"] = (input_data["Embarked"] == "C").astype(int)
input_data["Embarked_Q"] = (input_data["Embarked"] == "Q").astype(int)
del input_data["Embarked"]
input_data

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked_S,Embarked_C,Embarked_Q
0,3,34.5,0,0,7.8292,1,0,0,1
1,3,47.0,1,0,7.0000,0,1,0,0
2,2,62.0,0,0,9.6875,1,0,0,1
3,3,27.0,0,0,8.6625,1,1,0,0
4,3,22.0,1,1,12.2875,0,1,0,0
...,...,...,...,...,...,...,...,...,...
413,3,17.0,0,0,8.0500,1,1,0,0
414,1,39.0,0,0,108.9000,0,0,1,0
415,3,38.5,0,0,7.2500,1,1,0,0
416,3,6.0,0,0,8.0500,1,1,0,0


## Normalize Pclass

In [21]:
input_data["Pclass_norm"] = input_data["Pclass"] / 3
del input_data["Pclass"]
input_data

Unnamed: 0,Age,SibSp,Parch,Fare,Is_male,Embarked_S,Embarked_C,Embarked_Q,Pclass_norm
0,34.5,0,0,7.8292,1,0,0,1,1.000000
1,47.0,1,0,7.0000,0,1,0,0,1.000000
2,62.0,0,0,9.6875,1,0,0,1,0.666667
3,27.0,0,0,8.6625,1,1,0,0,1.000000
4,22.0,1,1,12.2875,0,1,0,0,1.000000
...,...,...,...,...,...,...,...,...,...
413,17.0,0,0,8.0500,1,1,0,0,1.000000
414,39.0,0,0,108.9000,0,0,1,0,0.333333
415,38.5,0,0,7.2500,1,1,0,0,1.000000
416,6.0,0,0,8.0500,1,1,0,0,1.000000


## Load ML model

In [22]:
model_filename = "ML_model_titanic2"
model = tf.keras.models.load_model(model_filename)

## Calculate survival rate

In [46]:
survival_rate = model.predict(input_data)
# Round to the nearest integer and drop the decimal part
survival_rate = np.int_(np.rint(survival_rate))
#survival_rate

## Format output data

In [47]:
data = [test_data[["PassengerId"]],
        pd.DataFrame(data = survival_rate, columns = ["Survived"])]
output_data = pd.concat(data, axis = 1)
output_data

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


## Save output to file

In [49]:
output_file = "output.csv"
output_data.to_csv(output_file, index = False)