# Titanic Casualty Prediction

### Imports

In [60]:
import pandas as pd
from tensorflow import keras

### Data Preparation and Cleaning

In [61]:
# Load data
data = pd.read_csv("data\\train.csv")
test = pd.read_csv("data\\test.csv")

In [62]:
def clean(data):

    # Drop the cabin column since it has lots of missing data
    data = data.drop(['Cabin'], axis=1)

    # Fill in missing data from other columns with median values
    cols = ['SibSp', 'Parch', 'Fare', 'Age']
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)

    # Fill in missing values using mode imputation
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode().to_string())
    data['Fare'] = data['Fare'].fillna(data['Fare'].mode().to_string())

    data.loc[(data['Embarked'] != 'S')&(data['Embarked'] != 'C')&(data['Embarked'] != 'Q'), 'Embarked'] = data['Embarked'].mode()[0]
    data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3}).astype(int)

    data['Sex'] = data['Sex'].map({'female':0, 'male':1})

    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

    data['Title'] = data['Title'].replace(['Rev', 'Dr', 'Col', 'Mlle', 'Major', 'Ms', 'Lady', 'Sir', 'Mme', 'Don', 'Capt', 'Countess', 'Jonkheer', 'Dona'], 'Others')
    data['Title'] = data['Title'].map({'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Others':5}) 

    data.loc[(data['Age'] > 0)&(data['Age'] <= 1), 'AgePeriod'] = 1
    data.loc[(data['Age'] > 1)&(data['Age'] <= 13), 'AgePeriod'] = 2
    data.loc[(data['Age'] > 13)&(data['Age'] <= 25), 'AgePeriod'] = 3
    data.loc[(data['Age'] > 25)&(data['Age'] <= 61), 'AgePeriod'] = 4
    data.loc[(data['Age'] > 61)&(data['Age'] <= 73), 'AgePeriod'] = 5
    data.loc[(data['Age'] > 73)&(data['Age'] <= 85), 'AgePeriod'] = 6
    data.loc[(data['Age'] > 85), 'AgePeriod'] = 7
    data['AgePeriod'] = data['AgePeriod'].astype(int)

    return data

data = clean(data)
test = clean(test)

# Check for null values 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    int32  
 11  Title        891 non-null    int64  
 12  AgePeriod    891 non-null    int32  
dtypes: float64(2), int32(2), int64(7), object(2)
memory usage: 83.7+ KB


### Model Experimentation

In [65]:
# Simple DNN

model = keras.Sequential()
model.add(Dense(12, input_shape=(8,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))