# Titanic Casualty Prediction

### Imports

In [1]:
import pandas as pd
from tensorflow import keras

### Data Preparation and Cleaning

In [2]:
# Load data
data = pd.read_csv("data\\train.csv")
test = pd.read_csv("data\\test.csv")

test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [3]:
def clean(data):

    # Drop the cabin column since it has lots of missing data
    data = data.drop(['Cabin'], axis=1)

    # Fill in missing data from other columns with median values
    cols = ['SibSp', 'Parch', 'Fare', 'Age']
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)

    # Fill in missing values using mode imputation
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode().to_string())
    data['Fare'] = data['Fare'].fillna(data['Fare'].mode().to_string())

    data.loc[(data['Embarked'] != 'S')&(data['Embarked'] != 'C')&
             (data['Embarked'] != 'Q'), 'Embarked'] = data['Embarked'].mode()[0]
    data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3}).astype(int)

    data['Sex'] = data['Sex'].map({'female':0, 'male':1})

    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

    data['Title'] = data['Title'].replace(['Rev', 'Dr', 'Col', 'Mlle', 'Major', 'Ms', 'Lady', 'Sir', 
                                           'Mme', 'Don', 'Capt', 'Countess', 'Jonkheer', 'Dona'], 'Others')
    data['Title'] = data['Title'].map({'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Others':5}) 

    data.loc[(data['Age'] > 0)&(data['Age'] <= 1), 'AgePeriod'] = 1
    data.loc[(data['Age'] > 1)&(data['Age'] <= 13), 'AgePeriod'] = 2
    data.loc[(data['Age'] > 13)&(data['Age'] <= 25), 'AgePeriod'] = 3
    data.loc[(data['Age'] > 25)&(data['Age'] <= 61), 'AgePeriod'] = 4
    data.loc[(data['Age'] > 61)&(data['Age'] <= 73), 'AgePeriod'] = 5
    data.loc[(data['Age'] > 73)&(data['Age'] <= 85), 'AgePeriod'] = 6
    data.loc[(data['Age'] > 85), 'AgePeriod'] = 7
    data['AgePeriod'] = data['AgePeriod'].astype(int)

    return data

    
data = clean(data)
#test = clean(test)

# Check for null values 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    int32  
 11  Title        891 non-null    int64  
 12  AgePeriod    891 non-null    int32  
dtypes: float64(2), int32(2), int64(7), object(2)
memory usage: 83.7+ KB


In [4]:
dropped_column = ['Age', 'Fare', 'Name', 'Parch', 'SibSp', 'Ticket']
for i in dropped_column:
    data = data.drop([i], axis=1)

### Model Experimentation

In [5]:

X = data.drop(['Survived', 'PassengerId'], axis=1)
y = data['Survived']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)


In [6]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
predictions = dtc.predict(test)
acc_score = dtc.score(X_train, y_train) * 100
print(acc_score)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Age
- Cabin
- Fare
- Name
- Parch
- ...
Feature names seen at fit time, yet now missing:
- AgePeriod
- Title
