# Titanic - Random Forest

## Preparation

We have done all of the data-related tasks on decision tree model, so I would love to just apply it again.

In [1]:
import warnings
warnings.filterwarnings("ignore")
from os.path import join
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# %matplotlib inline

from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
input_dir = join("..", 'data', 'raw')
output_dir = join('..', 'data', 'result')
log_dir = join('..', 'log')
train = pd.read_csv(join(input_dir, 'train.csv'))
test = pd.read_csv(join(input_dir, 'test.csv'))

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
# Get a copy of df_train and df_test dataset
df_train = train.copy()
df_test = test.copy()

# ===== Preprocessing =====
df_train = df_train.drop(['Ticket'], axis=1)
df_test = df_test.drop(['Ticket'], axis=1)

# Handle missing value for Age
master_train = df_train['Name'].str.contains(r',\s*Master.', regex=True)
df_master_train = df_train[master_train].copy()

master_test = df_test['Name'].str.contains(r',\s*Master.', regex=True)
df_master_test = df_test[master_test].copy()

median_age_train = df_train['Age'].median()
median_age_master_train = df_master_train['Age'].median()
median_age_test = df_test['Age'].median()
median_age_master_test = df_master_test['Age'].median()

df_master_train['Age'] = df_master_train['Age'].fillna(median_age_master_train)
df_train[master_train] = df_master_train
df_master_test['Age'] = df_master_test['Age'].fillna(median_age_master_test)
df_test[master_test] = df_master_test

df_train['Age'] = df_train['Age'].fillna(median_age_train)
df_test['Age'] = df_test['Age'].fillna(median_age_test)

# Handle missing value for Cabin, Embarked
df_train = df_train.drop(['Cabin'], axis=1)
df_test = df_test.drop(['Cabin'], axis=1)

df_train['Embarked'] = df_train['Embarked'].fillna("S")
df_test['Embarked'] = df_test['Embarked'].fillna("S")

fare_mean = df_train['Fare'].median()
df_test['Fare'] = df_test['Fare'].fillna(fare_mean)


# Feature encoding
df_train['Sex'] = df_train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df_test['Sex'] = df_test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df_train['Embarked'] = df_train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
df_test['Embarked'] = df_test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

# ===== Feature Engineering =====

# Familysize
df_train["FamilySize"] = df_train["SibSp"] + df_train["Parch"] + 1
df_test["FamilySize"] = df_test["SibSp"] + df_test["Parch"] + 1

df_train = df_train.drop(['Parch','SibSp'], axis=1)
df_test = df_test.drop(['Parch', 'SibSp'], axis=1)

# ageAndPclass
df_train["AgeAndPclass"] = df_train["Age"] * df_train["Pclass"]
df_test["AgeAndPclass"] = df_test["Age"] * df_test["Pclass"]

df_train = df_train.drop(['Age', 'Pclass', 'PassengerId'], axis=1)
df_test = df_test.drop(['Age', 'Pclass', 'PassengerId'], axis=1)

# Scale Fare feature using log transformation
df_train["FareTransformed"] = np.log10(df_train["Fare"] + 1)
df_test["FareTransformed"] = np.log10(df_test["Fare"] + 1)
df_train = df_train.drop(["Fare"], axis=1)
df_test = df_test.drop(["Fare"], axis=1)

# Title feature
full_data = [df_train, df_test]
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

df_train = df_train.drop(['Name'], axis=1)
df_test = df_test.drop(['Name'], axis=1)
    
df_train.head(3)

Unnamed: 0,Survived,Sex,Embarked,FamilySize,AgeAndPclass,FareTransformed,Title
0,0,1,0,2,66.0,0.916454,1
1,1,0,1,2,38.0,1.859038,3
2,1,0,0,1,78.0,0.950608,4


## Modelling

In [4]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
accuracies = list()
max_attributes = len(list(df_test))
depth_range = range(1, max_attributes + 1)

for depth in depth_range:
    fold_accuracy = []
    tree_model = RandomForestClassifier(max_depth=depth, n_estimators=100, random_state=42)
    for train_fold, valid_fold in kf.split(df_train):
        f_train = df_train.loc[train_fold] 
        f_valid = df_train.loc[valid_fold] 

        model = tree_model.fit(X = f_train.drop(['Survived'], axis=1), 
                               y = f_train["Survived"]) 
        valid_acc = model.score(X = f_valid.drop(['Survived'], axis=1), 
                                y = f_valid["Survived"])
        fold_accuracy.append(valid_acc)

    avg = sum(fold_accuracy)/len(fold_accuracy)
    accuracies.append(avg)
    # print("Accuracy per fold: ", fold_accuracy, "\n")
    # print("Average accuracy: ", avg)
    # print("\n")
    
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))

 Max Depth  Average Accuracy
         1          0.782235
         2          0.801323
         3          0.814794
         4          0.831623
         5          0.832747
         6          0.830487


### Create submission file

In [5]:
# Create Numpy arrays of df_train, df_test and target (Survived) dataframes to feed into our models
y_train = df_train['Survived']
x_train = df_train.drop(['Survived'], axis=1).values 
x_test = df_test.values

rf = RandomForestClassifier(max_depth = 6)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
submission = pd.DataFrame({
        "PassengerId": test['PassengerId'],
        "Survived": y_pred
    })
submission.to_csv(join(output_dir, 'submission_rf_titanic.csv'), index=False)

joblib.dump(rf, join('..', 'log', 'rf_model.pkl'))

['..\\log\\rf_model.pkl']

In [6]:
acc_rf = round(rf.score(x_train, y_train) * 100, 2)
acc_rf

86.53