# Titanic Dataset - Classification
### Genesis Adam D. Mendoza

Initialize the required packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


In [2]:
file_path = 'Datasets\\Titanic Dataset\\'
titanic = pd.read_csv(file_path + 'train.csv', index_col = 'PassengerId')
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic['Honorific'] = titanic['Name'].str.extract('([A-Za-z]+)\.')
titanic['Surname'] = titanic['Name'].str.extract('([A-Za-z]+)\,')
titanic['CabinName'] = titanic.Cabin.str.extract(r'([A-Z])')
titanic['CabinNumber'] = titanic.Cabin.str.extract(r'(\d+)', expand=False)
titanic_dropped = titanic.drop(['Name', 'Cabin'], axis = 1)
titanic_dropped.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Honorific,Surname,CabinName,CabinNumber
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,male,22.0,1,0,A/5 21171,7.25,S,Mr,Braund,,
2,1,1,female,38.0,1,0,PC 17599,71.2833,C,Mrs,Cumings,C,85.0
3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss,Heikkinen,,
4,1,1,female,35.0,1,0,113803,53.1,S,Mrs,Futrelle,C,123.0
5,0,3,male,35.0,0,0,373450,8.05,S,Mr,Allen,,


In [4]:
target_cols = ['Survived']
feature_cols = [col for col in titanic_dropped.columns if col not in target_cols]

In [5]:
num_cols = [col for col in feature_cols if titanic_dropped[col].dtype in ['int64', 'float64']]
cat_cols = [col for col in feature_cols if titanic_dropped[col].dtype in ['object']]                                                                                   

In [6]:
x_feats = titanic_dropped[feature_cols]
y_targ = titanic_dropped[target_cols]
x_train, x_test, y_train, y_test = train_test_split(x_feats, y_targ, train_size=0.5, random_state=0)

In [7]:
numerical_transformer = SimpleImputer(strategy = 'mean')
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')),('onehot', OneHotEncoder(handle_unknown = 'ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, num_cols), ('cat', categorical_transformer, cat_cols)])

In [8]:
params = {}
for node_val in range(10,500,50):
    for estim in range(1, 200, 20):
        for depth in range(1, 100, 1):
            model = RandomForestClassifier(n_estimators = estim, max_leaf_nodes = node_val, max_depth = depth, random_state = 0)
            pipeline = Pipeline(steps = [('preprocess', preprocessor),('model', model)])
            pipeline.fit(x_train, y_train.values.ravel())
            predicted_vals = pipeline.predict(x_test)
            y_pred = pd.DataFrame(predicted_vals, index = y_test.index, columns = {'SurvivedPred'})
            correct = y_pred.SurvivedPred[y_pred.SurvivedPred == y_test.Survived].count()
            total = y_test.Survived.count()
            params[(node_val, estim, depth)] = 100*correct/total
optimal_params = max(params, key = params.get)
opt_nodes, opt_estim, opt_depth = optimal_params
print('The optimal parameters are (max_leaf_nodes, n_estimators, max_depth) = {} with an accuracy of {:.2f}%'.format(optimal_params, params[optimal_params]))

The optimal parameters are (max_leaf_nodes, n_estimators, max_depth) = (160, 61, 36) with an accuracy of 84.08%


In [14]:
file_path = 'Datasets\\Titanic Dataset\\'
titanic_test = pd.read_csv(file_path + 'test.csv', index_col = 'PassengerId')
titanic_test['Honorific'] = titanic_test['Name'].str.extract('([A-Za-z]+)\.')
titanic_test['Surname'] = titanic_test['Name'].str.extract('([A-Za-z]+)\,')
titanic_test['CabinName'] = titanic_test.Cabin.str.extract(r'([A-Z])')
titanic_test['CabinNumber'] = titanic_test.Cabin.str.extract(r'(\d+)', expand=False)
titanic_test_dropped = titanic_test.drop(['Name', 'Cabin'], axis = 1)
titanic_test_dropped.head()

good_cols = [col for col in feature_cols if col in titanic_test.columns]
x_deploy = titanic_test[good_cols]

In [15]:
model = RandomForestClassifier(n_estimators = opt_estim, max_leaf_nodes = opt_nodes, max_depth = opt_depth, random_state = 0)
pipeline = Pipeline(steps = [('preprocess', preprocessor),('model', model)])
pipeline.fit(x_feats, y_targ.values.ravel())
final_pred = pipeline.predict(x_deploy)

output = pd.DataFrame({'PassengerId': titanic_test.index, 'Survived': final_pred})
output.to_csv('titanic_predict.csv', index=False)

Kaggle's accuracy score is given below:

<div>
<img src="Images/Kaggle_Titanic.jpg" width="800"/>
</div>

Attempt at using RandomizedSearchCV

In [16]:
from sklearn.model_selection import RandomizedSearchCV

model = RandomForestClassifier(random_state=0)
pipeline = Pipeline(steps=[('preprocess', preprocessor), ('model', model)])

max_leaf_nodes = [node_val for node_val in range(1, 500, 1)]
max_depth = [depth for depth in range(1, 100, 1)]

random_grid = {'model__max_leaf_nodes': max_leaf_nodes, 'model__max_depth': max_depth}
n_estimators = [estim for estim in range(1, 200, 1)]

model_random = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions={**random_grid, 'model__n_estimators': n_estimators},
    n_iter=100, cv=5, verbose=3, random_state=0, n_jobs=-1
)

model_random.fit(x_feats, y_targ.values.ravel())
print('The best score given the optimum parameters is {:.2f}%'.format(100*model_random.best_score_))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The best score given the optimum parameters is 84.40%


In [17]:
opt_estim = model_random.best_params_['model__n_estimators']
opt_nodes = model_random.best_params_['model__max_leaf_nodes']
opt_depth = model_random.best_params_['model__max_depth']

In [18]:
model = RandomForestClassifier(n_estimators = opt_estim, max_leaf_nodes = opt_nodes, max_depth = opt_depth, random_state = 0)
pipeline = Pipeline(steps = [('preprocess', preprocessor),('model', model)])
pipeline.fit(x_feats, y_targ.values.ravel())
final_pred = pipeline.predict(x_deploy)

output = pd.DataFrame({'PassengerId': titanic_test.index, 'Survived': final_pred})
output.to_csv('titanic_predict_rscv.csv', index=False)