In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn import datasets
from sklearn import linear_model
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

**Read and clean data**

In [10]:
df = pd.read_csv("titanic.csv").dropna()

X = df.drop(["Name", "Survived", "Sex"], axis = 1)

# Transform 1st, 2nd, 3rd to 1, 2, 3
X['PClass'] = pd.factorize(X['PClass'])[0] + 1

y = df.Survived

**Split into train/test** 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


**Define model type and fit**

In [12]:
rf_model = ensemble.RandomForestClassifier(random_state=1, n_estimators=200, n_jobs=-1)
# print(rf_model)

rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=1)

**Scoring**

In [13]:
train_score = rf_model.score(X_train, y_train)
train_error = 1 - train_score

print("Score on data used in training =", train_score, "==>error =", train_error)

Score on data used in training = 0.8758278145695364 ==>error = 0.1241721854304636


In [14]:
test_score = rf_model.score(X_test, y_test)
test_error = 1 - test_score

print("Score on test (unseen) data =", test_score, "==>error =", test_error)

Score on test (unseen) data = 0.7828947368421053 ==>error = 0.2171052631578947


**Hyperparameter Tuning**

In [15]:
# This can take long 'cause runs down a lot of paths
grid_param_forest = {"criterion" : ["gini", "entropy"],
                     "max_depth" : range(1, 5),
                     "n_estimators" : range(10, 300, 20),
                     "max_features" : range(1, len(df.columns))}

print(str(datetime.now()))
grid_search_tree = ms.GridSearchCV(rf_model,
                                   grid_param_forest,
                                   cv=3,
                                   n_jobs=-1,
                                   verbose=True,
#                                   random_state=1,
                                   scoring="accuracy").fit(X_train, y_train)
print(str(datetime.now()))

print("Best parameters:", grid_search_tree.best_params_,
      "\nBest Score:", grid_search_tree.best_score_)

2022-08-20 22:39:19.511648
Fitting 3 folds for each of 600 candidates, totalling 1800 fits
2022-08-20 22:44:55.025298
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 1, 'n_estimators': 210} 
Best Score: 0.8162241597294058


In [16]:
grid_train_score = grid_search_tree.best_estimator_.score(X_train, y_train)
grid_train_error = 1 - grid_train_score

print("Best estimator score on data used in training =", grid_train_score, "==>error =", grid_train_error)

grid_test_score = grid_search_tree.best_estimator_.score(X_test, y_test)
grid_test_error = 1 - grid_test_score

print("Best estimator score on test (unseen) data =", grid_test_score, "==>error =", grid_test_error)

Best estimator score on data used in training = 0.8195364238410596 ==>error = 0.18046357615894038
Best estimator score on test (unseen) data = 0.8092105263157895 ==>error = 0.1907894736842105


**Error on test data has decreased by over 2.5% - though this does fluctuate with each run**

**Note: Not as good as DT with Hyperparameter Tuning**