# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import pickle

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Reading Data

In [2]:
train_features_backup = pd.read_csv("../Data/train_features_df.csv")
test_features_backup = pd.read_csv("../Data/test_features_df.csv")

train_features_df = train_features_backup.copy()
train_features_df = train_features_df.drop(["Age_label_enc", "Fare_label_enc"], axis = 1)
train_features_df["Indices"] = train_features_df.index

test_features_df = test_features_backup.copy()
test_features_df = test_features_df.drop(["Age_label_enc", "Fare_mean_enc"], axis = 1)

In [3]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/test.csv")

train_preprocessed_df = pd.read_csv("../Data/preprocessed_train_df.csv")
test_preprocessed_df = pd.read_csv("../Data/preprocessed_test_df.csv")

# Train Test Split

In [4]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Base Random Forests Model

In [5]:
RandomForestClassifier(random_state = 42).fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [6]:
base_rf = RandomForestClassifier(random_state = 42).fit(X_train[:, :-1], y_train)

## Classification Accuracy

In [7]:
np.where(base_rf.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8547486033519553

## F1 Score

In [8]:
f1_score(y_true = y_test, y_pred = base_rf.predict(X_test[:, :-1]))

0.8243243243243243

## Classification Summary

In [9]:
preds = base_rf.predict(X_test[:, :-1])

f1_df = pd.DataFrame(index = ["Predictions:0", "Predictions:1"], columns = ["Actuals:0", "Actuals:1"])

f1_df.at["Predictions:0", "Actuals:0"] = np.where((preds == 0) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:0", "Actuals:1"] = np.where((preds == 0) & (y_test == 1), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:0"] = np.where((preds == 1) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:1"] = np.where((preds == 1) & (y_test == 1), 1, 0).sum()

f1_df

Unnamed: 0,Actuals:0,Actuals:1
Predictions:0,92,13
Predictions:1,13,61


## Error Analysis

In [10]:
preds = base_rf.predict(X_test[:, :-1])

p0_a1 = X_test[:, -1][(preds == 0) & (y_test == 1)]
p1_a0 = X_test[:, -1][(preds == 1) & (y_test == 0)]


print(base_rf.classes_)


preds_proba = base_rf.predict_proba(X_test[:, :-1])

error_df = pd.concat([train_df.loc[p0_a1], train_df.loc[p1_a0]], axis = 0)
error_df["Prediction_Proba"] = preds_proba[:, 1][((preds == 0) & (y_test == 1)) | ((preds == 1) & (y_test == 0))]

error_df

[0 1]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Prediction_Proba
39.0,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C,0.32
447.0,448,1,1,"Seward, Mr. Frederic Kimber",male,34.0,0,0,113794,26.55,,S,0.38
192.0,193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19.0,1,0,350046,7.8542,,S,0.24
673.0,674,1,2,"Wilhelms, Mr. Charles",male,31.0,0,0,244270,13.0,,S,0.314952
204.0,205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18.0,0,0,A/5 3540,8.05,,S,0.93
23.0,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S,0.311893
712.0,713,1,1,"Taylor, Mr. Elmer Zebley",male,48.0,1,0,19996,52.0,C126,S,0.27
338.0,339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.05,,S,0.94
286.0,287,1,3,"de Mulder, Mr. Theodore",male,30.0,0,0,345774,9.5,,S,0.92
209.0,210,1,1,"Blank, Mr. Henry",male,40.0,0,0,112277,31.0,A31,C,0.736005


# Base Random Forest with class balancing

In [11]:
class_weight = pd.Series(y_train).value_counts()[0] / pd.Series(y_train).value_counts()[1]

In [12]:
RandomForestClassifier(random_state = 42, class_weight = {0: 1, 1: class_weight})

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 1.6567164179104477},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [13]:
balanced_base_rf = RandomForestClassifier(
                        random_state = 42, 
                        class_weight = {0: 1, 1: class_weight}
                    ).fit(X_train[:, :-1], y_train)

In [14]:
np.where(balanced_base_rf.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8435754189944135

In [15]:
f1_score(y_pred = balanced_base_rf.predict(X_test[:, :-1]), y_true = y_test)

0.8082191780821918

## Creating Base Submission (Balanced and Unbalanced)

## Unbalanced

In [16]:
X, y = train_features_df.values, train_df["Survived"].values

full_base_rf = RandomForestClassifier(random_state = 42).fit(X[:, :-1], y)

base_sub6 = pd.read_csv("../Data/gender_submission.csv")
base_sub6["Survived"] = full_base_rf.predict(test_features_df.values)
base_sub6.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [17]:
base_sub6.to_csv("sub6_base_rf.csv", index = False)

## Balanced

In [18]:
X, y = train_features_df.values, train_df["Survived"].values

class_weight = pd.Series(y).value_counts()[0] / pd.Series(y).value_counts()[1]

full_bal_base_rf = RandomForestClassifier(
                        random_state = 42, 
                        class_weight = {0: 1, 1: class_weight}
                    ).fit(X[:, :-1], y)

bal_base_sub6 = pd.read_csv("../Data/gender_submission.csv")
bal_base_sub6["Survived"] = full_bal_base_rf.predict(test_features_df.values)
bal_base_sub6.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [19]:
bal_base_sub6.to_csv("sub6_bal_base_rf.csv", index = False)

# Optimizing Decision Trees

## Data

In [20]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Base Model

In [21]:
base_rf = RandomForestClassifier(random_state = 42, n_jobs = -1)

## Parameter Grid

In [22]:
# param_grid = [
#     {
#         "n_estimators" : [300],
#         "max_features" : ["log2", "sqrt", "auto"],
#         "criterion" : ["entropy", "gini"],
#         "max_depth" : [2, 5, 10],
#         "min_samples_split" : [2, 10, 20],
#         "min_samples_leaf" : [2, 10, 20]
#     }
# ]


param_grid = [
    {
        'max_depth' : [3, 5 ,10, None],
        'n_estimators' : [10, 100, 200, 300, 400, 500, 1000],
        'max_features' : ["log2", "sqrt", "auto"],
        'criterion' : ['gini', 'entropy'],
        'bootstrap' : [True, False],
        'min_samples_leaf' : [i for i in range(2, 100)],
        'min_samples_split' : [i for i in range(2, 100)]
    }
]

## Random Search

### Grid Object

In [23]:
# rf_grid = GridSearchCV(base_rf, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
rf_grid = RandomizedSearchCV(base_rf, 
                             param_distributions = param_grid, 
                             cv = 3, verbose=True, n_jobs=-1, n_iter = 40)

### Fitting All Models in the Grid

In [24]:
rf_grid_fit = rf_grid.fit(X_train[:, :-1], y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   39.0s finished


### Best Estimator

In [25]:
rf_grid_fit.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=27,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [26]:
int_best_rf = rf_grid_fit.best_estimator_

**Classification Accuracy**

In [27]:
np.where(int_best_rf.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8156424581005587

**F1 Score**

In [28]:
f1_score(y_true = y_test, y_pred = int_best_rf.predict(X_test[:, :-1]))

0.7659574468085106

### Exporting Best Internal Model

In [30]:
filename = "best_int_rf.sav"
pickle.dump(int_best_rf, open(filename, 'wb'))

In [32]:
filename = "best_int_rf.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [33]:
loaded_model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=27,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

# Final Submission File

## Fitting Model on Full Training data

In [34]:
X, y = train_features_df.values, train_df["Survived"].values

final_rf = rf_grid.fit(X[:, :-1], y)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   48.6s finished


In [35]:
final_rf.best_estimator_

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=9, min_samples_split=59,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [36]:
best_ext_rf = final_rf.best_estimator_

## Predictions

In [37]:
final_sub6 = pd.read_csv("../Data/gender_submission.csv")
final_sub6["Survived"] = best_ext_rf.predict(test_features_df.values)
final_sub6.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [38]:
final_sub6.to_csv("sub6_final_rf.csv", index = False)

## Exporting Model

In [39]:
filename = "best_ext_rf.sav"
pickle.dump(best_ext_rf, open(filename, 'wb'))

In [40]:
filename = "best_ext_rf.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [41]:
loaded_model

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=9, min_samples_split=59,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)