# Importing Required Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

import pickle

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Reading Data

In [3]:
train_features_backup = pd.read_csv("../Data/train_features_df.csv")
test_features_backup = pd.read_csv("../Data/test_features_df.csv")

train_features_df = train_features_backup.copy()
train_features_df = train_features_df.drop(["Age_label_enc", "Fare_label_enc"], axis = 1)
train_features_df["Indices"] = train_features_df.index

test_features_df = test_features_backup.copy()
test_features_df = test_features_df.drop(["Age_label_enc", "Fare_mean_enc"], axis = 1)

In [4]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/test.csv")

train_preprocessed_df = pd.read_csv("../Data/preprocessed_train_df.csv")
test_preprocessed_df = pd.read_csv("../Data/preprocessed_test_df.csv")

# Train Test Split

In [5]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Base Decision Tree Model

In [8]:
DecisionTreeClassifier(random_state = 42).fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [9]:
base_dt = DecisionTreeClassifier(random_state = 42).fit(X_train[:, :-1], y_train)

## Classification Accuracy

In [10]:
np.where(base_dt.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.7877094972067039

## F1 Score

In [11]:
f1_score(y_true = y_test, y_pred = base_dt.predict(X_test[:, :-1]))

0.7432432432432431

## Classification Summary

In [13]:
preds = base_dt.predict(X_test[:, :-1])

f1_df = pd.DataFrame(index = ["Predictions:0", "Predictions:1"], columns = ["Actuals:0", "Actuals:1"])

f1_df.at["Predictions:0", "Actuals:0"] = np.where((preds == 0) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:0", "Actuals:1"] = np.where((preds == 0) & (y_test == 1), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:0"] = np.where((preds == 1) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:1"] = np.where((preds == 1) & (y_test == 1), 1, 0).sum()

f1_df

Unnamed: 0,Actuals:0,Actuals:1
Predictions:0,86,19
Predictions:1,19,55


## Error Analysis

In [14]:
preds = base_dt.predict(X_test[:, :-1])

p0_a1 = X_test[:, -1][(preds == 0) & (y_test == 1)]
p1_a0 = X_test[:, -1][(preds == 1) & (y_test == 0)]


print(base_dt.classes_)


preds_proba = base_dt.predict_proba(X_test[:, :-1])

error_df = pd.concat([train_df.loc[p0_a1], train_df.loc[p1_a0]], axis = 0)
error_df["Prediction_Proba"] = preds_proba[:, 1][((preds == 0) & (y_test == 1)) | ((preds == 1) & (y_test == 0))]

error_df

[0 1]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Prediction_Proba
709.0,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C,0.0
447.0,448,1,1,"Seward, Mr. Frederic Kimber",male,34.0,0,0,113794,26.55,,S,1.0
673.0,674,1,2,"Wilhelms, Mr. Charles",male,31.0,0,0,244270,13.0,,S,0.0
204.0,205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18.0,0,0,A/5 3540,8.05,,S,0.333333
23.0,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S,1.0
712.0,713,1,1,"Taylor, Mr. Elmer Zebley",male,48.0,1,0,19996,52.0,C126,S,0.5
338.0,339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.05,,S,0.0
286.0,287,1,3,"de Mulder, Mr. Theodore",male,30.0,0,0,345774,9.5,,S,1.0
323.0,324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harb...",female,22.0,1,1,248738,29.0,,S,1.0
78.0,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S,1.0


## Creating Base Submission

In [15]:
X, y = train_features_df.values, train_df["Survived"].values

full_base_dt = DecisionTreeClassifier(random_state = 42).fit(X[:, :-1], y)

base_sub5 = pd.read_csv("../Data/gender_submission.csv")
base_sub5["Survived"] = base_dt.predict(test_features_df.values)
base_sub5.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,1
3,895,1
4,896,1


In [16]:
base_sub5.to_csv("sub5_base_dt.csv", index = False)

# Optimizing Decision Trees

## Data

In [25]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Base Model

In [22]:
base_dt = DecisionTreeClassifier(random_state = 42)

## Parameter Grid

In [67]:
param_grid = [
    {
        "max_features" : ["log2", "sqrt", "auto"],
        "criterion" : ["entropy", "gini"],
        "max_depth" : [i for i in range(2, 21)],
        "min_samples_split" : [i for i in range(1, 21)],
        "min_samples_leaf" : [i for i in range(1, 21)]
    }
]

## Grid Search

### Grid Object

In [68]:
dt_grid = GridSearchCV(base_dt, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

### Fitting All Models in the Grid

In [69]:
dt_grid_fit = dt_grid.fit(X_train[:, :-1], y_train)

Fitting 3 folds for each of 45600 candidates, totalling 136800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 8460 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 24460 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 46860 tasks      | elapsed:   42.5s
[Parallel(n_jobs=-1)]: Done 75660 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 110860 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 136800 out of 136800 | elapsed:  2.0min finished


### Best Estimator

In [70]:
dt_grid_fit.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=9, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [71]:
int_best_dt = dt_grid_fit.best_estimator_

**Classification Accuracy**

In [72]:
np.where(int_best_dt.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8044692737430168

**F1 Score**

In [73]:
f1_score(y_true = y_test, y_pred = int_best_dt.predict(X_test[:, :-1]))

0.7586206896551724

### Exporting Best Internal Model

In [46]:
filename = "best_int_dt.sav"
pickle.dump(int_best_dt, open(filename, 'wb'))

In [47]:
filename = "best_int_dt.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [48]:
loaded_model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=9, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

# Final Submission File

## Fitting Model on Full Training data

In [49]:
X, y = train_features_df.values, train_df["Survived"].values

final_dt = dt_grid.fit(X[:, :-1], y)

Fitting 3 folds for each of 45600 candidates, totalling 136800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 8460 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 24460 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 46860 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 75660 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 110860 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 136800 out of 136800 | elapsed:  2.0min finished


In [58]:
final_dt.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [51]:
best_ext_dt = final_dt.best_estimator_

## Predictions

In [52]:
final_sub5 = pd.read_csv("../Data/gender_submission.csv")
final_sub5["Survived"] = best_ext_dt.predict(test_features_df.values)
final_sub5.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [53]:
final_sub5.to_csv("sub5_final_dt.csv", index = False)

## Exporting Model

In [59]:
filename = "best_ext_dt.sav"
pickle.dump(best_ext_dt, open(filename, 'wb'))

In [60]:
filename = "best_ext_dt.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [61]:
loaded_model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')