# Import Statements

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Load Data

In [2]:
file = open('../../processed_data.pkl', 'rb')
data = pickle.load(file)
file.close()

In [3]:
train_x, train_y, test_x, test_y = data['train_x'], data['train_y'], data['test_x'], data['test_y']

# Decision Tree

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
decision_tree_model = DecisionTreeClassifier(random_state=0)

In [6]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [7]:
# Perform Grid Search on paramaters specified by params

grid_search = GridSearchCV(estimator=decision_tree_model, param_grid=params, cv=5, n_jobs=-1, verbose=1, scoring = 'accuracy')

In [8]:
grid_search.fit(train_x, train_y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100]},
             scoring='accuracy', verbose=1)

In [9]:
# Find best model

grid_search.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=5,
                       random_state=0)

In [10]:
# Best Decision Tree model
best_decision_tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=5, random_state=0)

In [11]:
best_decision_tree_model.fit(train_x, train_y)
best_decision_tree_model_predictions = best_decision_tree_model.predict(test_x)

In [12]:
print('Train Accuracy: {} %'.format(100*best_decision_tree_model.score(train_x, train_y)))
print('Test Accuracy: {} %'.format(100*best_decision_tree_model.score(test_x, test_y)))

Train Accuracy: 98.26849910707352 %
Test Accuracy: 93.63354037267081 %
