# Decision Tree Classification

##### Author: Cody Pierce
##### Part of Step 4: Train Algorithms

In [1]:
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier

import math
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from creditcard_preparation import create_creditcard_pipeline, prepare_creditcard_data

In [2]:
X_train, X_dev, X_test, y_train, y_dev, y_test = prepare_creditcard_data((1/10,1/10))

### Grid search for best hyperparameters

In [3]:
# Focus on main hyperparameters of interest: Criterion and Max Depth
# Tested separately to save time

def grid_search_hyper_params_mainfocus(X_train, y_train):
    
    param_grid = {'criterion':['gini','entropy','log_loss'],
                   'max_depth':[2,4,6,8,10,None]}

    print("Training ...")
    grid = GridSearchCV(DecisionTreeClassifier(), 
                        param_grid, 
                        return_train_score = True,
                        refit = True, 
                        verbose = 3,
                        n_jobs=1, 
                        cv = 3) 

    # Fit model for grid search
    grid.fit(X_train, y_train) 

    # print best parameters after tuning 
    print("Grid searching is done!")
    print("The best score: ", grid.best_score_)
    print("The best hyperparameters:")
    print(grid.best_params_)
    return grid

In [4]:
# Grid search for best parameters (focus on main hyperparameters of interest)

grid_search_hyper_params_mainfocus(X_train, y_train)

Training ...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3] END criterion=gini, max_depth=2;, score=(train=0.936, test=0.938) total time=   4.4s
[CV 2/3] END criterion=gini, max_depth=2;, score=(train=0.938, test=0.936) total time=   4.0s
[CV 3/3] END criterion=gini, max_depth=2;, score=(train=0.934, test=0.934) total time=   3.9s
[CV 1/3] END criterion=gini, max_depth=4;, score=(train=0.953, test=0.953) total time=   7.7s
[CV 2/3] END criterion=gini, max_depth=4;, score=(train=0.953, test=0.951) total time=   7.7s
[CV 3/3] END criterion=gini, max_depth=4;, score=(train=0.950, test=0.950) total time=   7.6s
[CV 1/3] END criterion=gini, max_depth=6;, score=(train=0.965, test=0.965) total time=  11.3s
[CV 2/3] END criterion=gini, max_depth=6;, score=(train=0.965, test=0.964) total time=  11.2s
[CV 3/3] END criterion=gini, max_depth=6;, score=(train=0.965, test=0.965) total time=  11.4s
[CV 1/3] END criterion=gini, max_depth=8;, score=(train=0.975, test=0.973) total

#### Criterion:
* Default value for criterion is Gini; Entropy seems to perform about the same but takes less time to train


#### Max Depth:
* Default value for max depth is None and is clearly the best option in terms of accuracy score
* Increased runtime seems worthwhile


In [5]:
# Try other options with max depth at 1 to decrease runtime
# Focus on those with default values that are not None (or 0)

def grid_search_hyper_params_other(X_train, y_train):
    
    param_grid = {'criterion':['entropy'],
                   'splitter':['best','random'],
                   'min_samples_split':[2,3,5],
                   'min_samples_leaf':[1,3,5],
                   'min_weight_fraction_leaf':[0.0, 0.5],
                   'max_depth':[1]}

    print("Training ...")
    grid = GridSearchCV(DecisionTreeClassifier(), 
                        param_grid, 
                        return_train_score = True,
                        refit = True, 
                        verbose = 3,
                        n_jobs=1, 
                        cv = 3) 

   # Fit model for grid search
    grid.fit(X_train, y_train) 

    # print best parameters after tuning 
    print("Grid searching is done!")
    print("The best score: ", grid.best_score_)
    print("The best hyperparameters:")
    print(grid.best_params_)
    return grid

In [6]:
# Grid search for best parameters (other than criterion and max depth)

grid_search_hyper_params_other(X_train, y_train)

Training ...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3] END criterion=entropy, max_depth=1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, splitter=best;, score=(train=0.929, test=0.931) total time=   1.9s
[CV 2/3] END criterion=entropy, max_depth=1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, splitter=best;, score=(train=0.931, test=0.930) total time=   1.8s
[CV 3/3] END criterion=entropy, max_depth=1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, splitter=best;, score=(train=0.930, test=0.929) total time=   1.8s
[CV 1/3] END criterion=entropy, max_depth=1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, splitter=random;, score=(train=0.733, test=0.733) total time=   0.1s
[CV 2/3] END criterion=entropy, max_depth=1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, splitter=random;, score=(train=0.693, test=0.693) total time=   0.1s
[CV 3/

* Overall: Default values seem to be the best choices for all parameters besides criterion ("entropy" seems to be faster while still maintaining performance)

### Test model on test set using chosen hyperparameters

In [7]:
# Create the pipeline
pipeline = create_creditcard_pipeline()


# Create algorithm with parameters found during grid search
# Leave all parameters as default besides criterion
DTC_model = DecisionTreeClassifier(criterion="entropy")


# Combine the pipeline and the algorithm
pipeline_with_algo = Pipeline(steps=[
    ('preprocessor', pipeline),
    ('algo', DTC_model)
])

pipeline_with_algo.fit(X_train, y_train)
y_test_pred = pipeline_with_algo.predict(X_test)

In [9]:
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9983293178340925
Precision: 0.9975114787424205
Recall: 0.999157421710434
F1 Score: 0.9983337718144348
