## Important Points

* Created using make_moons function in sklean a synthetic dataset
* Applied GridSearchCV with params max_leaf_nodes and max_depth

|   Model      |   Training   |  Validation |
|--------------|--------------|-------------|
| Decison Tree |    86.28%    |   85.68%    |

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

np.random.seed(42)
%matplotlib inline

In [2]:
# Creating data and train-test split 
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

x, y = make_moons(n_samples=10000, noise=0.4, random_state=True)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=42, test_size=0.25)

In [4]:
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

(7500, 2) (7500,)
(2500, 2) (2500,)


In [6]:
# No of classes
np.unique(y_train)

array([0, 1])

In [8]:
from sklearn.metrics import accuracy_score

def score(y, y_pred, train=False):
    if train:
        print("Training accuracy: ", accuracy_score(y, y_pred))
    else:
        print("Validation accuracy: ", accuracy_score(y, y_pred))

In [9]:
from sklearn.tree import DecisionTreeClassifier

dt =  DecisionTreeClassifier()
dt.fit(x_train, y_train)

# The model overfits since no regularization had been applied
score(y_valid, dt.predict(x_valid))
score(y_train, dt.predict(x_train), True)

Validation accuracy:  0.8008
Training accuracy:  1.0


In [11]:
from sklearn.model_selection import GridSearchCV

params = {"max_leaf_nodes": list(range(2, 50)), "max_depth": list(range(8, 12))}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), verbose=1, param_grid=params, cv=3)
%time
grid_search.fit(x_train, y_train)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11 µs
Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 576 out of 576 | elapsed:    6.6s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [8, 9, 10, 11],
                         '

In [12]:
grid_search.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=8, max_features=None, max_leaf_nodes=39,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [14]:
# Since refit=True, model already trained on complete x_train
score(y_train, grid_search.predict(x_train), True)
# Validation score
score(y_valid, grid_search.predict(x_valid))

Training accuracy:  0.8628
Validation accuracy:  0.8568
