## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Make the data set

In [3]:
from sklearn.datasets import make_moons as mm 

In [4]:
X_m, y_m = mm(n_samples=10000, noise=0.4) # make clustering set
plt.scatter(X_m[:, 0], X_m[:, 1], c=y_m) # plot them

## Hyper Parameter Tuning

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold as skf, GridSearchCV as gsc
from sklearn.model_selection import train_test_split as tts

In [21]:
# define the search space for the grid search
hyperparams = [
    {
        "min_samples_split" : [10] , "max_leaf_nodes" : [2], "max_depth" : [2]
    },
    {
        "min_samples_split" : [10] , "max_leaf_nodes" : [10] , "max_depth" : [10]
    },
    {
        "min_samples_split" : [10] , "max_leaf_nodes" : [20], "max_depth" : [20]
    },
    {
        "min_samples_split" : [10] , "max_leaf_nodes" : [30], "max_depth" : [30]
    },
    {
        "min_samples_split" : [10] , "max_leaf_nodes" : [50], "max_depth" : [50]
    },
    {
        "min_samples_split" : [50] , "max_leaf_nodes" : [2], "max_depth" : [2]
    },
    {
        "min_samples_split" : [50] , "max_leaf_nodes" : [10], "max_depth" : [10]
    },
    {
        "min_samples_split" : [50] , "max_leaf_nodes" : [20], "max_depth" : [20]
    },
    {
        "min_samples_split" : [50] , "max_leaf_nodes" : [50], "max_depth" : [50]
    }
]

In [22]:
dectree = DecisionTreeClassifier() # define the model
crosval = skf(n_splits=5, shuffle=True, random_state=0) # prepare the cross validation set
clf = gsc(estimator= dectree, param_grid= hyperparams,cv= crosval) 

In [25]:
X_train, X_test, y_train, y_test = tts(X_m, y_m, train_size=0.8, random_state= 42)
clf.fit(X_train, y_train)

In [29]:
#create a table with the results of the combinations, sorting them from best to worst:
scores = pd.DataFrame(clf.cv_results_)
scores = scores.sort_values(by="rank_test_score").set_index("rank_test_score")
int_cols = ["param_max_depth", "param_max_leaf_nodes", "param_min_samples_split"]
scores[int_cols].head() # only look at the first 5

Unnamed: 0_level_0,param_max_depth,param_max_leaf_nodes,param_min_samples_split
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10,10,10
1,10,10,50
3,20,20,10
3,20,20,50
5,50,50,50


In [30]:
fin_tree = clf.best_estimator_

## Results

In [None]:
from sklearn.metrics import accuracy_score as acc_s

In [34]:
y_pred = fin_tree.predict(X_test)
y_t_pred = fin_tree.predict(X_train)

In [36]:
results = [[acc_s(y_pred= y_pred, y_true= y_test), acc_s(y_pred= y_t_pred, y_true= y_train)]]
result_df = pd.DataFrame(results, columns= ["Test Set", "Train Set"])
result_df.head()

Unnamed: 0,Test Set,Train Set
0,0.8625,0.86375


## Visualizing the tree

In [37]:
from sklearn.tree import export_graphviz

In [38]:
export_graphviz(
    fin_tree,
    out_file= "moonsres.dot",
    feature_names = ["z1","z2"],
    class_names = ["first", "second"],
    rounded = True,
    filled = True
)

In [41]:
from graphviz import Source as src

In [42]:
src.from_file("moonsres.dot")

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x22d87c83b20>