<h1><center>DecisionTree - Classification</center></h1>

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import warnings

import random
random.seed(10)

In [14]:
iris_data = pd.read_csv('../data/Iris.csv')

In [13]:
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


<img height="600" width="750" src="../img/classification/iris_flowers.png">

### Scikit-Learn uses CART (Classification And Regression Tree) algorithm, which will generate binary tree.
  * The algorithm first splits the train set into two subsets using single feature K, threshold $t_k$ (example "petal lenght < 2.45 cm").
  * The algorithm searches for $(K, t_k)$ the purest subset, the cost function is privided by hyper-parameter "criterion" 
    * **For Classification** this can be "gini" for the **Gini impurity** and "entropy" for the **information gain**).
    * **For Regression** this can be "mse" or "mae"
    * CART algorithm is "greedy".
    * Algorithm Complexity: It requires O(exp(m)), m-samples, to find optimal tree. This is the reason way we should settle for "reasonably good" tree. 

<img height="600" width="550" src="../img/classification/dtree_iris_sample.png">

### Regularization parameters
  * It stops recursing once it reaches the maximum depth ("max_depth" hyper parameter), or if it cannot find a split that can reduce impurity.
  * min_samples_split : Minimum number of samples a node must have before it can be split.
  * min_samples_leaf : Minimum number of samples a leaf must have.
  * min_weight_fraction_leaf: Same as min_samples_leaf but expressed as a fraction of the total number of weighted instances.
  * max_leaf_nodes : Maximum number of leaf nodes.
  * max_features : Maximum number of features that are evaluated for splitting at each node.
  * **Increasing min_* OR reducing max_* hyperparameters will regularize the model.** 


In [4]:
X, y = iris_data.iloc[:,1:-1], iris_data.iloc[:, -1]

In [5]:
y.shape

(150,)

In [6]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

## DecisionTree Classification

In [7]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [8]:
tree_clf.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [9]:
tree_clf.predict_proba(X_train)[:5, :]

array([[1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.97297297, 0.02702703],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ]])

In [10]:
pred_y_train = tree_clf.predict(X_train)

In [11]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

def printCLFAccuracyScores(y, pred_y):
    conf_matrix = confusion_matrix(y, pred_y)
    print("conf_matrix : ", conf_matrix)
    prec_score = precision_score(y, pred_y, average='macro')
    print("precision_score : ", prec_score)
    rec_score = recall_score(y, pred_y, average='macro')
    print("recall_score : ", rec_score)
    fl_score = f1_score(y, pred_y, average='macro')
    print("fl_score : ", fl_score)

In [12]:
printCLFAccuracyScores(y_train, pred_y_train)

conf_matrix :  [[40  0  0]
 [ 0 36  5]
 [ 0  1 38]]
precision_score :  0.9522313010685104
recall_score :  0.9508025849489264
fl_score :  0.9499687304565354


In [13]:
pred_y_test = tree_clf.predict(X_test)
printCLFAccuracyScores(y_test, pred_y_test)

conf_matrix :  [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
precision_score :  0.9722222222222222
recall_score :  0.9629629629629629
fl_score :  0.9658994032395567


### This looks good lets try GridSearchCV and try to find best parameters.

In [14]:
param_grid = [
    {'max_leaf_nodes':[2,4,5,6,8,10,12],
     'max_depth' : [4,5,6,7,8,9,10]}
  ]

grd_tree_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(grd_tree_clf, param_grid, cv=5,
                           scoring='f1_macro')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_depth': [4, 5, 6, 7, 8, 9, 10],
                 

In [15]:
grid_search.best_params_

{'max_depth': 6, 'max_leaf_nodes': 8}

In [16]:
grid_search.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=8,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [17]:
best_tree_clf = grid_search.best_estimator_
best_tree_clf.fit(X_train, y_train)
pred_y_train = best_tree_clf.predict(X_train)

printCLFAccuracyScores(y_train, pred_y_train)

conf_matrix :  [[40  0  0]
 [ 0 40  1]
 [ 0  0 39]]
precision_score :  0.9916666666666667
recall_score :  0.991869918699187
fl_score :  0.991665364379851


In [18]:
for (feature, score) in zip (X.columns, best_tree_clf.feature_importances_):
    print("Feature - \"{0}\" - Importance Score {1} : ".format(feature, score))

Feature - "SepalLengthCm" - Importance Score 0.017085151173078687 : 
Feature - "SepalWidthCm" - Importance Score 0.0 : 
Feature - "PetalLengthCm" - Importance Score 0.9038067790135142 : 
Feature - "PetalWidthCm" - Importance Score 0.07910806981340704 : 


In [19]:
pred_y_test = best_tree_clf.predict(X_test)
printCLFAccuracyScores(y_test, pred_y_test)

conf_matrix :  [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
precision_score :  1.0
recall_score :  1.0
fl_score :  1.0
