__[Video Tutorial](https://www.youtube.com/watch?v=sgQAhG5Q7iY&list=PLM8wYQRetTxAl5FpMIJCcJbfZjSB0IeC_&index=2)__

<b>Import tools</b>

In [170]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from IPython.display import display

In [171]:
iris = load_iris()
dataset = pd.DataFrame(iris.data, columns=iris.feature_names)

dataset['target'] = iris.target
display(dataset)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


<b>Node Class</b>

In [172]:
class Node:
    def __init__(self, featureIndex=None, threshold=None, left=None, right=None, infoGain=None, value=None):

        # For decision node
        self.featureIndex = featureIndex
        self.threshold = threshold
        self.left = left
        self.right = right
        self.infoGain = infoGain

        # For leaf node
        self.value = value

In [173]:
class Split:
    def __init__(self, featureIndex: int, threshold: float, datasetLeft: Node, datasetRight: Node, infoGain: float):
        self.featureIndex = featureIndex
        self.threshold = threshold
        self.datasetLeft = datasetLeft
        self.datasetRight = datasetRight
        self.infoGain = infoGain

In [174]:
class DecisionTreeClassifier:
    def __init__(self, minSamplesSplit=2, maxDepth=2) -> Node:
        self.root = None
        self.minSamplesSplit = minSamplesSplit
        self.maxDepth = maxDepth

    def buildTree(self, dataset, currDepth=0):
        x, y = dataset[:, :-1], dataset[:, -1]
        numSamples, numFeatures = np.shape(x)

        if numSamples >= self.minSamplesSplit and currDepth <= self.maxDepth:
            bestSplit: Split = self.getBestSplit(dataset, numFeatures)
            if bestSplit.infoGain > 0:
                left = self.buildTree(bestSplit.datasetLeft, currDepth+1)
                right = self.buildTree(bestSplit.datasetRight, currDepth+1)
                return Node(bestSplit.featureIndex, bestSplit.threshold, left, right, bestSplit.infoGain)
        
        leafValue = self.calcLeafValue(y)
        return Node(value=leafValue)

    def getBestSplit(self, dataset, numFeatures) -> Split:
        bestSplit: Split = None
        maxInfoGain = -float('inf')

        for featureIndex in range(numFeatures):
            featureValues = dataset[:, featureIndex]
            possibleThresholds = np.unique(featureValues)
            
            for treshhold in possibleThresholds:
                datasetLeft, datasetRight = self.split(dataset, featureIndex, treshhold)
                if len(datasetLeft) > 0 and len(datasetRight) > 0:
                    y, left_y, right_y = dataset[:, -1], datasetLeft[:, -1], datasetRight[:, -1]
                    currInfoGain = self.calcInfoGain(y, left_y, right_y, "gini")
                    if currInfoGain > maxInfoGain:
                        bestSplit = Split(featureIndex, treshhold, datasetLeft, datasetRight, currInfoGain)
                        maxInfoGain = currInfoGain

        return bestSplit

    def split(self, dataset, featureIndex, threshold):
        datasetLeft = np.array([row for row in dataset if row[featureIndex] <= threshold])
        datasetRight = np.array([row for row in dataset if row[featureIndex] > threshold])
        return datasetLeft, datasetRight

    def calcInfoGain(self, parent, lChild, rChild, mode="entropy"):
        weightL = len(lChild) / len(parent)
        weightR = len(rChild) / len(parent)

        if mode == "gini":
            return self.gini(parent) - (weightL * self.gini(lChild) + weightR * self.gini(rChild))
        
        return self.entropy(parent) - (weightL * self.entropy(lChild) + weightR * self.entropy(rChild))

    def entropy(self, y):
        typeLabels = np.unique(y)
        entropy = 0

        for typeLabel in typeLabels:
            pType = len(y[y == typeLabel]) / len(y)
            entropy += -pType * np.log2(pType)
        
        return entropy

    def gini(self, y):
        typeLabels = np.unique(y)
        gini = 0

        for typeLabel in typeLabels:
            pType = len(y[y == typeLabel]) / len(y)
            gini += pType ** 2
        
        return 1 - gini

    def calcLeafValue(self, y):
        y = list(y)
        return max(y, key=y.count)

    def printTree(self, node=None, indent=" ", columns=None) -> None:
        if not node: node = self.root

        if node.value is not None: 
            print(node.value)
        else:
            print("X_"+str(columns[node.featureIndex] if columns.size else node.featureIndex), "<=", node.threshold, "?", node.infoGain)
            print("%sleft:" % (indent), end="")
            self.printTree(node.left, indent + indent, columns)
            print("%sright:" % (indent), end="")
            self.printTree(node.right, indent + indent, columns)
        
    def fit(self, x, y):
        dataset = np.concatenate((x, y), axis=1)
        self.root = self.buildTree(dataset)

    def predict(self, x):
        return [self.makePrediction(row, self.root) for row in x]

    def makePrediction(self, x, node: Node):
        if node.value != None: return node.value

        featureVal = x[node.featureIndex]

        if featureVal <= node.threshold:
            return self.makePrediction(x, node.left)
        return self.makePrediction(x, node.right)
        

<b>Train-test split</b>

In [175]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values.reshape(-1, 1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=41)

<b>Fit the model</b>

In [176]:
classifier = DecisionTreeClassifier(minSamplesSplit=3, maxDepth=3)
classifier.fit(x_train, y_train)
classifier.printTree(columns=dataset.columns.values)

X_petal length (cm) <= 1.9 ? 0.33741385372714494
 left:0.0
 right:X_petal width (cm) <= 1.5 ? 0.427106638180289
  left:X_petal length (cm) <= 4.9 ? 0.05124653739612173
    left:1.0
    right:2.0
  right:X_petal length (cm) <= 5.0 ? 0.019631171921475288
    left:X_sepal width (cm) <= 2.8 ? 0.20833333333333334
        left:2.0
        right:1.0
    right:2.0


<b>Test the model</b>

In [179]:
yPred = classifier.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, yPred)

0.9333333333333333