In [2]:
import pandas as pd
import numpy as np

In [4]:
col_names = ['sepal_length','sepal_width','petal_length','petal_width','type']
data = pd.read_csv("iris.csv",skiprows=1,header=None,names=col_names)
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


## Node Class

In [None]:
class Node():
    def __init__(self,feature_index=None,threshold=None,right_node=None,left_node=None,value=None,info_gain=None):
        self.feature_index=feature_index
        self.threshold=threshold
        self.right_node=right_node
        self.left_node=left_node
        self.value=value
        self.info_gain=info_gain

In [None]:
class DecisionTreeClasssifier():
    def __init__(self,min_samples_split,max_depth):
        self.root=None
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth

    def build_tree(self,dataset,curr_depth=0):
        X,Y = dataset[:,:-1],dataset[:,-1]
        num_samples,num_features = np.shape(X)

        if(num_samples>=self.min_samples_split and curr_depth<=max_depth):
            best_split = self.get_best_split(dataset,num_samples,num_features)

            if best_split['info_gain']>0:
                right_node = self.build_tree(best_split['dataset_right'])
                left_node = self.build_tree(best_split['dataset_left'])

                #the info gain of a node is the info gain we obtain when we further split it into its child nodes. That's why leaf nodes do not have any info gain feature
                return Node(best_split['feature_index'],best_split['threshold'],right_node,left_node,best_split['info_gain'])

        #make a leaf node:
        leaf_value = self.calculate_leaf_value(Y)
        return Node(value=leaf_value)

    def get_best_split(self,dataset,num_samples,num_features):
        best_split = {}
        max_info_gain = -float("inf")
        for feature_index in range(num_features):
            possible_thresholds = np.unique(dataset[:,feature_index])
            for threshold in possible_thresholds:
                left_dataset, right_dataset = self.split(dataset,feature_index,threshold)
                y,left_y,right_y = dataset[:,-1],left_dataset[:,-1],right_dataset[:,-1]

                curr_info_gain = calculate_info_gain(y,left_y,right_y,"gini")

                if curr_info_gain>max_info_gain:
                    best_split['feature_index'] = feature_index
                    best_split['threshold'] = threshold
                    best_split['dataset_right'] = right_dataset
                    best_split['dataset_left'] = left_dataset
                    best_split['info_gain'] = curr_info_gain
                    max_info_gain = curr_info_gain
        return best_split   