In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("/Users/vedantsingh/Downloads/iris-species/Iris.csv")

In [3]:
data.drop(columns = ['Id'], inplace = True)
col = list(data.columns.values)
train_data = data.values

In [4]:
def isPure (data) :
    label = data[:, -1]
    different = np.unique(label)
    
    if len(different) == 1 :
        return True
    else:
        return False

def predict (data) :
    label = data[:, -1]
    different, counts = np.unique(label, return_counts = True)
    
    i = counts.argmax()
    return different[i], counts[i]

In [5]:
def potential_splits (data) :
    potential_split = {}
    _, n = data.shape
    for i in range(n - 1) :
        values = data[:, i]
        different = np.unique(values)
        
        potential_split[i] = different
    return potential_split

In [6]:
def entropy (data) :
    label = data[:, -1]
    different, count = np.unique(label, return_counts = True)
    prob = count / count.sum()
    e = sum(prob * -np.log2(prob))
    
    return e

In [7]:
def total_entropy(left, right) :
    total = len(left) + len(right)
    pleft = len(left) / total
    pright = len(right) / total
    
    # weighted mean of left and right entropies to get the total entropy
    te = (pleft * entropy(left) + pright * entropy(right)) 
    
    return te

In [8]:
def split (data, column, value) :
    values = data[:, column]
    left = data[values <= value]
    right = data[values > value]
    
    return left, right

In [9]:
def gain_ratio(data, left, right) :
    e = entropy(data)
    te = total_entropy(left, right)
    
    info_gain = e - te
    
    total = len(left) + len(right)
    pleft = len(left) / total
    pright = len(right) / total
    
    split_info = (pleft * -np.log2(pleft)) + (pright * -np.log2(pright))
    gain_ratio = info_gain / split_info
    return gain_ratio

In [10]:
def best_split(data) :
    potential_split = potential_splits(data)
    
    g = -10000000
    for i in potential_split :
        for j in potential_split[i] :
            left, right = split(data, i, j)
            
            gr = gain_ratio(data, left, right)
            if gr > g :
                b_column = i
                b_value = j
                g = gr
    return b_column, b_value, g

In [11]:
def do_split(data, counter) :
    b_column, b_value, gr = best_split(data)
    left, right = split(data, b_column, b_value)
    print ("Level {}".format(counter))
    label = data[:, -1]
    different, count = np.unique(label, return_counts = True)
    for i in range(len(different)) :
        print ("Count of {} = {}".format(different[i], count[i]))
    e = entropy(data)
    print ("Current entropy is = {}".format(e))
    col_name = col[b_column]
    print ("Splitting on feature {} with gain ratio {}".format(col_name, gr))
    print (" ")
    printTree(left, counter + 1)
    printTree(right, counter + 1)

In [12]:
def printTree(data, counter) :
    m, n = data.shape
    if (isPure(data)) :
        print ("Level {}".format(counter))
        pred, count = predict(data)
        print ("Count of {} = {}".format(pred, count))
        print ("Current Entropy = 0.0")
        print ("Reached Leaf Node") 
        print (" ")
    else :
        do_split(data, counter)

In [13]:
printTree(train_data, 0)

Level 0
Count of Iris-setosa = 50
Count of Iris-versicolor = 50
Count of Iris-virginica = 50
Current entropy is = 1.584962500721156
Splitting on feature PetalLengthCm with gain ratio 0.9999999999999999
 
Level 1
Count of Iris-setosa = 50
Current Entropy = 0.0
Reached Leaf Node
 
Level 1
Count of Iris-versicolor = 50
Count of Iris-virginica = 50
Current entropy is = 1.0
Splitting on feature PetalWidthCm with gain ratio 0.6933647985912663
 
Level 2
Count of Iris-versicolor = 49
Count of Iris-virginica = 5
Current entropy is = 0.44506485705083865
Splitting on feature PetalLengthCm with gain ratio 0.606617822020301
 
Level 3
Count of Iris-versicolor = 49
Count of Iris-virginica = 3
Current entropy is = 0.31821529768323314
Splitting on feature PetalLengthCm with gain ratio 0.2720453440631925
 
Level 4
Count of Iris-versicolor = 47
Count of Iris-virginica = 1
Current entropy is = 0.1460942501201363
Splitting on feature PetalWidthCm with gain ratio 1.0
 
Level 5
Count of Iris-versicolor = 47


  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
