# Decision Tree Classifier on Iris Dataset

## Import modules

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

## Load And Prepare Data

In [2]:
df=pd.read_csv("Datasets/Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df=df.drop("Id",axis=1)
df=df.rename(columns={"Species":"label"})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   label          150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


## Train-Test Split

In [4]:
def train_test_split(df,test_size):
    indices = df.index.tolist()
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    
    test_indices=random.sample(population=indices,k=test_size)
    test_df=df.loc[test_indices]
    train_df=df.drop(test_indices)
    
    return train_df,test_df

In [5]:
random.seed(0)
train_df,test_df=train_test_split(df,test_size=20)

In [6]:
len(test_df)

20

In [7]:
len(train_df)

130

In [8]:
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
98,5.1,2.5,3.0,1.1,Iris-versicolor
107,7.3,2.9,6.3,1.8,Iris-virginica
10,5.4,3.7,1.5,0.2,Iris-setosa
66,5.6,3.0,4.5,1.5,Iris-versicolor
130,7.4,2.8,6.1,1.9,Iris-virginica


## Convert to pandas Data-Frame

In [9]:
data=train_df.values
data[:5]

array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
       [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
       [5.0, 3.6, 1.4, 0.2, 'Iris-setosa']], dtype=object)

## Check Purity of Data

In [10]:
def check_purity(data):
    
    label_column = data[:,-1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1 :
        return True
    else :
        return False

In [11]:
check_purity(data)

False

In [12]:
check_purity(train_df[train_df.PetalWidthCm < 0.8].values)

True

## Classify Data

In [13]:
def classify_data(data):
    label_column = data[:,-1]
    unique_classes,count_unique_classes = np.unique(label_column,return_counts=True)
    
    index = count_unique_classes.argmax()
    classification = unique_classes[index]
    return classification

In [14]:
classify_data(data)

'Iris-setosa'

In [15]:
classify_data(train_df[(train_df.PetalWidthCm>0.8)&(train_df.PetalWidthCm<2)].values)

'Iris-versicolor'

## Potential Splits

In [16]:
def get_potential_splits(data):
    _,n_columns = data.shape
    potential_splits = {}
    
    for column_index in range(n_columns-1):
        potential_splits[column_index] = []
        values =  data[:,column_index]
        unique_values = np.unique(values)
        
        for index in range(len(unique_values)):
            if index != 0:
                current_element = unique_values[index]
                previous_element = unique_values[index-1]
                potential_split = (previous_element + current_element) / 2;
                
                potential_splits[column_index].append(potential_split)
                
    return potential_splits

## Split Data

In [17]:
def split_data(data , split_column , split_value):
    split_column_values = data[:,split_column]
    
    data_below = data[ split_column_values <= split_value]
    data_above = data[ split_column_values > split_value]
    
    return data_below,data_above

## Lowest Overall Entropy Function
* Calculate Entropy Function
* Calculate Overall Entropy Function
* Determine Best Split

In [18]:
def calculate_entropy(data):
    
    label_column = data[:,-1]
    _,counts = np.unique(label_column , return_counts = True)
    
    probabilities = counts / sum(counts)
    
    entropy = sum( probabilities * (-np.log2(probabilities)))
    
    return entropy

In [19]:
def calculate_overall_entropy(data_below , data_above):
    
    n_data_points = len(data_below) + len(data_above)
    
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points
    
    overall_entropy = (p_data_below * calculate_entropy(data_below)) + (p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [20]:
def determine_best_split(data , potential_splits):
    
    overall_entropy = 999
    
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            
            data_below , data_above = split_data(data , split_column = column_index , split_value = value)
            current_overall_entropy = calculate_overall_entropy(data_below , data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
                
    return best_split_column , best_split_value

## Decision Tree Algorithm

In [21]:
def decision_tree_algorithm(df , counter = 0 , min_samples = 2 , max_depth = 5):
    if counter == 0 :
        global column_headers
        column_headers = df.columns
        data = df.values
    else :
        data = df
    
    # base- case
    if check_purity(data) or len(data) < min_samples or counter == max_depth :
        classification = classify_data(data)
        return classification
        
    else :
        counter += 1
        
        #helper functions
        potential_splits = get_potential_splits(data)
        split_column , split_value = determine_best_split(data , potential_splits)
        data_below , data_above = split_data(data , split_column , split_value)
        
        #instantiate subtrees
        feature = column_headers[split_column]
        question = "{} <= {}".format(feature , split_value)
        subtree = { question : [] }
        
        yes_answer = decision_tree_algorithm(data_below , counter , min_samples , max_depth )
        no_answer = decision_tree_algorithm(data_above , counter , min_samples , max_depth )
        
        if yes_answer == no_answer :
            subtree = yes_answer
        else :
            subtree[question].append(yes_answer)
            subtree[question].append(no_answer)
        
        return subtree
         

In [22]:
tree = decision_tree_algorithm( train_df )
pprint(tree)

{'PetalWidthCm <= 0.8': ['Iris-setosa',
                         {'PetalWidthCm <= 1.65': [{'PetalLengthCm <= 4.95': ['Iris-versicolor',
                                                                              {'PetalWidthCm <= 1.55': ['Iris-virginica',
                                                                                                        'Iris-versicolor']}]},
                                                   {'PetalLengthCm <= 4.85': [{'SepalWidthCm <= 3.1': ['Iris-virginica',
                                                                                                       'Iris-versicolor']},
                                                                              'Iris-virginica']}]}]}


## Classification of examples

In [23]:
def classify_example(example , tree) :
    question = list(tree.keys())[0]
    feature_name , comparision , value = question.split()
    
    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    
    else :
        answer = tree[question][1]
        
    #base case
    if not isinstance(answer , dict):
        return answer
    
    #recursive class
    else :
        residue_tree = answer
        return classify_example(example , residue_tree)

## Accuracy

In [24]:
def calculate_accuracy(df , tree):
    
    df["classification"] = df.apply(classify_example , axis = 1 , args = (tree , ))
    df["classification_correct"] = df.classification == df.label
    
    accuracy = df.classification_correct.mean()
    
    return accuracy

In [25]:
calculate_accuracy(test_df , tree)

0.9

In [26]:
test_df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label,classification,classification_correct
98,5.1,2.5,3.0,1.1,Iris-versicolor,Iris-versicolor,True
107,7.3,2.9,6.3,1.8,Iris-virginica,Iris-virginica,True
10,5.4,3.7,1.5,0.2,Iris-setosa,Iris-setosa,True
66,5.6,3.0,4.5,1.5,Iris-versicolor,Iris-versicolor,True
130,7.4,2.8,6.1,1.9,Iris-virginica,Iris-virginica,True
124,6.7,3.3,5.7,2.1,Iris-virginica,Iris-virginica,True
103,6.3,2.9,5.6,1.8,Iris-virginica,Iris-virginica,True
77,6.7,3.0,5.0,1.7,Iris-versicolor,Iris-virginica,False
122,7.7,2.8,6.7,2.0,Iris-virginica,Iris-virginica,True
91,6.1,3.0,4.6,1.4,Iris-versicolor,Iris-versicolor,True
