# Imports

In [9]:
import numpy as np 
import pandas as pd 
import time
from math import log
import xml.etree.ElementTree as ET

------------------------------

The main idea is not to deal with index but rather the heading of the columns. Therefore we use pandas.

# Helper Function

In [2]:
def entropy(target_col, n_class):
    """
    This function calculates the entropy of the dataset
    
    @param target_col: The column where the target values are stored
    @param n_class: For log base
    
    return: entropy of the target values w.r.t the dataset
    """
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*log((counts[i]/np.sum(counts)),n_class) for i in range(len(elements))])
    return entropy
    
def InfoGain(data,split_attribute_name, n_class, target_name="Play Tennis"):
    """
    This function computes the information gain of a feature by substracting total entropy with weighted
    entropy of the values in the feature respectively
    
    @param data: input data set 
    @param split_attribute_name: feature column
    @param n_class: for log base
    @target_name: name of target column
    
    return: information gain
    """    
    #Compute the entropy of the original dataset
    total_entropy = entropy(data[target_name], n_class)

    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name], n_class) for i in range(len(vals))])
    
    #Calculate the information gain by substrating weighted entropy from total entropy
    return total_entropy - Weighted_Entropy

def ID3_xml(data,originaldata,features,n_class,target_attribute_name="Play Tennis",best_feature = None,
            value = None, parent_node_class = None, space = ''):
    """
    This function compute the ID3 algorithm of a decision tree
    
    @param data: Data that the algorithm is currently running
    @param original_data: Orignial dataset that includes all the feature columns
    @param features: A list containing feature column names
    @param n_class: Number of class as log base
    @param target_attribute_name: Column name where the target values are stored.
    @param best_feature: Best feature used at the particular iteration
    @param value: The value of the best feature used at the particular iteration
    @param parent_node_class: The best target feature value will be stored
    
    reference: https://www.python-course.eu/Decision_Trees.php
    """   
    #Stopping criteria for creating a leaf node
    #If all target_values have the same value, return this value, because entropy will be 0
    if len(np.unique(data[target_attribute_name])) <= 1:
        space+= ' '
        ent = entropy(data[target_attribute_name], n_class)
        target_val = np.unique(data[target_attribute_name])[0]
        print(space+'entropy="{}"feature="{}"value"{}"class"{}"'.format(ent,best_feature, value, target_val))
        return target_val
    
    #Return the mode target feature value in the original dataset if the dataset is empty
    elif len(data)==0:
        #axis 1 is the list where the counts are stored
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    
    #If the feature space is empty, return the mode target feature value of the direct parent node 
    elif len(features) ==0:
        return parent_node_class
    
    #Grow tree
    else:
        print()
        if (value==None): pass
        else:
            ent = entropy(data[target_attribute_name], n_class)
            print('entropy="{}"feature="{}"value"{}"'.format(ent,best_feature, value))
        #Set the default value for parent node
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        #Compute the gain of each feature respectively 
        item_values = [InfoGain(data,feature, n_class, target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        #retrieving the index of the highest gain feature for best feature
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        #The root gets the name of the feature (best_feature) with the maximum info gain
        tree = {best_feature:{}}
        #Remove(isolate) the feature with the best inforamtion gain from the feature space
        #because we are sorting values w.r.t the best feature
        features = [i for i in features if i != best_feature]
        #Grow a branch under the root node for each possible value of the root node feature
        for value in np.unique(data[best_feature]):
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            #Recursively compute the ID3 algorithm for each of those sub_datasets with the new parameters
            subtree = ID3_xml(sub_data,data,features,n_class,target_attribute_name, 
                              best_feature, value, parent_node_class, space)
            
            #Add the sub tree, grown from the sub_dataset to the tree under the root node
            #nesting dictionary
            tree[best_feature][value] = subtree
            
        return(tree) 

# Data Processing

In [3]:
input_path = 'simple.csv'
input_ds = pd.read_csv(input_path)
n_cols = len(input_ds.columns)
print("Number of columns:", n_cols)

Number of columns: 5


In [4]:
print (input_ds.head())

    Outlook Temperature Humidity  Windy  Play Tennis
0     Sunny         Hot     High    Weak         No 
1     Sunny         Hot     High  Strong         No 
2  Overcast         Hot     High    Weak         Yes
3     Rainy        Mild     High    Weak         Yes
4     Rainy        Cool   Normal    Weak         Yes


In [5]:
for headings in input_ds:
    print("Values in {} is: {}".format(headings, np.unique(input_ds[headings])))
    
n_class = len(set(input_ds['Play Tennis']))
print("Number of classes: ", n_class)

Values in Outlook is: ['Overcast' 'Rainy' 'Sunny']
Values in Temperature is: ['Cool' 'Hot' 'Mild']
Values in Humidity is: ['High' 'Normal']
Values in Windy  is: ['Strong' 'Weak']
Values in Play Tennis is: ['No ' 'Yes']
Number of classes:  2


-----------------------------------------

In [6]:
init_ent = entropy(input_ds['Play Tennis'], n_class)
print('Entropy: "{}"'.format(init_ent))
tree = ID3_xml(input_ds, input_ds, input_ds.columns[:-1], n_class)

Entropy: "0.9402859586706309"

 entropy="0.0"feature="Outlook"value"Overcast"class"Yes"

entropy="0.9709505944546686"feature="Outlook"value"Rainy"
 entropy="0.0"feature="Windy "value"Strong"class"No "
 entropy="0.0"feature="Windy "value"Weak"class"Yes"

entropy="0.9709505944546686"feature="Outlook"value"Sunny"
 entropy="0.0"feature="Humidity"value"High"class"No "
 entropy="0.0"feature="Humidity"value"Normal"class"Yes"


In [7]:
for value in np.unique(input_ds['Outlook']):
    print(value)

Overcast
Rainy
Sunny


In [8]:
print(tree)

{'Outlook': {'Overcast': 'Yes', 'Rainy': {'Windy ': {'Strong': 'No ', 'Weak': 'Yes'}}, 'Sunny': {'Humidity': {'High': 'No ', 'Normal': 'Yes'}}}}
