# Imports

In [1]:
import numpy as np 
import pandas as pd 
import time
from math import log
import xml.etree.ElementTree as ET

------------------------------

The main idea is not to deal with index but rather the heading of the columns. Therefore we use pandas.

# Helper Function

In [2]:
def entropy(target_col, n_class):
    '''
    Compute Entropy
    @param target_col: the columns of the target value
    @param n_class: the number of class as log base
    
    return: entropy 
    '''
    elements,counts = np.unique(target_col,return_counts = True)
    ent = 0
    for i in range(len(elements)):
        ent += (-counts[i]/np.sum(counts))*log(counts[i]/np.sum(counts), n_class)
    return ent    

def gain(data, att_name, n_class, target_col_name = 'class'):
    
    assert type(att_name)==str, "Expected type str for att_name!"
    
    total_entropy = entropy(data[target_col_name], n_class)
    #return non-duplicate values from the column and it's frequency
    vals,counts= np.unique(data[att_name],return_counts=True)
    w_entropy = 0
    for i in range(len(vals)):
        #filter out the class with its corresponding attributes w.r.t features
        att = data.where(data[att_name]==vals[i]).dropna()[target_col_name]
        #entropy of each features respectively
        ent_att = entropy(att, n_class)
        w_entropy += (counts[i]/np.sum(counts))*ent_att
    return total_entropy- w_entropy

# Data Processing

In [3]:
input_path = 'car.csv'
input_ds = pd.read_csv(input_path, header = None)
n_cols = len(input_ds.columns)
print("Number of columns:", n_cols)

Number of columns: 7


## Naming columns 

In [4]:
#initiate empty list for column name
col_name = []

for i in range(n_cols):
    #last column is the target value
    if (i == n_cols-1):
        col_name.append('class')
    else:
        col_name.append('att{}'.format(i))
        
print(col_name)

['att0', 'att1', 'att2', 'att3', 'att4', 'att5', 'class']


In [5]:
#assigning column names to data set
input_ds.columns = col_name
print (input_ds.head())

    att0   att1 att2 att3   att4  att5  class
0  vhigh  vhigh    2    2  small   low  unacc
1  vhigh  vhigh    2    2  small   med  unacc
2  vhigh  vhigh    2    2  small  high  unacc
3  vhigh  vhigh    2    2    med   low  unacc
4  vhigh  vhigh    2    2    med   med  unacc


In [6]:
for headings in input_ds:
    print("Attributes in {} is: {}".format(headings, np.unique(input_ds[headings])))
    
n_class = len(set(input_ds['class']))
print("Number of classes: ", n_class)

Attributes in att0 is: ['high' 'low' 'med' 'vhigh']
Attributes in att1 is: ['high' 'low' 'med' 'vhigh']
Attributes in att2 is: ['2' '3' '4' '5more']
Attributes in att3 is: ['2' '4' 'more']
Attributes in att4 is: ['big' 'med' 'small']
Attributes in att5 is: ['high' 'low' 'med']
Attributes in class is: ['acc' 'good' 'unacc' 'vgood']
Number of classes:  4


-----------------------------------------

# Simulation for few steps (not so important)

## First step

First we calculate the entropy for class column from the original data

In [7]:
#sanity check for entropy for first step 
total_entropy = entropy(input_ds['class'], n_class)
print(total_entropy)

0.6028704850060875


Assign feature column

In [8]:
features = col_name[:n_cols-1] #ommit the class feature
print(features)

['att0', 'att1', 'att2', 'att3', 'att4', 'att5']


Calculate the info gain of the feature columns and pick the feature column with the highest gain

In [9]:
#loop through the feature columns to calculate gain
info_gain = [gain(input_ds, feature, n_class) for feature in features]
#retrieving the index of the highest gain, in order to access the feature name from features
highest_gain_index = np.argmax(info_gain)
best_feature = features[highest_gain_index]

print("Best feature: {}".format(best_feature)) # this will be the root node

Best feature: att5


In [10]:
#remove the best column of the best feature
features = [i for i in features if i!=best_feature]
print(features)

['att0', 'att1', 'att2', 'att3', 'att4']


In [11]:
#split the data by matching the value with respect to non-duplicate values
# in the feature column

#list for storing data matching values in the best feature
sub_data_list = []
value_list = [] #value for further spliting 
best_feature_col = input_ds[best_feature]
for value in np.unique(best_feature_col):
    sub_data = input_ds.where(best_feature_col==value).dropna()
    if len(np.unique(sub_data['class']))<=1:
        ent = entropy(sub_data['class'], n_class)
        target = np.unique(sub_data['class'])[0]
        print("entropy = {}, feature = {}, value = {}, class = {}".format(ent, best_feature, value, target))
        #become leaf no further split is needed 
    else:
        ent = entropy(sub_data['class'], n_class)
        print("entropy = {}, feature = {}, value = {}".format(ent, best_feature, value))
        sub_data_list.append(sub_data)
        value_list.append(value)
        #grow tree

entropy = 0.8077559247898061, feature = att5, value = high
entropy = 0.0, feature = att5, value = low, class = unacc
entropy = 0.607578995397061, feature = att5, value = med


In [12]:
value_list

['high', 'med']

In [13]:
sub_data_list[0]

Unnamed: 0,att0,att1,att2,att3,att4,att5,class
2,vhigh,vhigh,2,2,small,high,unacc
5,vhigh,vhigh,2,2,med,high,unacc
8,vhigh,vhigh,2,2,big,high,unacc
11,vhigh,vhigh,2,4,small,high,unacc
14,vhigh,vhigh,2,4,med,high,unacc
17,vhigh,vhigh,2,4,big,high,unacc
20,vhigh,vhigh,2,more,small,high,unacc
23,vhigh,vhigh,2,more,med,high,unacc
26,vhigh,vhigh,2,more,big,high,unacc
29,vhigh,vhigh,3,2,small,high,unacc


In [14]:
val_tmp = value_list[0]
ent = entropy(sub_data_list[0]['class'], n_class)
print("Entropy for value :{} from feature: {} is: {}".format(val_tmp, best_feature, ent))

Entropy for value :high from feature: att5 is: 0.8077559247898061


In [15]:
#calculate gain 

#feature do not need to include best feature here
features = col_name[:n_cols-1]
features_tmp = [i for i in features if i!='att5']

#loop through the feature columns to calculate gain
info_gain = [gain(sub_data_list[0], feature, n_class) for feature in features_tmp]
#retrieving the index of the highest gain, in order to access the feature name from features
highest_gain_index = np.argmax(info_gain)
best_feature = features[highest_gain_index]

print("Best feature: {}".format(best_feature)) # this will be the root node

Best feature: att3


---------------------------

# Main

rewrite functions after debugging

In [16]:
def entropy(target_col, n_class):
    """
    This function calculates the entropy of the dataset
    
    @param target_col: The column where the target values are stored
    @param n_class: For log base
    
    return: entropy of the target values w.r.t the dataset
    """
    values,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*log((counts[i]/np.sum(counts)),n_class) for i in range(len(values))])
    return entropy
    
def info_gain(data,split_attribute_name, n_class, target_name):
    """
    This function computes the information gain of a feature by substracting total entropy with weighted
    entropy of the values in the feature respectively
    
    @param data: input data set 
    @param split_attribute_name: feature column
    @param n_class: for log base
    @target_name: name of target column
    
    return: information gain
    """    
    #Compute the entropy of the original dataset
    total_entropy = entropy(data[target_name], n_class)

    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name], n_class) for i in range(len(vals))])
    
    #Calculate the information gain by subtracting weighted entropy from total entropy
    return total_entropy - Weighted_Entropy

def ID3_xml(data,originaldata,features,n_class,target_attribute_name, tree_xml, best_feature = None,
            value = None, space = ''):
    """
    This function compute the ID3 algorithm of a decision tree
    
    @param data: Data that the algorithm is currently running
    @param original_data: Orignial dataset that includes all the feature columns
    @param features: A list containing feature column names
    @param n_class: Number of class as log base
    @param target_attribute_name: Column name where the target values are stored.
    @param best_feature: Best feature used at the particular iteration
    @param value: The value of the best feature used at the particular iteration
    @param parent_node_class: The best target feature value will be stored
    
    reference: https://www.python-course.eu/Decision_Trees.php
    """   
    #Stopping criteria for creating a leaf node
    #If all target_values have the same value, return this value, because entropy will be 0
    if len(np.unique(data[target_attribute_name])) <= 1:
        space+= ' '
        ent = entropy(data[target_attribute_name], n_class)
        target_val = np.unique(data[target_attribute_name])[0]
        #leaf node
        #here tree_xml<=sub_tree from recursive function
        sub_sub_tree = ET.SubElement(tree_xml, 'node', entropy=str(ent),feature=str(best_feature),value=str(value))
        sub_sub_tree.text = str(target_val)
        print(space+'entropy="{}"feature="{}"value"{}"class"{}"'.format(ent,best_feature, value, target_val))
        return target_val
    
    #Grow tree
    else:
        print()
        if (value==None):
            sub_tree = tree_xml
        else:
            ent = entropy(data[target_attribute_name], n_class)
            #root node
            sub_tree = ET.SubElement(tree_xml, 'node', entropy=str(ent),feature=str(best_feature),value=str(value))            
            print('entropy="{}"feature="{}"value"{}"'.format(ent,best_feature, value))
        #Compute the gain of each feature respectively 
        item_values = [info_gain(data,feature, n_class, target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        #retrieving the index of the highest gain feature for best feature
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        #Remove(isolate) the feature with the best inforamtion gain from the feature space
        #because we are sorting values w.r.t the best feature
        features = [i for i in features if i != best_feature]
        #Grow a branch under the root node for each possible value of the root node feature
        for value in np.unique(data[best_feature]):
            #Split the dataset along the value of the feature with the largest information gain and create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            #Recursively compute the ID3 algorithm for each of those sub datasets with the new parameters
            subtree = ID3_xml(sub_data,data,features,n_class,target_attribute_name, sub_tree, 
                              best_feature, value, space)
           
        return tree_xml

In [17]:
init_ent = entropy(input_ds['class'],n_class)
print('entropy="{}"'.format(init_ent))
tree_xml = ET.Element('tree',entropy=str(init_ent))
tree_out = ID3_xml(input_ds, input_ds, input_ds.columns[:-1], n_class, "class", tree_xml)

entropy="0.6028704850060875"


entropy="0.8077559247898061"feature="att5"value"high"
 entropy="0.0"feature="att3"value"2"class"unacc"

entropy="0.829172609267362"feature="att3"value"4"

entropy="0.4056390622295664"feature="att0"value"high"
 entropy="0.0"feature="att1"value"high"class"acc"
 entropy="0.0"feature="att1"value"low"class"acc"
 entropy="0.0"feature="att1"value"med"class"acc"
 entropy="0.0"feature="att1"value"vhigh"class"unacc"

entropy="0.7806390622295662"feature="att0"value"low"

entropy="0.5"feature="att1"value"high"
 entropy="0.0"feature="att4"value"big"class"vgood"

entropy="0.5"feature="att4"value"med"
 entropy="0.0"feature="att2"value"2"class"acc"
 entropy="0.0"feature="att2"value"3"class"acc"
 entropy="0.0"feature="att2"value"4"class"vgood"
 entropy="0.0"feature="att2"value"5more"class"vgood"
 entropy="0.0"feature="att4"value"small"class"acc"

entropy="0.5"feature="att1"value"low"
 entropy="0.0"feature="att4"value"big"class"vgood"

entropy="0.5"feature="att4"value"med"

entropy="0.4591479170272448"feature="att2"value"2"
 entropy="0.0"feature="att4"value"big"class"acc"
 entropy="0.0"feature="att4"value"med"class"acc"
 entropy="0.0"feature="att4"value"small"class"unacc"
 entropy="0.0"feature="att2"value"3"class"acc"
 entropy="0.0"feature="att2"value"4"class"acc"
 entropy="0.0"feature="att2"value"5more"class"acc"

entropy="0.49749241409298506"feature="att0"value"vhigh"
 entropy="0.0"feature="att1"value"high"class"unacc"

entropy="0.20690842515181687"feature="att1"value"low"

entropy="0.4591479170272448"feature="att2"value"2"
 entropy="0.0"feature="att4"value"big"class"acc"
 entropy="0.0"feature="att4"value"med"class"acc"
 entropy="0.0"feature="att4"value"small"class"unacc"
 entropy="0.0"feature="att2"value"3"class"acc"
 entropy="0.0"feature="att2"value"4"class"acc"
 entropy="0.0"feature="att2"value"5more"class"acc"

entropy="0.20690842515181687"feature="att1"value"med"

entropy="0.4591479170272448"feature="att2"value"2"
 entropy="0.0"feature="att4"value"

 entropy="0.0"feature="att4"value"big"class"good"

entropy="0.4056390622295664"feature="att4"value"med"
 entropy="0.0"feature="att2"value"2"class"acc"
 entropy="0.0"feature="att2"value"3"class"good"
 entropy="0.0"feature="att2"value"4"class"good"
 entropy="0.0"feature="att2"value"5more"class"good"

entropy="0.4056390622295664"feature="att4"value"small"
 entropy="0.0"feature="att2"value"2"class"unacc"
 entropy="0.0"feature="att2"value"3"class"acc"
 entropy="0.0"feature="att2"value"4"class"acc"
 entropy="0.0"feature="att2"value"5more"class"acc"

entropy="0.48993437832557635"feature="att1"value"vhigh"
 entropy="0.0"feature="att4"value"big"class"acc"

entropy="0.4056390622295664"feature="att4"value"med"
 entropy="0.0"feature="att2"value"2"class"unacc"
 entropy="0.0"feature="att2"value"3"class"acc"
 entropy="0.0"feature="att2"value"4"class"acc"
 entropy="0.0"feature="att2"value"5more"class"acc"
 entropy="0.0"feature="att4"value"small"class"unacc"

entropy="0.6721428824256126"feature="att0"v

In [18]:
#write in xml style
xmlWrite = ET.ElementTree(tree_out)
xmlWrite.write("car.xml")