# Imports

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd 
import time
from math import log

------------------------------

# Helper Function

In [2]:
def set_array(inps):
    '''
    @param inps: sliced data frame
    Converts dataframe into numpy array for easy work out
    '''
    inps = np.asarray(inps)
    #squeeze to remove indexing column from pandas
    return np.squeeze(inps)


def entropy(target_col, n_class):
    '''
    Compute Entropy
    @param target_col: the columns of the target value
    @param n_class: the number of class as log base
    
    return: entropy 
    '''
    elements,counts = np.unique(target_col,return_counts = True)
    ent = 0
    for i in range(len(elements)):
        ent += (-counts[i]/np.sum(counts))*log(counts[i]/np.sum(counts), n_class)
    return ent    

def gain(att_col, target_col, n_class): 
    total_entropy = entropy(target_col, n_class)
    vals,counts= np.unique(att_col,return_counts=True)
    w_entropy = 0
    for i in range(len(vals)):
        split_att = np.where(att_col == vals[i], target_col, 0) #filter attributes 
        split_att = split_att[split_att!=0] #remove [row, column] that have 0 value
        weights = counts[i]/np.sum(counts) #prob w.r.t classes in that attribute
        w_entropy += weights*entropy(split_att, n_class)
    return total_entropy- w_entropy

# Data Processing

In [3]:
input_path = 'car.csv'
input_ds = pd.read_csv(input_path, header = None)
n_cols = len(input_ds.columns)
print("Number of columns:", n_cols)

Number of columns: 7


## Naming columns 

In [4]:
#initiate empty list for column name
col_name = []

for i in range(n_cols):
    #last column is the target value
    if (i == n_cols-1):
        col_name.append('class')
    else:
        col_name.append('att{}'.format(i))
        
print(col_name)

['att0', 'att1', 'att2', 'att3', 'att4', 'att5', 'class']


In [5]:
#assigning column names to data set
input_ds.columns = col_name
print (input_ds.head())

    att0   att1 att2 att3   att4  att5  class
0  vhigh  vhigh    2    2  small   low  unacc
1  vhigh  vhigh    2    2  small   med  unacc
2  vhigh  vhigh    2    2  small  high  unacc
3  vhigh  vhigh    2    2    med   low  unacc
4  vhigh  vhigh    2    2    med   med  unacc


In [6]:
#vectorization of data frame to arrays 
arr = set_array(input_ds)

In [7]:
for i in range(arr.shape[1]-1):
    print("Attributes in att{} is: {}".format(i, set(arr[:,i])))

class_ = arr[:,-1]
n_class = len(set(class_))
print("Attributes in class is:", set(class_))
print("Number of class is: ", n_class)

Attributes in att0 is: {'vhigh', 'low', 'high', 'med'}
Attributes in att1 is: {'vhigh', 'low', 'high', 'med'}
Attributes in att2 is: {'2', '5more', '3', '4'}
Attributes in att3 is: {'2', 'more', '4'}
Attributes in att4 is: {'med', 'small', 'big'}
Attributes in att5 is: {'low', 'med', 'high'}
Attributes in class is: {'vgood', 'unacc', 'good', 'acc'}
Number of class is:  4


-----------------------------------------

# Testing for First Step

In [8]:
entropy(arr[:,-1], n_class)

0.6028704850060875

In [9]:
max_gain_indx = 0
max_gain = 0
for i in range(n_cols-1):
    info_gain = gain(arr[:,i], arr[:,-1], n_class)
    if info_gain> max_gain:
        max_gain = info_gain
        max_gain_indx = i
print("Index {} has the highest gain: {}".format(max_gain_indx, info_gain))

Index 5 has the highest gain: 0.13109217827713188


In [10]:
#delete best feature column from the dataset 
sub_arr = np.delete(arr,max_gain_indx,1)
#sanity check
print(sub_arr[0])

['vhigh' 'vhigh' '2' '2' 'small' 'unacc']


In [11]:
features = np.unique(arr[:,max_gain_indx])
print(features)

['high' 'low' 'med']


In [12]:
for feature in np.unique(arr[:,max_gain_indx]):
    sub_data = np.where(arr[:,max_gain_indx]==feature, arr[:,-1], 0)
    sub_data = sub_data[sub_data!=0]
    if(len(set(sub_data))<=1):
        #if theres only one label, entropy should be 0
        ent = 0
        print("Entropy: {}, The best attribute is: {}".format(ent,feature))
        #node
    else:
        ent = entropy(sub_data, n_class)
        print("Entropy: {}, The best attribute is: {}".format(ent,feature))
        #grow tree

Entropy: 0.8077559247898061, The best attribute is: high
Entropy: 0, The best attribute is: low
Entropy: 0.607578995397061, The best attribute is: med


In [28]:
max_gain = 0
for i in range(n_cols-1):
    if i == max_gain_indx:
        pass
    else:
        print(i)
        info_gain = gain(arr[:,i], arr[:,-1], n_class)
        if info_gain> max_gain:
            max_gain = info_gain
            max_gain_indx = i
print("Index {} has the highest gain: {}".format(max_gain_indx, info_gain))

1
2
3
4
5
Index 0 has the highest gain: 0.13109217827713188


In [20]:
max_gain = 0
for i in range(n_cols-1):
    if i == max_gain_indx:
        pass
    else:
        print(i)
        info_gain = gain(arr[:,i], arr[:,-1], n_class)
        if info_gain> max_gain:
            max_gain = info_gain
            max_gain_indx = i
print("Index {} has the highest gain: {}".format(max_gain_indx, info_gain))

0
1
2
3
4
5
Index 5 has the highest gain: 0.13109217827713188


In [14]:
np.unique(arr[:,5],return_counts=True)

(array(['high', 'low', 'med'], dtype=object), array([576, 576, 576]))

In [15]:
parent_node_class = np.unique(arr[:,5])[np.argmax(np.unique(arr[:,5],return_counts=True)[1])]
print(parent_node_class)

high


In [16]:
input_ds.columns[:-1]

Index(['att0', 'att1', 'att2', 'att3', 'att4', 'att5'], dtype='object')

------------------------------------------------------------

# Recursive Tree

-----------------------------------------------