# Problem 4.3 

## Implementation of Iterative Dichotomiser 3 (ID3) Algorithm
## (Classification)

In [1]:
import numpy as np
import pandas as pd
import numbers

# 1. Entropy

In [2]:
def entropy(target):
    """
    This function computes the entropy, which has one input:
    1. target: the df column of response
    """
    ent = 0
    
    for element in pd.value_counts(target):
        p = element / len(target)
        ent += -p * np.log2(p)
    
    return ent  

# 2. Information Gain

In [3]:
def information_gain(target_name, attribute_name, data):
    """
    This function computes the information gain of an attribute, which has three inputs:
    1. target_name: string, the column name of response
    2. attribute_name: string, the column name of the attribute
    3. data: dataframe
    
    The class of the attribute could be numberic or object 
    """
    ent = entropy(data[target_name])
    df_slice = data[[target_name, attribute_name]]
    
    node_info = [] # store the best split information
    
    # information gain for categorical attributes
    if not np.issubdtype(data[attribute_name].dtype, np.number):
        new_ent = 0
        for attr in pd.value_counts(data[attribute_name]).index:
            dv = df_slice[df_slice[attribute_name] == attr] # the subset of specific attribute value
            
            ent_dv = entropy(dv[target_name]) # the entropy of the subset
            new_ent += len(dv)/len(data) * ent_dv # compute the sum of entropy 
            node_info.append(attr)
            
    # information gain for numerical attributes
    if np.issubdtype(data[attribute_name].dtype, np.number): 
        new_ent = np.inf
        sorted_attr = sorted(data[attribute_name].values)
        points  = [(sorted_attr[i]+sorted_attr[i+1])/2 for i in range(len(sorted_attr)-1)]
        for point in points:
            # compute the entropy for two subsets, + and -
            dv1 = df_slice[df_slice[attribute_name] < point]
            ent_dv1 = len(dv1)/len(data) * entropy(dv1[target_name])
            
            dv2 = df_slice[df_slice[attribute_name] > point]
            ent_dv2 = len(dv2)/len(data) * entropy(dv2[target_name])
            
            # find the smallest entropy sum
            if ent_dv1+ent_dv2 < new_ent:
                new_ent = ent_dv1 + ent_dv2
                node_info = [point]

        
    return ent - new_ent, node_info
    

# 3. Build Tree

In [4]:
def best_split(data, target_name):
    """
    This function returns the best split information (tree stump) of certain dataframe
    """
    
    attributes = list(data.columns) # get all the attributes
    attributes.remove(target_name)
    origin_gain = 0
    
    for attr in attributes:
        gain, node_info = information_gain(target_name, attr, data)
        if gain > origin_gain:
            origin_gain = gain
            node_column_name = attr
            split_info = node_info
    return node_column_name, split_info

In [5]:
def majorClass(data, target_name):
    """
    Majority Function simply tells which class has more entries in given data-set
    """
    value_cnt = pd.value_counts(data[target_name])
    # np.unique(data['quality'])[np.argmax(np.unique(data['quality'],return_counts=True)[1])]
    major = list(value_cnt.index[value_cnt.values == value_cnt.max()])

    return major[0]

In [6]:
def partition_data(data, best_attr_name, value, isnumber = True, islarger = None):
    """
    This function return the new dataframe based on the best split information.
    It has five inputs:
    1. data: the dataframe that should be sliced
    2. best_attr_name: string, the name of the attribute which is going to be splitted on
    3. value: list, the best split values
    4. isnumber: boolean, identify if the best split value is numeric
    5. islarger: boolean, identify if the condition of splitting is 
                 'larger than the best split value'
    """
    
    if isnumber:
        if islarger:
            new_df = data[data[best_attr_name] > value]
        else: new_df = data[data[best_attr_name] < value]
    else: new_df = data[data[best_attr_name] == value]
    return new_df
    

In [7]:
def ID3(data,originaldata,features,target_name='quality',parent_node_class = None):
    """
    ID3 Algorithm: This function takes five paramters:
    1. data = the data for which the ID3 algorithm should be run --> In the first run this equals the total dataset
 
    2. originaldata = This is the original dataset needed to calculate the mode target feature value of the original dataset
    in the case the dataset delivered by the first parameter is empty
    3. features = the feature space of the dataset . This is needed for the recursive call since during the tree growing process
    we have to remove features from our dataset --> Splitting at each node
    4. target_attribute_name = the name of the target attribute
    5. parent_node_class = This is the value or class of the mode target feature value of the parent node for a specific node. This is 
    also needed for the recursive call since if the splitting leads to a situation that there are no more features left in the feature
    space, we want to return the mode target feature value of the direct parent node.
    """   
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#
    
    #If all target_values have the same value, return this value
    if len(np.unique(data[target_name])) <= 1:
        return np.unique(data[target_name])[0]
    
    #If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data)==0:
        return majorClass(originaldata, target_name)
    
    elif len(features) ==0:
        return parent_node_class
    
    #If none of the above holds true, grow the tree!
    
    else:
        #Set the default value for this node --> The mode target feature value of the current node
        parent_node_class = majorClass(data, target_name)
        
        #Select the feature which best splits te dataset
        best_feature, best_split_info = best_split(data, target_name)
        
        #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information
        tree = {best_feature:{}}
        
        #Remove the feature with the best inforamtion gain from the feature space
        features = [i for i in features if i != best_feature]
        
        #Grow a branch under the root node for each possible value of the root node feature
        if isinstance(best_split_info[0], numbers.Number):
            values = ['<'+str(best_split_info[0]), '>'+str(best_split_info[0])]
            islargers = [False, True]
            for i in range(2):
                value = values[i]
                sub_data = partition_data(data, best_feature, best_split_info[0], True, islargers[i])
                
                sub_tree = ID3(sub_data, originaldata, features, target_name, parent_node_class)
                
                tree[best_feature][value] = sub_tree
        else:
            for value in best_split_info:
                #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
                sub_data = partition_data(data, best_feature, value, False)
                
                #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
                sub_tree = ID3(sub_data, originaldata, features, target_name, parent_node_class)
                #Add the sub tree, grown from the sub_dataset to the tree under the root node
                tree[best_feature][value] = sub_tree
        return (tree) 

# 4. Prediction

In [8]:
def check_float(value):
    """
    check if a string input is numeric
    """
    try:
        f = float(value)
        return True
    except: return False
    

In [9]:
def predict(input_df,tree,default = 'No prediction'):
    """
    Prediction of a new/unseen dataframe instance. This takes two parameters:
    1. input_df: a row of new instance with column names
    2. tree: the built decision tree
    3. default value: return the value in case of 
       new/unseen instance contains unseen attribute value.
    
    Also it is made in a recrusive manner.
    """
    for column in input_df:
        if column in list(tree.keys()):
            
            attr_value = list(input_df[column])[0]
            
            if isinstance(attr_value, numbers.Number):
                threshold = float(list(tree[column].keys())[0][1:])
                if attr_value < threshold:
                    attr_value = '<'+str(threshold)
                else: 
                    attr_value = '>'+str(threshold)
                    result = tree[column][attr_value]
            else: # do categorical classification
                try:

                    result = tree[column][attr_value] 
                except:
                    return default
  
            result = tree[column][attr_value]

            if isinstance(result,dict):
                return predict(input_df,result)
            else:
                return result

# 5. Train the tree

In [10]:
data = pd.read_csv('../../data/data.txt', sep=',')
data = data.drop(['Id'],axis=1)

In [11]:
data

Unnamed: 0,color,root,sound,stripes,umbilical,touch,density,sugar,quality
0,dark-green,roll-up,dull,clear,hollow,hard,0.697,0.46,good
1,pitch-dark,roll-up,dead,clear,hollow,hard,0.744,0.376,good
2,pitch-dark,roll-up,dull,clear,hollow,hard,0.634,0.264,good
3,dark-green,roll-up,dead,clear,hollow,hard,0.608,0.318,good
4,white,roll-up,dull,clear,hollow,hard,0.556,0.215,good
5,dark-green,slighly-curled,dull,clear,slightly-hollow,soft,0.403,0.237,good
6,pitch-dark,slighly-curled,dull,indistinct,slightly-hollow,soft,0.481,0.149,good
7,pitch-dark,slighly-curled,dull,clear,slightly-hollow,hard,0.437,0.211,good
8,pitch-dark,slighly-curled,dead,indistinct,slightly-hollow,hard,0.666,0.091,bad
9,dark-green,stiff,crisp,clear,plain,soft,0.243,0.267,bad


In [12]:
# split the train and test set
train=data.sample(frac=0.9,random_state=4)

# the test set should only contain features values
test_x, test_y =data.drop(train.index).drop(['quality'], axis=1), data.drop(train.index)['quality']

In [13]:
test_x

Unnamed: 0,color,root,sound,stripes,umbilical,touch,density,sugar
7,pitch-dark,slighly-curled,dull,clear,slightly-hollow,hard,0.437,0.211
14,pitch-dark,slighly-curled,dull,clear,slightly-hollow,soft,0.36,0.37


In [14]:
# fit the tree
features_name = list(test_x.columns)
dt = ID3(train,train,features_name,target_name='quality',parent_node_class = None)

In [15]:
dt

{'stripes': {'clear': {'root': {'roll-up': 'good',
    'stiff': 'bad',
    'slighly-curled': 'good'}},
  'indistinct': {'touch': {'hard': 'bad', 'soft': 'good'}},
  'blurred': 'bad'}}

In [16]:
dt['stripes']

{'clear': {'root': {'roll-up': 'good',
   'stiff': 'bad',
   'slighly-curled': 'good'}},
 'indistinct': {'touch': {'hard': 'bad', 'soft': 'good'}},
 'blurred': 'bad'}

In [17]:
# make prediction
prediction_result = []

for i in range(len(test_x)):
    new_pred = predict(test_x.iloc[[i]], dt)
    prediction_result.append(new_pred)

In [18]:
prediction_result

['good', 'good']

In [19]:
test_y

7     good
14     bad
Name: quality, dtype: object