# Decision Tree, Entropy, Gini, Gain

In [1]:
import pandas as pd
import io
import requests
import numpy as np

def compute_impurity(feature, impurity_criterion, name):
    """
    This function calculates impurity of a feature.
    Supported impurity criteria: 'entropy', 'gini'
    input: feature (this needs to be a Pandas series)
    output: feature impurity
    """
    probs = feature.value_counts(normalize=True)
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
        print(f"Entropy[{name}] = -1 * sum(log2(p) * p) = -1 * sum(log2({list(probs)}) * {list(probs)}) = {impurity}")
    elif impurity_criterion == 'gini':
        impurity = 1 - np.sum(np.square(probs))
        print(f"Gini[{name}] = -1 - sum(p^2) = -1 - sum({list(probs)}^2) = {impurity}")
    else:
        raise ValueError('Unknown impurity criterion')
        
    return(round(impurity, 3))

def comp_feature_information_gain(df, target, descriptive_feature, split_criterion):
    """
    This function calculates information gain for splitting on 
    a particular descriptive feature for a given dataset
    and a given impurity criteria.
    Supported split criterion: 'entropy', 'gini'
    """     
    
    target_entropy = compute_impurity(df[target], split_criterion, target)
    print("-------------------")
    # we define two lists below:
    # entropy_list to store the entropy of each partition
    # weight_list to store the relative number of observations in each partition
    entropy_list = list()
    weight_list = list()
    
    # loop over each level of the descriptive feature
    # to partition the dataset with respect to that level
    # and compute the entropy and the weight of the level's partition
    print(f"{descriptive_feature}: ")
    for level in df[descriptive_feature].unique():
        df_feature_level = df[df[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion, level)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        print(f"weight[{level}] = {len(df_feature_level)}/{len(df)} = {weight_level}")
        weight_list.append(round(weight_level, 3))

    print("-------------------")
    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    print(f'total {split_criterion}[{descriptive_feature}] = SUM(entropy_of_partitions * weights_of_partitions) = SUM({entropy_list} * {weight_list}) =', feature_remaining_impurity)
    
    split_info = -1 * np.sum(np.array(weight_list) * np.log2(weight_list))
    print(f'Split Info = - SUM(weights_of_partitions * log2(weights_of_partitions)) = - SUM({weight_list} * log2({weight_list})) =', split_info)
    
    information_gain = target_entropy - feature_remaining_impurity
    print(f'information gain = Gain(S,A) = Entropy(S) - SUM(weights * Entropy(Sv) = {target_entropy} - SUM({weight_list} * {entropy_list})) = ', information_gain)
    
    gain_ratio = information_gain / split_info
    print(f'gain_ratio = information_gain / split_info =', gain_ratio)
    
    print('========================================')

    return(information_gain)

In [2]:
df = pd.read_table('../data/table6.txt', delim_whitespace=True, index_col=0)
df

Unnamed: 0_level_0,OUTLOOK,HUMIDITY,WIND,TIME,PLAYTENNIS
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Sunny,Normal,Pleasant,Morning,Yes
2,Sunny,Normal,Pleasant,Morning,Yes
3,Sunny,Normal,Heavy,Afternoon,No
4,Overcast,High,Heavy,Afternoon,No
5,Rainy,Normal,Pleasant,Afternoon,No
6,Rainy,High,Pleasant,Morning,No
7,Rainy,High,Heavy,Afternoon,No
8,Overcast,Normal,Pleasant,Morning,Yes
9,Overcast,High,Heavy,Afternoon,No
10,Sunny,High,Pleasant,Morning,Yes


In [3]:
split_criterion = 'entropy'
#split_criterion = 'gini'
target = "PLAYTENNIS"
print('target feature:', target)
print('split criterion:', split_criterion)
print('====================')
print('====================')
for feature in df.drop(columns=target).columns:
    feature_info_gain = comp_feature_information_gain(df, target, feature, split_criterion)
    

target feature: PLAYTENNIS
split criterion: entropy
Entropy[PLAYTENNIS] = -1 * sum(log2(p) * p) = -1 * sum(log2([0.6, 0.4]) * [0.6, 0.4]) = 0.9709505944546686
-------------------
OUTLOOK: 
Entropy[Sunny] = -1 * sum(log2(p) * p) = -1 * sum(log2([0.75, 0.25]) * [0.75, 0.25]) = 0.8112781244591328
weight[Sunny] = 4/10 = 0.4
Entropy[Overcast] = -1 * sum(log2(p) * p) = -1 * sum(log2([0.6666666666666666, 0.3333333333333333]) * [0.6666666666666666, 0.3333333333333333]) = 0.9182958340544896
weight[Overcast] = 3/10 = 0.3
Entropy[Rainy] = -1 * sum(log2(p) * p) = -1 * sum(log2([1.0]) * [1.0]) = -0.0
weight[Rainy] = 3/10 = 0.3
-------------------
total entropy[OUTLOOK] = SUM(entropy_of_partitions * weights_of_partitions) = SUM([0.811, 0.918, -0.0] * [0.4, 0.3, 0.3]) = 0.5998
Split Info = - SUM(weights_of_partitions * log2(weights_of_partitions)) = - SUM([0.4, 0.3, 0.3] * log2([0.4, 0.3, 0.3])) = 1.5709505944546684
information gain = Gain(S,A) = Entropy(S) - SUM(weights * Entropy(Sv) = 0.971 - SUM([