# Info Gain Calculator

In [8]:
from collections import Counter
import math
import pprint
from statistics import median

Functions to calculate the info gain

In [9]:

def is_number(s):
    try:
        complex(s) # for int, long, float and complex
    except ValueError:
        return False

    return True
    
def entropy(records):
    count = Counter([x[IDX_TARGET] for x in records])
    return -1*sum([(freq / len(records)) * math.log(freq / len(records), 2) for freq in count.values()])

def information_gain(split_cond, records):
    p_records = len(records)
    p_entropy = entropy(records)
    key = list(split_cond.items())[0][0]

    conditions = list(split_cond.items())[0][1]
    idx = ATTRS.index(key)
    
    val_type = "cat"
    if is_number(records[0][idx]):
        val_type = "num"
        
    
    records_list=[]
    if val_type == "cat" :
        for cond in conditions :
            partial_list =[]
            for record in records :
                if record[idx] in cond :
                    partial_list.append(record)
            records_list.append(partial_list)
    else :
        cond = float(conditions[0][0])
        partial_list =[]
        for record in records :
            if record[idx] <= cond :
                partial_list.append(record)
        records_list.append(partial_list)
        partial_list =[]
        for record in records :
            if record[idx] > cond :
                partial_list.append(record)
        records_list.append(partial_list)
        
    each_sum = 0
    for partial_records in records_list :
        print("Splits")
        pprint.pprint(partial_records)
        entropy_vj = entropy(partial_records)
        each_sum += (len(partial_records)/p_records)*entropy_vj
    infogain = p_entropy - each_sum
    print("Info Gain : "+str(infogain))
    print("\n")
    return infogain

Need to set the table, based on the problem.
Numeric data will be integer or float.
Other data should be string!

# Usage

# Trial exam case

<img src="images/decision_tree_exam.png"/>

In [10]:
#Trial exam case

ATTRS = ["Gender","Size","Color","Class"]
RECORDS = [
        ["M","S" ,"red"  ,"No"],
        ["M","L" ,"blue" ,"Yes"],
        ["F","M" ,"red"  ,"Yes"],
        ["F","S" ,"blue" ,"No"],
        ["M","XL","blue" ,"Yes"],
        ["F","S" ,"green","No"],
        ["F","XL","red"  ,"No"],
        ["M","XL","green","Yes"],
        ["F","L" ,"green","No"],
        ["M","XL","red"  ,"No"],
]


IDX_TARGET = len(ATTRS)-1

split_cond is the most important part.
the rule is
{"Attribute":[[coditionA],[conditionB]...}

but sometimes, like in the trial exam, we need to group some attribute like below.
{"Attribute":[[coditionA1,coditionA2],[conditionB1,conditionB1]...}

In [11]:
#grouped condition case
records = RECORDS
split_cond = {"Size": [["S","M"],["L","XL"]]}
information_gain(split_cond, records )



Splits
[['M', 'S', 'red', 'No'],
 ['F', 'M', 'red', 'Yes'],
 ['F', 'S', 'blue', 'No'],
 ['F', 'S', 'green', 'No']]
Splits
[['M', 'L', 'blue', 'Yes'],
 ['M', 'XL', 'blue', 'Yes'],
 ['F', 'XL', 'red', 'No'],
 ['M', 'XL', 'green', 'Yes'],
 ['F', 'L', 'green', 'No'],
 ['M', 'XL', 'red', 'No']]
Info Gain : 0.0464393446710154




0.0464393446710154

This is not the tree builder, if you want to go to next step, you have to choose next data manually.

In [12]:
#records from above data [S,M]
records = [
    ['M', 'S', 'red', 'No'],
    ['F', 'M', 'red', 'Yes'],
    ['F', 'S', 'blue', 'No'],
    ['F', 'S', 'green', 'No']
]
split_cond = {"Color":[["red"],["blue"],["green"]]}
information_gain(split_cond, records )
split_cond = {"Gender":[["M"],["F"]]}
information_gain(split_cond, records )

#records from above data [L,XL]
records = [
     ['M', 'L', 'blue', 'Yes'],
     ['M', 'XL', 'blue', 'Yes'],
     ['F', 'XL', 'red', 'No'],
     ['M', 'XL', 'green', 'Yes'],
     ['F', 'L', 'green', 'No'],
     ['M', 'XL', 'red', 'No']
]
split_cond = {"Color":[["red"],["blue"],["green"]]}
information_gain(split_cond, records )
split_cond = {"Gender":[["M"],["F"]]}
information_gain(split_cond, records )

Splits
[['M', 'S', 'red', 'No'], ['F', 'M', 'red', 'Yes']]
Splits
[['F', 'S', 'blue', 'No']]
Splits
[['F', 'S', 'green', 'No']]
Info Gain : 0.31127812445913283


Splits
[['M', 'S', 'red', 'No']]
Splits
[['F', 'M', 'red', 'Yes'], ['F', 'S', 'blue', 'No'], ['F', 'S', 'green', 'No']]
Info Gain : 0.12255624891826566


Splits
[['F', 'XL', 'red', 'No'], ['M', 'XL', 'red', 'No']]
Splits
[['M', 'L', 'blue', 'Yes'], ['M', 'XL', 'blue', 'Yes']]
Splits
[['M', 'XL', 'green', 'Yes'], ['F', 'L', 'green', 'No']]
Info Gain : 0.6666666666666667


Splits
[['M', 'L', 'blue', 'Yes'],
 ['M', 'XL', 'blue', 'Yes'],
 ['M', 'XL', 'green', 'Yes'],
 ['M', 'XL', 'red', 'No']]
Splits
[['F', 'XL', 'red', 'No'], ['F', 'L', 'green', 'No']]
Info Gain : 0.4591479170272448




0.4591479170272448

# Exercise 6 case

<img src="images/decision_tree_exercise.png"/>

There can be some numeric data. For that case we need to make that data either int or float

In [13]:
#Exercise case
#Temp and Humidity is numeric
ATTRS = ["Outlook","Temp","Humidity","Windy", "Play"]
RECORDS = [
    ["sunny",85,85,"false","No"],
    ["sunny",80,90,"true","No"],
    ["overcast",83,78,"false","Yes"],
    ["rain",70,96,"false","Yes"],
    ["rain",68,80,"false","Yes"],
    ["rain",65,70,"true","No"],
    ["overcast",64,65,"true","Yes"],
    ["sunny",72,95,"false","No"],
    ["sunny",69,70,"false","Yes"],
    ["rain",75,80,"false","Yes"],
    ["sunny",75,70,"true","Yes"],
    ["overcast",72,90,"true","Yes"],
    ["overcast",81,75,"false","Yes"],
    ["rain",71,80,"true","No"],
]


IDX_TARGET = len(ATTRS)-1

In [14]:
records = RECORDS
#when we need median
#median_val = median([x[ATTRS.index("Temp")] for x in records])
#when we can fix the numeric value.
median_val = 75
split_cond = {"Temp": [[median_val]]}
information_gain(split_cond, records )

split_cond = {"Outlook": [["sunny"],["overcast"],["rain"]]}
information_gain(split_cond, records )

median_val = 75
split_cond = {"Humidity": [[median_val]]}
information_gain(split_cond, records )


split_cond = {"Windy": [["false"],["true"]]}
information_gain(split_cond, records )


Splits
[['rain', 70, 96, 'false', 'Yes'],
 ['rain', 68, 80, 'false', 'Yes'],
 ['rain', 65, 70, 'true', 'No'],
 ['overcast', 64, 65, 'true', 'Yes'],
 ['sunny', 72, 95, 'false', 'No'],
 ['sunny', 69, 70, 'false', 'Yes'],
 ['rain', 75, 80, 'false', 'Yes'],
 ['sunny', 75, 70, 'true', 'Yes'],
 ['overcast', 72, 90, 'true', 'Yes'],
 ['rain', 71, 80, 'true', 'No']]
Splits
[['sunny', 85, 85, 'false', 'No'],
 ['sunny', 80, 90, 'true', 'No'],
 ['overcast', 83, 78, 'false', 'Yes'],
 ['overcast', 81, 75, 'false', 'Yes']]
Info Gain : 0.0250781735058504


Splits
[['sunny', 85, 85, 'false', 'No'],
 ['sunny', 80, 90, 'true', 'No'],
 ['sunny', 72, 95, 'false', 'No'],
 ['sunny', 69, 70, 'false', 'Yes'],
 ['sunny', 75, 70, 'true', 'Yes']]
Splits
[['overcast', 83, 78, 'false', 'Yes'],
 ['overcast', 64, 65, 'true', 'Yes'],
 ['overcast', 72, 90, 'true', 'Yes'],
 ['overcast', 81, 75, 'false', 'Yes']]
Splits
[['rain', 70, 96, 'false', 'Yes'],
 ['rain', 68, 80, 'false', 'Yes'],
 ['rain', 65, 70, 'true', 'No'],


0.04812703040826927