In [1]:
import pandas as pd
import numpy as np
import math

## Functions
# Function to calculate the total frequency of a certain attribute
def calcFreq(listAttributeType):
    freq = 0
    for i in listAttributeType:
        freq = freq + 1
    return freq

# Function to calculate probability of each type of attributes
def calcProb(dataset,classifier):
    listAttrType = np.unique(dataset[classifier]) # get a list of types of the attributes
    listAttrProb = []
    for x in listAttrType:
        subdataset = dataset[dataset[classifier]==x] # get sub-dataset containing only each type of attribute
        subprob = calcFreq(subdataset['PlayTennis'])/calcFreq(dataset['PlayTennis']) # divide the freq of "Attribute == type"
                                                                         #over the frequency of total Attribute
        listAttrProb.append(subprob)
    return listAttrProb

# Function to calculate entropy
def calcEnt(dataset,classifier):
    ent = 0.0
    for prob in calcProb(dataset,classifier):
        ent = ent + prob * math.log(prob, 2) # log base 2
    return -ent

# Function to calculate information gain
def IG(dataset,A):
    
    # Make a list of entropy E(S1), E(S2), ... E(Sn)
    listAttrType = np.unique(dataset[A]) # get a list of types of the attributes
    listEnt = []
    for x in listAttrType:
        subdataset = dataset[dataset[A]==x] # get sub-dataset containing in which "Attribute == type"
        ent = calcEnt(subdataset, 'PlayTennis')
        listEnt.append(ent)
    
    # Make a list of probability P1, P2, ..., Pn
    listProb = calcProb(dataset,A)
    
    # Sum of sub-entropy
    subEnt = 0.0
    for i in range(len(listEnt)):
        subEnt = subEnt + listProb[i]*listEnt[i] # Pi*Ent(Si)
    
    # Information gain
    IG = calcEnt(dataset,'PlayTennis') - subEnt # main entropy - sum of sub-entropy
    return IG

In [25]:
dfTennisData = pd.read_excel('Tennis data.xlsx') 

print(IG(dfTennisData,'Outlook'))
print(IG(dfTennisData,'Temperature'))
print(IG(dfTennisData,'Humidity'))
print(IG(dfTennisData,'Wind'))

0.2467498197744391
0.029222565658954647
0.15183550136234136
0.04812703040826927


In [168]:
print(calcProb(dataPassangerSurvival, 'Class'))
print(calcEnt(dataPassangerSurvival, 'Survived'))
print(calcEnt(class3, 'Survived'))

[0.25567765567765566, 0.20146520146520147, 0.5428571428571428]
0.9718012324831182
0.8654036268680916


In [209]:
dfTennisData = pd.read_excel('Tennis data.xlsx') 
A = ['Outlook','Temperature','Humidity','Wind']

def ID3(dataset, classifier, attrNames, classDefault = None):
    valueUnique = dataset[classifier].unique();
    
    # Scenario 1: Homogeneous dataset
    if len(valueUnique) == 1:
        return next(iter(valueUnique)) # Next input dataset, or raises StopIteration when EOF is hit
    
    # Scenario 2: Empty dataset
    elif dataset.empty or (not attrNames):
        return classDefault # Return None for empty dataset
    
    else:
        Npos = calcFreq(dataset[dataset[classifier] == "Yes"][classifier])
        Nneg = calcFreq(dataset[dataset[classifier] == "No"][classifier])
        
        # Return majority value in Classifier
        if(Npos>Nneg):
            classDefault = Npos
        else:
            classDefault = Nneg
        
        # IG for each attribute:
        listIG = []
        for attr in attrNames:
            listIG.append(IG(dataset,attr))
        
        # Choose highest attribute to split on
        indexMax = listIG.index(max(listIG)) # Index of best attribute
        attrMax = A[indexMax]
        
        # Create an empty tree, soon to be populated
        tree = {attrMax:{}} # Initiate the tree with the best attribute as a node
        attrNames.remove(attrMax) # Remove the best attribute from the attribute list
        #rest_attribute_names = [i for i in attrNames if i != attrMax]
        
        for attrType, dataSubset in dataset.groupby(attrMax):
            subtree = ID3(dataSubset, classifier, attrNames)
            tree[attrMax][attrType] = subtree
        return tree

In [210]:
from pprint import pprint
tree = ID3(dfTennisData, 'PlayTennis', A)
pprint(tree)

{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}
