In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df1=pd.read_csv("PlayTennis.csv")
df1

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [3]:
def probability(event,sample_space):
    
    p = np.count_nonzero(sample_space==event) / len(sample_space)
    return p


In [4]:
def conditional(event,event_sample,condition,condition_sample):
    
    num = np.count_nonzero((event_sample==event) & (condition_sample==condition))
    #num = P(A n B) for example we have to find when outlook == sunny and condition == yes
    
    deno = np.count_nonzero(condition_sample==condition)
    #deno = P(B)
    
    return num/deno 
    

In [72]:
def entropy(data,target):
    classs = np.unique(target)
    
    if isinstance(data,pd.Series):   # for single column
        data=data.to_frame()
        
    entropy=[]
    for col in data.columns:
        attr = np.unique(data[col])
        total_ent=0
        for i in attr:
            ent=0
            for j in classs:
                prob = conditional(j,target,i,data[col])
                ent += -(prob * np.log2(prob)) if prob>0 else 0    
            total_ent += probability(i,data[col]) * ent 
        entropy.append(np.trunc(total_ent*100)/100)
    feature_split = data.columns[entropy.index(min(entropy))]    
    return entropy,feature_split    

In [73]:
data=df1.drop(["play"],axis=1)
entropy(data,df1["play"])

([0.69, 0.91, 0.78, 0.89], 'outlook')

In [74]:
def entropy_parent(data,target):
    classs = np.unique(target)        
    attr = np.unique(data)
    total_ent=0
    for i in attr:
        ent=0      
        total_ent -= probability(i,data) * np.log2(probability(i,data)) 
    entropy=(np.trunc(total_ent*100)/100)
    return entropy    

In [75]:
entropy_parent(df1["play"],df1["play"])

0.94

In [80]:
def gini(data,target):
    classs = np.unique(target)
    
    if isinstance(data,pd.Series):   # for single column
        data=data.to_frame()
        
    gini=[]
    for col in pd.DataFrame(data):
        attr = np.unique(data[col])
        total_gin=0
        for i in attr:
            gin=0
            for j in classs:
                prob = conditional(j,target,i,data[col])
                gin += prob**2
            gin = 1 - gin    
            total_gin +=  probability(i,data[col]) * gin
        gini.append(np.trunc(total_gin*100)/100)
        
    feature_split = data.columns[gini.index(min(gini))]    
    return gini,feature_split   

In [81]:
data=df1.drop(["play"],axis=1)
gini(data,df1["play"])

([0.34, 0.44, 0.36, 0.42], 'outlook')

In [78]:
def info_gain(data,target):
    parent = entropy_parent(target,target)
    
    info_gain =[]
    for col in data.columns:
        
        children = entropy(data[col],target)[0]
        info = parent - children
        info_gain.append(info)
    
    feature_split = data.columns[info_gain.index(max(info_gain))]    
    return info_gain,feature_split
        

In [79]:
info_gain(data,df1["play"])

([array([0.25]), array([0.03]), array([0.16]), array([0.05])], 'outlook')

In [119]:
#chi2

a=pd.crosstab(df1["outlook"],df1["play"])
a_num=a.to_numpy()
total=[]
for i in range(len(a)):
    v=np.sum(a.values[i])
    total.append(v)
total=np.array(total)    
a["total"] = total
a["expected"] = total/2
a["deviation_no"] = a_num[:,0] - a["expected"]
a["deviation_yes"] = a_num[:,1] - a["expected"]
a["chi2_no"]=np.sqrt(a["deviation_no"]**2/a["expected"])
a["chi2_yes"]=np.sqrt(a["deviation_yes"]**2/a["expected"])
chi2 = np.sum(2*a["chi2_no"])
print("chi2 : ",chi2)

chi2 :  4.093338188813542


In [120]:
a

play,no,yes,total,expected,deviation_no,deviation_yes,chi2_no,chi2_yes
outlook,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
overcast,0,4,4,2.0,-2.0,2.0,1.414214,1.414214
rainy,2,3,5,2.5,-0.5,0.5,0.316228,0.316228
sunny,3,2,5,2.5,0.5,-0.5,0.316228,0.316228
