In [21]:
import numpy as np
import math
from tabulate import tabulate

In [18]:
def entropy(ps, i = ''):
    #print('H'+i+' = ' + ' + '.join([f'-{p}*log2({p})' for p in ps]))
    es = [0 if not p else -p*math.log2(p) for p in ps]
    entr = np.sum(es)
    print('H'+i+' = ' + ' + '.join([f'-{p}*log2({p})' for p in ps]) + f'= {entr:.4g}')

    #print('= ' + ' + '.join([f'{e:.4g}' for e in es]) + f' = {entr:.4g}')
    return entr

def info_gain(p_class, p_features_class, feature_value, p_feature, feature_name, class_name):   
    text = 'MI' + '(' + feature_name + ') = '
    mean_info = 0
    entropy_class = entropy(p_class, '(' + class_name + ')')
    for i in range(len(feature_value)):
        mean_info += p_feature[i] * entropy(p_features_class[i], '(' + feature_name + '==' + feature_value[i] + ')')
        text += ('+ P(' if i else 'P(') + feature_name + '==' + feature_value[i] + ')*H(' + feature_name + '==' + feature_value[i] + ')'
    print(f'{text} = {mean_info:.4g} ')
    print(f'IG({feature_name}|{class_name}) = H({class_name}) - MI'+'(' + feature_name + ')'+f' = {entropy_class - mean_info:.4g}\n')
    return entropy_class - mean_info

def gain_ratio(info_gain, p_feature, feature_name, class_name):
    entropy_class = entropy(p_feature, '(' + feature_name + ')')
    print(f'GR({feature_name}|{class_name}) = IG({feature_name}|{class_name})/H({feature_name}) = {info_gain/entropy_class: .4g}')
    return 
    

In [20]:
"""Testing example from past exam _ 2020"""

p_class = [0.5, 0.5]
p_features_class = [[0.4, 0.6], [0.6, 0.4]]
feature_name = 'A1'
class_name = 'Class'
feature_value = ['True', 'False']

p_feature = [0.5,0.5]

ig = info_gain(p_class, p_features_class, feature_value, p_feature, feature_name, class_name)

gain_ratio(ig, p_feature, feature_name, class_name)

H(Class) = -0.5*log2(0.5) + -0.5*log2(0.5)= 1
H(A1==True) = -0.4*log2(0.4) + -0.6*log2(0.6)= 0.971
H(A1==False) = -0.6*log2(0.6) + -0.4*log2(0.4)= 0.971
MI(A1) = P(A1==True)*H(A1==True)+ P(A1==False)*H(A1==False) = 0.971 
IG(A1|Class) = H(Class) - MI(A1) = 0.02905

H(A1) = -0.5*log2(0.5) + -0.5*log2(0.5)= 1
GR(A1|Class) = IG(A1|Class)/H(A1) =  0.02905


In [100]:
def chi_sq(observed_table, f='Feature',fval=['T', 'F'], cl='Class', cval=['T','F']):
    l = len(observed_table)
    print(f'Feature: {f}\n')
    print(f'Observed table =')
    table(observed_table,f,fval,cl,cval)
    #print(tabulate(observed_table, headers=[f+'=='+v for v in fval], showindex=[cl+'=='+v for v in cval], tablefmt='orgtbl'))
    sum_row = []
    sum_col = []
    
    for row in observed_table:
        sum_row.append(sum(row))

    for col in range(len(observed_table[0])):
        sum_curr_col = 0
        for row in observed_table:
            sum_curr_col += row[col]
        sum_col.append(sum_curr_col)
    
    num_instances = sum(sum_row)
    
    def exp(sum_1, sum_2, sum_all = num_instances, i =''):
        res = sum_1*sum_2/sum_all
        print(f'E{i} = {sum_1}*{sum_2}/{sum_all} = {res}')
        return res

    expected_table = [[exp(sum_row[i], sum_col[j], i=f'{i+1}{j+1}') for j in range(l)] for i in range(l)]
    print(f'\nExpected table =')
    table(expected_table,f,fval,cl,cval)
    
    chi = 0
    text = ''
    for i in range(len(observed_table)):
        for j in range(len(observed_table[0])):
            text += f' + ({observed_table[i][j]} - {expected_table[i][j]})^2/{expected_table[i][j]}'
            chi += (observed_table[i][j] - expected_table[i][j])**2/expected_table[i][j]

    print(f'Chi2({f}) = {text} = {chi}')

def table(ot,f,fval,cl,cval):
    mod = np.array([np.append(r,[np.sum(r)]) for r in ot])
    mod_full = np.append(mod,[[np.sum(mod[:,i]) for i in range(len(ot)+1)]],axis=0)
    print(tabulate(mod_full, headers=[f+'=='+v for v in fval]+['Total'], showindex=[cl+'=='+v for v in cval]+['Total'], tablefmt='pretty'))

In [101]:
"""Chi square"""

observed_table = np.array([[2, 3], # first class
                           [3, 2] # second class
                           ])

chi_sq(observed_table,'A1',['T','F'],'Class',['(+)','(-)'])

Feature: A1

Observed table =
+------------+-------+-------+-------+
|            | A1==T | A1==F | Total |
+------------+-------+-------+-------+
| Class==(+) |   2   |   3   |   5   |
| Class==(-) |   3   |   2   |   5   |
|   Total    |   5   |   5   |  10   |
+------------+-------+-------+-------+
E11 = 5*5/10 = 2.5
E12 = 5*5/10 = 2.5
E21 = 5*5/10 = 2.5
E22 = 5*5/10 = 2.5

Expected table =
+------------+-------+-------+-------+
|            | A1==T | A1==F | Total |
+------------+-------+-------+-------+
| Class==(+) |  2.5  |  2.5  |  5.0  |
| Class==(-) |  2.5  |  2.5  |  5.0  |
|   Total    |  5.0  |  5.0  | 10.0  |
+------------+-------+-------+-------+
Chi2(A1) =  + (2 - 2.5)^2/2.5 + (3 - 2.5)^2/2.5 + (3 - 2.5)^2/2.5 + (2 - 2.5)^2/2.5 = 0.4


In [102]:
"""Chi square"""

observed_table = np.array([[2, 3], # first class
                           [2, 3] # second class
                           ])

chi_sq(observed_table,'A3',['T','F'],'Class',['(+)','(-)'])

Feature: A3

Observed table =
+------------+-------+-------+-------+
|            | A3==T | A3==F | Total |
+------------+-------+-------+-------+
| Class==(+) |   2   |   3   |   5   |
| Class==(-) |   2   |   3   |   5   |
|   Total    |   4   |   6   |  10   |
+------------+-------+-------+-------+
E11 = 5*4/10 = 2.0
E12 = 5*6/10 = 3.0
E21 = 5*4/10 = 2.0
E22 = 5*6/10 = 3.0

Expected table =
+------------+-------+-------+-------+
|            | A3==T | A3==F | Total |
+------------+-------+-------+-------+
| Class==(+) |  2.0  |  3.0  |  5.0  |
| Class==(-) |  2.0  |  3.0  |  5.0  |
|   Total    |  4.0  |  6.0  | 10.0  |
+------------+-------+-------+-------+
Chi2(A3) =  + (2 - 2.0)^2/2.0 + (3 - 3.0)^2/3.0 + (2 - 2.0)^2/2.0 + (3 - 3.0)^2/3.0 = 0.0
