In [11]:
import pandas as pd
import numpy as np

In [12]:
def calc_var_entropy(df, var):
    total_count = len(df)
    levels = df[var].unique()
    per_level_count = df.groupby(by=var).agg('count').values[:,0]
    prob = map(lambda x: x/float(total_count), per_level_count)
    neglog = map(lambda x: -x*np.log2(x), prob)
    entropy = reduce(lambda x,y: x+y, neglog)
    return entropy

def gain_ratio(df, features, target):
    total_count = len(df)
    target_entropy = calc_var_entropy(df, target)
    gain_ratio_df = pd.DataFrame(columns=['feature', 'gain_ratio'])
    
    for feat in features:
        target_entropy_per_feat_level = df.groupby(by = feat).apply(lambda dfnow: calc_var_entropy(dfnow, target)).values
        per_level_count = df.groupby(by = feat).agg('count').values[:,0]
        prob = map(lambda x: x/float(total_count), per_level_count)
        neglog = map(lambda (prob_this_level, target_entropy_this_level): -prob_this_level*target_entropy_this_level,
                         zip(prob, target_entropy_per_feat_level))
        gain = target_entropy + np.sum(neglog)
        split_info = calc_var_entropy(df, feat)
        gain_ratio = gain/float(split_info)
        gain_ratio_df = pd.concat((gain_ratio_df,
                                  pd.DataFrame(columns=['feature', 'gain_ratio'],
                                              data = [(feat, gain_ratio)])))
    gain_ratio_df.sort_values(by = ['gain_ratio'], ascending = False, inplace = True)
    gain_ratio_df.reset_index(inplace = True, drop = True)
    return gain_ratio_df

In [13]:
df = pd.DataFrame(columns = ['Var1', 'Var2'])
df['Var1'] = ['Yes']*10
df['Var2'] = ['Heads', 'Tail']*5
print df

  Var1   Var2
0  Yes  Heads
1  Yes   Tail
2  Yes  Heads
3  Yes   Tail
4  Yes  Heads
5  Yes   Tail
6  Yes  Heads
7  Yes   Tail
8  Yes  Heads
9  Yes   Tail


## Low entropy, low disorder. Inforation is concentrated

In [14]:
print calc_var_entropy(df, 'Var1')

-0.0


## High entropy, high disorder. Information is random

In [15]:
print calc_var_entropy(df, 'Var2')

1.0
