In [24]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [6]:
import pandas as pd
import numpy as np

In [7]:
mydata = pd.read_csv("https://stats.idre.ucla.edu/stat/data/binary.csv")

In [9]:
print(mydata)

     admit  gre   gpa  rank
0        0  380  3.61     3
1        1  660  3.67     3
2        1  800  4.00     1
3        1  640  3.19     4
4        0  520  2.93     4
5        1  760  3.00     2
6        1  560  2.98     1
7        0  400  3.08     2
8        1  540  3.39     3
9        0  700  3.92     2
10       0  800  4.00     4
11       0  440  3.22     1
12       1  760  4.00     1
13       0  700  3.08     2
14       1  700  4.00     1
15       0  480  3.44     3
16       0  780  3.87     4
17       0  360  2.56     3
18       0  800  3.75     2
19       1  540  3.81     1
20       0  500  3.17     3
21       1  660  3.63     2
22       0  600  2.82     4
23       0  680  3.19     4
24       1  760  3.35     2
25       1  800  3.66     1
26       1  620  3.61     1
27       1  520  3.74     4
28       1  780  3.22     2
29       0  520  3.29     1
..     ...  ...   ...   ...
370      1  540  3.77     2
371      1  680  3.76     3
372      1  680  2.42     1
373      1  620  3.3

In [21]:
# Python Function to calculate Information Value and WOE

def iv_woe(data, target, bins=10, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent varibles
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars])) > 10):
            binned_x = pd.qcut(data[ivars], bins, duplicates='drop')
            d0 = pd.DataFrame({'x':binned_x, 'y':data[target]})
        else:
            d0 = pd.DataFrame({'x':data[ivars], 'y':data[target]})
        d = d0.groupby("x", as_index=False).agg({"y":["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events'] / d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        print("Information value of " + ivars + " is " + str(round(d['IV'].sum(), 6)))
        temp = pd.DataFrame({"Variable":[ivars], "IV":[d['IV'].sum()]}, columns=["Variable", "IV"])
        newDF = pd.concat([newDF, temp], axis=0)
        woeDF = pd.concat([woeDF, d], axis=0)
        
        #Show WOE Table
        if show_woe == True:
            print(d)
            
    return newDF, woeDF

In [25]:
iv, woe = iv_woe(data=mydata, target='admit', bins=10, show_woe=True)
print(iv)
print(woe)

Information value of gre is 0.312882
  Variable            Cutoff   N  Events  % of Events  Non-Events  \
0      gre  (219.999, 440.0]  48       6     0.047244          42   
1      gre    (440.0, 500.0]  51      12     0.094488          39   
2      gre    (500.0, 520.0]  24      10     0.078740          14   
3      gre    (520.0, 560.0]  51      15     0.118110          36   
4      gre    (560.0, 580.0]  29       6     0.047244          23   
5      gre    (580.0, 620.0]  53      21     0.165354          32   
6      gre    (620.0, 660.0]  45      17     0.133858          28   
7      gre    (660.0, 680.0]  20       9     0.070866          11   
8      gre    (680.0, 740.0]  44      12     0.094488          32   
9      gre    (740.0, 800.0]  35      19     0.149606          16   

   % of Non-Events       WoE        IV  
0         0.153846 -1.180625  0.125857  
1         0.142857 -0.413370  0.019994  
2         0.051282  0.428812  0.011774  
3         0.131868 -0.110184  0.001516 