In [24]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

print('numpy version : ',np.__version__)
print('pandas version : ',pd.__version__)
print('seaborn version : ',sns.__version__)

numpy version :  1.20.1
pandas version :  1.2.4
seaborn version :  0.11.1


In [25]:
mydata = pd.read_csv('german.csv')

In [12]:
def iv_woe(data, target, bins=10, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        #Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

In [28]:
mydata[mydata['V21'] == 2]

Unnamed: 0.1,Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21


In [27]:
def segment(x):
    if x['V21'] == '2':
        class_encoded = '1'
    else:
        class_encoded = '0'
    return class_encoded

mydata['V21'] = mydata.apply(lambda x: segment(x), axis=1)

In [29]:
mydata["V21"] = pd.to_numeric(mydata["V21"], downcast="float")

In [30]:
def calc_iv(df, feature, target, pr=False):

    d1 = df.groupby(by=feature, as_index=True)
    data = pd.DataFrame()

    data['all'] = d1[target].count()
    data['bad'] = d1[target].sum()
    data['good'] = data['all']-data['bad']
    data['share'] = data['all'] / data['all'].sum()
    data['bad_rate'] = d1[target].mean()
    data['d_g'] = (data['all'] - data['bad']) / (data['all'] - data['bad']).sum()
    data['d_b'] = data['bad'] / data['bad'].sum()
    data['woe'] = np.log(data['d_g'] / data['d_b'])
    data = data.replace({'woe': {np.inf: 0, -np.inf: 0}})
    data['iv'] = data['woe'] * (data['d_g'] - data['d_b'])

    data.insert(0, 'variable', feature)
    data.insert(1, 'value', data.index)
    data.index = range(len(data))

    iv = data['iv'].sum()

    if pr:
        print(data)
        print('IV = %s' % iv)

    return iv, data

In [31]:
iv, data = calc_iv(mydata, 'V5', 'V21')

In [32]:
data

Unnamed: 0,variable,value,all,bad,good,share,bad_rate,d_g,d_b,woe,iv
0,V5,250,1,0.0,1.0,0.001,0.0,0.001,,,
1,V5,276,1,0.0,1.0,0.001,0.0,0.001,,,
2,V5,338,1,0.0,1.0,0.001,0.0,0.001,,,
3,V5,339,1,0.0,1.0,0.001,0.0,0.001,,,
4,V5,343,1,0.0,1.0,0.001,0.0,0.001,,,
...,...,...,...,...,...,...,...,...,...,...,...
916,V5,15653,1,0.0,1.0,0.001,0.0,0.001,,,
917,V5,15672,1,0.0,1.0,0.001,0.0,0.001,,,
918,V5,15857,1,0.0,1.0,0.001,0.0,0.001,,,
919,V5,15945,1,0.0,1.0,0.001,0.0,0.001,,,
