In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
import scorecardpy as sc  # pip install scorecardpy -i https://pypi.douban.com/simple

dat = sc.germancredit()

In [2]:
dat["y"] = dat["creditability"].replace({"good": 0, "bad": 1})
data = dat[["credit.amount", "status.of.existing.checking.account", "job", "y"]].copy()
data.columns = ["x1", "x2", "x3", "y"]
for i in range(1000, 1020):
    data.loc[i, "x1"] = np.nan
    data.loc[i, "x2"] = np.nan
    data.loc[i, "x3"] = np.nan
    data.loc[i, "y"] = 1 if i % 3 else 0

In [3]:
data.head()

Unnamed: 0,x1,x2,x3,y
0,1169.0,... < 0 DM,skilled employee / official,0.0
1,5951.0,0 <= ... < 200 DM,skilled employee / official,1.0
2,2096.0,no checking account,unskilled - resident,0.0
3,7882.0,... < 0 DM,skilled employee / official,0.0
4,4870.0,... < 0 DM,skilled employee / official,1.0


In [4]:
%run mono_bin.py

In [5]:
data1, rule1 = mono_bin_fit(
    data,
    "x1",
    "y",
    max_bin_cnt=3,
    min_total=5,
    min_bad=1,
    min_good=1,
    min_woe_diff=0.1,
    right=True,
    ascending=False,
    try_na_separate=False,
    return_rule=True,
)
pprint(rule1)
data1

{'bins': [-inf, 625.0, 10875.0, inf],
 'iv': 0.18404341023915186,
 'na_replace_val': 14179.0,
 'raw_is_continuous': True,
 'right': True,
 'woe': [-0.9815372176430663, -0.07943968794256466, 1.6856909889388887]}


Unnamed: 0,bin_no,bin,total,total_pct,bad,bad_pct,good,good_pct,bad_rate,woe,woe_diff,iv
0,1.0,"(-inf, 625.0]",28,0.027451,4.0,0.012739,24.0,0.033994,0.142857,-0.981537,-0.902098,0.020863
1,2.0,"(625.0, 10875.0]",941,0.922549,274.0,0.872611,667.0,0.944759,0.29118,-0.07944,-1.765131,0.005731
2,3.0,"(10875.0, inf] or nan",51,0.05,36.0,0.11465,15.0,0.021246,0.705882,1.685691,,0.157449


In [6]:
rule1["bins"] = [-np.inf, 500, np.inf]
rule1["na_replace_val"] = 99

In [7]:
data2, rule2 = mono_bin_adjust(data, rule1, "x1", "y")
pprint(rule2)
data2

{'bins': [-inf, 500, inf],
 'iv': 0.015231744700225813,
 'na_replace_val': 99,
 'raw_is_continuous': True,
 'right': True,
 'woe': [0.5989131579177818, -0.0254644478745369]}


Unnamed: 0,bin_no,bin,total,total_pct,bad,bad_pct,good,good_pct,bad_rate,woe,woe_diff,iv
0,1,"(-inf, 500.0] or nan",38,0.037255,17.0,0.05414,21.0,0.029745,0.447368,0.598913,0.624378,0.014611
1,2,"(500.0, inf]",982,0.962745,297.0,0.94586,685.0,0.970255,0.302444,-0.025464,,0.000621


In [8]:
data3, rule3 = mono_bin_fit(
    data,
    "x3",
    "y",
    max_bin_cnt=3,
    min_total=5,
    min_bad=1,
    min_good=1,
    min_woe_diff=0.1,
    right=True,
    ascending=False,
    try_na_separate=False,
    return_rule=True,
)
pprint(rule3)
data3

{'bins': [['unskilled - resident', 'skilled employee / official'],
          ['unemployed/ unskilled - non-resident'],
          ['management/ self-employed/ highly qualified employee/ officer',
           nan]],
 'iv': 0.026254530699283882,
 'raw_is_continuous': False,
 'woe': [-0.0775669701569518, 0.04808219953809194, 0.3498805332509901]}


Unnamed: 0,bin_no,bin,total,total_pct,bad,bad_pct,good,good_pct,bad_rate,woe,woe_diff,iv
0,1.0,"{unskilled - resident,skilled employee / offic...",830,0.813725,242.0,0.770701,588.0,0.832861,0.291566,-0.077567,-0.125649,0.004822
1,2.0,{unemployed/ unskilled - non-resident},22,0.021569,7.0,0.022293,15.0,0.021246,0.318182,0.048082,-0.301798,5e-05
2,3.0,{management/ self-employed/ highly qualified e...,168,0.164706,65.0,0.207006,103.0,0.145892,0.386905,0.349881,,0.021383
