In [1]:
import scorecardpy as sc
import BDMLtools as bm
import pandas as pd

+ load germancredit data

In [2]:
dt=sc.germancredit().copy()
dt['creditability']=dt['creditability'].map({'good':0,'bad':1})

+ 将germancredit的有序category类型转换为有序数值类型
+ bm.dtypeAllocator作用在于分配指定列类型至number和object两种类型并限制数据精度
    + bm库的分箱及评分卡制作限于这两种类型

In [3]:
da=bm.dtypeAllocator().fit(dt)
dt=da.transform(dt)

In [4]:
X=dt.drop('creditability',axis=1)
y=dt['creditability']

## 分箱对比

### 分箱-tree

总结:

+ BDMLtools的tree分箱的底层使用numpy，按照scorecardpy分箱逻辑进行了重写
+ tree最优分箱中两者的分箱结果大体一致,在一些细节上有不同
+ BDMLtools的tree分箱运行效率更佳

+ scorecardpy

In [5]:
%%time
bins_sc = sc.woebin(dt, y = "creditability",method="tree",bin_num_limit=5,count_distr_limit=0.05,no_cores=1)

[INFO] creating woe binning ...
CPU times: user 3.29 s, sys: 19.8 ms, total: 3.31 s
Wall time: 3.33 s


+ BDMLtools

In [6]:
%%time
bins_bm=bm.binSelector(method='tree',bin_num_limit=5,distr_limit=0.05,iv_limit=0,n_jobs=1).fit(X,y).bins

CPU times: user 649 ms, sys: 5.05 ms, total: 654 ms
Wall time: 656 ms


+ 对比结果

In [7]:
bins_sc['age.in.years']

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,age.in.years,"[-inf,26.0)",190,0.19,110,80,0.421053,0.528844,0.057921,0.130499,26.0,False
1,age.in.years,"[26.0,28.0)",101,0.101,74,27,0.267327,-0.16093,0.002529,0.130499,28.0,False
2,age.in.years,"[28.0,35.0)",257,0.257,172,85,0.330739,0.142455,0.005359,0.130499,35.0,False
3,age.in.years,"[35.0,37.0)",79,0.079,67,12,0.151899,-0.872488,0.04861,0.130499,37.0,False
4,age.in.years,"[37.0,inf)",373,0.373,277,96,0.257373,-0.212371,0.01608,0.130499,inf,False


In [8]:
bins_bm['age.in.years']

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 26.0)",age.in.years,190,0.19,110.0,80.0,0.421053,0.528844,0.057921,0.130499,0.109524,0.131429,26.0
"[26.0, 28.0)",age.in.years,101,0.101,74.0,27.0,0.267327,-0.16093,0.002529,0.130499,0.09381,0.131429,28.0
"[28.0, 35.0)",age.in.years,257,0.257,172.0,85.0,0.330739,0.142455,0.005359,0.130499,0.131429,0.131429,35.0
"[35.0, 37.0)",age.in.years,79,0.079,67.0,12.0,0.151899,-0.872488,0.04861,0.130499,0.075714,0.131429,37.0
"[37.0, inf)",age.in.years,373,0.373,277.0,96.0,0.257373,-0.212371,0.01608,0.130499,0.0,0.131429,inf
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.130499,0.0,0.131429,missing


+ iv对比

In [9]:
pd.concat(
    [pd.concat(bins_sc).groupby('variable')['bin_iv'].sum().rename('iv_sc'),
     pd.concat(bins_bm).groupby('variable')['bin_iv'].sum().rename('iv_bm')],axis=1
)

Unnamed: 0_level_0,iv_sc,iv_bm
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
age.in.years,0.130499,0.130499
credit.amount,0.18122,0.178117
credit.history,0.29183,0.29183
duration.in.month,0.282618,0.282618
foreign.worker,0.0,0.0
housing,0.083293,0.083293
installment.rate.in.percentage.of.disposable.income,0.025569,0.026322
job,0.008095,0.008095
number.of.existing.credits.at.this.bank,0.010084,0.010084
number.of.people.being.liable.to.provide.maintenance.for,4.3e-05,4.3e-05


### 分箱chi2merge

+ BDMLtools的chi2分箱的底层使用numpy，按照scorecardpy的分箱逻辑进行了重写
+ chi2分箱中两者的分箱结果大体一致,部分不一致在于BDMLtools修复了scorecardpy中breaks与bin对应不一致问题
+ BDMLtools的chi2分箱运行效率更佳

In [10]:
%%time
bins_sc_chi = sc.woebin(dt, y = "creditability",method="chimerge",bin_num_limit=5,count_distr_limit=0.05,no_cores=1)

[INFO] creating woe binning ...
CPU times: user 5.63 s, sys: 52.4 ms, total: 5.68 s
Wall time: 5.76 s


In [11]:
%%time
bins_bm_chi=bm.binSelector(method='chi2',bin_num_limit=5,distr_limit=0.05,iv_limit=0,n_jobs=1).fit(X,y).bins

CPU times: user 1.07 s, sys: 5.02 ms, total: 1.07 s
Wall time: 1.08 s


+ 结果对比

In [12]:
bins_sc_chi['age.in.years']

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,age.in.years,"[-inf,26.0)",190,0.19,110,80,0.421053,0.528844,0.057921,0.123935,26.0,False
1,age.in.years,"[26.0,35.0)",358,0.358,246,112,0.312849,0.060465,0.001324,0.123935,35.0,False
2,age.in.years,"[35.0,37.0)",79,0.079,67,12,0.151899,-0.872488,0.04861,0.123935,37.0,False
3,age.in.years,"[37.0,inf)",373,0.373,277,96,0.257373,-0.212371,0.01608,0.123935,inf,False


In [13]:
bins_bm_chi['age.in.years']

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 26.0)",age.in.years,190,0.19,110.0,80.0,0.421053,0.528844,0.057921,0.123935,0.109524,0.131429,26.0
"[26.0, 35.0)",age.in.years,358,0.358,246.0,112.0,0.312849,0.060465,0.001324,0.123935,0.131429,0.131429,35.0
"[35.0, 37.0)",age.in.years,79,0.079,67.0,12.0,0.151899,-0.872488,0.04861,0.123935,0.075714,0.131429,37.0
"[37.0, inf)",age.in.years,373,0.373,277.0,96.0,0.257373,-0.212371,0.01608,0.123935,0.0,0.131429,inf
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.123935,0.0,0.131429,missing


In [14]:
pd.concat(
    [pd.concat(bins_sc_chi).groupby('variable')['bin_iv'].sum().rename('iv_sc'),
     pd.concat(bins_bm_chi).groupby('variable')['bin_iv'].sum().rename('iv_bm')],axis=1
)

Unnamed: 0_level_0,iv_sc,iv_bm
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
age.in.years,0.123935,0.123935
credit.amount,0.171431,0.114711
credit.history,0.291829,0.246868
duration.in.month,0.282618,0.282618
foreign.worker,0.0,0.0
housing,0.083293,0.038878
installment.rate.in.percentage.of.disposable.income,0.023859,0.019769
job,0.0,0.0
number.of.existing.credits.at.this.bank,0.0,0.0
number.of.people.being.liable.to.provide.maintenance.for,0.0,0.0


## BDMLtools对scorecardpy分箱算法的补充1:更多的分箱算法

bm.binSelector的method:
+ ‘freq’:数值等频分箱，分类特征按其类别分箱
+ ‘freq-kmeans’:基于Kmeans，对freq-cut结果进行自动调整，以将badrate近似的箱进行合并
+ 'pretty':使用Pretty Breakpoints获取数值特征分箱点
    + pretty分箱点更加美观，适合报告、绘图
    + 详见R的pretty函数           
+ 'tree':决策树,分裂iv/ks增益最高的切分点形成新分箱直到达到终止条件
+ 'chi2':卡方,先等频预分箱,再合并低于卡方值(交叉表卡方检验的差异不显著)的分箱

## BDMLtools对scorecardpy分箱算法的补充2:强制单调

bm.binSelector可以强制任意分箱算法实现数值特征的单调分箱,以tree为例
+ 参数coerce_monotonic=True即可实现单调分箱
+ 细节请查看binSelector的帮助文档

In [15]:
bin_monotonic=bm.binSelector(method='tree',bin_num_limit=5,distr_limit=0.05,
               iv_limit=0,n_jobs=1,coerce_monotonic=True).fit(X,y).bins

+ age.in.years

In [16]:
bin_monotonic['age.in.years']

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 26.0)",age.in.years,190,0.19,110.0,80.0,0.421053,0.528844,0.057921,0.100182,0.109524,0.131429,26.0
"[26.0, 30.0)",age.in.years,181,0.181,124.0,57.0,0.314917,0.070068,0.000901,0.100182,0.122381,0.131429,30.0
"[30.0, 35.0)",age.in.years,177,0.177,122.0,55.0,0.310734,0.05061,0.000458,0.100182,0.131429,0.131429,35.0
"[35.0, inf)",age.in.years,452,0.452,344.0,108.0,0.238938,-0.311213,0.040902,0.100182,0.0,0.131429,inf
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.100182,0.0,0.131429,missing


+ credit.amount

In [17]:
bin_monotonic['credit.amount']

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 4000.0)",credit.amount,754,0.754,559.0,195.0,0.258621,-0.205852,0.030584,0.137631,0.148571,0.148571,4000.0
"[4000.0, 6200.0)",credit.amount,104,0.104,65.0,39.0,0.375,0.336472,0.012498,0.137631,0.111429,0.148571,6200.0
"[6200.0, 8000.0)",credit.amount,72,0.072,44.0,28.0,0.388889,0.395313,0.012048,0.137631,0.080952,0.148571,8000.0
"[8000.0, inf)",credit.amount,70,0.07,32.0,38.0,0.542857,1.019148,0.082502,0.137631,0.0,0.148571,inf
missing,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.137631,0.0,0.148571,missing


In [18]:
for key in bin_monotonic:
    
    vtab=bin_monotonic[key]
    
    badprob=vtab.loc[vtab.index!='missing']['badprob']
    
    if badprob.is_monotonic_decreasing or badprob.is_monotonic_increasing:
        
        print('monotonic trend shows in {}'.format(key))
    
    else:
        
        print('no monotonic trend shows in {}'.format(key))

monotonic trend shows in age.in.years
monotonic trend shows in credit.amount
monotonic trend shows in credit.history
monotonic trend shows in duration.in.month
monotonic trend shows in foreign.worker
monotonic trend shows in housing
monotonic trend shows in installment.rate.in.percentage.of.disposable.income
monotonic trend shows in job
monotonic trend shows in number.of.existing.credits.at.this.bank
monotonic trend shows in number.of.people.being.liable.to.provide.maintenance.for
monotonic trend shows in other.debtors.or.guarantors
monotonic trend shows in other.installment.plans
monotonic trend shows in personal.status.and.sex
monotonic trend shows in present.employment.since
monotonic trend shows in present.residence.since
monotonic trend shows in property
monotonic trend shows in purpose
monotonic trend shows in savings.account.and.bonds
monotonic trend shows in status.of.existing.checking.account
monotonic trend shows in telephone
