# 使用BDMLtools做策略分析报告 <font size=3 >by 曾珂</font>

In [1]:
import scorecardpy as sc
import BDMLtools as bm
import pandas as pd
import numpy as np

+ load germancredit data

In [2]:
dt=sc.germancredit().copy()
dt['creditability']=dt['creditability'].map({'good':0,'bad':1})

+ 将germancredit的数据类型进行指定
    + num列-->float64
    + str列-->object
    + date列-->datetime

In [3]:
dtypes_dict={
    'num':['age.in.years',
         'credit.amount',
         'creditability',
         'duration.in.month',
         'installment.rate.in.percentage.of.disposable.income',
         'number.of.existing.credits.at.this.bank',
         'number.of.people.being.liable.to.provide.maintenance.for',
         'present.residence.since'],
    'str':['housing','telephone','foreign.worker','purpose','job','personal.status.and.sex','property',
           'credit.history','savings.account.and.bonds','present.employment.since',
           'status.of.existing.checking.account',
           'other.installment.plans','other.debtors.or.guarantors'],
    'date':[]
}

In [4]:
da=bm.dtypeAllocator(dtypes_dict={}).fit(dt)
dt=da.transform(dt)
X=dt.drop('creditability',axis=1)
y=dt['creditability']

## BDMLtools中的报告

实践进行策略分析时，有时希望快速生成策略特征的分析报告,BDMLtools提供了三种特征分析库:
    
+ varReport:特征分析报告
+ varReportSinge:单特征分析报告
+ varGroupsReport:组特征分析报告

## varReport

### 基本用法

一般特征分析需先进行分箱再生成报告,这里使用binSelector进行最优分箱

In [5]:
bin_tree=bm.binSelector(method='tree',bin_num_limit=8,n_jobs=1,iv_limit=0).fit(X,y)

+ bin_tree中的属性breaks_list为分箱字典,包含了所有特征的分箱结果,格式与scorecardpy一致
+ 将breaks_list作为varReport的入参以按照指定的breaks_list分箱产生特征分析报告

In [6]:
vtab=bm.varReport(bin_tree.breaks_list,n_jobs=1).fit(X,y)

+ vtab的var_report_dict保存了所有特征的分箱结果,结构与scorecardpy一致

In [7]:
vtab.var_report_dict.keys()

dict_keys(['age.in.years', 'credit.amount', 'credit.history', 'duration.in.month', 'foreign.worker', 'housing', 'installment.rate.in.percentage.of.disposable.income', 'job', 'number.of.existing.credits.at.this.bank', 'number.of.people.being.liable.to.provide.maintenance.for', 'other.debtors.or.guarantors', 'other.installment.plans', 'personal.status.and.sex', 'present.employment.since', 'present.residence.since', 'property', 'purpose', 'savings.account.and.bonds', 'status.of.existing.checking.account', 'telephone'])

In [8]:
vtab.var_report_dict['credit.amount']

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,267,0.267,185.0,82.0,0.307116,0.033661,0.000305,0.219101,0.009047619,0.148571,1400.0
"[1400.0, 1800.0)",credit.amount,105,0.105,87.0,18.0,0.171429,-0.728238,0.046815,0.219101,0.0552381,0.148571,1800.0
"[1800.0, 2000.0)",credit.amount,60,0.06,39.0,21.0,0.35,0.228259,0.003261,0.219101,0.04095238,0.148571,2000.0
"[2000.0, 3400.0)",credit.amount,242,0.242,182.0,60.0,0.247934,-0.262364,0.015742,0.219101,0.1009524,0.148571,3400.0
"[3400.0, 4000.0)",credit.amount,80,0.08,66.0,14.0,0.175,-0.7033,0.03349,0.219101,0.1485714,0.148571,4000.0
"[4000.0, 5000.0)",credit.amount,58,0.058,31.0,27.0,0.465517,0.709148,0.032418,0.219101,0.1028571,0.148571,5000.0
"[5000.0, 9200.0)",credit.amount,138,0.138,89.0,49.0,0.355072,0.250482,0.009065,0.219101,0.06666667,0.148571,9200.0
"[9200.0, inf)",credit.amount,50,0.05,21.0,29.0,0.58,1.170071,0.078005,0.219101,1.110223e-16,0.148571,inf
missing,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,missing


这里每一个报表中bin索引,存在missing行用于标示缺失值,数据中的np.nan将被计算进缺失值行中,若数据没有缺失值则为默认值

In [9]:
pd.concat(vtab.var_report_dict)

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
Unnamed: 0_level_1,bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
age.in.years,"[-inf, 26.0)",age.in.years,190,0.190,110.0,80.0,0.421053,0.528844,0.057921,0.155013,0.109524,0.131429,26.0
age.in.years,"[26.0, 28.0)",age.in.years,101,0.101,74.0,27.0,0.267327,-0.160930,0.002529,0.155013,0.093810,0.131429,28.0
age.in.years,"[28.0, 30.0)",age.in.years,80,0.080,50.0,30.0,0.375000,0.336472,0.009613,0.155013,0.122381,0.131429,30.0
age.in.years,"[30.0, 35.0)",age.in.years,177,0.177,122.0,55.0,0.310734,0.050610,0.000458,0.155013,0.131429,0.131429,35.0
age.in.years,"[35.0, 37.0)",age.in.years,79,0.079,67.0,12.0,0.151899,-0.872488,0.048610,0.155013,0.075714,0.131429,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
status.of.existing.checking.account,"[3.0, inf)",status.of.existing.checking.account,394,0.394,348.0,46.0,0.116751,-1.176263,0.404410,0.666012,0.000000,0.367143,inf
status.of.existing.checking.account,missing,status.of.existing.checking.account,0,0.000,0.0,0.0,,0.000000,0.000000,0.666012,0.000000,0.367143,missing
telephone,"[-inf, 1.0)",telephone,596,0.596,409.0,187.0,0.313758,0.064691,0.002526,0.006378,0.039048,0.039048,1.0
telephone,"[1.0, inf)",telephone,404,0.404,291.0,113.0,0.279703,-0.098638,0.003852,0.006378,0.000000,0.039048,inf


使用pd.concat形成所有特征的特征分析报告,与scorecardpy一致

### 样本权重

若数据是经过抽样获取的，一般希望通过样本加权建模以利于还原其违约概率

+ 加入样本权重后各个分箱的坏样本率会产生一定变化
+ 加入样本权重后各个分箱的iv,ks也会产生变化(若只对好坏样本加权且breaks一致则无变化,因为好坏分布未变化)

BDMLtools的报告支持加入样本权重

In [10]:
sample_weight=pd.Series(y.map({0:10,1:1}),index=y.index) #假定数据经过抽样后好样本权重为10,坏样本权重为1
sample_weight_oth=pd.Series(np.random.randint(0,100,y.size)/100,index=y.index) #模拟复杂抽样情形下的样本权重

In [11]:
vtab_ws=bm.varReport(bin_tree.breaks_list,n_jobs=1,sample_weight=sample_weight).fit(X,y)
vtab_ws_oth=bm.varReport(bin_tree.breaks_list,n_jobs=1,sample_weight=sample_weight_oth).fit(X,y)

In [12]:
vtab_ws.var_report_dict['credit.amount'] #加权后(仅类加权)

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,1932,0.264658,1850.0,82.0,0.042443,0.033661,0.000305,0.219101,0.009047619,0.148571,1400.0
"[1400.0, 1800.0)",credit.amount,888,0.121644,870.0,18.0,0.02027,-0.728238,0.046815,0.219101,0.0552381,0.148571,1800.0
"[1800.0, 2000.0)",credit.amount,411,0.056301,390.0,21.0,0.051095,0.228259,0.003261,0.219101,0.04095238,0.148571,2000.0
"[2000.0, 3400.0)",credit.amount,1880,0.257534,1820.0,60.0,0.031915,-0.262364,0.015742,0.219101,0.1009524,0.148571,3400.0
"[3400.0, 4000.0)",credit.amount,674,0.092329,660.0,14.0,0.020772,-0.7033,0.03349,0.219101,0.1485714,0.148571,4000.0
"[4000.0, 5000.0)",credit.amount,337,0.046164,310.0,27.0,0.080119,0.709148,0.032418,0.219101,0.1028571,0.148571,5000.0
"[5000.0, 9200.0)",credit.amount,939,0.12863,890.0,49.0,0.052183,0.250482,0.009065,0.219101,0.06666667,0.148571,9200.0
"[9200.0, inf)",credit.amount,239,0.03274,210.0,29.0,0.121339,1.170071,0.078005,0.219101,1.110223e-16,0.148571,inf
missing,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,missing


In [13]:
vtab_ws_oth.var_report_dict['credit.amount'] #加权后(样本加权)

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,125.78,0.251736,90.75,35.03,0.278502,-0.025025,0.000157,0.209546,0.006266,0.151417,1400.0
"[1400.0, 1800.0)",credit.amount,52.84,0.105754,41.82,11.02,0.208554,-0.406785,0.015883,0.209546,0.04531,0.151417,1800.0
"[1800.0, 2000.0)",credit.amount,34.28,0.068608,23.18,11.1,0.323804,0.190533,0.00259,0.209546,0.031719,0.151417,2000.0
"[2000.0, 3400.0)",credit.amount,123.19,0.246553,95.76,27.43,0.222664,-0.32333,0.023895,0.209546,0.105622,0.151417,3400.0
"[3400.0, 4000.0)",credit.amount,39.14,0.078335,32.69,6.45,0.164793,-0.696111,0.031879,0.209546,0.151417,0.151417,4000.0
"[4000.0, 5000.0)",credit.amount,28.59,0.05722,17.64,10.95,0.383001,0.450049,0.012605,0.209546,0.123408,0.151417,5000.0
"[5000.0, 9200.0)",credit.amount,69.19,0.138477,45.51,23.68,0.342246,0.273577,0.010944,0.209546,0.083404,0.151417,9200.0
"[9200.0, inf)",credit.amount,26.64,0.053317,10.62,16.02,0.601351,1.337977,0.111593,0.209546,0.0,0.151417,inf
missing,credit.amount,0.0,0.0,0.0,0.0,,0.0,0.0,0.209546,0.0,0.151417,missing


In [14]:
vtab.var_report_dict['credit.amount'] #未加权

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,267,0.267,185.0,82.0,0.307116,0.033661,0.000305,0.219101,0.009047619,0.148571,1400.0
"[1400.0, 1800.0)",credit.amount,105,0.105,87.0,18.0,0.171429,-0.728238,0.046815,0.219101,0.0552381,0.148571,1800.0
"[1800.0, 2000.0)",credit.amount,60,0.06,39.0,21.0,0.35,0.228259,0.003261,0.219101,0.04095238,0.148571,2000.0
"[2000.0, 3400.0)",credit.amount,242,0.242,182.0,60.0,0.247934,-0.262364,0.015742,0.219101,0.1009524,0.148571,3400.0
"[3400.0, 4000.0)",credit.amount,80,0.08,66.0,14.0,0.175,-0.7033,0.03349,0.219101,0.1485714,0.148571,4000.0
"[4000.0, 5000.0)",credit.amount,58,0.058,31.0,27.0,0.465517,0.709148,0.032418,0.219101,0.1028571,0.148571,5000.0
"[5000.0, 9200.0)",credit.amount,138,0.138,89.0,49.0,0.355072,0.250482,0.009065,0.219101,0.06666667,0.148571,9200.0
"[9200.0, inf)",credit.amount,50,0.05,21.0,29.0,0.58,1.170071,0.078005,0.219101,1.110223e-16,0.148571,inf
missing,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,missing


### 并行

varReport使用joblib对列的报告生成进行了并行优化,
+ 当基础的数据量非常大列较多时可通过设定n_jobs=任务数进行并行以提高运行速度
+ 一般情况下建议n_jobs=1

In [49]:
#模拟较大大数据量,10w行3000个特征
X_big=pd.DataFrame(np.random.rand(30000,1000),columns=['f'+str(i) for i in range(1000)])
y_big=pd.Series(np.random.randint(0,2,30000),name='target')
breaks_list_big={col:[0.2,0.4,0.6,0.8] for col in X_big.columns}

In [50]:
X_big.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Columns: 1000 entries, f0 to f999
dtypes: float64(1000)
memory usage: 228.9 MB


In [51]:
%%time
res_p=bm.varReport(breaks_list_big,n_jobs=-1).fit(X_big,y_big) 

CPU times: user 953 ms, sys: 156 ms, total: 1.11 s
Wall time: 6.33 s


In [52]:
%%time
res=bm.varReport(breaks_list_big,n_jobs=1).fit(X_big,y_big)

CPU times: user 14.2 s, sys: 142 ms, total: 14.3 s
Wall time: 14.5 s


In [53]:
#结果一致
pd.concat(res.var_report_dict).equals(pd.concat(res_p.var_report_dict))

True

## varReportSinge

### 基本用法

对单一特征产生报告可用bm.varReportSinge,其可以灵活得调整分箱

In [20]:
bm.varReportSinge().report(X['age.in.years'],y,[20,30,40,50])

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 20.0)",age.in.years,2,0.002,1.0,1.0,0.5,0.847298,0.001614,0.064422,0.001905,0.122381,20.0
"[20.0, 30.0)",age.in.years,369,0.369,233.0,136.0,0.368564,0.308914,0.037217,0.064422,0.122381,0.122381,30.0
"[30.0, 40.0)",age.in.years,330,0.33,245.0,85.0,0.257576,-0.211309,0.014087,0.064422,0.055714,0.122381,40.0
"[40.0, 50.0)",age.in.years,174,0.174,130.0,44.0,0.252874,-0.236047,0.009217,0.064422,0.016667,0.122381,50.0
"[50.0, inf)",age.in.years,125,0.125,91.0,34.0,0.272,-0.137201,0.002287,0.064422,0.0,0.122381,inf
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.064422,0.0,0.122381,missing


In [21]:
bm.varReportSinge().report(X['age.in.years'],y,[25,35,45,55])

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 25.0)",age.in.years,149,0.149,88.0,61.0,0.409396,0.480835,0.037322,0.089385,0.077619,0.131429,25.0
"[25.0, 35.0)",age.in.years,399,0.399,268.0,131.0,0.328321,0.131508,0.007076,0.089385,0.131429,0.131429,35.0
"[35.0, 45.0)",age.in.years,251,0.251,193.0,58.0,0.231076,-0.354949,0.029241,0.089385,0.049048,0.131429,45.0
"[45.0, 55.0)",age.in.years,122,0.122,94.0,28.0,0.229508,-0.363792,0.014898,0.089385,0.008095,0.131429,55.0
"[55.0, inf)",age.in.years,79,0.079,57.0,22.0,0.278481,-0.104711,0.000848,0.089385,0.0,0.131429,inf
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.089385,0.0,0.131429,missing


当然也支持样本权重

In [22]:
sample_weight=pd.Series(y.map({0:10,1:1}),index=y.index)
bm.varReportSinge().report(X['age.in.years'],y,[25,35,45,55],sample_weight)

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 25.0)",age.in.years,941,0.128904,880.0,61.0,0.064825,0.480835,0.037322,0.089385,0.077619,0.131429,25.0
"[25.0, 35.0)",age.in.years,2811,0.385068,2680.0,131.0,0.046603,0.131508,0.007076,0.089385,0.131429,0.131429,35.0
"[35.0, 45.0)",age.in.years,1988,0.272329,1930.0,58.0,0.029175,-0.354949,0.029241,0.089385,0.049048,0.131429,45.0
"[45.0, 55.0)",age.in.years,968,0.132603,940.0,28.0,0.028926,-0.363792,0.014898,0.089385,0.008095,0.131429,55.0
"[55.0, inf)",age.in.years,592,0.081096,570.0,22.0,0.037162,-0.104711,0.000848,0.089385,0.0,0.131429,inf
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.089385,0.0,0.131429,missing


## varGroupsReport

在实践中,单一的特征分析报告往往仍旧无法满足实际需要,往往希望通过不同的月份\产品\客群查看各个策略的表现,那么就需要分组产生分析报告

varGroupsReport就是用来满足这种需要的

### 基本用法

In [23]:
#模拟月份(9,10,11,12月),模拟客群(3个)

X_all=X.join(y).assign(
    month=np.random.randint(9,13,y.size),
    client_group=pd.Series(np.random.randint(0,3,y.size),index=y.index).map({0:'g1',1:'g2',2:'g3'})
)

In [24]:
vtab_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['month','client_group'],target=y.name,row_limit=0,
                          n_jobs=1).fit(X_all)

vtab_g中共五张报表:
+ report_all:各个组的特征报告
+ report_brief:各个组的简化版报告,只保留count,badprob,woe,total_iv,ks_max
+ report_count:各个组的简化版报告,只保留count
+ report_badprob:各个组的简化版报告,只保留badprob
+ report_iv:各个组的简化版报告,只保留total_iv
+ report_ks:各个组的简化版报告,只保留ks_max

以report_brief为例

In [25]:
vtab_g.report_dict['report_brief'][['variable','bin','10']] #只看10月份的报告

Unnamed: 0_level_0,variable,bin,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,g1,g1,g1,g1,g1,g2,g2,g2,g2,g2,g3,g3,g3,g3,g3
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,count,badprob,woe,total_iv,ks_max,count,badprob,woe,total_iv,ks_max,count,badprob,woe,total_iv,ks_max
0,age.in.years,"[-inf, 26.0)",20,0.500000,0.426084,1.393963,0.155612,12,0.166667,-0.664976,0.895660,0.177249,15,0.266667,0.283126,2.071199,0.193151
1,age.in.years,"[26.0, 28.0)",7,0.428571,0.138402,1.393963,0.155612,8,0.250000,-0.154151,0.895660,0.177249,7,0.428571,1.007045,2.071199,0.193151
2,age.in.years,"[28.0, 30.0)",3,0.333333,-0.267063,1.393963,0.155612,7,0.714286,1.860752,0.895660,0.177249,10,0.200000,-0.091567,2.071199,0.193151
3,age.in.years,"[30.0, 35.0)",13,0.461538,0.271934,1.393963,0.155612,11,0.363636,0.384846,0.895660,0.177249,19,0.263158,0.265108,2.071199,0.193151
4,age.in.years,"[35.0, 37.0)",3,0.000000,-20.232643,1.393963,0.155612,6,0.333333,0.251314,0.895660,0.177249,4,0.000000,-20.121686,2.071199,0.193151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,status.of.existing.checking.account,"[3.0, inf)",29,0.137931,-1.406497,0.772264,0.385204,36,0.138889,-0.880088,0.721918,0.380952,46,0.130435,-0.602393,0.275140,0.247945
96,status.of.existing.checking.account,missing,0,,0.000000,0.772264,0.385204,0,,0.000000,0.721918,0.380952,0,,0.000000,0.275140,0.247945
97,telephone,"[-inf, 1.0)",48,0.354167,-0.174689,0.042432,0.101403,51,0.313725,0.161702,0.062821,0.113757,55,0.218182,0.018434,0.000498,0.010959
98,telephone,"[1.0, inf)",33,0.454545,0.243763,0.042432,0.101403,24,0.208333,-0.390539,0.062821,0.113757,38,0.210526,-0.027029,0.000498,0.010959


In [26]:
vtab_g.report_dict['report_brief'] #全部报告

Unnamed: 0_level_0,variable,bin,10,10,10,10,10,10,10,10,...,9,9,9,9,9,9,9,9,9,9
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,g1,g1,g1,g1,g1,g2,g2,g2,...,g2,g2,g2,g2,g2,g3,g3,g3,g3,g3
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,count,badprob,woe,total_iv,ks_max,count,badprob,woe,...,count,badprob,woe,total_iv,ks_max,count,badprob,woe,total_iv,ks_max
0,age.in.years,"[-inf, 26.0)",20,0.500000,0.426084,1.393963,0.155612,12,0.166667,-0.664976,...,12,0.500000,0.773190,4.366599,0.304487,12,0.333333,-0.175204,0.094282,0.094985
1,age.in.years,"[26.0, 28.0)",7,0.428571,0.138402,1.393963,0.155612,8,0.250000,-0.154151,...,8,0.500000,0.773190,4.366599,0.304487,9,0.333333,-0.175204,0.094282,0.094985
2,age.in.years,"[28.0, 30.0)",3,0.333333,-0.267063,1.393963,0.155612,7,0.714286,1.860752,...,6,0.500000,0.773190,4.366599,0.304487,7,0.428571,0.230261,0.094282,0.094985
3,age.in.years,"[30.0, 35.0)",13,0.461538,0.271934,1.393963,0.155612,11,0.363636,0.384846,...,12,0.333333,0.080043,4.366599,0.304487,16,0.312500,-0.270514,0.094282,0.094985
4,age.in.years,"[35.0, 37.0)",3,0.000000,-20.232643,1.393963,0.155612,6,0.333333,0.251314,...,7,0.000000,-21.020517,4.366599,0.304487,6,0.333333,-0.175204,0.094282,0.094985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,status.of.existing.checking.account,"[3.0, inf)",29,0.137931,-1.406497,0.772264,0.385204,36,0.138889,-0.880088,...,35,0.057143,-2.030170,1.659732,0.551282,26,0.153846,-1.186805,0.602084,0.325228
96,status.of.existing.checking.account,missing,0,,0.000000,0.772264,0.385204,0,,0.000000,...,0,,0.000000,1.659732,0.551282,0,,0.000000,0.602084,0.325228
97,telephone,"[-inf, 1.0)",48,0.354167,-0.174689,0.042432,0.101403,51,0.313725,0.161702,...,40,0.400000,0.367725,0.173805,0.205128,49,0.346939,-0.114579,0.023761,0.073708
98,telephone,"[1.0, inf)",33,0.454545,0.243763,0.042432,0.101403,24,0.208333,-0.390539,...,36,0.222222,-0.479573,0.173805,0.205128,26,0.423077,0.207788,0.023761,0.073708


In [27]:
vtab_g.report_dict['report_ks']

Unnamed: 0_level_0,variable,10,10,10,11,11,11,12,12,12,9,9,9
Unnamed: 0_level_1,Unnamed: 1_level_1,g1,g2,g3,g1,g2,g3,g1,g2,g3,g1,g2,g3
Unnamed: 0_level_2,Unnamed: 1_level_2,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max
0,age.in.years,0.155612,0.177249,0.193151,0.31954,0.265276,0.236425,0.261538,0.233333,0.086335,0.202712,0.304487,0.094985
1,credit.amount,0.160077,0.306878,0.385616,0.169655,0.090909,0.205882,0.150769,0.122222,0.260064,0.27661,0.076923,0.158815
2,credit.history,0.292092,0.333333,0.145205,0.338851,0.137109,0.149321,0.169231,0.228889,0.119703,0.309831,0.214744,0.303951
3,duration.in.month,0.146046,0.383598,0.44863,0.329655,0.145306,0.38009,0.181538,0.185185,0.272775,0.270508,0.240385,0.218085
4,foreign.worker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,housing,0.128827,0.354497,0.117808,0.022529,0.087183,0.179864,0.107692,0.15037,0.150953,0.126102,0.089744,0.143617
6,installment.rate.in.percentage.of.disposable.i...,0.094388,0.150794,0.113014,0.272184,0.139344,0.223982,0.110769,0.288148,0.090042,0.182373,0.426282,0.06003
7,job,0.085459,0.092593,0.135616,0.073103,0.154993,0.236425,0.04,0.008889,0.131356,0.145085,0.099359,0.048632
8,number.of.existing.credits.at.this.bank,0.007653,0.097884,0.024658,0.150805,0.035768,0.168552,0.003077,0.105926,0.00053,0.092881,0.044872,0.211246
9,number.of.people.being.liable.to.provide.maint...,0.033801,0.060847,0.049315,0.163218,0.112519,0.03733,0.006154,0.011852,0.083157,0.038644,0.025641,0.015198


### 排序组特征水平

上述报告中,月份排序有一定问题,那么若希望以指定顺序排序报告则可通过sort_columns参数设定 

In [28]:
sort_columns={
    'month':['9','10','11','12'],
    'client_group':['g3','g2','g1']
} 

In [29]:
vtab_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['month','client_group'],target=y.name,row_limit=0,
                          sort_columns=sort_columns,
                          n_jobs=1).fit(X_all)

In [30]:
vtab_g.report_dict['report_ks'] #排序后的数据

Unnamed: 0_level_0,variable,9,9,9,10,10,10,11,11,11,12,12,12
Unnamed: 0_level_1,Unnamed: 1_level_1,g3,g2,g1,g3,g2,g1,g3,g2,g1,g3,g2,g1
Unnamed: 0_level_2,Unnamed: 1_level_2,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max
0,age.in.years,0.094985,0.304487,0.202712,0.193151,0.177249,0.155612,0.236425,0.265276,0.31954,0.086335,0.233333,0.261538
1,credit.amount,0.158815,0.076923,0.27661,0.385616,0.306878,0.160077,0.205882,0.090909,0.169655,0.260064,0.122222,0.150769
2,credit.history,0.303951,0.214744,0.309831,0.145205,0.333333,0.292092,0.149321,0.137109,0.338851,0.119703,0.228889,0.169231
3,duration.in.month,0.218085,0.240385,0.270508,0.44863,0.383598,0.146046,0.38009,0.145306,0.329655,0.272775,0.185185,0.181538
4,foreign.worker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,housing,0.143617,0.089744,0.126102,0.117808,0.354497,0.128827,0.179864,0.087183,0.022529,0.150953,0.15037,0.107692
6,installment.rate.in.percentage.of.disposable.i...,0.06003,0.426282,0.182373,0.113014,0.150794,0.094388,0.223982,0.139344,0.272184,0.090042,0.288148,0.110769
7,job,0.048632,0.099359,0.145085,0.135616,0.092593,0.085459,0.236425,0.154993,0.073103,0.131356,0.008889,0.04
8,number.of.existing.credits.at.this.bank,0.211246,0.044872,0.092881,0.024658,0.097884,0.007653,0.168552,0.035768,0.150805,0.00053,0.105926,0.003077
9,number.of.people.being.liable.to.provide.maint...,0.015198,0.025641,0.038644,0.049315,0.060847,0.033801,0.03733,0.112519,0.163218,0.083157,0.011852,0.006154


+ 若希望client_group在第一行,month在第二行,那么只需要把columns=['month','client_group']换为columns=['client_group','month']

### 使用row_limit

有时组分得过细会导致组内样本量不足,指标可能不具备代表性,因此可通过row_limit进行限定,若组数量小于row_limit时,该组就不会统计其任何指标

In [31]:
vtab_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['month','client_group'],target=y.name,
                          row_limit=80,
                          sort_columns=sort_columns,
                          n_jobs=1).fit(X_all)

group ('10', 'g2') has rows less than 80,output will return None
group ('11', 'g3') has rows less than 80,output will return None
group ('12', 'g2') has rows less than 80,output will return None
group ('9', 'g2') has rows less than 80,output will return None
group ('9', 'g3') has rows less than 80,output will return None


可以看到警告信息中相应剔除的组的信息

In [32]:
vtab_g.report_dict['report_ks']

Unnamed: 0_level_0,variable,9,10,10,11,11,12,12
Unnamed: 0_level_1,Unnamed: 1_level_1,g1,g3,g1,g2,g1,g3,g1
Unnamed: 0_level_2,Unnamed: 1_level_2,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max
0,age.in.years,0.202712,0.193151,0.155612,0.265276,0.31954,0.086335,0.261538
1,credit.amount,0.27661,0.385616,0.160077,0.090909,0.169655,0.260064,0.150769
2,credit.history,0.309831,0.145205,0.292092,0.137109,0.338851,0.119703,0.169231
3,duration.in.month,0.270508,0.44863,0.146046,0.145306,0.329655,0.272775,0.181538
4,foreign.worker,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,housing,0.126102,0.117808,0.128827,0.087183,0.022529,0.150953,0.107692
6,installment.rate.in.percentage.of.disposable.i...,0.182373,0.113014,0.094388,0.139344,0.272184,0.090042,0.110769
7,job,0.145085,0.135616,0.085459,0.154993,0.073103,0.131356,0.04
8,number.of.existing.credits.at.this.bank,0.092881,0.024658,0.007653,0.035768,0.150805,0.00053,0.003077
9,number.of.people.being.liable.to.provide.maint...,0.038644,0.049315,0.033801,0.112519,0.163218,0.083157,0.006154


### 产生psi报告

varGroupsReport可以产生psi报告以比较各个组中各个变量的分布变动情况

In [33]:
vtabs_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['client_group'],target=y.name,
                    row_limit=0,output_psi=True,n_jobs=1).fit(X_all)

In [34]:
vtabs_g.report_dict['report_psi'].head(20)

Unnamed: 0_level_0,variable,bin,g1,g2,g3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count_distr,count_distr,count_distr
0,age.in.years,"[-inf, 26.0)",0.225627,0.182109,0.158537
1,age.in.years,"[26.0, 28.0)",0.08078,0.105431,0.118902
2,age.in.years,"[28.0, 30.0)",0.069638,0.083067,0.088415
3,age.in.years,"[30.0, 35.0)",0.181058,0.159744,0.189024
4,age.in.years,"[35.0, 37.0)",0.066852,0.092652,0.079268
5,age.in.years,"[37.0, 48.0)",0.208914,0.220447,0.237805
6,age.in.years,"[48.0, 53.0)",0.083565,0.051118,0.027439
7,age.in.years,"[53.0, inf)",0.083565,0.105431,0.10061
8,age.in.years,missing,0.0,0.0,0.0
9,age.in.years,psi,0.028666,0.005765,0.030718


每一个变量的分组最后一行都会有psi用于显示各个组与基准分布比较而产生的psi

默认情况下psi的基准分布为全量数据分布,这里也可以指定某一组的分布为基准分布

+ 参数psi_base用于指定分布基准
+ 参数psi_base会在模块内部通过X.query(psi_base)传递,其语法为pd.DataFrame.query()的语法

In [35]:
#选择client_group=="g1"为psi基准分布
vtabs_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['client_group'],target=y.name,
                    row_limit=0,output_psi=True,psi_base='client_group=="g1"',n_jobs=1).fit(X_all)

In [36]:
vtabs_g.report_dict['report_psi'].head(20) #可以看到g1的psi为0

Unnamed: 0_level_0,variable,bin,g1,g2,g3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count_distr,count_distr,count_distr
0,age.in.years,"[-inf, 26.0)",0.225627,0.182109,0.158537
1,age.in.years,"[26.0, 28.0)",0.08078,0.105431,0.118902
2,age.in.years,"[28.0, 30.0)",0.069638,0.083067,0.088415
3,age.in.years,"[30.0, 35.0)",0.181058,0.159744,0.189024
4,age.in.years,"[35.0, 37.0)",0.066852,0.092652,0.079268
5,age.in.years,"[37.0, 48.0)",0.208914,0.220447,0.237805
6,age.in.years,"[48.0, 53.0)",0.083565,0.051118,0.027439
7,age.in.years,"[53.0, inf)",0.083565,0.105431,0.10061
8,age.in.years,missing,0.0,0.0,0.0
9,age.in.years,psi,0.0,0.050997,0.114766


### 并行

varGroupsReport使用joblib对各组的报告生成进行了并行优化,

+ 当基础的数据量非常大列较多组的数量较多时可通过设定n_jobs=任务数进行并行以提高运行速度
+ 一般情况下建议n_jobs=1

In [37]:
#模拟较大数据量,3w行1000个特征,一个组特征
X_big=pd.DataFrame(np.random.rand(30000,1000),columns=['f'+str(i) for i in range(1000)])
y_big=pd.Series(np.random.randint(0,2,30000),name='target')

breaks_list_big={col:[0.2,0.4,0.6,0.8] for col in X_big.columns}

X_big_all=X_big.join(y_big).assign(
    client_group=pd.Series(np.random.randint(0,5,y_big.size),index=y_big.index).map({0:'g1',
                                                                                     1:'g2',
                                                                                     2:'g3',
                                                                                     3:'g4',
                                                                                     4:'g5'
                                                                                    })
)

In [38]:
X_big_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Columns: 1002 entries, f0 to client_group
dtypes: float64(1000), int64(1), object(1)
memory usage: 229.3+ MB


In [39]:
%%time
vtabs_g_p=bm.varGroupsReport(breaks_list_big,columns=['client_group'],target=y_big.name,
                    row_limit=0,output_psi=True,psi_base='client_group=="g1"',n_jobs=-1).fit(X_big_all)

CPU times: user 1min 9s, sys: 1min 5s, total: 2min 15s
Wall time: 2min 39s


In [40]:
%%time
vtabs_g=bm.varGroupsReport(breaks_list_big,columns=['client_group'],target=y_big.name,
                    row_limit=0,output_psi=True,psi_base='client_group=="g1"',n_jobs=1).fit(X_big_all)

CPU times: user 2min 24s, sys: 1min 8s, total: 3min 32s
Wall time: 3min 34s


### 样本权重

varGroupsReport提供了参数sample_weight,以在报告中加入样本权重,使用方法与varReport一致

In [41]:
sample_weight=pd.Series(y.map({0:10,1:1}),index=y.index) #假定数据经过抽样后好样本权重为10,坏样本权重为1

In [42]:
vtab_g_ws=bm.varGroupsReport(bin_tree.breaks_list,columns=['split'],target=y.name,
                       sample_weight=sample_weight,
                       row_limit=0,n_jobs=1).fit(X_all.assign(split=1))

In [43]:
vtab_g_ws.report_dict['report_all'].head(10)

Unnamed: 0_level_0,variable,bin,1,1,1,1,1,1,1,1,1,1,1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
0,age.in.years,"[-inf, 26.0)",1180,0.161644,1100.0,80.0,0.067797,0.528844,0.057921,0.155013,0.109524,0.131429,26.0
1,age.in.years,"[26.0, 28.0)",767,0.105068,740.0,27.0,0.035202,-0.16093,0.002529,0.155013,0.09381,0.131429,28.0
2,age.in.years,"[28.0, 30.0)",530,0.072603,500.0,30.0,0.056604,0.336472,0.009613,0.155013,0.122381,0.131429,30.0
3,age.in.years,"[30.0, 35.0)",1275,0.174658,1220.0,55.0,0.043137,0.05061,0.000458,0.155013,0.131429,0.131429,35.0
4,age.in.years,"[35.0, 37.0)",682,0.093425,670.0,12.0,0.017595,-0.872488,0.04861,0.155013,0.075714,0.131429,37.0
5,age.in.years,"[37.0, 48.0)",1698,0.232603,1640.0,58.0,0.034158,-0.192126,0.007868,0.155013,0.034762,0.131429,48.0
6,age.in.years,"[48.0, 53.0)",469,0.064247,460.0,9.0,0.01919,-0.784119,0.028004,0.155013,0.000952,0.131429,53.0
7,age.in.years,"[53.0, inf)",699,0.095753,670.0,29.0,0.041488,0.009901,9e-06,0.155013,0.0,0.131429,inf
8,age.in.years,missing,0,0.0,0.0,0.0,,0.0,0.0,0.155013,0.0,0.131429,missing
9,credit.amount,"[-inf, 1400.0)",1932,0.264658,1850.0,82.0,0.042443,0.033661,0.000305,0.219101,0.009048,0.148571,1400.0


### 导出为excel

varGroupsReport支持对组报告导出为excel,名称为var_report.xlsx

+ 参数out_path为报告输出路径,若无此路径则模块会创建该路径
+ 参数tab_suffix代表报告名称后缀,例如tab_suffix=‘_group’时,输出报告名称为var_report_group.xlsx
+ report_dict中所有的报告都会被导出,并写为不同的sheet

In [44]:
vtabs_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['client_group'],
                           target=y.name,row_limit=0,
                           output_psi=True,n_jobs=1,
                           out_path='report/',
                           tab_suffix='_client'
                          ).fit(X_all)

to_excel done
