# 使用BDMLtools做策略分析报告 <font size=3 >by 曾珂</font>

In [1]:
import BDMLtools as bm
import pandas as pd
import numpy as np

In [2]:
bm.__version__

'0.1.7'

+ load germancredit data

In [3]:
import scorecardpy as sc
dt=sc.germancredit().copy()
dt['creditability']=dt['creditability'].map({'good':0,'bad':1})

+ 将germancredit的数据类型进行指定
    + num列-->float64
    + str列-->object
    + date列-->datetime

In [4]:
dtypes_dict={
    'num':['age.in.years',
         'credit.amount',
         'creditability',
         'duration.in.month',
         'installment.rate.in.percentage.of.disposable.income',
         'number.of.existing.credits.at.this.bank',
         'number.of.people.being.liable.to.provide.maintenance.for',
         'present.residence.since'],
    'str':['housing','telephone','foreign.worker','purpose','job','personal.status.and.sex','property',
           'credit.history','savings.account.and.bonds','present.employment.since',
           'status.of.existing.checking.account',
           'other.installment.plans','other.debtors.or.guarantors'],
    'date':[]
}

In [5]:
da=bm.dtypeAllocator(dtypes_dict=dtypes_dict).fit(dt)
dt=da.transform(dt)
X=dt.drop('creditability',axis=1)
y=dt['creditability']

## BDMLtools中的报告

实践进行策略分析时，有时希望快速生成策略特征的分析报告,BDMLtools提供了三种特征分析库:
    
+ varReport:特征分析报告
+ varReportSinge:单特征分析报告
+ varGroupsReport:组特征分析报告

## varReport

### 基本用法

一般特征分析需先进行分箱再生成报告,这里使用binSelector进行最优分箱

In [6]:
bin_tree=bm.binSelector(method='tree',bin_num_limit=8,n_jobs=1,iv_limit=0).fit(X,y)

+ bin_tree中的属性breaks_list为分箱字典,包含了所有特征的分箱结果,格式与scorecardpy一致
+ 将breaks_list作为varReport的入参以按照指定的breaks_list分箱产生特征分析报告

In [7]:
vtab=bm.varReport(bin_tree.breaks_list,n_jobs=1).fit(X,y)

+ vtab的var_report_dict保存了所有特征的分箱结果,结构与scorecardpy一致

In [8]:
vtab.var_report_dict.keys()

dict_keys(['age.in.years', 'credit.amount', 'credit.history', 'duration.in.month', 'foreign.worker', 'housing', 'installment.rate.in.percentage.of.disposable.income', 'job', 'number.of.existing.credits.at.this.bank', 'number.of.people.being.liable.to.provide.maintenance.for', 'other.debtors.or.guarantors', 'other.installment.plans', 'personal.status.and.sex', 'present.employment.since', 'present.residence.since', 'property', 'purpose', 'savings.account.and.bonds', 'status.of.existing.checking.account', 'telephone'])

In [9]:
vtab.var_report_dict['credit.amount']

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,267,0.267,185.0,82.0,0.307116,0.033661,0.000305,0.219101,0.009047619,0.148571,1400.0
"[1400.0, 1800.0)",credit.amount,105,0.105,87.0,18.0,0.171429,-0.728238,0.046815,0.219101,0.0552381,0.148571,1800.0
"[1800.0, 2000.0)",credit.amount,60,0.06,39.0,21.0,0.35,0.228259,0.003261,0.219101,0.04095238,0.148571,2000.0
"[2000.0, 3400.0)",credit.amount,242,0.242,182.0,60.0,0.247934,-0.262364,0.015742,0.219101,0.1009524,0.148571,3400.0
"[3400.0, 4000.0)",credit.amount,80,0.08,66.0,14.0,0.175,-0.7033,0.03349,0.219101,0.1485714,0.148571,4000.0
"[4000.0, 5000.0)",credit.amount,58,0.058,31.0,27.0,0.465517,0.709148,0.032418,0.219101,0.1028571,0.148571,5000.0
"[5000.0, 9200.0)",credit.amount,138,0.138,89.0,49.0,0.355072,0.250482,0.009065,0.219101,0.06666667,0.148571,9200.0
"[9200.0, inf)",credit.amount,50,0.05,21.0,29.0,0.58,1.170071,0.078005,0.219101,1.110223e-16,0.148571,inf
special,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,special
missing,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,missing


这里每一个报表中bin索引,
+ 存在missing行用于标示缺失值,数据中的np.nan将被计算进缺失值行中,若数据没有缺失值则为默认值
+ special行用于标示除缺失值外的特殊编码,可通过special_values参数进行指定,若数据没有特殊值则为默认值,详情请见文档

In [10]:
pd.concat(vtab.var_report_dict)

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
Unnamed: 0_level_1,bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
age.in.years,"[-inf, 26.0)",age.in.years,190,0.190,110.0,80.0,0.421053,0.528844,0.057921,0.155013,0.109524,0.131429,26.0
age.in.years,"[26.0, 28.0)",age.in.years,101,0.101,74.0,27.0,0.267327,-0.160930,0.002529,0.155013,0.093810,0.131429,28.0
age.in.years,"[28.0, 30.0)",age.in.years,80,0.080,50.0,30.0,0.375000,0.336472,0.009613,0.155013,0.122381,0.131429,30.0
age.in.years,"[30.0, 35.0)",age.in.years,177,0.177,122.0,55.0,0.310734,0.050610,0.000458,0.155013,0.131429,0.131429,35.0
age.in.years,"[35.0, 37.0)",age.in.years,79,0.079,67.0,12.0,0.151899,-0.872488,0.048610,0.155013,0.075714,0.131429,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
status.of.existing.checking.account,missing,status.of.existing.checking.account,0,0.000,0.0,0.0,,0.000000,0.000000,0.666012,0.000000,0.367143,missing
telephone,"yes, registered under the customers name",telephone,404,0.404,291.0,113.0,0.279703,-0.098638,0.003852,0.006378,0.039048,0.039048,"yes, registered under the customers name"
telephone,none,telephone,596,0.596,409.0,187.0,0.313758,0.064691,0.002526,0.006378,0.000000,0.039048,none
telephone,special,telephone,0,0.000,0.0,0.0,,0.000000,0.000000,0.006378,0.000000,0.039048,special


使用pd.concat形成所有特征的特征分析报告,与scorecardpy一致

### 样本权重

若数据是经过抽样获取的，一般希望通过样本加权建模以利于还原其违约概率

+ 加入样本权重后各个分箱的坏样本率会产生一定变化
+ 加入样本权重后各个分箱的iv,ks也会产生变化(若只对好坏样本加权且breaks一致则无变化,因为好坏分布未变化)

BDMLtools的报告支持加入样本权重

In [11]:
sample_weight=pd.Series(y.map({0:10,1:1}),index=y.index) #假定数据经过抽样后好样本权重为10,坏样本权重为1
sample_weight_oth=pd.Series(np.random.randint(0,100,y.size)/100,index=y.index) #模拟复杂抽样情形下的样本权重

In [12]:
vtab_ws=bm.varReport(bin_tree.breaks_list,n_jobs=1,sample_weight=sample_weight).fit(X,y)
vtab_ws_oth=bm.varReport(bin_tree.breaks_list,n_jobs=1,sample_weight=sample_weight_oth).fit(X,y)

In [13]:
vtab_ws.var_report_dict['credit.amount'] #加权后(仅类加权)

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,1932,0.264658,1850.0,82.0,0.042443,0.033661,0.000305,0.219101,0.009047619,0.148571,1400.0
"[1400.0, 1800.0)",credit.amount,888,0.121644,870.0,18.0,0.02027,-0.728238,0.046815,0.219101,0.0552381,0.148571,1800.0
"[1800.0, 2000.0)",credit.amount,411,0.056301,390.0,21.0,0.051095,0.228259,0.003261,0.219101,0.04095238,0.148571,2000.0
"[2000.0, 3400.0)",credit.amount,1880,0.257534,1820.0,60.0,0.031915,-0.262364,0.015742,0.219101,0.1009524,0.148571,3400.0
"[3400.0, 4000.0)",credit.amount,674,0.092329,660.0,14.0,0.020772,-0.7033,0.03349,0.219101,0.1485714,0.148571,4000.0
"[4000.0, 5000.0)",credit.amount,337,0.046164,310.0,27.0,0.080119,0.709148,0.032418,0.219101,0.1028571,0.148571,5000.0
"[5000.0, 9200.0)",credit.amount,939,0.12863,890.0,49.0,0.052183,0.250482,0.009065,0.219101,0.06666667,0.148571,9200.0
"[9200.0, inf)",credit.amount,239,0.03274,210.0,29.0,0.121339,1.170071,0.078005,0.219101,1.110223e-16,0.148571,inf
special,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,special
missing,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,missing


In [14]:
vtab_ws_oth.var_report_dict['credit.amount'] #加权后(样本加权)

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,139.9,0.275442,96.61,43.29,0.309435,0.014833,6.077495e-05,0.212791,0.004097,0.152123,1400.0
"[1400.0, 1800.0)",credit.amount,53.53,0.105393,44.43,9.1,0.169998,-0.768047,0.05191835,0.212791,0.063501,0.152123,1800.0
"[1800.0, 2000.0)",credit.amount,27.37,0.053887,19.0,8.37,0.305809,-0.002192,2.587187e-07,0.212791,0.063619,0.152123,2000.0
"[2000.0, 3400.0)",credit.amount,117.82,0.23197,88.12,29.7,0.252079,-0.269959,0.01597324,0.212791,0.122788,0.152123,3400.0
"[3400.0, 4000.0)",credit.amount,38.84,0.07647,30.11,8.73,0.224768,-0.420498,0.01233531,0.212791,0.152123,0.152123,4000.0
"[4000.0, 5000.0)",credit.amount,32.46,0.063909,16.93,15.53,0.478435,0.73128,0.03786863,0.212791,0.100339,0.152123,5000.0
"[5000.0, 9200.0)",credit.amount,73.61,0.144927,47.7,25.91,0.35199,0.207291,0.00646391,0.212791,0.069156,0.152123,9200.0
"[9200.0, inf)",credit.amount,24.38,0.048001,9.45,14.93,0.612387,1.274951,0.08817042,0.212791,0.0,0.152123,inf
special,credit.amount,0.0,0.0,0.0,0.0,,0.0,0.0,0.212791,0.0,0.152123,special
missing,credit.amount,0.0,0.0,0.0,0.0,,0.0,0.0,0.212791,0.0,0.152123,missing


In [15]:
vtab.var_report_dict['credit.amount'] #未加权

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 1400.0)",credit.amount,267,0.267,185.0,82.0,0.307116,0.033661,0.000305,0.219101,0.009047619,0.148571,1400.0
"[1400.0, 1800.0)",credit.amount,105,0.105,87.0,18.0,0.171429,-0.728238,0.046815,0.219101,0.0552381,0.148571,1800.0
"[1800.0, 2000.0)",credit.amount,60,0.06,39.0,21.0,0.35,0.228259,0.003261,0.219101,0.04095238,0.148571,2000.0
"[2000.0, 3400.0)",credit.amount,242,0.242,182.0,60.0,0.247934,-0.262364,0.015742,0.219101,0.1009524,0.148571,3400.0
"[3400.0, 4000.0)",credit.amount,80,0.08,66.0,14.0,0.175,-0.7033,0.03349,0.219101,0.1485714,0.148571,4000.0
"[4000.0, 5000.0)",credit.amount,58,0.058,31.0,27.0,0.465517,0.709148,0.032418,0.219101,0.1028571,0.148571,5000.0
"[5000.0, 9200.0)",credit.amount,138,0.138,89.0,49.0,0.355072,0.250482,0.009065,0.219101,0.06666667,0.148571,9200.0
"[9200.0, inf)",credit.amount,50,0.05,21.0,29.0,0.58,1.170071,0.078005,0.219101,1.110223e-16,0.148571,inf
special,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,special
missing,credit.amount,0,0.0,0.0,0.0,,0.0,0.0,0.219101,1.110223e-16,0.148571,missing


### 并行

varReport使用joblib对列的报告生成进行了并行优化,
+ 当基础的数据量非常大列较多时可通过设定n_jobs=任务数进行并行以提高运行速度
+ 一般情况下建议n_jobs=1

In [16]:
#模拟较大大数据量,3w行1000个特征
X_big=pd.DataFrame(np.random.rand(30000,1000),columns=['f'+str(i) for i in range(1000)])
y_big=pd.Series(np.random.randint(0,2,30000),name='target')
breaks_list_big={col:[0.2,0.4,0.6,0.8] for col in X_big.columns}

In [17]:
X_big.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Columns: 1000 entries, f0 to f999
dtypes: float64(1000)
memory usage: 228.9 MB


In [18]:
%%time
res_p=bm.varReport(breaks_list_big,n_jobs=-1).fit(X_big,y_big) 

CPU times: user 1.01 s, sys: 314 ms, total: 1.32 s
Wall time: 12 s


In [19]:
%%time
res=bm.varReport(breaks_list_big,n_jobs=1).fit(X_big,y_big)

CPU times: user 13.7 s, sys: 123 ms, total: 13.9 s
Wall time: 14 s


In [20]:
#结果一致
pd.concat(res.var_report_dict).equals(pd.concat(res_p.var_report_dict))

True

## varReportSinge

### 基本用法

对单一特征产生报告可用bm.varReportSinge,其可以灵活得调整分箱

In [21]:
bm.varReportSinge().report(X['age.in.years'],y,[20,30,40,50])

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 20.0)",age.in.years,2,0.002,1.0,1.0,0.5,0.847298,0.001614,0.064422,0.001905,0.122381,20.0
"[20.0, 30.0)",age.in.years,369,0.369,233.0,136.0,0.368564,0.308914,0.037217,0.064422,0.122381,0.122381,30.0
"[30.0, 40.0)",age.in.years,330,0.33,245.0,85.0,0.257576,-0.211309,0.014087,0.064422,0.055714,0.122381,40.0
"[40.0, 50.0)",age.in.years,174,0.174,130.0,44.0,0.252874,-0.236047,0.009217,0.064422,0.016667,0.122381,50.0
"[50.0, inf)",age.in.years,125,0.125,91.0,34.0,0.272,-0.137201,0.002287,0.064422,0.0,0.122381,inf
special,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.064422,0.0,0.122381,special
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.064422,0.0,0.122381,missing


In [22]:
bm.varReportSinge().report(X['age.in.years'],y,[25,35,45,55])

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 25.0)",age.in.years,149,0.149,88.0,61.0,0.409396,0.480835,0.037322,0.089385,0.077619,0.131429,25.0
"[25.0, 35.0)",age.in.years,399,0.399,268.0,131.0,0.328321,0.131508,0.007076,0.089385,0.131429,0.131429,35.0
"[35.0, 45.0)",age.in.years,251,0.251,193.0,58.0,0.231076,-0.354949,0.029241,0.089385,0.049048,0.131429,45.0
"[45.0, 55.0)",age.in.years,122,0.122,94.0,28.0,0.229508,-0.363792,0.014898,0.089385,0.008095,0.131429,55.0
"[55.0, inf)",age.in.years,79,0.079,57.0,22.0,0.278481,-0.104711,0.000848,0.089385,0.0,0.131429,inf
special,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.089385,0.0,0.131429,special
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.089385,0.0,0.131429,missing


当然也支持样本权重

In [23]:
sample_weight=pd.Series(y.map({0:10,1:1}),index=y.index)
bm.varReportSinge().report(X['age.in.years'],y,[25,35,45,55],sample_weight)

Unnamed: 0_level_0,variable,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,ks,ks_max,breaks
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"[-inf, 25.0)",age.in.years,941,0.128904,880.0,61.0,0.064825,0.480835,0.037322,0.089385,0.077619,0.131429,25.0
"[25.0, 35.0)",age.in.years,2811,0.385068,2680.0,131.0,0.046603,0.131508,0.007076,0.089385,0.131429,0.131429,35.0
"[35.0, 45.0)",age.in.years,1988,0.272329,1930.0,58.0,0.029175,-0.354949,0.029241,0.089385,0.049048,0.131429,45.0
"[45.0, 55.0)",age.in.years,968,0.132603,940.0,28.0,0.028926,-0.363792,0.014898,0.089385,0.008095,0.131429,55.0
"[55.0, inf)",age.in.years,592,0.081096,570.0,22.0,0.037162,-0.104711,0.000848,0.089385,0.0,0.131429,inf
special,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.089385,0.0,0.131429,special
missing,age.in.years,0,0.0,0.0,0.0,,0.0,0.0,0.089385,0.0,0.131429,missing


## varGroupsReport

在实践中,单一的特征分析报告往往仍旧无法满足实际需要,往往希望通过不同的月份\产品\客群查看各个策略的表现,那么就需要分组产生分析报告

varGroupsReport就是用来满足这种需要的

### 基本用法

In [24]:
#模拟月份(9,10,11,12月),模拟客群(3个)

X_all=X.join(y).assign(
    month=np.random.randint(9,13,y.size),
    client_group=pd.Series(np.random.randint(0,3,y.size),index=y.index).map({0:'g1',1:'g2',2:'g3'})
)

In [25]:
vtab_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['month','client_group'],target=y.name,row_limit=0,
                          n_jobs=1).fit(X_all)

vtab_g中共五张报表:
+ report_all:各个组的特征报告
+ report_brief:各个组的简化版报告,只保留count,badprob,woe,total_iv,ks_max
+ report_count:各个组的简化版报告,只保留count
+ report_badprob:各个组的简化版报告,只保留badprob
+ report_iv:各个组的简化版报告,只保留total_iv
+ report_ks:各个组的简化版报告,只保留ks_max

以report_brief为例

In [26]:
vtab_g.report_dict['report_brief'][['variable','bin','10']] #只看10月份的报告

Unnamed: 0_level_0,variable,bin,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,g1,g1,g1,g1,g1,g2,g2,g2,g2,g2,g3,g3,g3,g3,g3
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,count,badprob,woe,total_iv,ks_max,count,badprob,woe,total_iv,ks_max,count,badprob,woe,total_iv,ks_max
0,age.in.years,"[-inf, 26.0)",14,0.428571,0.796944,3.569872,0.137324,12,0.500000,0.934309,0.741361,0.233766,13,0.461538,0.887303,4.534827,0.223039
1,age.in.years,"[26.0, 28.0)",11,0.000000,-21.161066,3.569872,0.137324,10,0.100000,-1.262915,0.741361,0.233766,6,0.166667,-0.567984,4.534827,0.223039
2,age.in.years,"[28.0, 30.0)",7,0.428571,0.796944,3.569872,0.137324,11,0.545455,1.116631,0.741361,0.233766,8,0.500000,1.041454,4.534827,0.223039
3,age.in.years,"[30.0, 35.0)",16,0.187500,-0.381711,3.569872,0.137324,9,0.111111,-1.145132,0.741361,0.233766,16,0.250000,-0.057158,4.534827,0.223039
4,age.in.years,"[35.0, 37.0)",5,0.200000,-0.301668,3.569872,0.137324,9,0.111111,-1.145132,0.741361,0.233766,7,0.000000,-20.752253,4.534827,0.223039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,status.of.existing.checking.account,missing,0,,0.000000,0.876354,0.410798,0,,0.000000,0.701563,0.353896,0,,0.000000,0.200580,0.203431
116,telephone,"yes, registered under the customers name",39,0.230769,-0.119347,0.009433,0.047535,34,0.264706,-0.087342,0.005698,0.037338,42,0.261905,0.005362,0.000024,0.002451
117,telephone,none,56,0.267857,0.079104,0.009433,0.047535,44,0.295455,0.065271,0.005698,0.037338,50,0.260000,-0.004515,0.000024,0.002451
118,telephone,special,0,,0.000000,0.009433,0.047535,0,,0.000000,0.005698,0.037338,0,,0.000000,0.000024,0.002451


In [27]:
vtab_g.report_dict['report_brief'] #全部报告

Unnamed: 0_level_0,variable,bin,10,10,10,10,10,10,10,10,...,9,9,9,9,9,9,9,9,9,9
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,g1,g1,g1,g1,g1,g2,g2,g2,...,g2,g2,g2,g2,g2,g3,g3,g3,g3,g3
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,count,badprob,woe,total_iv,ks_max,count,badprob,woe,...,count,badprob,woe,total_iv,ks_max,count,badprob,woe,total_iv,ks_max
0,age.in.years,"[-inf, 26.0)",14,0.428571,0.796944,3.569872,0.137324,12,0.500000,0.934309,...,24,0.458333,0.667744,2.631772,0.232978,23,0.565217,1.098612,3.935034,0.353846
1,age.in.years,"[26.0, 28.0)",11,0.000000,-21.161066,3.569872,0.137324,10,0.100000,-1.262915,...,4,0.000000,-20.441853,2.631772,0.232978,12,0.333333,0.143101,3.935034,0.353846
2,age.in.years,"[28.0, 30.0)",7,0.428571,0.796944,3.569872,0.137324,11,0.545455,1.116631,...,2,0.000000,-19.748706,2.631772,0.232978,4,0.250000,-0.262364,3.935034,0.353846
3,age.in.years,"[30.0, 35.0)",16,0.187500,-0.381711,3.569872,0.137324,9,0.111111,-1.145132,...,10,0.100000,-1.362427,2.631772,0.232978,14,0.142857,-0.955511,3.935034,0.353846
4,age.in.years,"[35.0, 37.0)",5,0.200000,-0.301668,3.569872,0.137324,9,0.111111,-1.145132,...,5,0.400000,0.429333,2.631772,0.232978,6,0.000000,-20.723266,3.935034,0.353846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,status.of.existing.checking.account,missing,0,,0.000000,0.876354,0.410798,0,,0.000000,...,0,,0.000000,0.422172,0.278917,0,,0.000000,0.718063,0.362821
116,telephone,"yes, registered under the customers name",39,0.230769,-0.119347,0.009433,0.047535,34,0.264706,-0.087342,...,28,0.285714,-0.081493,0.003775,0.029532,33,0.363636,0.276632,0.051853,0.111538
117,telephone,none,56,0.267857,0.079104,0.009433,0.047535,44,0.295455,0.065271,...,48,0.312500,0.046340,0.003775,0.029532,53,0.264151,-0.188256,0.051853,0.111538
118,telephone,special,0,,0.000000,0.009433,0.047535,0,,0.000000,...,0,,0.000000,0.003775,0.029532,0,,0.000000,0.051853,0.111538


In [28]:
vtab_g.report_dict['report_ks']

Unnamed: 0_level_0,variable,10,10,10,11,11,11,12,12,12,9,9,9
Unnamed: 0_level_1,Unnamed: 1_level_1,g1,g2,g3,g1,g2,g3,g1,g2,g3,g1,g2,g3
Unnamed: 0_level_2,Unnamed: 1_level_2,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max
0,age.in.years,0.137324,0.233766,0.223039,0.13617,0.159151,0.127159,0.099265,0.162698,0.277931,0.27197,0.232978,0.353846
1,credit.amount,0.234742,0.267857,0.07598,0.147234,0.098806,0.257457,0.143382,0.17381,0.122759,0.138636,0.240361,0.239744
2,credit.history,0.27054,0.275974,0.301471,0.145532,0.208223,0.200157,0.283088,0.144444,0.133793,0.231818,0.154225,0.124359
3,duration.in.month,0.243545,0.256494,0.117647,0.313191,0.301061,0.141287,0.227941,0.260317,0.233793,0.112121,0.106645,0.302564
4,foreign.worker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,housing,0.344484,0.25,0.215686,0.11234,0.058355,0.062794,0.130515,0.143651,0.101379,0.042424,0.208368,0.211538
6,installment.rate.in.percentage.of.disposable.i...,0.059859,0.13961,0.29902,0.230638,0.212202,0.094192,0.128676,0.061905,0.066207,0.1,0.086136,0.275641
7,job,0.086268,0.084416,0.095588,0.171064,0.068302,0.083203,0.09375,0.157143,0.184138,0.15303,0.164889,0.123077
8,number.of.existing.credits.at.this.bank,0.242371,0.219156,0.009804,0.003404,0.104775,0.175039,0.069853,0.053175,0.053793,0.037879,0.032814,0.111538
9,number.of.people.being.liable.to.provide.maint...,0.15493,0.141234,0.063725,0.091064,0.015915,0.169545,0.027574,0.026984,0.012414,0.044697,0.085316,0.178205


### 排序组特征水平

上述报告中,月份排序有一定问题,那么若希望以指定顺序排序报告则可通过sort_columns参数设定 

In [29]:
sort_columns={
    'month':['9','10','11','12'],
    'client_group':['g3','g2','g1']
} 

In [30]:
vtab_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['month','client_group'],target=y.name,row_limit=0,
                          sort_columns=sort_columns,
                          n_jobs=1).fit(X_all)

In [31]:
vtab_g.report_dict['report_ks'] #排序后的数据

Unnamed: 0_level_0,variable,9,9,9,10,10,10,11,11,11,12,12,12
Unnamed: 0_level_1,Unnamed: 1_level_1,g3,g2,g1,g3,g2,g1,g3,g2,g1,g3,g2,g1
Unnamed: 0_level_2,Unnamed: 1_level_2,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max
0,age.in.years,0.353846,0.232978,0.27197,0.223039,0.233766,0.137324,0.127159,0.159151,0.13617,0.277931,0.162698,0.099265
1,credit.amount,0.239744,0.240361,0.138636,0.07598,0.267857,0.234742,0.257457,0.098806,0.147234,0.122759,0.17381,0.143382
2,credit.history,0.124359,0.154225,0.231818,0.301471,0.275974,0.27054,0.200157,0.208223,0.145532,0.133793,0.144444,0.283088
3,duration.in.month,0.302564,0.106645,0.112121,0.117647,0.256494,0.243545,0.141287,0.301061,0.313191,0.233793,0.260317,0.227941
4,foreign.worker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,housing,0.211538,0.208368,0.042424,0.215686,0.25,0.344484,0.062794,0.058355,0.11234,0.101379,0.143651,0.130515
6,installment.rate.in.percentage.of.disposable.i...,0.275641,0.086136,0.1,0.29902,0.13961,0.059859,0.094192,0.212202,0.230638,0.066207,0.061905,0.128676
7,job,0.123077,0.164889,0.15303,0.095588,0.084416,0.086268,0.083203,0.068302,0.171064,0.184138,0.157143,0.09375
8,number.of.existing.credits.at.this.bank,0.111538,0.032814,0.037879,0.009804,0.219156,0.242371,0.175039,0.104775,0.003404,0.053793,0.053175,0.069853
9,number.of.people.being.liable.to.provide.maint...,0.178205,0.085316,0.044697,0.063725,0.141234,0.15493,0.169545,0.015915,0.091064,0.012414,0.026984,0.027574


+ 若希望client_group在第一行,month在第二行,那么只需要把columns=['month','client_group']换为columns=['client_group','month']

### 使用row_limit

有时组分得过细会导致组内样本量不足,指标可能不具备代表性,因此可通过row_limit进行限定,若组数量小于row_limit时,该组就不会统计其任何指标

In [32]:
vtab_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['month','client_group'],target=y.name,
                          row_limit=80,
                          sort_columns=sort_columns,
                          n_jobs=1).fit(X_all)

group ('10', 'g2') has rows less than 80,output will return None
group ('11', 'g1') has rows less than 80,output will return None
group ('11', 'g3') has rows less than 80,output will return None
group ('9', 'g1') has rows less than 80,output will return None
group ('9', 'g2') has rows less than 80,output will return None


可以看到警告信息中相应剔除的组的信息

In [33]:
vtab_g.report_dict['report_ks']

Unnamed: 0_level_0,variable,9,10,10,11,12,12,12
Unnamed: 0_level_1,Unnamed: 1_level_1,g3,g3,g1,g2,g3,g2,g1
Unnamed: 0_level_2,Unnamed: 1_level_2,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max,ks_max
0,age.in.years,0.353846,0.223039,0.137324,0.159151,0.277931,0.162698,0.099265
1,credit.amount,0.239744,0.07598,0.234742,0.098806,0.122759,0.17381,0.143382
2,credit.history,0.124359,0.301471,0.27054,0.208223,0.133793,0.144444,0.283088
3,duration.in.month,0.302564,0.117647,0.243545,0.301061,0.233793,0.260317,0.227941
4,foreign.worker,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,housing,0.211538,0.215686,0.344484,0.058355,0.101379,0.143651,0.130515
6,installment.rate.in.percentage.of.disposable.i...,0.275641,0.29902,0.059859,0.212202,0.066207,0.061905,0.128676
7,job,0.123077,0.095588,0.086268,0.068302,0.184138,0.157143,0.09375
8,number.of.existing.credits.at.this.bank,0.111538,0.009804,0.242371,0.104775,0.053793,0.053175,0.069853
9,number.of.people.being.liable.to.provide.maint...,0.178205,0.063725,0.15493,0.015915,0.012414,0.026984,0.027574


### 产生psi报告

varGroupsReport可以产生psi报告以比较各个组中各个变量的分布变动情况

In [34]:
vtabs_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['client_group'],target=y.name,
                    row_limit=0,output_psi=True,n_jobs=1).fit(X_all)

In [35]:
vtabs_g.report_dict['report_psi'].head(21)

Unnamed: 0_level_0,variable,bin,g1,g2,g3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count_distr,count_distr,count_distr
0,age.in.years,"[-inf, 26.0)",0.16763,0.207547,0.196429
1,age.in.years,"[26.0, 28.0)",0.106936,0.084906,0.110119
2,age.in.years,"[28.0, 30.0)",0.078035,0.091195,0.071429
3,age.in.years,"[30.0, 35.0)",0.16474,0.169811,0.196429
4,age.in.years,"[35.0, 37.0)",0.066474,0.081761,0.089286
5,age.in.years,"[37.0, 48.0)",0.257225,0.22956,0.178571
6,age.in.years,"[48.0, 53.0)",0.052023,0.050314,0.0625
7,age.in.years,"[53.0, inf)",0.106936,0.084906,0.095238
8,age.in.years,missing,0.0,0.0,0.0
9,age.in.years,psi,0.012766,0.008236,0.015675


每一个变量的分组最后一行都会有psi用于显示各个组与基准分布比较而产生的psi

默认情况下psi的基准分布为全量数据分布,这里也可以指定某一组的分布为基准分布

+ 参数psi_base用于指定分布基准
+ 参数psi_base会在模块内部通过X.query(psi_base)传递,其语法为pd.DataFrame.query()的语法

In [40]:
X_g_gen=X_all.groupby('client_group')

In [60]:
#选择client_group=="g1"为psi基准分布
vtabs_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['client_group'],target=y.name,
                    row_limit=0,output_psi=True,psi_base='client_group=="g1"',n_jobs=1).fit(X_all)

In [None]:
vtabs_g.report_dict['report_psi'].head(20) #可以看到g1的psi为0

### 并行

varGroupsReport使用joblib对各组的报告生成进行了并行优化,

+ 当基础的数据量非常大列较多组的数量较多时可通过设定n_jobs=任务数进行并行以提高运行速度
+ 一般情况下建议n_jobs=1

In [None]:
#模拟较大数据量,3w行1000个特征,一个组特征(5个水平)
X_big=pd.DataFrame(np.random.rand(30000,1000),columns=['f'+str(i) for i in range(1000)])
y_big=pd.Series(np.random.randint(0,2,30000),name='target')

breaks_list_big={col:[0.2,0.4,0.6,0.8] for col in X_big.columns}

X_big_all=X_big.join(y_big).assign(
    client_group=pd.Series(np.random.randint(0,5,y_big.size),index=y_big.index).map({0:'g1',
                                                                                     1:'g2',
                                                                                     2:'g3',
                                                                                     3:'g4',
                                                                                     4:'g5'
                                                                                    })
)

In [None]:
X_big_all.info()

In [None]:
%%time
vtabs_g_p=bm.varGroupsReport(breaks_list_big,columns=['client_group'],target=y_big.name,
                    row_limit=0,output_psi=True,psi_base='client_group=="g1"',n_jobs=-1).fit(X_big_all)

In [None]:
%%time
vtabs_g=bm.varGroupsReport(breaks_list_big,columns=['client_group'],target=y_big.name,
                    row_limit=0,output_psi=True,psi_base='client_group=="g1"',n_jobs=1).fit(X_big_all)

### 样本权重

varGroupsReport提供了参数sample_weight,以在报告中加入样本权重,使用方法与varReport一致

In [None]:
sample_weight=pd.Series(y.map({0:10,1:1}),index=y.index) #假定数据经过抽样后好样本权重为10,坏样本权重为1

In [None]:
vtab_g_ws=bm.varGroupsReport(bin_tree.breaks_list,columns=['split'],target=y.name,
                       sample_weight=sample_weight,
                       row_limit=0,n_jobs=1).fit(X_all.assign(split=1))

In [None]:
vtab_g_ws.report_dict['report_all'].head(10)

### 导出为excel

varGroupsReport支持对组报告导出为excel,名称为var_report.xlsx

+ 参数out_path为报告输出路径,若无此路径则模块会创建该路径
+ 参数tab_suffix代表报告名称后缀,例如tab_suffix=‘_group’时,输出报告名称为var_report_group.xlsx
+ report_dict中所有的报告都会被导出,并写为不同的sheet

In [None]:
vtabs_g=bm.varGroupsReport(bin_tree.breaks_list,columns=['client_group'],
                           target=y.name,row_limit=0,
                           output_psi=True,n_jobs=1,
                           out_path='report/',
                           tab_suffix='_client'
                          ).fit(X_all)