# BDMLtools制作评分卡并与scorecardpy对比  <font size=3 >by 曾珂</font>

In [1]:
import scorecardpy as sc
import BDMLtools as bm
import pandas as pd

In [2]:
bm.__version__

'0.1.7'

In [3]:
sc.__version__

'0.1.9.2'

+ load germancredit data

In [4]:
dt=sc.germancredit().copy()
dt['creditability']=dt['creditability'].map({'good':0,'bad':1})

+ 将germancredit的有序category类型转换为有序数值类型

In [5]:
da=bm.dtypeAllocator().fit(dt)
dt=da.transform(dt)

In [6]:
X=dt.drop('creditability',axis=1)
y=dt['creditability']

## 制作评分卡

### 产生特征分析报告

In [7]:
breaks_list_user={'age.in.years': [26.0, 30.0, 35.0],
 'credit.amount': [4000.0, 6200.0, 8000.0],
 'credit.history': [2.0, 3.0, 4.0],
 'duration.in.month': [8.0, 16.0, 44.0],
 'foreign.worker': [1],
 'housing': [1.0],
 'installment.rate.in.percentage.of.disposable.income': [2.0, 3.0, 4.0],
 'job': [2.0, 3.0],
 'number.of.existing.credits.at.this.bank': [2.0],
 'number.of.people.being.liable.to.provide.maintenance.for': [2.0],
 'other.debtors.or.guarantors': [2.0],
 'other.installment.plans': [2.0],
 'personal.status.and.sex': [2.0, 3.0],
 'present.employment.since': [2.0, 3.0],
 'present.residence.since': [2.0],
 'property': [1.0, 2.0, 3.0],
 'purpose': ['retraining%,%car (used)',
  'radio/television',
  'furniture/equipment%,%domestic appliances%,%business',
  'repairs%,%car (new)%,%others%,%education'],
 'savings.account.and.bonds': [1.0, 2.0, 3.0],
 'status.of.existing.checking.account': [1.0, 2.0, 3.0],
 'telephone': [1.0]}

In [8]:
%%time
bin_sc=sc.woebin(dt,y='creditability',breaks_list=breaks_list_user,no_cores=1)

[INFO] creating woe binning ...
CPU times: user 498 ms, sys: 8.66 ms, total: 506 ms
Wall time: 505 ms


In [9]:
%%time
bin_bm=bm.varReport(breaks_list_dict=breaks_list_user,n_jobs=1).fit(X,y).var_report_dict

CPU times: user 270 ms, sys: 7.22 ms, total: 277 ms
Wall time: 275 ms


### woe编码

In [10]:
%%time
dt_woe_sc = sc.woebin_ply(dt, bins=bin_sc,no_cores=1)

[INFO] converting into woe values ...
CPU times: user 308 ms, sys: 9.31 ms, total: 317 ms
Wall time: 313 ms


In [11]:
%%time
dt_woe_bm = bm.woeTransformer(varbin=bin_bm,n_jobs=1).transform(X,y)

CPU times: user 71.6 ms, sys: 2.79 ms, total: 74.4 ms
Wall time: 73 ms


In [None]:
dt_woe_bm.dtype

+ 对比两份数据的编码结果

In [12]:
dt_woe_sc_1=dt_woe_sc.loc[:,dt_woe_sc.columns.str.contains('woe')]
dt_woe_sc_1.columns=[i[:-4] for i in dt_woe_sc_1.columns]
dt_woe_sc_1=dt_woe_sc_1[dt_woe_bm.columns]

+ 两者数值上几乎无区别，低精度类型条件下数值相等

In [13]:
dt_woe_sc_1.astype('float32').equals(dt_woe_bm.astype('float32')) #结果对比

True

### 回归建模

In [14]:
from sklearn.linear_model import LogisticRegression

+ 使用lr建模

In [15]:
lr_sc = LogisticRegression(penalty='l1',C=0.9,solver='saga').fit(dt_woe_sc_1, y)

In [16]:
lr_bm = LogisticRegression(penalty='l1',C=0.9,solver='saga').fit(dt_woe_bm, y)

BDMLtools的woe值转化数据dt_woe_bm(float64)与scorecardpy的woe转化数据dt_woe_sc(float64)有精度上细微的差别,,因此回归系数在会有细微差别

In [37]:
lr_sc.coef_

array([[0.61137712, 0.88738224, 0.64828953, 0.64220188, 0.72711308,
        0.72101066, 1.65428732, 0.        , 0.        , 0.        ,
        0.96039579, 0.63858036, 0.        , 0.55408163, 0.        ,
        0.40907312, 1.01363035, 0.77097241, 0.78982356, 0.58570633]])

In [38]:
lr_bm.coef_

array([[0.61144096, 0.88730925, 0.64829691, 0.64220625, 0.72707624,
        0.72098531, 1.65427221, 0.        , 0.        , 0.        ,
        0.96034977, 0.63855477, 0.        , 0.55408591, 0.        ,
        0.40903206, 1.01362096, 0.77097447, 0.78982819, 0.58436053]])

In [19]:
lr_sc.feature_names_in_[lr_bm.coef_[0]==0]

array(['job', 'number.of.existing.credits.at.this.bank',
       'number.of.people.being.liable.to.provide.maintenance.for',
       'personal.status.and.sex', 'present.residence.since'], dtype=object)

+ 制作评分卡

    这里scorecardpy产生评分卡时会去掉系数为0的特征,而bm会保留这些特征
    + 这种区别不会对后续打分带来任何影响,因为系数为0时score_point也为0

In [20]:
card_sc = sc.scorecard(bin_sc, lr_sc, dt_woe_sc_1.columns,
                           points0=600,
                           odds0=0.05263157894736842,
                           pdo=50)

In [21]:
card_obj = bm.cardScorer(lr_bm,bin_bm,
                        odds0=0.05263157894736842,
                        pdo=50,
                        points0=600).fit(X)

#bm.cardScorer可支持sklearn的LogisticRegression,也可支持statsmodels的glm-logit回归或logit回归

card_bm = card_obj.scorecard

In [22]:
print(len(card_sc),len(card_bm))

16 21


+ BDMLtools的特征分析报告文件中,任何变量都会存在missing水平,用以标示缺失值或特殊值,

    + 数据无缺失的时missing的woe为0,因此points为0,不会对打分产生任何影响
    + 若希望赋予missing值特殊的woe(比如woe较高(风险较大)),可以通过bm.cardScorer的woe_missing参数设定,这也是BMDLtools的特色
    + 更多细节请参考帮助文档

In [23]:
pd.concat(card_bm)[['points']].head(13)

Unnamed: 0,Unnamed: 1,points
intercept,intercept,449.0
age.in.years,"[-inf, 26.0)",-23.0
age.in.years,"[26.0, 30.0)",-3.0
age.in.years,"[30.0, 35.0)",-2.0
age.in.years,"[35.0, inf)",14.0
age.in.years,special,-0.0
age.in.years,missing,-0.0
credit.amount,"[-inf, 4000.0)",13.0
credit.amount,"[4000.0, 6200.0)",-22.0
credit.amount,"[6200.0, 8000.0)",-25.0


In [24]:
pd.concat(card_sc)[['variable','bin','points']].head(9)

Unnamed: 0,Unnamed: 1,variable,bin,points
basepoints,0,basepoints,,449.0
age.in.years,6,age.in.years,"[-inf,26.0)",-23.0
age.in.years,7,age.in.years,"[26.0,30.0)",-3.0
age.in.years,8,age.in.years,"[30.0,35.0)",-2.0
age.in.years,9,age.in.years,"[35.0,inf)",14.0
credit.amount,2,credit.amount,"[-inf,4000.0)",13.0
credit.amount,3,credit.amount,"[4000.0,6200.0)",-22.0
credit.amount,4,credit.amount,"[6200.0,8000.0)",-25.0
credit.amount,5,credit.amount,"[8000.0,inf)",-65.0


+ 对比得分结果

In [25]:
%%time
dt_score_sc=sc.scorecard_ply(dt,card_sc)

CPU times: user 246 ms, sys: 7.53 ms, total: 253 ms
Wall time: 250 ms



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [26]:
%%time
dt_score_bm=card_obj.transform(X)

CPU times: user 65.7 ms, sys: 3.34 ms, total: 69 ms
Wall time: 66.7 ms


In [36]:
dt_score_sc['score'].equals(dt_score_bm['score'])

False

In [35]:
card_obj.scorecard['age.in.years'].dtypes

variable     object
points      float64
woe         float64
breaks       object
dtype: object

In [31]:
dt_score_sc['score']

0      615.0
1      375.0
2      598.0
3      429.0
4      331.0
       ...  
995    551.0
996    450.0
997    566.0
998    324.0
999    427.0
Name: score, Length: 1000, dtype: float64

In [29]:
dt_score_bm['score']

0      615.0
1      375.0
2      598.0
3      429.0
4      331.0
       ...  
995    551.0
996    450.0
997    566.0
998    324.0
999    427.0
Name: score, Length: 1000, dtype: float32