# BDMLtools制作评分卡并与scorecardpy对比  <font size=3 >by 曾珂</font>

In [1]:
import scorecardpy as sc
import BDMLtools as bm
import pandas as pd

+ load germancredit data

In [2]:
dt=sc.germancredit().copy()
dt['creditability']=dt['creditability'].map({'good':0,'bad':1})

+ 将germancredit的有序category类型转换为有序数值类型

In [3]:
da=bm.dtypeAllocator().fit(dt)
dt=da.transform(dt)

In [4]:
X=dt.drop('creditability',axis=1)
y=dt['creditability']

## 制作评分卡

### 产生特征分析报告

In [5]:
breaks_list_user={'age.in.years': [26.0, 30.0, 35.0],
 'credit.amount': [4000.0, 6200.0, 8000.0],
 'credit.history': [2.0, 3.0, 4.0],
 'duration.in.month': [8.0, 16.0, 44.0],
 'foreign.worker': [1],
 'housing': [1.0],
 'installment.rate.in.percentage.of.disposable.income': [2.0, 3.0, 4.0],
 'job': [2.0, 3.0],
 'number.of.existing.credits.at.this.bank': [2.0],
 'number.of.people.being.liable.to.provide.maintenance.for': [2.0],
 'other.debtors.or.guarantors': [2.0],
 'other.installment.plans': [2.0],
 'personal.status.and.sex': [2.0, 3.0],
 'present.employment.since': [2.0, 3.0],
 'present.residence.since': [2.0],
 'property': [1.0, 2.0, 3.0],
 'purpose': ['retraining%,%car (used)',
  'radio/television',
  'furniture/equipment%,%domestic appliances%,%business',
  'repairs%,%car (new)%,%others%,%education'],
 'savings.account.and.bonds': [1.0, 2.0, 3.0],
 'status.of.existing.checking.account': [1.0, 2.0, 3.0],
 'telephone': [1.0]}

In [6]:
%%time
bin_sc=sc.woebin(dt,y='creditability',breaks_list=breaks_list_user,no_cores=1)

[INFO] creating woe binning ...
CPU times: user 545 ms, sys: 10.6 ms, total: 556 ms
Wall time: 561 ms


In [7]:
%%time
bin_bm=bm.binSelector(breaks_list_adj=breaks_list_user,n_jobs=1).fit(X,y).adjbin

CPU times: user 315 ms, sys: 9.78 ms, total: 325 ms
Wall time: 326 ms


### woe编码

In [8]:
%%time
dt_woe_sc = sc.woebin_ply(dt, bins=bin_sc,no_cores=1)

[INFO] converting into woe values ...
CPU times: user 409 ms, sys: 15.1 ms, total: 424 ms
Wall time: 426 ms


In [9]:
%%time
dt_woe_bm = bm.woeTransformer(varbin=bin_bm,n_jobs=1).transform(X,y)

CPU times: user 76.1 ms, sys: 3.75 ms, total: 79.9 ms
Wall time: 79.4 ms


+ 对比两份数据的编码结果

In [10]:
dt_woe_sc_1=dt_woe_sc.loc[:,dt_woe_sc.columns.str.contains('woe')]
dt_woe_sc_1.columns=[i[:-4] for i in dt_woe_sc_1.columns]
dt_woe_sc_1=dt_woe_sc_1[dt_woe_bm.columns]

In [11]:
dt_woe_sc_1.astype('float32').equals(dt_woe_bm) #结果对比

True

### 回归建模

In [12]:
from sklearn.linear_model import LogisticRegression

+ 使用lr建模

In [13]:
lr_sc = LogisticRegression(penalty='l1',C=0.9,solver='saga').fit(dt_woe_sc_1, y)

In [14]:
lr_bm = LogisticRegression(penalty='l1',C=0.9,solver='saga').fit(dt_woe_bm, y)

为节省内存BDMLtools的woe值转化数据dt_woe_bm为float32,scorecardpy为float64,因此回归系数在会有细微差别

In [15]:
lr_sc.coef_

array([[0.61138176, 0.88738056, 0.64828809, 0.64219757, 0.72711397,
        0.72100921, 1.65429331, 0.        , 0.        , 0.        ,
        0.96038957, 0.6385824 , 0.        , 0.55408126, 0.        ,
        0.40907374, 1.01362541, 0.77096885, 0.78982772, 0.58574398]])

In [16]:
lr_bm.coef_

array([[0.61144763, 0.88730806, 0.6482932 , 0.6422019 , 0.7270549 ,
        0.72097105, 1.6542778 , 0.        , 0.        , 0.        ,
        0.9602891 , 0.6385427 , 0.        , 0.55408764, 0.        ,
        0.40903908, 1.0136266 , 0.7709711 , 0.78982747, 0.58435893]],
      dtype=float32)

In [17]:
lr_sc.feature_names_in_[lr_bm.coef_[0]==0]

array(['job', 'number.of.existing.credits.at.this.bank',
       'number.of.people.being.liable.to.provide.maintenance.for',
       'personal.status.and.sex', 'present.residence.since'], dtype=object)

+ 制作评分卡

    这里scorecardpy产生评分卡时会去掉系数为0的特征,而bm会保留这些特征
    + 这种区别不会对后续打分带来任何影响,因为系数为0时score_point也为0

In [18]:
card_sc = sc.scorecard(bin_sc, lr_sc, dt_woe_sc_1.columns,
                           points0=600,
                           odds0=0.05263157894736842,
                           pdo=50)

In [19]:
card_obj = bm.cardScorer(lr_bm,bin_bm,
                        odds0=0.05263157894736842,
                        pdo=50,
                        points0=600).fit(X)

#bm.cardScorer可支持sklearn的LogisticRegression,也可支持statsmodels的glm-logit回归或logit回归

card_bm = card_obj.scorecard

In [20]:
print(len(card_sc),len(card_bm))

16 21


+ BDMLtools的特征分析报告文件中,任何变量都会存在missing水平,用以标示缺失值或特殊值,

    + 数据无缺失的时missing的woe为0,因此points为0,不会对打分产生任何影响
    + 若希望赋予missing值特殊的woe(比如woe较高(风险较大)),可以通过bm.cardScorer的woe_missing参数设定,这也是BMDLtools的特色
    + 更多细节请参考帮助文档

In [21]:
pd.concat(card_bm)[['points']].head(11)

Unnamed: 0,Unnamed: 1,points
intercept,intercept,449.0
age.in.years,"[-inf, 26.0)",-23.0
age.in.years,"[26.0, 30.0)",-3.0
age.in.years,"[30.0, 35.0)",-2.0
age.in.years,"[35.0, inf)",14.0
age.in.years,missing,-0.0
credit.amount,"[-inf, 4000.0)",13.0
credit.amount,"[4000.0, 6200.0)",-22.0
credit.amount,"[6200.0, 8000.0)",-25.0
credit.amount,"[8000.0, inf)",-65.0


In [22]:
pd.concat(card_sc)[['variable','bin','points']].head(9)

Unnamed: 0,Unnamed: 1,variable,bin,points
basepoints,0,basepoints,,449.0
age.in.years,57,age.in.years,"[-inf,26.0)",-23.0
age.in.years,58,age.in.years,"[26.0,30.0)",-3.0
age.in.years,59,age.in.years,"[30.0,35.0)",-2.0
age.in.years,60,age.in.years,"[35.0,inf)",14.0
credit.amount,4,credit.amount,"[-inf,4000.0)",13.0
credit.amount,5,credit.amount,"[4000.0,6200.0)",-22.0
credit.amount,6,credit.amount,"[6200.0,8000.0)",-25.0
credit.amount,7,credit.amount,"[8000.0,inf)",-65.0


+ 对比得分结果

In [23]:
%%time
dt_score_sc=sc.scorecard_ply(dt,card_sc)

CPU times: user 345 ms, sys: 11.4 ms, total: 357 ms
Wall time: 359 ms



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
%%time
dt_score_bm=card_obj.transform(X)

CPU times: user 59.6 ms, sys: 3.53 ms, total: 63.1 ms
Wall time: 61.1 ms


In [25]:
dt_score_sc['score'].astype('float32').equals(dt_score_bm['score'])

True