In [1]:
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
from scipy.stats import ks_2samp
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from datetime import datetime
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml import PMMLPipeline
import time
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import pickle
# from project_demo.tools.optimize import *
from project_demo.tools.evaluate import *
%matplotlib inline
from sklearn import metrics

# 本文件用于训练fasttext包含在LGBM

In [2]:
def auc_ks(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['perf'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On Test is: {}'.format(auc))
        print('KS On Test is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
        predprob_test = predprob
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['perf'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    return predprob['predprob'], predprob['ytrue'],predprob_test['predprob'],predprob_test['ytrue']

In [3]:
def ks(df_score, df_good,fig_dir):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    df_score = pd.DataFrame(df_score)
    df_good = pd.DataFrame(df_good) 
    df_score.columns = ['score']
    df_good.columns = ['good']
    df = pd.concat([df_score,df_good],axis=1)
    
    df['bad'] = 1 - df.good
    bin = np.arange(0, 1.001, 0.05)
    df['bucket'] = pd.cut(df.score, bin)  # 根据bin来划分区间
   
    grouped = df.groupby('bucket', as_index=False) # 统计在每个区间的样本量
    agg1 = pd.DataFrame()
    agg1['min_scr'] = grouped.min().score # 取得每个区间的最小值
    agg1['max_scr'] = grouped.max().score
    agg1['bads'] = grouped.sum().bad # 计算每个区间bad的总数量
    agg1['goods'] = grouped.sum().good
    
    agg2 = (agg1.sort_values(['min_scr'])).reset_index(drop=True) # 根据区间最小值排序
    agg2['bad_cum_rate'] = np.round((agg2.bads / df.bad.sum()).cumsum(), 4) # 计算bad样本累计概率
    agg2['good_cum_rate'] = np.round((agg2.goods / df.good.sum()).cumsum(), 4) 
    agg2['ks'] = abs(np.round(((agg2.bads / df.bad.sum()).cumsum() - (agg2.goods / df.good.sum()).cumsum()), 4)) # 计算bad和good累计概率之差的绝对值
    ks = agg2.ks.max()  # 求出ks
    
    plt.figure(figsize=(8, 4))  # 创建绘图对象
    plt.plot(agg2.min_scr, agg2.bad_cum_rate, "g-", linewidth=1)  # 在当前绘图对象绘图（X轴，Y轴，蓝色虚线，线宽度）
    plt.plot(agg2.min_scr, agg2.good_cum_rate, "b-", linewidth=1)
    
    x_abline = agg2['min_scr'][agg2['ks'] == agg2['ks'].max()] # ks最大的min_scr
    y_abline1 = agg2['bad_cum_rate'][agg2['ks'] == agg2['ks'].max()] # ks最大时bad_cum_rate
    y_abline2 = agg2['good_cum_rate'][agg2['ks'] == agg2['ks'].max()]
    plt.fill_between(x_abline, y_abline1, y_abline2, color = "red",linewidth=2)    
    
    sub = "%s%s"%('ks = ',ks)
    plt.legend(title=sub,loc='lower right')
    plt.xlabel("Minimum score")  # X轴标签
    plt.ylabel("Cumulative percentage(%)")  # Y轴标签
    plt.title('KS chart')  # 图标题
    plt.savefig(fig_dir)
    plt.show()  # 显示图

In [4]:
def auc_ks_v(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['perf'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On valid is: {}'.format(auc))
        print('KS On valid is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['perf'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))

In [5]:
def continuous_categorical(dataset):
    '''diff continuous and categorical
        
    '''
    continuousDomain = [] # float int
    categoricalDomain = [] # object bool category

    for item in dataset.columns:
        if (dataset[item].dtypes == object)| (dataset[item].dtypes == bool):
            categoricalDomain.append(item)
            dataset[item] = dataset[item].astype(str)
        elif item!='target':
            continuousDomain.append(item)
            dataset[item] = dataset[item]
    print (categoricalDomain)
    return continuousDomain,categoricalDomain

In [6]:
def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False):
    """
    :param proba_train: 训练集预测概率
    :param proba_validation: 验证集预测概率
    :param segment_cnt:
    :param out_path:
    :return: 模型稳定性
    """
    step = 1.0/segment_cnt
    flag = 0.0
    model_stability = []
    len_train = len(proba_train)
    len_validation = len(proba_validation)

    columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference',
               'variance','ln_variance','stability_index']

    while flag < 1.0:
        temp = {}

        score_range = '['+str(flag)+','+str(flag + step)+')'
        segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count()
        segment_train_percentage = segment_train_cnt*1.0/len_train
        segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count()
        segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation
        difference = segment_validation_percentage - segment_train_percentage
        variance = float(segment_validation_percentage)/segment_train_percentage
        ln_variance = variance
        stability_index = difference * ln_variance

        temp['score_range'] = score_range
        temp['segment_train_percentage'] = segment_train_percentage[0]
        temp['segment_validation_percentage'] = segment_validation_percentage[0]
        temp['difference'] = difference[0]
        temp['variance'] = variance[0]
        temp['ln_variance'] = ln_variance[0]
        temp['stability_index'] = stability_index[0]

        model_stability.append(temp)
        flag += step

    model_stability = pd.DataFrame(model_stability,columns=columns)
    if out_path:
        file_name = out_path if isinstance(out_path, str) else None
        model_stability.to_csv(file_name, index=False)

    return model_stability

In [10]:
data = pd.read_csv("/data/work/fuxinfeng/tb_file/data/fasttext_main_data.csv") # train and test

In [16]:
pre_fasttext = pd.read_csv('/data/work/fuxinfeng/tb_file/data/pre_all_fasttext.csv') # 跨时间验证

In [17]:
pre_fasttext =pre_fasttext.rename(columns={'target_y':'perf'})

In [18]:
pre_fasttext.head(0)

Unnamed: 0,ugid,token_id_x,TB1,TB5,TB9,TB3,TB7,TB11,TB2,TB6,...,TO218,TO208,TO221,TO198,TO211,token_id,crawler_time,last_decision_tm_y,perf,prob


In [19]:
data.head(5)

Unnamed: 0,ugid,token_id_x,TO9,TO10,TO52,TO51,TO17,TO54,TO27,TO28,...,TB296,crawler_time,TO414_y,TO415_y,TO416_y,TO417_y,TO418_y,first_trans_time,perf,prob
0,00024CEF-E433-4679-B85E-2237CF46A2B2,7d4e87d01f44479296e43124b4a979461488774175767_21,3672,2149,131490,234504,1523,103014,899,161,...,13,06MAR17:12:39:58,,,,,,2017-04-22 19:18:22,0,0.296029
1,0002BE1A-3E99-48F4-9C9C-B075B9D85BA7,4e31ee0c90e34879b798d4fd69714eca1486560721411_21,2960,1381,105290,212868,1579,107578,603,171,...,7,09MAR17:23:27:01,,,,,,2017-03-21 17:09:05,0,0.415101
2,00044410-EB5A-4F23-B629-64570EE71897,5cd370d2a346471593434f67ac9a00011493171833156_21,3558,1410,142013,435017,2148,293004,739,196,...,6,26APR17:10:10:02,,,,,,2017-04-26 10:26:08,1,0.598104
3,00046111-B639-4791-8FA9-F0D1906C326A,0e8d2d819cdd4f6a82e5af468c956a361495350053263_21,603,241,11384,30644,362,19260,194,94,...,13,21MAY17:15:38:19,68.0,119.0,137.0,137.0,3.0,2017-05-21 15:41:15,1,0.052565
4,000485EE-1971-4FCB-82A4-6A9A4BCF3A29,8641267920004cc79cc715c6563baee51495098191611_21,184,98,7026,16674,86,9648,177,69,...,33,18MAY17:23:59:57,709.0,270.0,185.0,152.0,7.0,2017-05-19 00:01:31,1,0.46161


In [20]:
delete_list_2 = ['ugid','token_id_x','crawler_time','token_id_y','crawler_time_x', 'ugid_y', 'crawler_time_y', 'last_decision_tm_x', 'token_id', 'last_decision_tm_y']

In [21]:
delete_list_1 = ['ugid','token_id_x','crawler_time','token_id_y','first_trans_time']

In [22]:
# d = ['crawler_time','token_id_y']
pre_fasttext =pre_fasttext.drop(delete_list_2,axis=1)

In [23]:
data =data.drop(delete_list_1,axis=1)

In [24]:
# data =data.drop(d,axis=1)
pre_fasttext.replace(np.inf, np.nan, inplace=True)
pre_fasttext.replace(-99998,np.nan, inplace=True)
pre_fasttext.replace(-99999976,np.nan, inplace=True)
pre_fasttext.replace(-9999979,np.nan, inplace=True)
pre_fasttext.replace(-9999976,np.nan, inplace=True)
pre_fasttext.replace(-999973,np.nan, inplace=True)
pre_fasttext.replace(-999976,np.nan, inplace=True)
pre_fasttext.replace(9999,np.nan, inplace=True)

In [25]:
data.replace(np.inf, np.nan, inplace=True)
data.replace(-99998,np.nan, inplace=True)
data.replace(-99999976,np.nan, inplace=True)
data.replace(-9999979,np.nan, inplace=True)
data.replace(-9999976,np.nan, inplace=True)
data.replace(-999973,np.nan, inplace=True)
data.replace(-999976,np.nan, inplace=True)
data.replace(9999,np.nan, inplace=True)

In [26]:
pre_fasttext.head()

Unnamed: 0,TB1,TB5,TB9,TB3,TB7,TB11,TB2,TB6,TB10,TB4,...,TO201,TO214,TO205,TO218,TO208,TO221,TO198,TO211,perf,prob
0,282,210,142,0,0,0,76,38.0,25.0,33,...,82.0,82.0,64.0,68.0,55.0,54.0,44.0,46.0,1,0.679297
1,1870,1320,700,0,0,0,180,317.0,517.0,186,...,82.0,90.0,68.0,79.0,52.0,70.0,46.0,66.0,1,0.577517
2,311,518,464,0,0,2,84,143.0,97.0,107,...,52.0,65.0,42.0,56.0,31.0,49.0,26.0,44.0,0,0.386687
3,469,588,556,0,0,0,292,442.0,255.0,27,...,63.0,84.0,38.0,65.0,24.0,61.0,20.0,55.0,1,0.649922
4,1968,1501,909,0,0,0,55,28.0,273.0,4,...,73.0,90.0,51.0,75.0,41.0,65.0,36.0,53.0,1,0.662916


In [27]:
continuousDomain, categoricalDomain = continuous_categorical(pre_fasttext)

[]


In [28]:
del pre_fasttext['target_x']

In [29]:
# data[['perf']]
for i in pre_fasttext.columns:
    print(i)

TB1
TB5
TB9
TB3
TB7
TB11
TB2
TB6
TB10
TB4
TB8
TB12
TB321
TB322
TB323
TB324
TB325
TB326
TB327
TB328
TB329
TB330
TB331
TB332
TB333
TB334
TB335
TB336
TB337
TB338
TB339
TB340
TB341
TB342
TB343
TB344
TB345
TB346
TB347
TB348
TB349
TB434
TB435
TB436
TB437
TB366
TB367
TB368
TB369
TB370
TB371
TB372
TB373
TB374
TB375
TB376
TB377
TB378
TB379
TB380
TB381
TB382
TB383
TB384
TB385
TB386
TB387
TB388
TB389
TB390
TB391
TB392
TB393
TB394
TB395
TB396
TB397
TB13
TB16
TB19
TB14
TB17
TB20
TB15
TB18
TB21
TB22
TB26
TB30
TB24
TB28
TB32
TB23
TB27
TB31
TB25
TB29
TB33
TB34
TB38
TB42
TB46
TB36
TB40
TB44
TB48
TB35
TB39
TB43
TB47
TB37
TB41
TB45
TB49
TB50
TB54
TB58
TB62
TB52
TB56
TB60
TB64
TB51
TB55
TB59
TB63
TB53
TB57
TB61
TB65
TB66
TB74
TB82
TB68
TB76
TB84
TB67
TB75
TB83
TB69
TB77
TB85
TB445
TB446
TB447
TB448
TB449
TB450
TB451
TB452
TB453
TB454
TB455
TB456
TB457
TB458
TB459
TB460
TB70
TB78
TB86
TB71
TB79
TB87
TB72
TB80
TB88
TB73
TB81
TB89
TB90
TB91
TB92
TB93
TB94
TB97
TB95
TB98
TB96
TB99
TB100
TB103
TB101
TB104
TB10

In [30]:
# for j in data.columns:
#     print(j)
a = pd.merge(data,pre_fasttext)

In [31]:
a.head()

Unnamed: 0,TO9,TO10,TO52,TO51,TO17,TO54,TO27,TO28,TO29,TO34,...,TB265,TB297,TB296,TO414_y,TO415_y,TO416_y,TO417_y,TO418_y,perf,prob


In [32]:
dfm = DataFrameMapper([(c, [CategoricalDomain(invalid_value_treatment = 'as_missing',
                                              missing_value_treatment = 'as_value',
                                              missing_value_replacement = 'N/A'), LabelBinarizer()])
                       for c in categoricalDomain] 
                      + 
                     [(c, [ContinuousDomain(invalid_value_treatment = 'as_missing',
                                                     missing_value_treatment = 'as_value',
                                                     missing_value_replacement = -1)])
                      for c in continuousDomain]
                      ,df_out = True)

In [33]:
%%time
dfm.fit(data)

KeyError: 'target_x: target_x'

In [144]:
dfm.fit(pre_fasttext)

  "mean" : numpy.asarray(numpy.nanmean(X, axis = 0)),
  keepdims=keepdims)
  r = func(a, **kwargs)


DataFrameMapper(default=False, df_out=True,
        features=[('TB1', [ContinuousDomain()]), ('TB5', [ContinuousDomain()]), ('TB9', [ContinuousDomain()]), ('TB3', [ContinuousDomain()]), ('TB7', [ContinuousDomain()]), ('TB11', [ContinuousDomain()]), ('TB2', [ContinuousDomain()]), ('TB6', [ContinuousDomain()]), ('TB10', [ContinuousDomain()]), ('TB4', [...), ('TO198', [ContinuousDomain()]), ('TO211', [ContinuousDomain()]), ('perf', [ContinuousDomain()])],
        input_df=False, sparse=False)

In [167]:
data=data[data.perf.isin([0,1])]

In [None]:
pre_fasttext=pre_fasttext[pre_fasttext.perf.isin([0,1])]

In [89]:
data

Unnamed: 0,TO9,TO10,TO52,TO51,TO17,TO54,TO27,TO28,TO29,TO34,...,TB252,TB265,TB297,TB296,TO414_y,TO415_y,TO416_y,TO417_y,TO418_y,perf
0,3672,2149,131490.0,234504.0,1523,103014.0,899,161,1802,351,...,59.0,100.0,2.0,13,,,,,,0
1,2960,1381,105290.0,212868.0,1579,107578.0,603,171,1655,162,...,25.0,100.0,1.0,7,,,,,,0
2,3558,1410,142013.0,435017.0,2148,293004.0,739,196,1857,1550,...,0.0,100.0,1.0,6,,,,,,1
3,603,241,11384.0,30644.0,362,19260.0,194,94,395,179,...,41.0,59.0,1.0,13,68.0,119.0,137.0,137.0,3.0,1
4,184,98,7026.0,16674.0,86,9648.0,177,69,177,32,...,28.0,54.0,5.0,33,709.0,270.0,185.0,152.0,7.0,1
5,2073,878,91137.0,243770.0,1195,152633.0,468,143,1138,489,...,94.0,94.0,1.0,9,0.0,40.0,57.0,38.0,0.0,1
6,2525,1023,63068.0,170501.0,1502,107433.0,542,109,1453,524,...,9.0,35.0,7.0,18,,,,,,1
7,307,183,8071.0,14409.0,124,6338.0,40,13,177,79,...,52.0,52.0,4.0,18,,,,,,0
8,495,348,24499.0,34457.0,147,9958.0,187,52,401,69,...,14.0,94.0,6.0,20,,,,,,1
9,892,500,28136.0,52320.0,392,24184.0,141,20,598,318,...,16.0,85.0,1.0,25,,,,,,0


In [168]:
train, test = train_test_split(data, test_size=0.1, random_state=2018)#调用sklearn中的方法，注意指定随机种子，保证结果可以复线

# print(train.shape)
# print(test.shape)
# print(X_train.shape)
# print(train.target.value_counts())#查看训练集好坏分布
# print(test.target.value_counts())#查看测试集好坏分布

In [169]:
X_train = dfm.transform(train)

In [170]:
y_train = train['perf']

In [171]:
X_train = X_train.drop("perf",axis=1)

In [172]:
# X_train

In [173]:
print(train.shape)
print(test.shape)
print(X_train.shape)
print(train.perf.value_counts())#查看训练集好坏分布
print(test.perf.value_counts())#查看测试集好坏分布

(61822, 889)
(6870, 889)
(61822, 888)
1    32111
0    29711
Name: perf, dtype: int64
1    3600
0    3270
Name: perf, dtype: int64


In [174]:
varlist=X_train.columns
varlist

Index(['TO9', 'TO10', 'TO52', 'TO51', 'TO17', 'TO54', 'TO27', 'TO28', 'TO29',
       'TO34',
       ...
       'TB246', 'TB252', 'TB265', 'TB297', 'TB296', 'TO414_y', 'TO415_y',
       'TO416_y', 'TO417_y', 'TO418_y'],
      dtype='object', length=888)

In [175]:
model = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.82622520164901361,
        drop_rate=0.5, is_unbalance=True,
        learning_rate=0.025, max_bin=4, max_depth=3,
        max_drop=50, min_child_samples=120, min_child_weight=3,
        min_split_gain=0.028, n_estimators=705, nthread=-1,
        num_leaves=128, objective='binary', reg_alpha=30, reg_lambda=150,
        scale_pos_weight=1, seed=27, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.7, subsample_for_bin=30000,
        subsample_freq=3, uniform_drop=False, xgboost_dart_mode=False)

In [176]:
model = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.82622520164901361,
        drop_rate=0.5, is_unbalance=True,
        learning_rate=0.03, max_bin=10, max_depth=5,
        max_drop=50, min_child_samples=1200, min_child_weight=4,
        min_split_gain=0.01500339778285961, n_estimators=490, nthread=-1,
        num_leaves=128, objective='binary', reg_alpha=20, reg_lambda=100,
        scale_pos_weight=1, seed=27, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.7, subsample_for_bin=30000,
        subsample_freq=3, uniform_drop=False, xgboost_dart_mode=False)

In [177]:
def modelWithCv(model, x_array, y_array, cv=5):
    model.fit(x_array, y_array)
    
    dtrain_predictions = model.predict(x_array)
    dtrain_predprob = model.predict_proba(x_array)[:,1]
    
    print("--AUC Score (Train): %f" % roc_auc_score(y_array, dtrain_predprob))
    print ("class metrics:")
    print (metrics.classification_report(y_array, dtrain_predictions))
    
    cv_score = cross_val_score(model,x_array, y_array, cv=cv, scoring = 'roc_auc')
    print("--CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" %(np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score)))

In [178]:
%%time
modelWithCv(model, X_train, y_train, 2)

  if diff:


--AUC Score (Train): 0.757755
class metrics:
             precision    recall  f1-score   support

          0       0.67      0.70      0.69     29711
          1       0.71      0.68      0.69     32111

avg / total       0.69      0.69      0.69     61822

--CV Score : Mean - 0.7223358 | Std - 0.0003219352 | Min - 0.7220138 | Max - 0.7226577
CPU times: user 5h 13min 19s, sys: 1min 12s, total: 5h 14min 32s
Wall time: 12min 24s


# IV

In [41]:
ivobj = iv_pandas()

In [42]:
columns = data.columns

In [44]:
columns = list(columns)

In [46]:
len(columns)

890

In [57]:
c = columns[0:-2]

In [58]:
c.append(columns[-1])

In [76]:
len(c)
# c

889

In [62]:
data_for_iv = data.loc[:,c]

In [204]:
# woe,iv = ivobj.cal_woe_iv(data,['prob'],'perf',nsplit=10,event=1)

In [74]:
iv_1 = sorted(iv.items(),key = lambda x:x[1],reverse=True)

In [75]:
iv_1

[('prob', 0.37136149272068164),
 ('TB327', 0.25152686180474576),
 ('TB326', 0.23806597060047413),
 ('TB377', 0.23677991742883628),
 ('TB9', 0.22955600811110682),
 ('TB325', 0.22484855803269338),
 ('TB385', 0.22255414156953857),
 ('TB5', 0.21502459901788334),
 ('TB409', 0.2134043715788797),
 ('TB546', 0.20764931845426488),
 ('TB393', 0.19766704942020222),
 ('TB376', 0.1939796901892052),
 ('TB545', 0.19362519439297898),
 ('TB384', 0.18745618345516948),
 ('TB82', 0.18557346487378712),
 ('TB408', 0.18535294758114543),
 ('TB448', 0.18498256181460107),
 ('TB440', 0.1831265328945128),
 ('TB19', 0.1802589942195432),
 ('TB439', 0.17981743364681518),
 ('TB437', 0.17611369426027365),
 ('TB425', 0.17553113388994818),
 ('TB16', 0.17527851452683646),
 ('TB381', 0.1733849057654204),
 ('TB551', 0.1708457536699436),
 ('TB46', 0.1704544277953116),
 ('TB341', 0.16829790958349772),
 ('TB296', 0.1680116079934401),
 ('TB324', 0.16537124284883398),
 ('TB436', 0.1650173959286769),
 ('TB389', 0.163417546244579

In [67]:
woe

{'prob': {1: -1.1165874075814881,
  2: -0.710843507365601,
  3: -0.46235615341916364,
  4: -0.2643578269601575,
  5: -0.031475955623983316,
  6: 0.11411691827272542,
  7: 0.2900320392824555,
  8: 0.4711773720333609,
  9: 0.7201770367681255,
  10: 1.043690972974318}}

In [57]:
df_train=dfm.transform(train)
predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])

In [38]:
a = model.feature_importances_

In [40]:
a

array([  1,   2,   1,   7,   0,   3,  12,   2,   3,  20,   3,   0,   6,
        35,  35,   0,   1,   7,   5,   2,   1,  10,   3,   3,   3,   6,
         2,   3,   3,  17,   4,   0,   0,   1,   0,   1,   1,   0,   5,
         2,   1,   2,   2,   0,   1,   0,   1,   5,   5,   1,   4,   4,
         0,   4,   0,   6,   2,   0,   6,   1,   1,  10,   3,   0,   3,
         1,   2,   1,   2,  22,   3,   7,   3,   4,   6,   2,  59,  44,
         0,   1,   2,   3,   6,   2,   9,   1,   3,   3,   3,   4,   0,
         1,  17,   6,   0,   0,   0,   3,   2,   4,   1,   0,   0,   0,
         0,   1,   1,   1,   1,   1,   5,   3,   1,   4,   2,   0,   3,
         7,   0,   2,   0,   3,   2,   0,   4,   2,  11,  18,   2,   7,
         4,   4,   3,   6,   8,   6,  40,   0,  21,  24,   0,   1,  31,
         5,   0,   7,   6,   0,   6,   2,   2,   4,   0,   1,  17,   1,
         3,   0,   1,   2,   0,   1,   5,   4,   2,   1,   2,   0,   3,
         3,   0,   0,   0,  50,   0,   8,   0,   7,   2,   7,   

In [61]:
predprob = np.array(predprob)

In [63]:
train_label = np.array(train['b'])

In [68]:
len(predprob)==len(train_label)

True

In [70]:
predprob = np.reshape(predprob,len(predprob))

In [71]:
predprob

array([0.00108401, 0.99892657, 0.99893273, ..., 0.00106378, 0.99885563,
       0.00110127])

In [59]:
def cal_ks_scipy(y_pred,y_true):
    return ks_2samp(y_pred[y_true==1],y_pred[y_true!=1]).statistic

In [72]:
ks_train = cal_ks_scipy(predprob,train_label)

In [73]:
ks_train

1.0