In [40]:
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
from scipy.stats import ks_2samp
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from datetime import datetime
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml import PMMLPipeline
import time
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import pickle
# from project_demo.tools.optimize import *
from project_demo.tools.evaluate import *
%matplotlib inline
from sklearn import metrics

# 仅LDA

In [41]:
def auc_ks(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['b'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On Test is: {}'.format(auc))
        print('KS On Test is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
        predprob_test = predprob
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['b'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    return predprob['predprob'], predprob['ytrue'],predprob_test['predprob'],predprob_test['ytrue']

In [42]:
def ks(df_score, df_good,fig_dir):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    df_score = pd.DataFrame(df_score)
    df_good = pd.DataFrame(df_good) 
    df_score.columns = ['score']
    df_good.columns = ['good']
    df = pd.concat([df_score,df_good],axis=1)
    
    df['bad'] = 1 - df.good
    bin = np.arange(0, 1.001, 0.05)
    df['bucket'] = pd.cut(df.score, bin)  # 根据bin来划分区间
   
    grouped = df.groupby('bucket', as_index=False) # 统计在每个区间的样本量
    agg1 = pd.DataFrame()
    agg1['min_scr'] = grouped.min().score # 取得每个区间的最小值
    agg1['max_scr'] = grouped.max().score
    agg1['bads'] = grouped.sum().bad # 计算每个区间bad的总数量
    agg1['goods'] = grouped.sum().good
    
    agg2 = (agg1.sort_values(['min_scr'])).reset_index(drop=True) # 根据区间最小值排序
    agg2['bad_cum_rate'] = np.round((agg2.bads / df.bad.sum()).cumsum(), 4) # 计算bad样本累计概率
    agg2['good_cum_rate'] = np.round((agg2.goods / df.good.sum()).cumsum(), 4) 
    agg2['ks'] = abs(np.round(((agg2.bads / df.bad.sum()).cumsum() - (agg2.goods / df.good.sum()).cumsum()), 4)) # 计算bad和good累计概率之差的绝对值
    ks = agg2.ks.max()  # 求出ks
    
    plt.figure(figsize=(8, 4))  # 创建绘图对象
    plt.plot(agg2.min_scr, agg2.bad_cum_rate, "g-", linewidth=1)  # 在当前绘图对象绘图（X轴，Y轴，蓝色虚线，线宽度）
    plt.plot(agg2.min_scr, agg2.good_cum_rate, "b-", linewidth=1)
    
    x_abline = agg2['min_scr'][agg2['ks'] == agg2['ks'].max()] # ks最大的min_scr
    y_abline1 = agg2['bad_cum_rate'][agg2['ks'] == agg2['ks'].max()] # ks最大时bad_cum_rate
    y_abline2 = agg2['good_cum_rate'][agg2['ks'] == agg2['ks'].max()]
    plt.fill_between(x_abline, y_abline1, y_abline2, color = "red",linewidth=2)    
    
    sub = "%s%s"%('ks = ',ks)
    plt.legend(title=sub,loc='lower right')
    plt.xlabel("Minimum score")  # X轴标签
    plt.ylabel("Cumulative percentage(%)")  # Y轴标签
    plt.title('KS chart')  # 图标题
    plt.savefig(fig_dir)
    plt.show()  # 显示图

In [43]:
def auc_ks_v(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['b'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On valid is: {}'.format(auc))
        print('KS On valid is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['b'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))

In [44]:
def continuous_categorical(dataset):
    '''diff continuous and categorical
        
    '''
    continuousDomain = [] # float int
    categoricalDomain = [] # object bool category

    for item in dataset.columns:
        if (dataset[item].dtypes == object)| (dataset[item].dtypes == bool):
            categoricalDomain.append(item)
            dataset[item] = dataset[item].astype(str)
        elif item!='b':
            continuousDomain.append(item)
            dataset[item] = dataset[item]
    print (categoricalDomain)
    return continuousDomain,categoricalDomain

In [45]:
def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False):
    """
    :param proba_train: 训练集预测概率
    :param proba_validation: 验证集预测概率
    :param segment_cnt:
    :param out_path:
    :return: 模型稳定性
    """
    step = 1.0/segment_cnt
    flag = 0.0
    model_stability = []
    len_train = len(proba_train)
    len_validation = len(proba_validation)

    columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference',
               'variance','ln_variance','stability_index']

    while flag < 1.0:
        temp = {}

        score_range = '['+str(flag)+','+str(flag + step)+')'
        segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count()
        segment_train_percentage = segment_train_cnt*1.0/len_train
        segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count()
        segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation
        difference = segment_validation_percentage - segment_train_percentage
        variance = float(segment_validation_percentage)/segment_train_percentage
        ln_variance = variance
        stability_index = difference * ln_variance

        temp['score_range'] = score_range
        temp['segment_train_percentage'] = segment_train_percentage[0]
        temp['segment_validation_percentage'] = segment_validation_percentage[0]
        temp['difference'] = difference[0]
        temp['variance'] = variance[0]
        temp['ln_variance'] = ln_variance[0]
        temp['stability_index'] = stability_index[0]

        model_stability.append(temp)
        flag += step

    model_stability = pd.DataFrame(model_stability,columns=columns)
    if out_path:
        file_name = out_path if isinstance(out_path, str) else None
        model_stability.to_csv(file_name, index=False)

    return model_stability

In [46]:
data = pd.read_csv("data/all_lda.csv")

In [47]:
data = data[[ 't_01','t_02','t_03','t_04','t_05','t_06','t_07','t_08','t_09','t_10','t_11','t_12','t_13','t_14','t_15','t_16','t_17','t_18','t_19',
 't_20','t_21','t_22','t_23','t_24','t_25','t_26','t_27','t_28','t_29','t_30','t_31','t_32','t_33','t_34','t_35','t_36','t_37','t_38','t_39',
 't_40','t_41','t_42','t_43','t_44','t_45','t_46','t_47','t_48','t_49','t_50','t_51','t_52','t_53','t_54','t_55','b']]

In [48]:
# len(data)
data.head(2)

Unnamed: 0,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,t_10,...,t_47,t_48,t_49,t_50,t_51,t_52,t_53,t_54,t_55,b
0,0.0,0.016399,0.0,0.0,0.0,0.0,0.0,0.0,0.011921,0.053927,...,0.0,0.011697,0.0,0.022512,0.0,0.0,0.0,0.0,0.0,01. 坏
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.102615,0.014539,0.0,0.0,0.016099,0.0,0.0,01. 坏


In [49]:
pre_lda = pd.read_csv("data/pre_all_lda.csv")

In [50]:
pre_lda=pre_lda.rename(columns={'target':'b'})

In [51]:
# pre_lda['target']
pre_lda = pre_lda[[ 't_01','t_02','t_03','t_04','t_05','t_06','t_07','t_08','t_09','t_10','t_11','t_12','t_13','t_14','t_15','t_16','t_17','t_18','t_19',
 't_20','t_21','t_22','t_23','t_24','t_25','t_26','t_27','t_28','t_29','t_30','t_31','t_32','t_33','t_34','t_35','t_36','t_37','t_38','t_39',
 't_40','t_41','t_42','t_43','t_44','t_45','t_46','t_47','t_48','t_49','t_50','t_51','t_52','t_53','t_54','t_55','b']]

In [52]:
pre_lda.head(2)

Unnamed: 0,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,t_10,...,t_47,t_48,t_49,t_50,t_51,t_52,t_53,t_54,t_55,b
0,0.016735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052206,0.0,...,0.0,0.0,0.124379,0.031253,0.0,0.0,0.0,0.052202,0.0,1
1,0.0,0.108373,0.0,0.019228,0.0,0.0,0.0,0.023378,0.08069,0.0,...,0.0,0.0,0.034451,0.0,0.0,0.0,0.0,0.163911,0.0,1


In [53]:
# data = data.drop_duplicates(['ugid'])

In [54]:
data.head()

Unnamed: 0,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,t_10,...,t_47,t_48,t_49,t_50,t_51,t_52,t_53,t_54,t_55,b
0,0.0,0.016399,0.0,0.0,0.0,0.0,0.0,0.0,0.011921,0.053927,...,0.0,0.011697,0.0,0.022512,0.0,0.0,0.0,0.0,0.0,01. 坏
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.102615,0.014539,0.0,0.0,0.016099,0.0,0.0,01. 坏
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.477374,0.0,0.0,0.0,0.0,0.0,0.0,05. 好
3,0.0,0.0,0.0,0.0,0.0,0.0,0.019705,0.0,0.011198,0.0,...,0.0,0.0,0.0,0.201109,0.0,0.0,0.0,0.0,0.0,01. 坏
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.212417,0.0,0.0,0.0,0.0,0.0,0.0,0.179035,0.0,05. 好


In [55]:
# delete_list_2 = ['ugid','token_id_x','first_trans_time']

In [56]:
# d = ['crawler_time','token_id_y']

In [57]:
# d1 = ['ugid', 'token_id_x', 'crawler_time_x', 'ugid_y', 'crawler_time_y', 'last_decision_tm', 'token_id_y', 'token_id']

In [58]:
# data =data.drop(delete_list_2,axis=1)

In [59]:
# data =data.drop(d,axis=1)

In [60]:
# data = data.drop(a,axis=1)

In [61]:
# pre_lda =pre_lda.drop(d1,axis=1)
# pre_lda = pre_lda.drop(a,axis=1)

In [62]:
data.replace(np.inf, np.nan, inplace=True)
data.replace(-99998,np.nan, inplace=True)
data.replace(-99999976,np.nan, inplace=True)
data.replace(-9999979,np.nan, inplace=True)
data.replace(-9999976,np.nan, inplace=True)
data.replace(-999973,np.nan, inplace=True)
data.replace(-999976,np.nan, inplace=True)
data.replace(9999,np.nan, inplace=True)

In [63]:
pre_lda.replace(np.inf, np.nan, inplace=True)
pre_lda.replace(-99998,np.nan, inplace=True)
pre_lda.replace(-99999976,np.nan, inplace=True)
pre_lda.replace(-9999979,np.nan, inplace=True)
pre_lda.replace(-9999976,np.nan, inplace=True)
pre_lda.replace(-999973,np.nan, inplace=True)
pre_lda.replace(-999976,np.nan, inplace=True)
pre_lda.replace(9999,np.nan, inplace=True)

In [64]:
data = data.replace('01. 坏',0)
data = data.replace('05. 好',1)

In [65]:
# len(dfm)

In [66]:
continuousDomain, categoricalDomain = continuous_categorical(data)

[]


In [67]:
dfm = DataFrameMapper([(c, [CategoricalDomain(invalid_value_treatment = 'as_missing',
                                              missing_value_treatment = 'as_value',
                                              missing_value_replacement = 'N/A'), LabelBinarizer()])
                       for c in categoricalDomain] 
                      + 
                     [(c, [ContinuousDomain(invalid_value_treatment = 'as_missing',
                                                     missing_value_treatment = 'as_value',
                                                     missing_value_replacement = -1)])
                      for c in continuousDomain]
                      ,df_out = True)

In [68]:
%%time
dfm.fit(data)

CPU times: user 48.6 s, sys: 56 ms, total: 48.7 s
Wall time: 48.7 s


DataFrameMapper(default=False, df_out=True,
        features=[('t_01', [ContinuousDomain()]), ('t_02', [ContinuousDomain()]), ('t_03', [ContinuousDomain()]), ('t_04', [ContinuousDomain()]), ('t_05', [ContinuousDomain()]), ('t_06', [ContinuousDomain()]), ('t_07', [ContinuousDomain()]), ('t_08', [ContinuousDomain()]), ('t_09', [ContinuousDomain()]), ('...)]), ('t_53', [ContinuousDomain()]), ('t_54', [ContinuousDomain()]), ('t_55', [ContinuousDomain()])],
        input_df=False, sparse=False)

In [69]:
# len(data.columns)
# ==
len(pre_lda.columns)

56

In [70]:
pre_lda['token_id']=0

In [71]:
data=data[data.b.isin([0,1])]

In [72]:
data

Unnamed: 0,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,t_10,...,t_47,t_48,t_49,t_50,t_51,t_52,t_53,t_54,t_55,b
0,0.000000,0.016399,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.011921,0.053927,...,0.000000,0.011697,0.000000,0.022512,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.102615,0.014539,0.000000,0.000000,0.016099,0.000000,0.000000,0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.477374,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.019705,0.000000,0.011198,0.000000,...,0.000000,0.000000,0.000000,0.201109,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.212417,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.179035,0.000000,1
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.011729,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.953817,0.000000,0.000000,1
6,0.000000,0.000000,0.010689,0.000000,0.000000,0.051723,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.057587,0.000000,0.000000,0.000000,0.000000,0.000000,1
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.115552,0.000000,0.104498,0.079795,0.000000,0.000000,0.000000,0.000000,0.000000,1
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.089433,0.000000,0.017660,0.024981,0.000000,...,0.000000,0.011989,0.050598,0.080949,0.052759,0.026583,0.052241,0.028019,0.000000,1
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.012766,0.000000,0.000000,0.000000,0.032820,0.000000,0


In [73]:
train, test = train_test_split(data, test_size=0.2, random_state=2018)#调用sklearn中的方法，注意指定随机种子，保证结果可以复线

# print(train.shape)
# print(test.shape)
# print(X_train.shape)
# print(train.target.value_counts())#查看训练集好坏分布
# print(test.target.value_counts())#查看测试集好坏分布

In [74]:
X_train = dfm.transform(train)

In [75]:
X_train

Unnamed: 0,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,t_10,...,t_46,t_47,t_48,t_49,t_50,t_51,t_52,t_53,t_54,t_55
102505,0.000000,0.000000,0.048572,0.000000,0.000000,0.013893,0.000000,0.000000,0.036362,0.000000,...,0.000000,0.078317,0.000000,0.000000,0.010424,0.000000,0.000000,0.000000,0.050075,0.000000
79920,0.000000,0.000000,0.000000,0.000000,0.013724,0.015799,0.000000,0.013354,0.000000,0.090902,...,0.000000,0.157008,0.000000,0.000000,0.037629,0.000000,0.000000,0.000000,0.000000,0.000000
108167,0.000000,0.000000,0.000000,0.000000,0.000000,0.144018,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.025623,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033638
111409,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
109279,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.195576,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
12055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029793,...,0.067639,0.010760,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
56770,0.000000,0.000000,0.021495,0.000000,0.000000,0.000000,0.000000,0.015229,0.049707,0.000000,...,0.081318,0.031192,0.000000,0.000000,0.000000,0.047787,0.019370,0.026516,0.012620,0.000000
109300,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6969,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.039052,0.000000,0.000000,0.000000,...,0.000000,0.351988,0.000000,0.019088,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3777,0.000000,0.000000,0.012286,0.000000,0.000000,0.021460,0.000000,0.000000,0.000000,0.014900,...,0.000000,0.000000,0.000000,0.000000,0.015948,0.000000,0.075442,0.000000,0.024037,0.000000


In [76]:
y_train = train['b']

In [77]:
X_train = X_train.drop("b",axis=1)

ValueError: labels ['b'] not contained in axis

In [78]:
# X_train['b']

In [79]:
print(train.shape)
print(test.shape)
print(X_train.shape)
print(train.b.value_counts())#查看训练集好坏分布
print(test.b.value_counts())#查看测试集好坏分布

(91374, 56)
(22844, 56)
(91374, 55)
1    47552
0    43822
Name: b, dtype: int64
1    11776
0    11068
Name: b, dtype: int64


In [80]:
varlist=X_train.columns
varlist

Index(['t_01', 't_02', 't_03', 't_04', 't_05', 't_06', 't_07', 't_08', 't_09',
       't_10', 't_11', 't_12', 't_13', 't_14', 't_15', 't_16', 't_17', 't_18',
       't_19', 't_20', 't_21', 't_22', 't_23', 't_24', 't_25', 't_26', 't_27',
       't_28', 't_29', 't_30', 't_31', 't_32', 't_33', 't_34', 't_35', 't_36',
       't_37', 't_38', 't_39', 't_40', 't_41', 't_42', 't_43', 't_44', 't_45',
       't_46', 't_47', 't_48', 't_49', 't_50', 't_51', 't_52', 't_53', 't_54',
       't_55'],
      dtype='object')

In [81]:
# model = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.82622520164901361,
#         drop_rate=0.5, is_unbalance=True,
#         learning_rate=0.025, max_bin=4, max_depth=3,
#         max_drop=50, min_child_samples=120, min_child_weight=3,
#         min_split_gain=0.028, n_estimators=705, nthread=-1,
#         num_leaves=128, objective='binary', reg_alpha=30, reg_lambda=150,
#         scale_pos_weight=1, seed=27, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.7, subsample_for_bin=30000,
#         subsample_freq=3, uniform_drop=False, xgboost_dart_mode=False)

In [82]:
model = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.82622520164901361,
        drop_rate=0.5, is_unbalance=True,
        learning_rate=0.03, max_bin=10, max_depth=5,
        max_drop=50, min_child_samples=1200, min_child_weight=4,
        min_split_gain=0.01500339778285961, n_estimators=490, nthread=-1,
        num_leaves=128, objective='binary', reg_alpha=20, reg_lambda=100,
        scale_pos_weight=1, seed=27, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.7, subsample_for_bin=30000,
        subsample_freq=3, uniform_drop=False, xgboost_dart_mode=False)

In [83]:
def modelWithCv(model, x_array, y_array, cv=5):
    model.fit(x_array, y_array)
    
    dtrain_predictions = model.predict(x_array)
    dtrain_predprob = model.predict_proba(x_array)[:,1]
    
    print("--AUC Score (Train): %f" % roc_auc_score(y_array, dtrain_predprob))
    print ("class metrics:")
    print (metrics.classification_report(y_array, dtrain_predictions))
    
    cv_score = cross_val_score(model,x_array, y_array, cv=cv, scoring = 'roc_auc')
    print("--CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" %(np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score)))

In [84]:
%%time
modelWithCv(model, X_train, y_train, 2)

  if diff:


--AUC Score (Train): 0.683664
class metrics:
             precision    recall  f1-score   support

          0       0.61      0.64      0.62     43822
          1       0.65      0.62      0.64     47552

avg / total       0.63      0.63      0.63     91374

--CV Score : Mean - 0.6712516 | Std - 5.169919e-05 | Min - 0.6711999 | Max - 0.6713033
CPU times: user 4h 47min 31s, sys: 1min 9s, total: 4h 48min 41s
Wall time: 13min 53s


In [32]:
# X_train

In [86]:
pro,true,test_pro,test_true = auc_ks(model, dfm, train, test, varlist)

AUC On Test is: 0.6733794910427731
KS On Test is: 0.25109875365330525
AUC On Train is: 0.6836644798304017
KS On Train is: 0.2658459668650363


In [42]:
ivobj = iv_pandas()


In [178]:
X_train.columns

Index(['TO9', 'TO10', 'TO52', 'TO51', 'TO17', 'TO54', 'TO27', 'TO28', 'TO29',
       'TO34',
       ...
       't_46', 't_47', 't_48', 't_49', 't_50', 't_51', 't_52', 't_53', 't_54',
       't_55'],
      dtype='object', length=944)

In [180]:
list_for_iv = []
for i in X_train.columns:
    list_for_iv.append(i)

In [181]:
a = list_for_iv

In [189]:
a = [ 't_01','t_02','t_03','t_04','t_05','t_06','t_07','t_08','t_09','t_10','t_11','t_12','t_13','t_14','t_15','t_16','t_17','t_18','t_19',
 't_20','t_21','t_22','t_23','t_24','t_25','t_26','t_27','t_28','t_29','t_30','t_31','t_32','t_33','t_34','t_35','t_36','t_37','t_38','t_39',
 't_40','t_41','t_42','t_43','t_44','t_45','t_46','t_47','t_48','t_49','t_50','t_51','t_52','t_53','t_54','t_55']

In [190]:
a

['t_01',
 't_02',
 't_03',
 't_04',
 't_05',
 't_06',
 't_07',
 't_08',
 't_09',
 't_10',
 't_11',
 't_12',
 't_13',
 't_14',
 't_15',
 't_16',
 't_17',
 't_18',
 't_19',
 't_20',
 't_21',
 't_22',
 't_23',
 't_24',
 't_25',
 't_26',
 't_27',
 't_28',
 't_29',
 't_30',
 't_31',
 't_32',
 't_33',
 't_34',
 't_35',
 't_36',
 't_37',
 't_38',
 't_39',
 't_40',
 't_41',
 't_42',
 't_43',
 't_44',
 't_45',
 't_46',
 't_47',
 't_48',
 't_49',
 't_50',
 't_51',
 't_52',
 't_53',
 't_54',
 't_55']

In [197]:
woe,iv = ivobj.cal_woe_iv(pre_lda,a,'b',nsplit=10,event=1)

In [198]:
iv_1 = sorted(iv.items(),key = lambda x:x[1],reverse=True)

In [199]:
iv_1

[('t_31', 0.09742304508561597),
 ('t_14', 0.08823233601383906),
 ('t_09', 0.07731844228679052),
 ('t_02', 0.04824008313985164),
 ('t_42', 0.04744654367576008),
 ('t_30', 0.04236132194785624),
 ('t_10', 0.03512609603542958),
 ('t_29', 0.030928133185089607),
 ('t_38', 0.021569950717103575),
 ('t_12', 0.021411391333792516),
 ('t_34', 0.020900496444434924),
 ('t_26', 0.01978312097804548),
 ('t_21', 0.01969140278572819),
 ('t_48', 0.019521422590526444),
 ('t_22', 0.018542605469988694),
 ('t_54', 0.01839315581088644),
 ('t_36', 0.01791411257294495),
 ('t_28', 0.01665757208362976),
 ('t_27', 0.015062534975144502),
 ('t_49', 0.012756335852710996),
 ('t_47', 0.012119164632717224),
 ('t_46', 0.012060983108691348),
 ('t_50', 0.011756679408926888),
 ('t_44', 0.010643789440180029),
 ('t_40', 0.009130714672512944),
 ('t_45', 0.007393337866959835),
 ('t_15', 0.006861821037009678),
 ('t_32', 0.006462774169741293),
 ('t_33', 0.005518088763539035),
 ('t_23', 0.0043915884854487715),
 ('t_43', 0.003925331

In [57]:
df_train=dfm.transform(train)
predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])

In [68]:
model.feature_importances_

array([  0,   1,   3,   0,   2,   0,   4,   2,   3,   2,   3,   1,  14,
        27,  23,   0,   0,   2,   3,   5,   2,   5,   9,  15,   1,   4,
         1,   1,   7,  32,   4,   0,   0,   5,   0,   2,   0,   0,   0,
         2,   0,   0,   0,   0,   1,   0,   1,   0,   3,   3,   1,   2,
         4,   0,   1,   4,   1,   0,   1,   3,   0,  15,   1,   0,   2,
         2,   0,   2,   0,  13,   5,   2,   4,   3,   0,   0,  71,  30,
         0,   4,   0,   4,   6,   1,   9,   7,  16,   5,   2,   3,   2,
         4,  21,   7,   0,   0,   1,   3,   0,   2,   1,   2,   1,   2,
         1,   4,   3,   0,   5,   0,   3,   5,   2,   1,   5,   0,   1,
         2,   3,   1,   3,   0,   3,   2,   2,   2,   9,  35,   2,   2,
         3,   2,   2,   7,   9,   4,  48,   1,  23,  25,   2,   2,  22,
         1,   0,   4,   2,   1,   4,   1,   0,   2,   3,   2,  17,   6,
         3,   4,   0,   0,   0,   1,  15,   0,   1,   0,   1,   1,   0,
         2,   0,   1,   2,  34,   6,  27,   0,  19,   1,   1,   

In [61]:
predprob = np.array(predprob)

In [63]:
train_label = np.array(train['b'])

In [68]:
len(predprob)==len(train_label)

True

In [70]:
predprob = np.reshape(predprob,len(predprob))

In [71]:
predprob

array([0.00108401, 0.99892657, 0.99893273, ..., 0.00106378, 0.99885563,
       0.00110127])

In [59]:
def cal_ks_scipy(y_pred,y_true):
    return ks_2samp(y_pred[y_true==1],y_pred[y_true!=1]).statistic

In [72]:
ks_train = cal_ks_scipy(predprob,train_label)

In [73]:
ks_train

1.0