In [1]:
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
from scipy.stats import ks_2samp
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from datetime import datetime
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml import PMMLPipeline
import time
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import pickle
# from project_demo.tools.optimize import *
from project_demo.tools.evaluate import *
%matplotlib inline

In [2]:
def auc_ks(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['b'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On Test is: {}'.format(auc))
        print('KS On Test is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
        predprob_test = predprob
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['b'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    return predprob['predprob'], predprob['ytrue'],predprob_test['predprob'],predprob_test['ytrue']

In [3]:
def ks(df_score, df_good,fig_dir):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    df_score = pd.DataFrame(df_score)
    df_good = pd.DataFrame(df_good) 
    df_score.columns = ['score']
    df_good.columns = ['good']
    df = pd.concat([df_score,df_good],axis=1)
    
    df['bad'] = 1 - df.good
    bin = np.arange(0, 1.001, 0.05)
    df['bucket'] = pd.cut(df.score, bin)  # 根据bin来划分区间
   
    grouped = df.groupby('bucket', as_index=False) # 统计在每个区间的样本量
    agg1 = pd.DataFrame()
    agg1['min_scr'] = grouped.min().score # 取得每个区间的最小值
    agg1['max_scr'] = grouped.max().score
    agg1['bads'] = grouped.sum().bad # 计算每个区间bad的总数量
    agg1['goods'] = grouped.sum().good
    
    agg2 = (agg1.sort_values(['min_scr'])).reset_index(drop=True) # 根据区间最小值排序
    agg2['bad_cum_rate'] = np.round((agg2.bads / df.bad.sum()).cumsum(), 4) # 计算bad样本累计概率
    agg2['good_cum_rate'] = np.round((agg2.goods / df.good.sum()).cumsum(), 4) 
    agg2['ks'] = abs(np.round(((agg2.bads / df.bad.sum()).cumsum() - (agg2.goods / df.good.sum()).cumsum()), 4)) # 计算bad和good累计概率之差的绝对值
    ks = agg2.ks.max()  # 求出ks
    
    plt.figure(figsize=(8, 4))  # 创建绘图对象
    plt.plot(agg2.min_scr, agg2.bad_cum_rate, "g-", linewidth=1)  # 在当前绘图对象绘图（X轴，Y轴，蓝色虚线，线宽度）
    plt.plot(agg2.min_scr, agg2.good_cum_rate, "b-", linewidth=1)
    
    x_abline = agg2['min_scr'][agg2['ks'] == agg2['ks'].max()] # ks最大的min_scr
    y_abline1 = agg2['bad_cum_rate'][agg2['ks'] == agg2['ks'].max()] # ks最大时bad_cum_rate
    y_abline2 = agg2['good_cum_rate'][agg2['ks'] == agg2['ks'].max()]
    plt.fill_between(x_abline, y_abline1, y_abline2, color = "red",linewidth=2)    
    
    sub = "%s%s"%('ks = ',ks)
    plt.legend(title=sub,loc='lower right')
    plt.xlabel("Minimum score")  # X轴标签
    plt.ylabel("Cumulative percentage(%)")  # Y轴标签
    plt.title('KS chart')  # 图标题
    plt.savefig(fig_dir)
    plt.show()  # 显示图

In [4]:
def auc_ks_v(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['b'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On valid is: {}'.format(auc))
        print('KS On valid is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['b'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))

In [5]:
def continuous_categorical(dataset):
    '''diff continuous and categorical
        
    '''
    continuousDomain = [] # float int
    categoricalDomain = [] # object bool category

    for item in dataset.columns:
        if (dataset[item].dtypes == object)| (dataset[item].dtypes == bool):
            categoricalDomain.append(item)
            dataset[item] = dataset[item].astype(str)
        elif item!='target':
            continuousDomain.append(item)
            dataset[item] = dataset[item]
    print (categoricalDomain)
    return continuousDomain,categoricalDomain

In [6]:
def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False):
    """
    :param proba_train: 训练集预测概率
    :param proba_validation: 验证集预测概率
    :param segment_cnt:
    :param out_path:
    :return: 模型稳定性
    """
    step = 1.0/segment_cnt
    flag = 0.0
    model_stability = []
    len_train = len(proba_train)
    len_validation = len(proba_validation)

    columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference',
               'variance','ln_variance','stability_index']

    while flag < 1.0:
        temp = {}

        score_range = '['+str(flag)+','+str(flag + step)+')'
        segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count()
        segment_train_percentage = segment_train_cnt*1.0/len_train
        segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count()
        segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation
        difference = segment_validation_percentage - segment_train_percentage
        variance = float(segment_validation_percentage)/segment_train_percentage
        ln_variance = variance
        stability_index = difference * ln_variance

        temp['score_range'] = score_range
        temp['segment_train_percentage'] = segment_train_percentage[0]
        temp['segment_validation_percentage'] = segment_validation_percentage[0]
        temp['difference'] = difference[0]
        temp['variance'] = variance[0]
        temp['ln_variance'] = ln_variance[0]
        temp['stability_index'] = stability_index[0]

        model_stability.append(temp)
        flag += step

    model_stability = pd.DataFrame(model_stability,columns=columns)
    if out_path:
        file_name = out_path if isinstance(out_path, str) else None
        model_stability.to_csv(file_name, index=False)

    return model_stability

In [8]:
data = pd.read_table("in_out/thetafile.txt",names=delete_list_1)

In [7]:
delete_list_1 = ['ugid','a','t_01','t_02','t_03','t_04','t_05','t_06','t_07',
                 't_08','t_09','t_10','t_11','t_12','t_13','t_14','t_15','t_16','t_17','t_18','t_19','t_20','b']

In [9]:
delete_list_2 = ['ugid','a']

In [10]:
data =data.drop(delete_list_2,axis=1)

In [11]:
continuousDomain, categoricalDomain = continuous_categorical(data)

[]


In [12]:
dfm = DataFrameMapper([(c, [CategoricalDomain(invalid_value_treatment = 'as_missing',
                                              missing_value_treatment = 'as_value',
                                              missing_value_replacement = 'N/A'), LabelBinarizer()])
                       for c in categoricalDomain] 
                      + 
                     [(c, [ContinuousDomain(invalid_value_treatment = 'as_missing',
                                                     missing_value_treatment = 'as_value',
                                                     missing_value_replacement = -1)])
                      for c in continuousDomain]
                      ,df_out = True)

In [16]:
%%time
dfm.fit(data)

CPU times: user 18.3 s, sys: 28 ms, total: 18.4 s
Wall time: 18.4 s


DataFrameMapper(default=False, df_out=True,
        features=[('t_01', [ContinuousDomain()]), ('t_02', [ContinuousDomain()]), ('t_03', [ContinuousDomain()]), ('t_04', [ContinuousDomain()]), ('t_05', [ContinuousDomain()]), ('t_06', [ContinuousDomain()]), ('t_07', [ContinuousDomain()]), ('t_08', [ContinuousDomain()]), ('t_09', [ContinuousDomain()]), ('...in()]), ('t_19', [ContinuousDomain()]), ('t_20', [ContinuousDomain()]), ('b', [ContinuousDomain()])],
        input_df=False, sparse=False)

In [13]:
for i in range(75000):
    data.iat[i,20]=1

In [14]:
for i in range(75000,len(data)):
    data.iat[i,20]=0

In [15]:
data

Unnamed: 0,t_01,t_02,t_03,t_04,t_05,t_06,t_07,t_08,t_09,t_10,...,t_12,t_13,t_14,t_15,t_16,t_17,t_18,t_19,t_20,b
0,0.000101,0.114242,0.053636,0.006162,0.000101,0.000101,0.000101,0.011212,0.003131,0.000101,...,0.000101,0.142525,0.007172,0.001111,0.010202,0.297071,0.080909,0.118283,0.000101,1.0
1,0.171151,0.010878,0.036371,0.236522,0.008102,0.000025,0.000025,0.012393,0.004821,0.055553,...,0.019460,0.002549,0.177713,0.067415,0.000025,0.000530,0.029556,0.000530,0.121429,1.0
2,0.000265,0.000265,0.000265,0.000265,0.000265,0.000265,0.000265,0.000265,0.063926,0.093103,...,0.077188,0.018833,0.063926,0.002918,0.042706,0.077188,0.530769,0.024138,0.002918,1.0
3,0.000103,0.000103,0.170359,0.084205,0.000103,0.261641,0.016513,0.000103,0.030872,0.002154,...,0.003179,0.038051,0.000103,0.044205,0.046256,0.232923,0.005231,0.047282,0.004205,1.0
4,0.000100,0.000100,0.025996,0.608665,0.000100,0.120618,0.000100,0.007072,0.166434,0.031972,...,0.000100,0.010060,0.025996,0.000100,0.000100,0.000100,0.000100,0.001096,0.000100,1.0
5,0.000062,0.099875,0.008796,0.000062,0.174735,0.131691,0.000062,0.000062,0.000062,0.101747,...,0.054336,0.050593,0.033125,0.000062,0.051840,0.064941,0.018777,0.208422,0.000686,1.0
6,0.499696,0.075247,0.000076,0.000076,0.161048,0.000076,0.000076,0.000076,0.012225,0.000076,...,0.000076,0.043356,0.000076,0.034244,0.011465,0.000076,0.030448,0.000835,0.113212,1.0
7,0.000103,0.005252,0.000103,0.000103,0.078373,0.000103,0.001133,0.125747,0.008342,0.000103,...,0.000103,0.000103,0.000103,0.087642,0.210196,0.246241,0.162822,0.073223,0.000103,1.0
8,0.073905,0.000025,0.030622,0.000025,0.082861,0.233109,0.000025,0.118682,0.063706,0.000025,...,0.046542,0.122910,0.051517,0.014701,0.000025,0.087090,0.014204,0.014950,0.012960,1.0
9,0.091225,0.242306,0.193980,0.004965,0.007274,0.038281,0.001006,0.000016,0.127181,0.013871,...,0.060218,0.009253,0.133119,0.000016,0.027231,0.000016,0.000016,0.006779,0.043229,1.0


In [None]:
%%time
dfm.fit(data)

In [None]:
data=data[data.b.isin([0,"1"])]

In [17]:
train, test = train_test_split(data, test_size=0.15, random_state=2018)#调用sklearn中的方法，注意指定随机种子，保证结果可以复线

# print(train.shape)
# print(test.shape)
# print(X_train.shape)
# print(train.target.value_counts())#查看训练集好坏分布
# print(test.target.value_counts())#查看测试集好坏分布

In [18]:
X_train = dfm.transform(train)

In [19]:
y_train = train['b']

In [20]:
y_train

101075    0.0
61702     1.0
11875     1.0
5572      1.0
19406     1.0
49390     1.0
128646    0.0
18410     1.0
3273      1.0
93548     0.0
25579     1.0
76845     0.0
91236     0.0
94928     0.0
56132     1.0
120234    0.0
34742     1.0
98116     0.0
100139    0.0
94900     0.0
47006     1.0
79338     0.0
57989     1.0
150349    0.0
126078    0.0
69554     1.0
61024     1.0
119173    0.0
132564    0.0
45328     1.0
         ... 
67149     1.0
54189     1.0
117914    0.0
67999     1.0
102869    0.0
7356      1.0
98077     0.0
46546     1.0
143739    0.0
133992    0.0
69239     1.0
133751    0.0
93910     0.0
127925    0.0
121727    0.0
53457     1.0
126762    0.0
31903     1.0
114236    0.0
7371      1.0
59823     1.0
61515     1.0
84055     0.0
10388     1.0
40092     1.0
115605    0.0
84745     0.0
80098     0.0
60006     1.0
107770    0.0
Name: b, Length: 128392, dtype: float64

In [21]:
print(train.shape)
print(test.shape)
print(X_train.shape)
print(train.b.value_counts())#查看训练集好坏分布
print(test.b.value_counts())#查看测试集好坏分布

(128392, 21)
(22658, 21)
(128392, 21)
0.0    64608
1.0    63784
Name: b, dtype: int64
0.0    11442
1.0    11216
Name: b, dtype: int64


In [22]:
varlist=data.columns
varlist

Index(['t_01', 't_02', 't_03', 't_04', 't_05', 't_06', 't_07', 't_08', 't_09',
       't_10', 't_11', 't_12', 't_13', 't_14', 't_15', 't_16', 't_17', 't_18',
       't_19', 't_20', 'b'],
      dtype='object')

In [23]:
model = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.82622520164901361,
        drop_rate=0.5, is_unbalance=True,
        learning_rate=0.024875, max_bin=4, max_depth=3,
        max_drop=50, min_child_samples=120, min_child_weight=3,
        min_split_gain=0.02800339778285961, n_estimators=705, nthread=-1,
        num_leaves=128, objective='binary', reg_alpha=30, reg_lambda=150,
        scale_pos_weight=1, seed=27, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.7, subsample_for_bin=30000,
        subsample_freq=3, uniform_drop=False, xgboost_dart_mode=False)

In [24]:
def modelWithCv(model, x_array, y_array, cv=5):
    model.fit(x_array, y_array)
    
    dtrain_predictions = model.predict(x_array)
    dtrain_predprob = model.predict_proba(x_array)[:,1]
    
#     print("--AUC Score (Train): %f" % metrics.roc_auc_score(y_array, dtrain_predprob))
#     print ("class metrics:")
#     print (metrics.classification_report(y_array, dtrain_predictions))
    
    cv_score = cross_val_score(model,x_array, y_array, cv=cv, scoring = 'roc_auc')
#     print("--CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" %(np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score)))

In [25]:
%%time
modelWithCv(model, X_train, y_train, 2)

  if diff:


CPU times: user 1h 20min 13s, sys: 16.2 s, total: 1h 20min 29s
Wall time: 3min 25s


In [26]:
pro,true,test_pro,test_true = auc_ks(model, dfm, train, test, varlist)

AUC On Test is: 1.0
KS On Test is: 1.0
AUC On Train is: 1.0
KS On Train is: 1.0


In [None]:
data