In [1]:
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
from scipy.stats import ks_2samp
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from datetime import datetime
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml import PMMLPipeline
import time
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import pickle
# from project_demo.tools.optimize import *
from project_demo.tools.evaluate import *
%matplotlib inline
from sklearn import metrics

# 线形组合和adaboosting

In [33]:
def auc_ks(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['perf'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On Test is: {}'.format(auc))
        print('KS On Test is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
        predprob_test = predprob
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['perf'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    return predprob['predprob'], predprob['ytrue'],predprob_test['predprob'],predprob_test['ytrue']

In [34]:
def ks(df_score, df_good,fig_dir):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    df_score = pd.DataFrame(df_score)
    df_good = pd.DataFrame(df_good) 
    df_score.columns = ['score']
    df_good.columns = ['good']
    df = pd.concat([df_score,df_good],axis=1)
    
    df['bad'] = 1 - df.good
    bin = np.arange(0, 1.001, 0.05)
    df['bucket'] = pd.cut(df.score, bin)  # 根据bin来划分区间
   
    grouped = df.groupby('bucket', as_index=False) # 统计在每个区间的样本量
    agg1 = pd.DataFrame()
    agg1['min_scr'] = grouped.min().score # 取得每个区间的最小值
    agg1['max_scr'] = grouped.max().score
    agg1['bads'] = grouped.sum().bad # 计算每个区间bad的总数量
    agg1['goods'] = grouped.sum().good
    
    agg2 = (agg1.sort_values(['min_scr'])).reset_index(drop=True) # 根据区间最小值排序
    agg2['bad_cum_rate'] = np.round((agg2.bads / df.bad.sum()).cumsum(), 4) # 计算bad样本累计概率
    agg2['good_cum_rate'] = np.round((agg2.goods / df.good.sum()).cumsum(), 4) 
    agg2['ks'] = abs(np.round(((agg2.bads / df.bad.sum()).cumsum() - (agg2.goods / df.good.sum()).cumsum()), 4)) # 计算bad和good累计概率之差的绝对值
    ks = agg2.ks.max()  # 求出ks
    
    plt.figure(figsize=(8, 4))  # 创建绘图对象
    plt.plot(agg2.min_scr, agg2.bad_cum_rate, "g-", linewidth=1)  # 在当前绘图对象绘图（X轴，Y轴，蓝色虚线，线宽度）
    plt.plot(agg2.min_scr, agg2.good_cum_rate, "b-", linewidth=1)
    
    x_abline = agg2['min_scr'][agg2['ks'] == agg2['ks'].max()] # ks最大的min_scr
    y_abline1 = agg2['bad_cum_rate'][agg2['ks'] == agg2['ks'].max()] # ks最大时bad_cum_rate
    y_abline2 = agg2['good_cum_rate'][agg2['ks'] == agg2['ks'].max()]
    plt.fill_between(x_abline, y_abline1, y_abline2, color = "red",linewidth=2)    
    
    sub = "%s%s"%('ks = ',ks)
    plt.legend(title=sub,loc='lower right')
    plt.xlabel("Minimum score")  # X轴标签
    plt.ylabel("Cumulative percentage(%)")  # Y轴标签
    plt.title('KS chart')  # 图标题
    plt.savefig(fig_dir)
    plt.show()  # 显示图

In [35]:
def auc_ks_v(model, dataframemapper, trainset, testset, varlist,train_only=False):
    if train_only == False:
        df_test=dataframemapper.transform(testset)
        predprob = pd.DataFrame(model.predict_proba(df_test[varlist])[:,1], columns = ['predprob'])
        predprob['ytrue'] = testset['perf'].values
        auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
        print('AUC On valid is: {}'.format(auc))
        print('KS On valid is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))
    
    df_train=dataframemapper.transform(trainset)
    predprob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
    predprob['ytrue'] = trainset['perf'].values
    auc = roc_auc_score(y_score=predprob['predprob'], y_true=predprob['ytrue'])
    print('AUC On Train is: {}'.format(auc))
    print('KS On Train is: {}'.format(cal_ks_scipy(predprob['predprob'], predprob['ytrue'])))

In [5]:
def continuous_categorical(dataset):
    '''diff continuous and categorical
        
    '''
    continuousDomain = [] # float int
    categoricalDomain = [] # object bool category

    for item in dataset.columns:
        if (dataset[item].dtypes == object)| (dataset[item].dtypes == bool):
            categoricalDomain.append(item)
            dataset[item] = dataset[item].astype(str)
        elif item!='target':
            continuousDomain.append(item)
            dataset[item] = dataset[item]
    print (categoricalDomain)
    return continuousDomain,categoricalDomain

In [6]:
def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False):
    """
    :param proba_train: 训练集预测概率
    :param proba_validation: 验证集预测概率
    :param segment_cnt:
    :param out_path:
    :return: 模型稳定性
    """
    step = 1.0/segment_cnt
    flag = 0.0
    model_stability = []
    len_train = len(proba_train)
    len_validation = len(proba_validation)

    columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference',
               'variance','ln_variance','stability_index']

    while flag < 1.0:
        temp = {}

        score_range = '['+str(flag)+','+str(flag + step)+')'
        segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count()
        segment_train_percentage = segment_train_cnt*1.0/len_train
        segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count()
        segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation
        difference = segment_validation_percentage - segment_train_percentage
        variance = float(segment_validation_percentage)/segment_train_percentage
        ln_variance = variance
        stability_index = difference * ln_variance

        temp['score_range'] = score_range
        temp['segment_train_percentage'] = segment_train_percentage[0]
        temp['segment_validation_percentage'] = segment_validation_percentage[0]
        temp['difference'] = difference[0]
        temp['variance'] = variance[0]
        temp['ln_variance'] = ln_variance[0]
        temp['stability_index'] = stability_index[0]

        model_stability.append(temp)
        flag += step

    model_stability = pd.DataFrame(model_stability,columns=columns)
    if out_path:
        file_name = out_path if isinstance(out_path, str) else None
        model_stability.to_csv(file_name, index=False)

    return model_stability

In [457]:
data = pd.read_csv("tb_file/data/linear_ada_train.csv")

In [458]:
pre_fasttext = pd.read_csv('tb_flie/data/linear_ada_pre.csv')

In [459]:
# pre_fasttext =pre_fasttext.rename(columns={'target_y':'perf'})

In [460]:
pre_fasttext.head(0)

Unnamed: 0,ugid,token_id_x,TB1,TB5,TB9,TB3,TB7,TB11,TB2,TB6,...,TO218,TO208,TO221,TO198,TO211,token_id,crawler_time,last_decision_tm_y,perf,prob


In [461]:
data.head(5)

Unnamed: 0,ugid,token_id_x,TO9,TO10,TO52,TO51,TO17,TO54,TO27,TO28,...,TB296,crawler_time,TO414_y,TO415_y,TO416_y,TO417_y,TO418_y,first_trans_time,perf,prob
0,00024CEF-E433-4679-B85E-2237CF46A2B2,7d4e87d01f44479296e43124b4a979461488774175767_21,3672,2149,131490,234504,1523,103014,899,161,...,13,06MAR17:12:39:58,,,,,,2017-04-22 19:18:22,0,0.296029
1,0002BE1A-3E99-48F4-9C9C-B075B9D85BA7,4e31ee0c90e34879b798d4fd69714eca1486560721411_21,2960,1381,105290,212868,1579,107578,603,171,...,7,09MAR17:23:27:01,,,,,,2017-03-21 17:09:05,0,0.415101
2,00044410-EB5A-4F23-B629-64570EE71897,5cd370d2a346471593434f67ac9a00011493171833156_21,3558,1410,142013,435017,2148,293004,739,196,...,6,26APR17:10:10:02,,,,,,2017-04-26 10:26:08,1,0.598104
3,00046111-B639-4791-8FA9-F0D1906C326A,0e8d2d819cdd4f6a82e5af468c956a361495350053263_21,603,241,11384,30644,362,19260,194,94,...,13,21MAY17:15:38:19,68.0,119.0,137.0,137.0,3.0,2017-05-21 15:41:15,1,0.052565
4,000485EE-1971-4FCB-82A4-6A9A4BCF3A29,8641267920004cc79cc715c6563baee51495098191611_21,184,98,7026,16674,86,9648,177,69,...,33,18MAY17:23:59:57,709.0,270.0,185.0,152.0,7.0,2017-05-19 00:01:31,1,0.46161


In [464]:
delete_list_2 = ['ugid','token_id_x','crawler_time','token_id_y','crawler_time_x', 'ugid_y', 'crawler_time_y', 'last_decision_tm_x', 'token_id', 'last_decision_tm_y']

In [465]:
delete_list_1 = ['ugid','token_id_x','crawler_time','token_id_y','first_trans_time']

In [466]:
# d = ['crawler_time','token_id_y']
pre_fasttext =pre_fasttext.drop(delete_list_2,axis=1)

In [467]:
data =data.drop(delete_list_1,axis=1)

In [468]:
# data =data.drop(d,axis=1)
pre_fasttext.replace(np.inf, np.nan, inplace=True)
pre_fasttext.replace(-99998,np.nan, inplace=True)
pre_fasttext.replace(-99999976,np.nan, inplace=True)
pre_fasttext.replace(-9999979,np.nan, inplace=True)
pre_fasttext.replace(-9999976,np.nan, inplace=True)
pre_fasttext.replace(-999973,np.nan, inplace=True)
pre_fasttext.replace(-999976,np.nan, inplace=True)
pre_fasttext.replace(9999,np.nan, inplace=True)

In [470]:
# pre_fasttext.columns[]

In [471]:
data.replace(np.inf, np.nan, inplace=True)
data.replace(-99998,np.nan, inplace=True)
data.replace(-99999976,np.nan, inplace=True)
data.replace(-9999979,np.nan, inplace=True)
data.replace(-9999976,np.nan, inplace=True)
data.replace(-999973,np.nan, inplace=True)
data.replace(-999976,np.nan, inplace=True)
data.replace(9999,np.nan, inplace=True)

In [84]:
# data = data.replace('01. 坏',0)
# data = data.replace('05. 好',1)

In [475]:
continuousDomain, categoricalDomain = continuous_categorical(pre_fasttext)

[]


In [484]:
del pre_fasttext['target_x']

In [486]:
# data[['perf']]
for i in pre_fasttext.columns:
    print(i)

TB1
TB5
TB9
TB3
TB7
TB11
TB2
TB6
TB10
TB4
TB8
TB12
TB321
TB322
TB323
TB324
TB325
TB326
TB327
TB328
TB329
TB330
TB331
TB332
TB333
TB334
TB335
TB336
TB337
TB338
TB339
TB340
TB341
TB342
TB343
TB344
TB345
TB346
TB347
TB348
TB349
TB434
TB435
TB436
TB437
TB366
TB367
TB368
TB369
TB370
TB371
TB372
TB373
TB374
TB375
TB376
TB377
TB378
TB379
TB380
TB381
TB382
TB383
TB384
TB385
TB386
TB387
TB388
TB389
TB390
TB391
TB392
TB393
TB394
TB395
TB396
TB397
TB13
TB16
TB19
TB14
TB17
TB20
TB15
TB18
TB21
TB22
TB26
TB30
TB24
TB28
TB32
TB23
TB27
TB31
TB25
TB29
TB33
TB34
TB38
TB42
TB46
TB36
TB40
TB44
TB48
TB35
TB39
TB43
TB47
TB37
TB41
TB45
TB49
TB50
TB54
TB58
TB62
TB52
TB56
TB60
TB64
TB51
TB55
TB59
TB63
TB53
TB57
TB61
TB65
TB66
TB74
TB82
TB68
TB76
TB84
TB67
TB75
TB83
TB69
TB77
TB85
TB445
TB446
TB447
TB448
TB449
TB450
TB451
TB452
TB453
TB454
TB455
TB456
TB457
TB458
TB459
TB460
TB70
TB78
TB86
TB71
TB79
TB87
TB72
TB80
TB88
TB73
TB81
TB89
TB90
TB91
TB92
TB93
TB94
TB97
TB95
TB98
TB96
TB99
TB100
TB103
TB101
TB104
TB10

In [489]:
# for j in data.columns:
#     print(j)
# a = pd.merge(data,pre_fasttext)

In [493]:
# len(a)

0

In [165]:
dfm = DataFrameMapper([(c, [CategoricalDomain(invalid_value_treatment = 'as_missing',
                                              missing_value_treatment = 'as_value',
                                              missing_value_replacement = 'N/A'), LabelBinarizer()])
                       for c in categoricalDomain] 
                      + 
                     [(c, [ContinuousDomain(invalid_value_treatment = 'as_missing',
                                                     missing_value_treatment = 'as_value',
                                                     missing_value_replacement = -1)])
                      for c in continuousDomain]
                      ,df_out = True)

In [166]:
%%time
dfm.fit(data)

CPU times: user 5min 50s, sys: 256 ms, total: 5min 50s
Wall time: 5min 50s


DataFrameMapper(default=False, df_out=True,
        features=[('TO9', [ContinuousDomain()]), ('TO10', [ContinuousDomain()]), ('TO52', [ContinuousDomain()]), ('TO51', [ContinuousDomain()]), ('TO17', [ContinuousDomain()]), ('TO54', [ContinuousDomain()]), ('TO27', [ContinuousDomain()]), ('TO28', [ContinuousDomain()]), ('TO29', [ContinuousDomain()]), ('T...'TO417_y', [ContinuousDomain()]), ('TO418_y', [ContinuousDomain()]), ('perf', [ContinuousDomain()])],
        input_df=False, sparse=False)

In [144]:
dfm.fit(pre_fasttext)

  "mean" : numpy.asarray(numpy.nanmean(X, axis = 0)),
  keepdims=keepdims)
  r = func(a, **kwargs)


DataFrameMapper(default=False, df_out=True,
        features=[('TB1', [ContinuousDomain()]), ('TB5', [ContinuousDomain()]), ('TB9', [ContinuousDomain()]), ('TB3', [ContinuousDomain()]), ('TB7', [ContinuousDomain()]), ('TB11', [ContinuousDomain()]), ('TB2', [ContinuousDomain()]), ('TB6', [ContinuousDomain()]), ('TB10', [ContinuousDomain()]), ('TB4', [...), ('TO198', [ContinuousDomain()]), ('TO211', [ContinuousDomain()]), ('perf', [ContinuousDomain()])],
        input_df=False, sparse=False)

In [167]:
data=data[data.perf.isin([0,1])]

In [None]:
pre_fasttext=pre_fasttext[pre_fasttext.perf.isin([0,1])]

In [1]:
# data

In [168]:
train, test = train_test_split(data, test_size=0.1, random_state=2018)#调用sklearn中的方法，注意指定随机种子，保证结果可以复线

# print(train.shape)
# print(test.shape)
# print(X_train.shape)
# print(train.target.value_counts())#查看训练集好坏分布
# print(test.target.value_counts())#查看测试集好坏分布

In [169]:
X_train = dfm.transform(train)

In [170]:
y_train = train['perf']

In [171]:
X_train = X_train.drop("perf",axis=1)

In [172]:
# X_train

In [173]:
print(train.shape)
print(test.shape)
print(X_train.shape)
print(train.perf.value_counts())#查看训练集好坏分布
print(test.perf.value_counts())#查看测试集好坏分布

(61822, 889)
(6870, 889)
(61822, 888)
1    32111
0    29711
Name: perf, dtype: int64
1    3600
0    3270
Name: perf, dtype: int64


In [174]:
varlist=X_train.columns
varlist

Index(['TO9', 'TO10', 'TO52', 'TO51', 'TO17', 'TO54', 'TO27', 'TO28', 'TO29',
       'TO34',
       ...
       'TB246', 'TB252', 'TB265', 'TB297', 'TB296', 'TO414_y', 'TO415_y',
       'TO416_y', 'TO417_y', 'TO418_y'],
      dtype='object', length=888)

In [175]:
model = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.82622520164901361,
        drop_rate=0.5, is_unbalance=True,
        learning_rate=0.025, max_bin=4, max_depth=3,
        max_drop=50, min_child_samples=120, min_child_weight=3,
        min_split_gain=0.028, n_estimators=705, nthread=-1,
        num_leaves=128, objective='binary', reg_alpha=30, reg_lambda=150,
        scale_pos_weight=1, seed=27, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.7, subsample_for_bin=30000,
        subsample_freq=3, uniform_drop=False, xgboost_dart_mode=False)

In [176]:
model = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.82622520164901361,
        drop_rate=0.5, is_unbalance=True,
        learning_rate=0.03, max_bin=10, max_depth=5,
        max_drop=50, min_child_samples=1200, min_child_weight=4,
        min_split_gain=0.01500339778285961, n_estimators=490, nthread=-1,
        num_leaves=128, objective='binary', reg_alpha=20, reg_lambda=100,
        scale_pos_weight=1, seed=27, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.7, subsample_for_bin=30000,
        subsample_freq=3, uniform_drop=False, xgboost_dart_mode=False)

In [177]:
def modelWithCv(model, x_array, y_array, cv=5):
    model.fit(x_array, y_array)
    
    dtrain_predictions = model.predict(x_array)
    dtrain_predprob = model.predict_proba(x_array)[:,1]
    
    print("--AUC Score (Train): %f" % roc_auc_score(y_array, dtrain_predprob))
    print ("class metrics:")
    print (metrics.classification_report(y_array, dtrain_predictions))
    
    cv_score = cross_val_score(model,x_array, y_array, cv=cv, scoring = 'roc_auc')
    print("--CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" %(np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score)))

In [178]:
%%time
modelWithCv(model, X_train, y_train, 2)

  if diff:


--AUC Score (Train): 0.757755
class metrics:
             precision    recall  f1-score   support

          0       0.67      0.70      0.69     29711
          1       0.71      0.68      0.69     32111

avg / total       0.69      0.69      0.69     61822

--CV Score : Mean - 0.7223358 | Std - 0.0003219352 | Min - 0.7220138 | Max - 0.7226577
CPU times: user 5h 13min 19s, sys: 1min 12s, total: 5h 14min 32s
Wall time: 12min 24s


# fasttext_main_line

In [191]:
# X_train['prob']
df_train=dfm.transform(data)
all_prob = pd.DataFrame(model.predict_proba(df_train[varlist])[:,1], columns = ['predprob'])
all_prob['ytrue'] = data['perf'].values

In [194]:
data_0 = pd.read_csv("data/fasttext_main_data.csv")

In [195]:
all_prob['fastprob']=data_0['prob'].values

In [201]:
# pre_fasttext['perf']

In [202]:
df_pre=dfm.transform(pre_fasttext)
pre_prob = pd.DataFrame(model.predict_proba(df_pre[varlist])[:,1], columns = ['predprob'])
pre_prob['ytrue'] = pre_fasttext['perf'].values

In [206]:
# pre_prob
pre_f = pd.read_csv('data/pre_all_fasttext.csv')

In [207]:
pre_prob['fastprob']=pre_f['prob'].values

In [209]:
len(pre_f)==len(pre_prob)

True

In [211]:
pre_prob['prob']=0.0
all_prob['prob']=0.0

In [214]:
train_prob, test_prob = train_test_split(all_prob, test_size=0.1, random_state=2018)

In [226]:
pre_prob

Unnamed: 0,predprob,ytrue,fastprob,prob
0,0.610502,1,0.679297,0.0
1,0.757769,1,0.577517,0.0
2,0.581823,0,0.386687,0.0
3,0.837402,1,0.649922,0.0
4,0.795014,1,0.662916,0.0
5,0.613261,0,0.507287,0.0
6,0.335335,0,0.502666,0.0
7,0.470683,0,0.481760,0.0
8,0.408449,0,0.513140,0.0
9,0.500290,0,0.643847,0.0


In [216]:
len(test_prob)

6870

In [218]:
train_label = []
for i in range(len(train_prob)):
    train_label.append(train_prob.iat[i,1])

In [220]:
test_label = []
for i in range(len(test_prob)):
    test_label.append(test_prob.iat[i,1])

In [222]:
pre_label = []
for i in range(len(pre_prob)):
    pre_label.append(pre_prob.iat[i,1])

In [223]:
# len(test_label)
import numpy as np
train_label = np.array(train_label)
test_label = np.array(test_label)
pre_label = np.array(pre_label)

In [224]:
from scipy.stats import ks_2samp
def cal_ks_scipy(y_pred,y_true):
    return ks_2samp(y_pred[y_true==1],y_pred[y_true!=1]).statistic

In [227]:
for i in range(0,101,5):
    train_prob_list = []
    test_prob_list = []
    pre_prob_list = []
    a = i/100
    for j in range(len(train_prob)):
        train_prob_list.append(a*train_prob.iat[j,0]+(1-a)*train_prob.iat[j,2])
    for j in range(len(test_prob)):
        test_prob_list.append(a*test_prob.iat[j,0]+(1-a)*test_prob.iat[j,2])
    for j in range(len(pre_prob)):
        pre_prob_list.append(a*pre_prob.iat[j,0]+(1-a)*pre_prob.iat[j,2])
    train_prob_list = np.array(train_prob_list)
    test_prob_list = np.array(test_prob_list)
    pre_prob_list = np.array(pre_prob_list)
    print("a is",a)
    print("train ks is ",cal_ks_scipy(train_prob_list,train_label))
    print("test ks is ",cal_ks_scipy(test_prob_list,test_label))
    print("pre ks is ",cal_ks_scipy(pre_prob_list,pre_label))

a is 0.0
train ks is  0.2505774202563977
test ks is  0.24735983690112134
pre ks is  0.21895410064827364
a is 0.05
train ks is  0.26624348727345054
test ks is  0.25984709480122326
pre ks is  0.23087950396668333
a is 0.1
train ks is  0.2813224466479464
test ks is  0.2712079510703364
pre ks is  0.24346192273532796
a is 0.15
train ks is  0.295010994503295
test ks is  0.2794367991845056
pre ks is  0.25447420673380894
a is 0.2
train ks is  0.30901643458130956
test ks is  0.2899515800203874
pre ks is  0.26590868488432673
a is 0.25
train ks is  0.32222267434158725
test ks is  0.29450560652395513
pre ks is  0.2752900540068259
a is 0.3
train ks is  0.3343769481848739
test ks is  0.30529306829765546
pre ks is  0.28268776641974025
a is 0.35
train ks is  0.3445218900657505
test ks is  0.3129612640163099
pre ks is  0.2906563690967975
a is 0.4
train ks is  0.35259530093289526
test ks is  0.31472731906218143
pre ks is  0.2990452092890109
a is 0.45
train ks is  0.3613371935911517
test ks is  0.32299949

In [198]:
pro,true,test_pro,test_true = auc_ks(model, dfm, train, pre_fasttext, varlist)

AUC On Test is: 0.7092076114766503
KS On Test is: 0.30643346186005666
AUC On Train is: 0.7577553942274264
KS On Train is: 0.3814966481193178


# youhuachengdutuiduan

In [232]:
pre_mainfalse_fasttrue_yfalse = pre_prob[pre_prob.predprob>0.5][pre_prob.ytrue==0][pre_prob.fastprob<0.5]

  """Entry point for launching an IPython kernel.


In [234]:
pre_mainfalse_fasttrue_ytrue = pre_prob[pre_prob.predprob<0.5][pre_prob.ytrue==1][pre_prob.fastprob>0.5]

  """Entry point for launching an IPython kernel.


In [236]:
(len(pre_mainfalse_fasttrue_ytrue)+len(pre_mainfalse_fasttrue_yfalse))/len(pre_prob)

0.14409287502845436

In [237]:
pre_maintrue_fastfalse_ytrue = pre_prob[pre_prob.predprob>0.5][pre_prob.ytrue==1][pre_prob.fastprob<0.5]

  """Entry point for launching an IPython kernel.


In [238]:
pre_maintrue_fastfalse_yfalse= pre_prob[pre_prob.predprob<0.5][pre_prob.ytrue==0][pre_prob.fastprob>0.5]

  """Entry point for launching an IPython kernel.


In [239]:
(len(pre_maintrue_fastfalse_ytrue)+len(pre_maintrue_fastfalse_yfalse))/len(pre_prob)

0.1915319826997496

In [240]:
pre_maintrue_fasttrue_yfalse= pre_prob[pre_prob.predprob<0.5][pre_prob.ytrue==0][pre_prob.fastprob<0.5]

  """Entry point for launching an IPython kernel.


In [241]:
pre_maintrue_fasttrue_ytrue= pre_prob[pre_prob.predprob>0.5][pre_prob.ytrue==1][pre_prob.fastprob>0.5]

  """Entry point for launching an IPython kernel.


In [242]:
(len(pre_maintrue_fasttrue_yfalse)+len(pre_maintrue_fasttrue_ytrue))/len(pre_prob)

0.4599590257227407

In [246]:
pre_mainfalse_fastfalse_yfalse= pre_prob[pre_prob.predprob>0.5][pre_prob.ytrue==0][pre_prob.fastprob>0.5]

  """Entry point for launching an IPython kernel.


In [243]:
pre_mainfalse_fastfalse_ytrue= pre_prob[pre_prob.predprob<0.5][pre_prob.ytrue==1][pre_prob.fastprob<0.5]

  """Entry point for launching an IPython kernel.


In [245]:
(len(pre_mainfasle_fastfalse_yfalse)+len(pre_mainfalse_fastfalse_ytrue))/len(pre_prob)

0.2044161165490553

In [247]:
(len(pre_mainfalse_fasttrue_ytrue)+len(pre_mainfalse_fasttrue_yfalse))/(len(pre_mainfalse_fasttrue_ytrue)+len(pre_mainfalse_fasttrue_yfalse)+
                                                                       len(pre_mainfalse_fastfalse_ytrue)+len(pre_mainfalse_fastfalse_yfalse))

0.41345525800130634

In [494]:
(len(pre_mainfalse_fasttrue_ytrue)+len(pre_mainfalse_fasttrue_yfalse))-(len(pre_maintrue_fastfalse_yfalse)+len(pre_maintrue_fastfalse_ytrue))

-1042

In [250]:
(len(pre_maintrue_fastfalse_yfalse)+len(pre_maintrue_fastfalse_ytrue))/(
    len(pre_maintrue_fastfalse_ytrue)+len(pre_maintrue_fastfalse_yfalse)+
    len(pre_maintrue_fasttrue_ytrue)+len(pre_maintrue_fasttrue_yfalse))

0.2939902166317261

# sklearn  adaboost

In [313]:
from sklearn.model_selection import cross_val_score
# from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [314]:
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2,min_samples_split=20,min_samples_leaf=4),algorithm="SAMME", 
                         n_estimators=200,learning_rate=0.8)

In [340]:
clf_1 = AdaBoostClassifier(n_estimators=200,learning_rate=0.9)

In [438]:
clf_2 = AdaBoostClassifier(n_estimators=700,learning_rate=0.9)

In [439]:
# score = cross_val_score(clf,data,label)

In [440]:
# model_ada = clf.fit(data,label)

In [441]:
# model_ada_1 = clf_1.fit(data,label)

In [2]:
# data

In [442]:
model_ada_2 = clf_2.fit(data,label)

In [443]:
train_p_ada =  model_ada_2.predict_proba(data)

In [444]:
test_p_ada = model_ada_2.predict_proba(data_test)

In [445]:
pre_p_ada = model_ada_2.predict_proba(data_pre)

In [446]:
# train_ada = model_ada.predict(data)
# train_ada

In [447]:
train_prob_ada = []
for i in range(len(train_p_ada)):
    train_p_ada = list(train_p_ada)
    train_prob_ada.append(train_p_ada[i][1])

In [448]:
test_prob_ada = []
for i in range(len(test_p_ada)):
    test_p_ada = list(test_p_ada)
    test_prob_ada.append(test_p_ada[i][1])

In [449]:
pre_prob_ada = []
for i in range(len(pre_p_ada)):
    pre_p_ada = list(pre_p_ada)
    pre_prob_ada.append(pre_p_ada[i][1])

In [450]:
train_prob_ada = np.array(train_prob_ada)

In [451]:
test_prob_ada = np.array(test_prob_ada)

In [452]:
pre_prob_ada = np.array(pre_prob_ada)

In [453]:
train_prob_ada

array([0.49974856, 0.49991385, 0.50047877, ..., 0.50011079, 0.49973949,
       0.49968061])

In [454]:
cal_ks_scipy(train_prob_ada,train_label)

0.38671678481277294

In [455]:
cal_ks_scipy(test_prob_ada,test_label)


0.3450229357798165

In [456]:
cal_ks_scipy(pre_prob_ada,pre_label)

0.3135390384376673

In [308]:
model_ada.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 100,
 'random_state': None}