In [1]:
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]
def gini_xgb_neg(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]
def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = eval_gini(y, preds)
    return [('gini', score, True)]
def gini_lgb_sk(y,preds):
    score = eval_gini(y, preds)
    return [('gini', score, True)]

######################################################
### 参数解释：                                     ###
###     all_col:      训练数据的 columns           ###
###     importances:  交叉验证得到的 importances   ###
###     K:            交叉验证的折数               ###
def ave_importance(all_col, importances, K=5, isxgb = True):
    # 创造总字典
    final_importance = dict(zip(list(all_col),np.zeros(len(all_col))))                 # 字典的 key 为 train 的列，values 为 0
    
    # 各个字典除以K
    for model_index in range(K):                                                       # 对每个模型得到的 importances 进行循环
        for each_key in importances[model_index].keys():                               # 对每个 importances 结果的每个 key 进行循环
            importances[model_index][each_key] = importances[model_index][each_key]/K  # 将该 key 对应的 value 除以 K，并替换
    
    # 各个字典的 key 转换成相应的变量名
    if isxgb:
        for model_index in range(K):
            keys_index = list(importances[model_index].keys())                             # 获得该 importances 的 keys
            values = list(importances[model_index].values())                               # 获得该 importances 的 values
            for i in range(len(keys_index)):
                keys_index[i] = int(keys_index[i][1:])       # 对每个 key 取其除了首字母“f”后面的数值部分，转化为 int 型，作为 index
            importances[model_index] = dict(zip(list(all_col[keys_index]),values))         # 将 keys 替换为指标名        
    
    # 各个字典相加到总字典种
    for model_index in range(K):
        for keys, values in importances[model_index].items(): 
            final_importance[keys] += values

    return final_importance

######################################################
### 参数解释：                                     ###
###     all_col:      训练数据的 columns           ###
###     importances:  交叉验证得到的 importances   ###
###     K:            交叉验证的折数               ###
def ave_importance_lgb(all_col, importances, K=5):
    # 创造总字典
    final_importance = dict(zip(list(all_col),np.zeros(len(all_col))))                 # 字典的 key 为 train 的列，values 为 0
    
    # 各个字典除以K
    for model_index in range(K):                                                       # 对每个模型得到的 importances 进行循环
        for each_key in importances[model_index].keys():                               # 对每个 importances 结果的每个 key 进行循环
            importances[model_index][each_key] = importances[model_index][each_key]/K  # 将该 key 对应的 value 除以 K，并替换       
    
    # 各个字典相加到总字典种
    for model_index in range(K):
        for keys, values in importances[model_index].items(): 
            final_importance[keys] += values

    return final_importance

import warnings
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from boruta import BorutaPy
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings("ignore")



In [4]:
base_path = 'D:/DataSet/Credit/'
train = pd.read_csv(base_path + 'featured/train_all_feature_log.csv')
test = pd.read_csv(base_path + 'featured/test_all_feature_log.csv')

one_hot = 0
if one_hot:
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)

add_noise = 0
if add_noise:
    train['noise_flt'] = np.random.normal(size = len(train))
    train['noise_bin'] = np.random.binomial(1,0.5,size=len(train))
    test['noise_flt'] = np.random.normal(size = len(test))
    test['noise_bin'] = np.random.binomial(1,0.5,size=len(test))
    
col_to_drop = [
    # 1
    'count_house_loan_ln',# 0.97232912500359225)
    'count_house_loan',# 0.65395611105534779)
    'edu_level_other',# 0.57350198625873861)
    'count_payment_state_E_ln',# 0.51044888053676218)
    'count_attention_ln',# 0.32350891467937332)
    'count_commercial_loan',# 0.0)
    'count_sixty_ovd_dw',# 0.0)
    'count_sixty_ovd_months',# 0.0)
    'ind_sixty_max_duration',# 0.0)
    'marry_status_other',# 0.0)
    'count_study_loan_ln',# 0.0)
    'count_housing_accumulation_ln',# 0.0)
    'count_commercial_housing_ln',# 0.0)
    'count_combination_ensure_ln',# 0.0)
    'ind_other_counts_lnd',# 0.0)
    'count_combination_lnd',# 0.0)
    'count_pledge_guarantee_lnd',# 0.0)
    'count_ensure_lnd',# 0.0)
    'count_other_guarantee_lnd',# 0.0)
    'count_combination_ensure_lnd',# 0.0)
    'count_farmer_joint_lnd',# 0.0)
    'count_pledge_guarantee_lnd.1',# 0.0)
    # 10
    'ind_curr_overdue_cyc_lnd',# 9.1747017468829988)
    'ind_unact_counts_lnd',# 8.1829607049772104)
    'ind_other_counts',# 8.1190294047611253)
    'count_pledge_guarantee_ln.1',# 4.9866824922417035)
    'count_sharedebt',# 4.25470982342895)
    'marry_status_unmarried',# 3.5268450817329375)
    'count_normal_ln',# 3.1544560288384238)
    'count_farmer_joint_ln',# 2.8250789765495536)
    'not_clear_account_count',# 2.438362051462843)
    'count_car_loan_ln',# 2.0244134983035815)
    'count_payment_state_E_lnd',# 1.9511014375106948)
    # 40
    'ind_normal_counts',# 38.981578298058366)
    'count_ensure_ln',# 32.686012781155711)
    'not_logout_pre_account_count',# 29.035934862113912)
    'count_spl',# 26.621401271906876)
    'not_logout_pre_finance_org_count',# 20.88833411697636)
    'count_debit_card_ovd_dw',# 20.552771641255653)
    'ind_clear_counts_lnd',# 18.63633007759838)
    'cat_query_reason_mal',# 18.012892273855371)
    'count_other_guarantee_ln',# 15.719418555370231)
    'count_pledge_guarantee_ln',# 14.603172531948651)
    'count_standard_loancard',# 14.257967916620288)
    'count_combination_ln',# 13.225866749150008)
    'marry_status_divorced',# 12.996673186745294)
    'flt_highest_sixty_oa_per_mon',# 12.496437876453976)
    'count_farmer_loan_ln',# 12.034259912867824)
    # 60
    'curr_overdue_cyc_days',# 58.533138767951407)
    'not_logout_pre_max_credit_limit_per_org',# 55.545770066417433)
    'not_logout_pre_finance_corp_count',# 54.909995520093744)
    'has_fund',# 53.524477466077244)
    'edu_level_bachelor',# 51.681921368379456)
    'cat_query_reason_sqe',# 50.507370364531575)
    'not_clear_finance_org_count',# 50.499752342403504)
    'count_payment_state_B_ln',# 49.675949679726415)
    'count_ovd',# 47.284280675138668)
    'marry_status_married',# 47.141063732728689)
    # 100
    'count_payment_state_D_ln',# 98.582591891500016)
    'not_logout_finance_org_count',# 97.63379220401383)
    'count_credit_loan_ln',# 96.057688729626022)
    'count_payment_state_D_lnd',# 94.52677681813546)
    'all_highest_oa_per_mon',# 93.758390716396264)
    'count_consumption loan_ln',# 92.34403542313882)
    'not_logout_pre_latest_6m_used_avg_amount',# 85.357040829001591)
    'balance',# 85.186190705260103)
    'cat_query_reason_la',# 83.737758346562558)
    'ind_clear_counts',# 82.467554392775014)
    'count_unknown_ln',# 73.400641666623201)
    'not_logout_pre_min_credit_limit_per_org',# 70.752886069309909)
    'not_logout_pre_credit_limit',# 68.394379619115668)
    'not_logout_account_count',# 67.926934887483739)
    'lnd_ovd_sum_amount',# 67.598474433716945)
    'flt_highest_debit_card_oa_per_mon',# 64.9057845815354)
    'count_loan_ovd_dw',# 64.132474960621153)
    'count_operating_loan_ln',# 63.127772355688421)
    'not_clear_finance_corp_count',# 62.877948676798582)
    # 180
    'not_logout_latest_6m_used_avg_amount',# 176.32317263727683)
    'ind_loan_max_duration',# 167.56724876084013)
    'count_credit_loan_lnd',# 166.96045957856194)
    'all_max_duration',# 153.57957111737943)
    'not_clear_balance',# 153.18548993022915)
    'flt_highest_loan_oa_per_mon',# 150.5264464131252)
    'flt_sum_amount',# 149.83162201460962)
    'used_highest_amount_lnd',# 136.85598322859255)
    'scheduled_payment_amount',# 128.02030046387119)
    'latest6_month_used_avg_amount_lnd',# 124.58135686300396)
    'ind_org_counts',# 120.5490321578975)
    'ind_debit_card_max_duration',# 118.53218655429413)
    'edu_level_junior',# 117.02178169627837)
    'lnd_ovd_sum_last_months',# 114.74345491704378)
    'used_credit_limit_amount_lnd',# 114.2672848868574)
    'ave_ovd_amount',# 114.06571798355444)
    'count_other_loan',# 111.82596886962884)
    'range_lnd_ovd',# 109.13363916690426)
    'not_logout_pre_used_credit_limit',# 105.64340446411771)
    'count_sum_ovd_dw',# 104.97563024241767)
    # flt_noise
    'not_logout_max_credit_limit_per_org',# 211.80030432198566)
    # 240
    #'not_logout_finance_corp_count',# 217.97120979038462)
]

good_feature = [
    'work_province',# 30694.650976051769)
    'count_payment_state_C_lnd',# 2982.003392965335)
    'agent',# 2602.0783870214441)
    'ind_query_reason_1',# 2533.8643067204089)
    'edu_level_under',# 1859.9270182980599)
    'ind_query_reason_sum',# 1383.3172016898654)
    'salary',# 1211.8799915686996)
    'actual_payment_amount_lnd',# 998.26369029401383)
    'credit_limit_amount',# 922.24288539403824)
    'not_clear_latest_6m_used_avg_amount',# 821.60081160725144)
    'credit_limit_amount_lnd',# 772.31533574258708)
    'not_clear_credit_limit',# 665.24639282660962)
    'count_payment_state_B_lnd',# 547.27779239096174)
    'is_local',# 508.77236411435695)
    'count_sum_ovd_months',# 498.09872004178482)
    'curr_overdue_amount_lnd',# 468.78860263321621)
    'count_payment_state_C_ln',# 466.40915096322885)
    'count_payment_state_A_lnd',# 466.38884880989008)
    'actual_payment_amount',# 441.74983784890975)
    'remain_payment_cyc_days',# 426.75063460575694)
    'ind_query_reason_0',# 411.1863172129091)
    'edu_level_middle',# 409.12973816195012)
    'count_other_loan_ln',# 405.44576356536675)
    'share_credit_limit_amount_lnd',# 368.8973182499239)
    'scheduled_payment_amount_lnd',# 357.21129907441025)
    'curr_overdue_amount',# 334.78859657977409)
    'not_logout_credit_limit',# 301.16843215738373)
    'count_debit_card_ovd_months',# 296.58511155734573)
    'not_logout_used_credit_limit',# 296.47367085191996)
    'count_loancard',# 282.10276677553202)
    'count_month',# 279.2126415116054)
    'count_payment_state_A_ln',# 278.65120956624384)
    'count_loan_ovd_months',# 278.22592234161147)
    'ind_query_reason_2',# 266.19281460907439)
    'ind_normal_counts_lnd',# 247.77093500725147)
    'not_logout_min_credit_limit_per_org',# 242.36182331351182)
 ]
#train = train.drop(col_to_drop, axis=1)
#test = test.drop(col_to_drop,axis=1)

In [5]:
params = {
    'learning_rate': 0.024, 
    'max_depth': 4, 
    'lambda_l1': 15,
    'boosting': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc',
    'feature_fraction': 0.7,
    'is_training_metric': False, 
}
gini = []

features = train.columns.drop(['y','report_id'])    # 列名
X = train.drop(['y','report_id'],axis=1)            # 不包含 target 的数据集
#X = train.drop(['y','report_id'],axis=1).loc[:,feat_selector.support_]
y = train.y                                         # 目标数据
#sub=test.report_id.to_frame()                      # 用于储存结果
#sub['y']=0                                         # 初始化为 0
sub_train=0*train.y                                 # oof
models = []                                         # 用于储存 k 个模型
importances = []                                    # 用于储存 k 组重要性
nrounds=20000                                       # 最大轮数
K = 10                                             # 折数

skf = StratifiedKFold(n_splits=K, random_state=int(time.time()), shuffle = True)    # random_state=1
for i, (train_index, valid_index) in enumerate(skf.split(train, y)):
    #print(' lgb kfold: {}  of  {} : '.format(i+1, K))
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[valid_index,:].copy()
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[valid_index]
    X_test = test.drop(['report_id'],axis=1)

    lgb_model = lgb.train(params, 
                          lgb.Dataset(X_train, label=y_train), 
                          nrounds, 
                          lgb.Dataset(X_valid, label=y_valid), 
                          verbose_eval=3000, 
                          feval=gini_lgb, 
                          early_stopping_rounds=120)

    pred_valid = lgb_model.predict(X_valid)
    sub_train.iloc[valid_index] = pred_valid   
    #print( "  Gini = ", eval_gini(y_valid, pred_valid) )
    #sub['y'] += lgb_model.predict(X_test)/K

    importances.append(lgb_model.feature_importance(importance_type='gain'))         # 获取变量重要性
    models.append(lgb_model)                                                         # 获取每个 fold 的模型

print( "\nGini for full training set:" )
print(eval_gini(y, sub_train))
gini.append(eval_gini(y, sub_train))

Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[873]	valid_0's auc: 0.875491	valid_0's gini: 0.750981
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[667]	valid_0's auc: 0.86843	valid_0's gini: 0.73686
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[816]	valid_0's auc: 0.884703	valid_0's gini: 0.769406
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[652]	valid_0's auc: 0.874394	valid_0's gini: 0.748788
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[871]	valid_0's auc: 0.877796	valid_0's gini: 0.755591
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[511]	valid_0's auc: 0.877447	valid_0's gini: 0.754893
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[448]	valid_0's auc: 0.868676	valid_0's gi

In [5]:
importesces_dict = []
for i in range(5):
    importesces_dict.append(dict(zip(features,importances[i])))
final_importance = ave_importance(features, importesces_dict, K=5, isxgb=False)
for i in range(len(sorted(final_importance.items(), key=lambda d:d[1], reverse = True))):
    print(sorted(final_importance.items(), key=lambda d:d[1], reverse = True)[i])    

('work_province', 35058.022636348061)
('count_payment_state_C_lnd', 3414.9904584321675)
('agent', 3283.5049437987846)
('ind_query_reason_1', 2773.6412404622388)
('edu_level_under', 2214.9889235920209)
('ind_query_reason_sum', 1752.8734360529254)
('salary', 1257.3471306484353)
('credit_limit_amount', 1061.8608495518397)
('actual_payment_amount_lnd', 1049.5293160642425)
('credit_limit_amount_lnd', 875.510153937214)
('not_clear_latest_6m_used_avg_amount', 869.60554335850668)
('not_clear_credit_limit', 689.09851182631803)
('is_local', 640.73547701837788)
('count_sum_ovd_months', 618.50474583277787)
('curr_overdue_amount_lnd', 536.96315163728457)
('count_other_loan_ln', 516.3596885883245)
('count_payment_state_C_ln', 486.7959403540782)
('actual_payment_amount', 485.27826446911456)
('count_payment_state_B_lnd', 481.09856862385203)
('remain_payment_cyc_days', 458.54790925454012)
('edu_level_middle', 456.70096486093485)
('share_credit_limit_amount_lnd', 446.95485839533512)
('count_payment_stat

In [23]:
params = {
    'objective':'binary:logistic',
    'eta':0.01,
    'silent':True,
    'max_depth':4,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'eval_metric': 'auc',
}

features = train.columns.drop(['y','report_id'])    # 列名
X = train.drop(['y','report_id'],axis=1).as_matrix()# 不包含 target 的数据集
y = train.y.as_matrix()                             # 目标数据
sub_train=0*train.y                                 # oof
sub=test.report_id.to_frame()                       # 用于储存结果
sub['y']=0                                          # 初始化为 0
models = []                                         # 用于储存 k 个模型
importances = []                                    # 用于储存 k 组重要性
nrounds=20000                                       # 最大轮数
kfold = 5                                           # 折数

skf = StratifiedKFold(n_splits=kfold, random_state=99)
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[valid_index]                       # 数据分配，根据 CV 来分配训练集与验证集
    y_train, y_valid = y[train_index], y[valid_index]
    xgb_train = xgb.DMatrix(X_train, y_train)                               # 数据装载，将训练集与验证集数据，装载到 xgboost 算法中
    xgb_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]                # 观察数据
    
    xgb_model = xgb.train(params = params, 
                          dtrain = xgb_train, 
                          num_boost_round = nrounds,
                          evals = watchlist, 
                          early_stopping_rounds = 200, 
                          feval = gini_xgb, 
                          maximize = True, 
                          verbose_eval = 500)
    
    sub_train.iloc[valid_index] += xgb_model.predict(xgb_valid, ntree_limit=xgb_model.best_ntree_limit)
    #sub['y'] += xgb_model.predict(xgb.DMatrix(test[features].values), ntree_limit=xgb_model.best_ntree_limit) / (kfold)
    importances.append(xgb_model.get_score(importance_type='gain'))         # 获取变量重要性
    models.append(xgb_model)                                                # 获取每个 fold 的模型

print('oof gini is:')
print(eval_gini(train.y, sub_train))

In [24]:
# 方差分析
from scipy import stats
args = [gini_all,gini_240]
w,p = stats.levene(*args)
#levene方差齐性检验
print(w,p)
#进行方差分析
f,p = stats.f_oneway(*args)
print(f,p)