In [1]:
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]
def gini_xgb_neg(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]
def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = eval_gini(y, preds)
    return [('gini', score, True)]
def gini_lgb_sk(y,preds):
    score = eval_gini(y, preds)
    return [('gini', score, True)]


import warnings
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from boruta import BorutaPy
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings("ignore")



In [2]:
base_path = 'D:/DataSet/Credit/'
train = pd.read_csv(base_path + 'featured/train_all_feature.csv')
test = pd.read_csv(base_path + 'featured/test_all_feature.csv')

one_hot = 0
if one_hot:
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)
    
col_to_drop = [
    # 1
    'count_house_loan_ln',# 0.97232912500359225)
    'count_house_loan',# 0.65395611105534779)
    'edu_level_other',# 0.57350198625873861)
    'count_payment_state_E_ln',# 0.51044888053676218)
    'count_attention_ln',# 0.32350891467937332)
    'count_commercial_loan',# 0.0)
    'count_sixty_ovd_dw',# 0.0)
    'count_sixty_ovd_months',# 0.0)
    'ind_sixty_max_duration',# 0.0)
    'marry_status_other',# 0.0)
    'count_study_loan_ln',# 0.0)
    'count_housing_accumulation_ln',# 0.0)
    'count_commercial_housing_ln',# 0.0)
    'count_combination_ensure_ln',# 0.0)
    'ind_other_counts_lnd',# 0.0)
    'count_combination_lnd',# 0.0)
    'count_pledge_guarantee_lnd',# 0.0)
    'count_ensure_lnd',# 0.0)
    'count_other_guarantee_lnd',# 0.0)
    'count_combination_ensure_lnd',# 0.0)
    'count_farmer_joint_lnd',# 0.0)
    'count_pledge_guarantee_bail_lnd',# 0.0)
    # 10
    'ind_curr_overdue_cyc_lnd',# 9.1747017468829988)
    'ind_unact_counts_lnd',# 8.1829607049772104)
    'ind_other_counts',# 8.1190294047611253)
    'count_pledge_guarantee_bail_ln',# 4.9866824922417035)
    'count_sharedebt',# 4.25470982342895)
    'marry_status_unmarried',# 3.5268450817329375)
    'count_normal_ln',# 3.1544560288384238)
    'count_farmer_joint_ln',# 2.8250789765495536)
    'not_clear_account_count',# 2.438362051462843)
    'count_car_loan_ln',# 2.0244134983035815)
    'count_payment_state_E_lnd',# 1.9511014375106948)
    # 40
    'ind_normal_counts',# 38.981578298058366)
    'count_ensure_ln',# 32.686012781155711)
    'not_logout_pre_account_count',# 29.035934862113912)
    'count_spl',# 26.621401271906876)
    'not_logout_pre_finance_org_count',# 20.88833411697636)
    'count_debit_card_ovd_dw',# 20.552771641255653)
    'ind_clear_counts_lnd',# 18.63633007759838)
    'cat_query_reason_mal',# 18.012892273855371)
    'count_other_guarantee_ln',# 15.719418555370231)
    'count_pledge_guarantee_ln',# 14.603172531948651)
    'count_standard_loancard',# 14.257967916620288)
    'count_combination_ln',# 13.225866749150008)
    'marry_status_divorced',# 12.996673186745294)
    'flt_highest_sixty_oa_per_mon',# 12.496437876453976)
    'count_farmer_loan_ln',# 12.034259912867824)
    # 60
    'curr_overdue_cyc_days',# 58.533138767951407)
    'not_logout_pre_max_credit_limit_per_org',# 55.545770066417433)
    'not_logout_pre_finance_corp_count',# 54.909995520093744)
    'has_fund',# 53.524477466077244)
    'edu_level_bachelor',# 51.681921368379456)
    'cat_query_reason_sqe',# 50.507370364531575)
    'not_clear_finance_org_count',# 50.499752342403504)
    'count_payment_state_B_ln',# 49.675949679726415)
    'count_ovd',# 47.284280675138668)
    'marry_status_married',# 47.141063732728689)
    # 100
    'count_payment_state_D_ln',# 98.582591891500016)
    'not_logout_finance_org_count',# 97.63379220401383)
    'count_credit_loan_ln',# 96.057688729626022)
    'count_payment_state_D_lnd',# 94.52677681813546)
    'all_highest_oa_per_mon',# 93.758390716396264)
    'count_consumption loan_ln',# 92.34403542313882)
    'not_logout_pre_latest_6m_used_avg_amount',# 85.357040829001591)
    'balance',# 85.186190705260103)
    'cat_query_reason_la',# 83.737758346562558)
    'ind_clear_counts',# 82.467554392775014)
    'count_unknown_ln',# 73.400641666623201)
    'not_logout_pre_min_credit_limit_per_org',# 70.752886069309909)
    'not_logout_pre_credit_limit',# 68.394379619115668)
    'not_logout_account_count',# 67.926934887483739)
    'lnd_ovd_sum_amount',# 67.598474433716945)
    'flt_highest_debit_card_oa_per_mon',# 64.9057845815354)
    'count_loan_ovd_dw',# 64.132474960621153)
    'count_operating_loan_ln',# 63.127772355688421)
    'not_clear_finance_corp_count',# 62.877948676798582)
    # 180
    'not_logout_latest_6m_used_avg_amount',# 176.32317263727683)
    'ind_loan_max_duration',# 167.56724876084013)
    'count_credit_loan_lnd',# 166.96045957856194)
    'all_max_duration',# 153.57957111737943)
    'not_clear_balance',# 153.18548993022915)
    'flt_highest_loan_oa_per_mon',# 150.5264464131252)
    'flt_sum_amount',# 149.83162201460962)
    'used_highest_amount_lnd',# 136.85598322859255)
    'scheduled_payment_amount',# 128.02030046387119)
    'latest6_month_used_avg_amount_lnd',# 124.58135686300396)
    'ind_org_counts',# 120.5490321578975)
    'ind_debit_card_max_duration',# 118.53218655429413)
    'edu_level_junior',# 117.02178169627837)
    'lnd_ovd_sum_last_months',# 114.74345491704378)
    'used_credit_limit_amount_lnd',# 114.2672848868574)
    'ave_ovd_amount',# 114.06571798355444)
    'count_other_loan',# 111.82596886962884)
    'range_lnd_ovd',# 109.13363916690426)
    'not_logout_pre_used_credit_limit',# 105.64340446411771)
    'count_sum_ovd_dw',# 104.97563024241767)
    # flt_noise
    'not_logout_max_credit_limit_per_org',# 211.80030432198566)
    # 240
    #'not_logout_finance_corp_count',# 217.97120979038462)
]

train = train.drop(col_to_drop, axis=1)
test = test.drop(col_to_drop,axis=1)

In [None]:
base_path = 'D:/DataSet/Credit/'
train = pd.read_csv(base_path + 'featured/train_all_feature_log.csv')
test = pd.read_csv(base_path + 'featured/test_all_feature_log.csv')

one_hot = 0
if one_hot:
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)
    
col_to_drop = [ 
'ind_normal_counts_lnd', 
'count_month', 
'count_loan_ovd_months',
'not_logout_used_credit_limit', 
'latest6_month_used_avg_amount_lnd', 
'scheduled_payment_amount', 
'not_logout_max_credit_limit_per_org', 
'flt_highest_loan_oa_per_mon', 
'not_clear_balance', 
'ind_loan_max_duration', 
'used_credit_limit_amount_lnd', 
'not_logout_finance_corp_count', 
'used_highest_amount_lnd', 
'all_max_duration', 
'count_credit_loan_lnd', 
'not_logout_latest_6m_used_avg_amount', 
'ind_org_counts', 
'count_sum_ovd_dw', 
'all_highest_oa_per_mon', 
'flt_sum_amount', 
'count_credit_loan_ln', 
'ind_debit_card_max_duration', 
'count_payment_state_D_ln', 
'edu_level_junior', 
'not_logout_finance_org_count', 
'lnd_ovd_sum_last_months', 
'balance', 
'count_operating_loan_ln', 
'range_lnd_ovd', 
'count_payment_state_D_lnd', 
'ave_ovd_amount', 
'count_other_loan', 
'not_logout_pre_credit_limit', 
'edu_level_bachelor', 
'not_clear_finance_org_count', 
'not_logout_pre_used_credit_limit', 
'cat_query_reason_la', 
'curr_overdue_cyc_days', 
'ind_clear_counts', 
'lnd_ovd_sum_amount', 
'not_logout_pre_min_credit_limit_per_org', 
'count_loan_ovd_dw', 
'not_logout_pre_finance_corp_count', 
'count_consumption loan_ln', 
'count_unknown_ln', 
'not_clear_finance_corp_count', 
'marry_status_married', 
'not_logout_account_count', 
'flt_highest_debit_card_oa_per_mon', 
'cat_query_reason_sqe', 
'count_payment_state_B_ln', 
'not_logout_pre_latest_6m_used_avg_amount', 
'has_fund', 
'count_spl', 
'count_pledge_guarantee_ln', 
'count_ovd', 
'not_logout_pre_max_credit_limit_per_org', 
'ind_normal_counts', 
'ind_clear_counts_lnd', 
'count_ensure_ln', 
'flt_highest_sixty_oa_per_mon', 
'not_logout_pre_finance_org_count', 
'not_logout_pre_account_count', 
'count_debit_card_ovd_dw',
'count_standard_loancard', 
'marry_status_divorced', 
'edu_level_other', 
'count_farmer_loan_ln', 
'ind_other_counts', 
'count_sharedebt', 
'count_normal_ln',
'count_combination_ln',
'marry_status_unmarried',
'cat_query_reason_mal',
'not_clear_account_count',
'ind_curr_overdue_cyc_lnd',
'ind_unact_counts_lnd',
'count_attention_ln',
'count_pledge_guarantee_bail_ln',
'count_payment_state_E_ln',
'count_other_guarantee_ln',
'count_payment_state_E_lnd',
'count_car_loan_ln', 
'count_house_loan_ln', 
'count_house_loan', 
'count_commercial_loan', 
'count_study_loan_ln', 
'count_housing_accumulation_ln', 
'count_commercial_housing_ln', 
'count_combination_ensure_ln', 
'count_farmer_joint_ln', 
'ind_other_counts_lnd', 
'count_combination_lnd', 
'count_pledge_guarantee_lnd', 
'count_ensure_lnd', 
'count_other_guarantee_lnd', 
'count_combination_ensure_lnd', 
'count_farmer_joint_lnd', 
'count_pledge_guarantee_bail_lnd', 
'count_sixty_ovd_dw', 
'count_sixty_ovd_months', 
'ind_sixty_max_duration', 
'marry_status_other', 
]

train = train.drop(col_to_drop, axis=1)
test = test.drop(col_to_drop,axis=1)

In [3]:
params = {
    'learning_rate': 0.024, 
    'max_depth': 4, 
    'lambda_l1': 15,
    'boosting': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc',
    'feature_fraction': 0.7,
    'is_training_metric': False, 
}

features = train.columns.drop(['y','report_id'])
X = train.drop(['y','report_id'],axis=1)
y = train.y 
sub=test.report_id.to_frame()
sub['pred']=0 
sub_train=train.report_id.to_frame()
sub_train['y']=train.y 
sub_train['pred'] = 0

GINI = []
K = 5                                               # 折数
nrounds=20000000                                       # 最大轮数
for haha in range(20):
    skf = StratifiedKFold(n_splits=K, random_state=int(time.time()), shuffle = True)    # random_state=1
    for i, (train_index, valid_index) in enumerate(skf.split(train, y)):
        #print(' lgb kfold: {}  of  {} : '.format(i+1, K))
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[valid_index,:].copy()
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[valid_index]
        X_test = test.drop(['report_id'],axis=1)
    
        lgb_model = lgb.train(params, 
                              lgb.Dataset(X_train, label=y_train), 
                              nrounds, 
                              lgb.Dataset(X_valid, label=y_valid), 
                              verbose_eval=3000, 
                              feval=gini_lgb, 
                              early_stopping_rounds=120)
    
        pred_valid = lgb_model.predict(X_valid)
        sub_train['pred'].iloc[valid_index] = pred_valid   
        #print( "  Gini = ", eval_gini(y_valid, pred_valid) )
        sub['pred'] += lgb_model.predict(X_test)/K
    
    print( "\nGini for full training set:" )
    print(eval_gini(y, sub_train.pred))
    GINI.append(eval_gini(y, sub_train.pred))

Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[602]	valid_0's auc: 0.865588	valid_0's gini: 0.731176
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[695]	valid_0's auc: 0.878074	valid_0's gini: 0.756148
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[748]	valid_0's auc: 0.878442	valid_0's gini: 0.756883
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[715]	valid_0's auc: 0.88136	valid_0's gini: 0.76272
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[712]	valid_0's auc: 0.874601	valid_0's gini: 0.749202

Gini for full training set:
0.751201185185
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[541]	valid_0's auc: 0.882983	valid_0's gini: 0.765966
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is

Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[663]	valid_0's auc: 0.885175	valid_0's gini: 0.770351

Gini for full training set:
0.750317207704
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[541]	valid_0's auc: 0.874604	valid_0's gini: 0.749208
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[597]	valid_0's auc: 0.893687	valid_0's gini: 0.787375
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[794]	valid_0's auc: 0.865593	valid_0's gini: 0.731186
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[678]	valid_0's auc: 0.865134	valid_0's gini: 0.730267
Train until valid scores didn't improve in 120 rounds.
Early stopping, best iteration is:
[777]	valid_0's auc: 0.875087	valid_0's gini: 0.750173

Gini for full training set:
0.748988207407
Train until valid scores didn't improve in

In [8]:
sub_train.to_csv(base_path+'result/xgboost_train.csv', index=False)
sub.to_csv(base_path+'result/xgboost_test.csv', index=False)

In [5]:
np.array(GINI).mean()

0.74963507200000001

In [18]:
gini_log = GINI.copy()

In [4]:
GINI

[0.75120118518518519,
 0.75114664770370365,
 0.74995766992592594,
 0.75118514251851853,
 0.74984495407407414,
 0.74977232592592591,
 0.75092406044444449,
 0.75085947259259256,
 0.75035005155555556,
 0.75163831940740744,
 0.75031720770370369,
 0.74898820740740746,
 0.74959138133333325,
 0.75295010133333329,
 0.75161783940740734,
 0.74984813985185184,
 0.75138812207407413,
 0.75076268562962967,
 0.75325548088888894,
 0.75124589985185186]

In [23]:
# 方差分析
from scipy import stats
args = [gini,gini_log]
w,p = stats.levene(*args)
#levene方差齐性检验
print(w,p)
#进行方差分析
f,p = stats.f_oneway(*args)
print(f,p)

4.94078894126 0.0322552357304
0.317354311449 0.576511297689
