## 1、导入库、数据

In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer

print(xgb.__version__)
print(lgb.__version__)

1.5.1
3.2.1


In [4]:
logging.info('data loading...')
train = pd.read_csv('../xfdata/车辆贷款违约预测数据集/train.csv')
test = pd.read_csv('../xfdata/车辆贷款违约预测数据集/test.csv')

2021-12-08 14:36:50,611 : INFO : data loading...


## 2、特征工程

In [5]:

def gen_new_feats(train, test):
    '''生成新特征：如年利率/分箱等特征'''
    # Step 1: 合并训练集和测试集
    data = pd.concat([train, test])

    # Step 2: 具体特征工程
    # 计算二级账户的年利率
    data['sub_Rate'] = (data['sub_account_monthly_payment'] * data['sub_account_tenure'] - data[
        'sub_account_sanction_loan']) / data['sub_account_sanction_loan']

    # 计算主账户的年利率
    data['main_Rate'] = (data['main_account_monthly_payment'] * data['main_account_tenure'] - data[
        'main_account_sanction_loan']) / data['main_account_sanction_loan']

    # 对部分特征进行分箱操作
    # 等宽分箱
    loan_to_asset_ratio_labels = [i for i in range(10)]
    data['loan_to_asset_ratio_bin'] = pd.cut(data["loan_to_asset_ratio"], 10, labels=loan_to_asset_ratio_labels)
    # 等频分箱
    data['asset_cost_bin'] = pd.qcut(data['asset_cost'], 10, labels=loan_to_asset_ratio_labels)
    # 自定义分箱
    amount_cols = [
                   'total_monthly_payment',
                   'main_account_sanction_loan',
                   'main_account_disbursed_loan',
                   'sub_account_sanction_loan',
                   'sub_account_disbursed_loan',
                   'main_account_monthly_payment',
                   'sub_account_monthly_payment',
                   'total_sanction_loan'
                ]
    amount_labels = [i for i in range(10)]
    for col in amount_cols:
        total_monthly_payment_bin = [-1, 5000, 10000, 30000, 50000, 100000, 300000, 500000, 1000000, 3000000, data[col].max()]
        data[col + '_bin'] = pd.cut(data[col], total_monthly_payment_bin, labels=amount_labels).astype(int)

    # Step 3: 返回包含新特征的训练集 & 测试集
    return data[data['loan_default'].notnull()], data[data['loan_default'].isnull()]

## 3、target 编码

In [6]:
def gen_target_encoding_feats(train, test, encode_cols, target_col, n_fold=10):
    '''生成target encoding特征'''
    # for training set - cv
    tg_feats = np.zeros((train.shape[0], len(encode_cols)))
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    for _, (train_index, val_index) in enumerate(kfold.split(train[encode_cols], train[target_col])):
        df_train, df_val = train.iloc[train_index], train.iloc[val_index]
        for idx, col in enumerate(encode_cols):
            target_mean_dict = df_train.groupby(col)[target_col].mean()
            df_val[f'{col}_mean_target'] = df_val[col].map(target_mean_dict)
            tg_feats[val_index, idx] = df_val[f'{col}_mean_target'].values

    for idx, encode_col in enumerate(encode_cols):
        train[f'{encode_col}_mean_target'] = tg_feats[:, idx]

    # for testing set
    for col in encode_cols:
        target_mean_dict = train.groupby(col)[target_col].mean()
        test[f'{col}_mean_target'] = test[col].map(target_mean_dict)

    return train, test

## 4、根据id，取近邻的结果概率

In [7]:
def gen_neighbor_feats(train, test):
    '''产生近邻欺诈特征'''
    if not os.path.exists('../user_data/neighbor_default_probs.pkl'):
        # 该特征需要跑的时间较久，因此将其存成了pkl文件
        neighbor_default_probs = []
        for i in tqdm(range(train.customer_id.max())):
            if i >= 10 and i < 199706:
                customer_id_neighbors = list(range(i - 10, i)) + list(range(i + 1, i + 10))
            elif i < 199706:
                customer_id_neighbors = list(range(0, i)) + list(range(i + 1, i + 10))
            else:
                customer_id_neighbors = list(range(i - 10, i)) + list(range(i + 1, 199706))

            customer_id_neighbors = [customer_id_neighbor for customer_id_neighbor in customer_id_neighbors if
                                     customer_id_neighbor in train.customer_id.values.tolist()]
            neighbor_default_prob = train.set_index('customer_id').loc[customer_id_neighbors].loan_default.mean()
            neighbor_default_probs.append(neighbor_default_prob)

        df_neighbor_default_prob = pd.DataFrame({'customer_id': range(0, train.customer_id.max()),
                                                 'neighbor_default_prob': neighbor_default_probs})
        save_pkl(df_neighbor_default_prob, '../user_data/neighbor_default_probs.pkl')
    else:
        df_neighbor_default_prob = load_pkl('../user_data/neighbor_default_probs.pkl')
    train = pd.merge(left=train, right=df_neighbor_default_prob, on='customer_id', how='left')
    test = pd.merge(left=test, right=df_neighbor_default_prob, on='customer_id', how='left')

    return train, test

In [8]:
def save_pkl(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)


def load_pkl(path):
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [9]:
TARGET_ENCODING_FETAS = [
                            'employment_type',
                             'branch_id',
                             'supplier_id',
                             'manufacturer_id',
                             'area_id',
                             'employee_code_id',
                             'asset_cost_bin'
                         ]


# 特征工程
logging.info('feature generating...')
train, test = gen_new_feats(train, test)
train, test = gen_target_encoding_feats(train, test, TARGET_ENCODING_FETAS, target_col='loan_default', n_fold=10)
train, test = gen_neighbor_feats(train, test)

2021-12-08 14:36:51,259 : INFO : feature generating...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val[f'{col}_mean_target'] = df_val[col].map(target_mean_dict)


In [10]:
SAVE_FEATS = [
                 'customer_id',
                 'neighbor_default_prob',
                 'disbursed_amount',
                 'asset_cost',
                 'branch_id',
                 'supplier_id',
                 'manufacturer_id',
                 'area_id',
                 'employee_code_id',
                 'credit_score',
                 'loan_to_asset_ratio',
                 'year_of_birth',
                 'age',
                 'sub_Rate',
                 'main_Rate',
                 'loan_to_asset_ratio_bin',
                 'asset_cost_bin',
                 'employment_type_mean_target',
                 'branch_id_mean_target',
                 'supplier_id_mean_target',
                 'manufacturer_id_mean_target',
                 'area_id_mean_target',
                 'employee_code_id_mean_target',
                 'asset_cost_bin_mean_target',
                 'credit_history',
                 'average_age',
                 'total_disbursed_loan',
                 'main_account_disbursed_loan',
                 'total_sanction_loan',
                 'main_account_sanction_loan',
                 'active_to_inactive_act_ratio',
                 'total_outstanding_loan',
                 'main_account_outstanding_loan',
                 'Credit_level',
                 'outstanding_disburse_ratio',
                 'total_account_loan_no',
                 'main_account_tenure',
                 'main_account_loan_no',
                 'main_account_monthly_payment',
                 'total_monthly_payment',
                 'main_account_active_loan_no',
                 'main_account_inactive_loan_no',
                 'sub_account_inactive_loan_no',
                 'enquirie_no',
                 'main_account_overdue_no',
                 'total_overdue_no',
                 'last_six_month_defaulted_no'
            ]


# 特征工程 一些后处理
for col in ['sub_Rate', 'main_Rate', 'outstanding_disburse_ratio']:
     train[col] = train[col].apply(lambda x: 1 if x > 1 else x)
     test[col] = test[col].apply(lambda x: 1 if x > 1 else x)
train['asset_cost_bin'] = train['asset_cost_bin'].astype(int)
test['asset_cost_bin'] = test['asset_cost_bin'].astype(int)
train['loan_to_asset_ratio_bin'] = train['loan_to_asset_ratio_bin'].astype(int)
test['loan_to_asset_ratio_bin'] = test['loan_to_asset_ratio_bin'].astype(int)

# 存储包含新特征的数据集
logging.info('new data saving...')
cols = SAVE_FEATS + ['loan_default', ]
train[cols].to_csv('./train_final.csv', index=False)
test[cols].to_csv('./test_final.csv', index=False)

2021-12-08 14:36:52,398 : INFO : new data saving...


## 5、模型训练-交叉验证

In [14]:
def train_lgb_kfold(X_train, y_train, X_test, n_fold=5):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr)
        dvalid = lgb.Dataset(X_val, y_val, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 64,
            'learning_rate': 0.02,
            'min_data_in_leaf': 150,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.7,
            'n_jobs': -1,
            'seed': 1024
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=100,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds



def train_xgb_kfold(X_train, y_train, X_test, n_fold=5):
    '''train xgboost with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=10, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = xgb.DMatrix(X_tr, y_tr)
        dvalid = xgb.DMatrix(X_val, y_val)
        dtest = xgb.DMatrix(X_test)

        params={
            'booster':'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': ['logloss', 'auc'],
            'max_depth': 8,
            'subsample':0.9,
            'min_child_weight': 10,
            'colsample_bytree':0.85,
            'lambda': 10,
            'eta': 0.02,
            'seed': 1024
        }

        watchlist = [(dtrain, 'train'), (dvalid, 'test')]

        gbm = xgb.train(params,
                        dtrain,
                        num_boost_round=100,
                        evals=watchlist,
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(dvalid, iteration_range=(0, gbm.best_iteration))
        test_preds += gbm.predict(dtest, iteration_range=(0, gbm.best_iteration)) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds

## 6、模型训练-结果输出

In [15]:
def train_xgb(train, test, feat_cols, label_col, n_fold=10):
    '''训练xgboost'''
    for col in ['sub_Rate', 'main_Rate', 'outstanding_disburse_ratio']:
        train[col] = train[col].apply(lambda x: 1 if x > 1 else x)
        test[col] = test[col].apply(lambda x: 1 if x > 1 else x)

    X_train = train[feat_cols]
    y_train = train[label_col]
    X_test = test[feat_cols]
    gbms_xgb, oof_preds_xgb, test_preds_xgb = train_xgb_kfold(X_train, y_train, X_test, n_fold=n_fold)

    if not os.path.exists('../user_data/gbms_xgb.pkl'):
        save_pkl(gbms_xgb, '../user_data/gbms_xgb.pkl')

    return gbms_xgb, oof_preds_xgb, test_preds_xgb


def train_lgb(train, test, feat_cols, label_col, n_fold=10):
    '''训练lightgbm'''
    X_train = train[feat_cols]
    y_train = train[label_col]
    X_test = test[feat_cols]
    gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb_kfold(X_train, y_train, X_test, n_fold=n_fold)

    if not os.path.exists('../user_data/gbms_lgb.pkl'):
        save_pkl(gbms_lgb, '../user_data/gbms_lgb.pkl')

    return gbms_lgb, oof_preds_lgb, test_preds_lgb

In [16]:
# 读取原始数据集
logging.info('data loading...')
train = pd.read_csv('../xfdata/车辆贷款违约预测数据集/train.csv')
test = pd.read_csv('../xfdata/车辆贷款违约预测数据集/test.csv')

# 特征工程
logging.info('feature generating...')
train, test = gen_new_feats(train, test)
train, test = gen_target_encoding_feats(train, test, TARGET_ENCODING_FETAS, target_col='loan_default', n_fold=10)
train, test = gen_neighbor_feats(train, test)

train['asset_cost_bin'] = train['asset_cost_bin'].astype(int)
test['asset_cost_bin'] = test['asset_cost_bin'].astype(int)
train['loan_to_asset_ratio_bin'] = train['loan_to_asset_ratio_bin'].astype(int)
test['loan_to_asset_ratio_bin'] = test['loan_to_asset_ratio_bin'].astype(int)
train['asset_cost_bin_mean_target'] = train['asset_cost_bin_mean_target'].astype(float)
test['asset_cost_bin_mean_target'] = test['asset_cost_bin_mean_target'].astype(float)

# 模型训练：linux和mac的xgboost结果会有些许不同，以模型文件结果为主
gbms_xgb, oof_preds_xgb, test_preds_xgb = train_xgb(train.copy(), test.copy(),
                                                    feat_cols=SAVE_FEATS,
                                                    label_col='loan_default')
gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb(train, test,
                                                    feat_cols=SAVE_FEATS,
                                                    label_col='loan_default')

2021-12-08 14:37:37,711 : INFO : data loading...
2021-12-08 14:37:38,197 : INFO : feature generating...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val[f'{col}_mean_target'] = df_val[col].map(target_mean_dict)
2021-12-08 14:37:39,540 : INFO : ############ fold 0 ###########


[0]	train-logloss:0.68452	train-auc:0.63921	test-logloss:0.68456	test-auc:0.62855
[50]	train-logloss:0.48456	train-auc:0.69437	test-logloss:0.48960	test-auc:0.65455
[99]	train-logloss:0.44098	train-auc:0.71345	test-logloss:0.45205	test-auc:0.66009


2021-12-08 14:37:48,938 : INFO : ############ fold 1 ###########


[0]	train-logloss:0.68450	train-auc:0.63775	test-logloss:0.68462	test-auc:0.62587
[50]	train-logloss:0.48443	train-auc:0.69538	test-logloss:0.49083	test-auc:0.65386
[99]	train-logloss:0.44087	train-auc:0.71447	test-logloss:0.45330	test-auc:0.65865


2021-12-08 14:37:58,284 : INFO : ############ fold 2 ###########


[0]	train-logloss:0.68451	train-auc:0.64048	test-logloss:0.68455	test-auc:0.62541
[50]	train-logloss:0.48439	train-auc:0.69509	test-logloss:0.49013	test-auc:0.65240
[99]	train-logloss:0.44086	train-auc:0.71396	test-logloss:0.45280	test-auc:0.65747


2021-12-08 14:38:07,753 : INFO : ############ fold 3 ###########


[0]	train-logloss:0.68452	train-auc:0.63414	test-logloss:0.68465	test-auc:0.62270
[50]	train-logloss:0.48450	train-auc:0.69570	test-logloss:0.48949	test-auc:0.66355
[99]	train-logloss:0.44089	train-auc:0.71486	test-logloss:0.45175	test-auc:0.66649


2021-12-08 14:38:17,367 : INFO : ############ fold 4 ###########


[0]	train-logloss:0.68452	train-auc:0.63459	test-logloss:0.68455	test-auc:0.63408
[50]	train-logloss:0.48469	train-auc:0.69490	test-logloss:0.48868	test-auc:0.66670
[99]	train-logloss:0.44128	train-auc:0.71371	test-logloss:0.45032	test-auc:0.67131


2021-12-08 14:38:26,926 : INFO : ############ fold 5 ###########


[0]	train-logloss:0.68450	train-auc:0.64065	test-logloss:0.68462	test-auc:0.61957
[50]	train-logloss:0.48435	train-auc:0.69548	test-logloss:0.49044	test-auc:0.65277
[99]	train-logloss:0.44071	train-auc:0.71474	test-logloss:0.45295	test-auc:0.65920


2021-12-08 14:38:36,512 : INFO : ############ fold 6 ###########


[0]	train-logloss:0.68448	train-auc:0.63988	test-logloss:0.68464	test-auc:0.61795
[50]	train-logloss:0.48424	train-auc:0.69561	test-logloss:0.49100	test-auc:0.64600
[99]	train-logloss:0.44052	train-auc:0.71490	test-logloss:0.45404	test-auc:0.65147


2021-12-08 14:38:46,023 : INFO : ############ fold 7 ###########


[0]	train-logloss:0.68453	train-auc:0.63641	test-logloss:0.68457	test-auc:0.63026
[50]	train-logloss:0.48457	train-auc:0.69379	test-logloss:0.48922	test-auc:0.65895
[99]	train-logloss:0.44103	train-auc:0.71357	test-logloss:0.45125	test-auc:0.66454


2021-12-08 14:38:55,479 : INFO : ############ fold 8 ###########


[0]	train-logloss:0.68449	train-auc:0.63646	test-logloss:0.68453	test-auc:0.62626
[50]	train-logloss:0.48458	train-auc:0.69478	test-logloss:0.48932	test-auc:0.65600
[99]	train-logloss:0.44093	train-auc:0.71402	test-logloss:0.45145	test-auc:0.66282


2021-12-08 14:39:05,081 : INFO : ############ fold 9 ###########


[0]	train-logloss:0.68452	train-auc:0.63518	test-logloss:0.68459	test-auc:0.62040
[50]	train-logloss:0.48459	train-auc:0.69417	test-logloss:0.49000	test-auc:0.65480
[99]	train-logloss:0.44099	train-auc:0.71341	test-logloss:0.45241	test-auc:0.66107


2021-12-08 14:39:15,159 : INFO : ############ fold 0 ###########


[LightGBM] [Info] Number of positive: 23891, number of negative: 111109
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7066
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176970 -> initscore=-1.537010
[LightGBM] [Info] Start training from score -1.537010
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.680728	valid_1's auc: 0.657064


2021-12-08 14:39:17,237 : INFO : ############ fold 1 ###########


[100]	training's auc: 0.695359	valid_1's auc: 0.663392
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695359	valid_1's auc: 0.663392
[LightGBM] [Info] Number of positive: 23891, number of negative: 111109
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7059
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176970 -> initscore=-1.537010
[LightGBM] [Info] Start training from score -1.537010
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.681283	valid_1's auc: 0.653969


2021-12-08 14:39:19,523 : INFO : ############ fold 2 ###########


[100]	training's auc: 0.696122	valid_1's auc: 0.659682
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.696122	valid_1's auc: 0.659682
[LightGBM] [Info] Number of positive: 23891, number of negative: 111109
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7071
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176970 -> initscore=-1.537010
[LightGBM] [Info] Start training from score -1.537010
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.681236	valid_1's auc: 0.65142


2021-12-08 14:39:21,286 : INFO : ############ fold 3 ###########


[100]	training's auc: 0.695948	valid_1's auc: 0.658272
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695948	valid_1's auc: 0.658272
[LightGBM] [Info] Number of positive: 23891, number of negative: 111109
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7065
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176970 -> initscore=-1.537010
[LightGBM] [Info] Start training from score -1.537010
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.680678	valid_1's auc: 0.66442


2021-12-08 14:39:23,257 : INFO : ############ fold 4 ###########


[100]	training's auc: 0.695676	valid_1's auc: 0.668496
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695676	valid_1's auc: 0.668496
[LightGBM] [Info] Number of positive: 23891, number of negative: 111109
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7072
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176970 -> initscore=-1.537010
[LightGBM] [Info] Start training from score -1.537010
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.680246	valid_1's auc: 0.6666


2021-12-08 14:39:25,504 : INFO : ############ fold 5 ###########


[100]	training's auc: 0.695177	valid_1's auc: 0.672279
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695177	valid_1's auc: 0.672279
[LightGBM] [Info] Number of positive: 23890, number of negative: 111110
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7068
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176963 -> initscore=-1.537061
[LightGBM] [Info] Start training from score -1.537061
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.680389	valid_1's auc: 0.654878


2021-12-08 14:39:27,457 : INFO : ############ fold 6 ###########


[100]	training's auc: 0.695173	valid_1's auc: 0.662115
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695173	valid_1's auc: 0.662115
[LightGBM] [Info] Number of positive: 23890, number of negative: 111110
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7066
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176963 -> initscore=-1.537061
[LightGBM] [Info] Start training from score -1.537061
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.682069	valid_1's auc: 0.647487


2021-12-08 14:39:29,387 : INFO : ############ fold 7 ###########


[100]	training's auc: 0.696339	valid_1's auc: 0.653793
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.696339	valid_1's auc: 0.653793
[LightGBM] [Info] Number of positive: 23890, number of negative: 111110
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7071
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176963 -> initscore=-1.537061
[LightGBM] [Info] Start training from score -1.537061
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.680425	valid_1's auc: 0.661555


2021-12-08 14:39:31,464 : INFO : ############ fold 8 ###########


[100]	training's auc: 0.695188	valid_1's auc: 0.667369
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695188	valid_1's auc: 0.667369
[LightGBM] [Info] Number of positive: 23890, number of negative: 111110
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7073
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176963 -> initscore=-1.537061
[LightGBM] [Info] Start training from score -1.537061
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.6806	valid_1's auc: 0.658909


2021-12-08 14:39:33,538 : INFO : ############ fold 9 ###########


[100]	training's auc: 0.695282	valid_1's auc: 0.665429
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695282	valid_1's auc: 0.665429
[LightGBM] [Info] Number of positive: 23890, number of negative: 111110
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7070
[LightGBM] [Info] Number of data points in the train set: 135000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176963 -> initscore=-1.537061
[LightGBM] [Info] Start training from score -1.537061
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.681306	valid_1's auc: 0.653703
[100]	training's auc: 0.695862	valid_1's auc: 0.659826
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.695862	valid_1's auc: 0.659826


## 7、预测结果阈值划分

In [22]:
def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['loan_default'].mean() # 可看作训练集取到loan_default=1的概率
    thres = df_train['oof_preds'].quantile(1 - quantile_point) # 比如 0,1,1,1 mean=0.75 1-mean=0.25,也就是25%分位数取值为0

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01): #  按照理论阈值的上下0.2范围，0.01步长，找到最佳阈值，f1分数最高对应的阈值即为最佳阈值
        _thresh.append(
            [thres_item, f1_score(df_train['loan_default'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax() # 找到f1最高对应的行
    best_thresh = _thresh[best_id][0] # 取出最佳阈值

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh

In [30]:
xgb_thres = gen_thres_new(train, oof_preds_xgb)
lgb_thres =  gen_thres_new(train, oof_preds_lgb)


# 结果聚合
df_oof_res = pd.DataFrame({'customer_id': train['customer_id'],
                            'loan_default':train['loan_default'],
                            'oof_preds_xgb': oof_preds_xgb,
                            'oof_preds_lgb': oof_preds_lgb})

# 模型融合
df_oof_res['xgb_rank'] = df_oof_res['oof_preds_xgb'].rank(pct=True) # percentile rank,返回的是排序后的分位数
df_oof_res['lgb_rank'] = df_oof_res['oof_preds_lgb'].rank(pct=True)

df_oof_res['preds'] = 0.31 * df_oof_res['xgb_rank'] + 0.69 * df_oof_res['lgb_rank']


thres = gen_thres_new(df_oof_res, df_oof_res['preds'])

阈值: 0.2728475863923628
训练集的f1: 0.5831012392678149
阈值: 0.2277117872193553
训练集的f1: 0.5841566537815013
阈值: 0.8110024552466668
训练集的f1: 0.5849155835510944


## 8、测试集结果阈值划分，输出最终预测结果

In [31]:

def gen_submit_file(df_test, test_preds, thres, save_path):
    df_test['test_preds_binary'] = np.where(test_preds > thres, 1, 0)  # 按最终模型融合后的阈值进行划分
    df_test_submit = df_test[['customer_id', 'test_preds_binary']]
    df_test_submit.columns = ['customer_id', 'loan_default']
    print(f'saving result to: {save_path}')
    df_test_submit.to_csv(save_path, index=False)
    print('done!')
    return df_test_submit



df_test_res = pd.DataFrame({'customer_id': test['customer_id'],
                                'test_preds_xgb': test_preds_xgb,
                                'test_preds_lgb': test_preds_lgb})

df_test_res['xgb_rank'] = df_test_res['test_preds_xgb'].rank(pct=True)
df_test_res['lgb_rank'] = df_test_res['test_preds_lgb'].rank(pct=True)
df_test_res['preds'] = 0.31 * df_test_res['xgb_rank'] + 0.69 * df_test_res['lgb_rank']

# 结果产出
df_submit = gen_submit_file(df_test_res, df_test_res['preds'], thres,
                            save_path='../prediction_result/result.csv')

saving result to: ../prediction_result/result.csv
done!
