In [7]:
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np
from glob2 import glob
import warnings
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

warnings.filterwarnings("ignore", module="lightgbm")

import lightgbm as lgbm
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import pickle
import re
import gc
import config

In [8]:
train = pd.read_pickle(config.TRAIN_BINARY_FILE)
valid = pd.read_pickle(config.DEV_BINARY_FILE)

In [9]:
# 调参
def objective(trial):
#     train_resample = resample(train, replace=False, n_samples=1000000, stratify=train['下单用户'])
    train_x, train_y = train.drop(columns='下单用户'), train['下单用户']
    valid_x, valid_y = valid.drop(columns='下单用户'), valid['下单用户']
    dtrain = lgbm.Dataset(train_x, label=train_y)
    dvalid = lgbm.Dataset(valid_x, label=valid_y)
    

    param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_iterations":trial.suggest_int('num_iterations',50,2000,log=True),
        "learning_rate":trial.suggest_float("learning_rate",0.001,1,log=True),
        "num_leaves": trial.suggest_int("num_leaves", 10,400, log=True),
#         "min_data_in_leaf":trial.suggest_int("min_data_in_leaf", 20,30,step=2, log=False),
        "neg_bagging_fraction":trial.suggest_float("neg_bagging_fraction", 0.025, 0.1, log=True),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-9, 10, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-9, 10, log=True),
        "bagging_freq": trial.suggest_int("bagging_freq", 10, 100, log=True),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200, log=True),
        "max_bin":trial.suggest_int("max_bin",200,300,log=True),
        "min_data_in_bin":trial.suggest_int("min_data_in_bin",5,30,step=1),
        "feature_fraction":trial.suggest_float("feature_fraction",0.8,1,log=True),
        'num_threads':20,
        'pos_bagging_fraction':1,
        'early_stopping_round':10,
    }
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    gbm = lgbm.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback],
        categorical_feature=['gender','new_customer','bank_processed','pos_merchant','cluster_n_3','line_key','是否需要贷款',
                            '学历','是否拥有信用卡','有多少张信用卡','职业']
    )

    preds = gbm.predict(valid_x)
    value = roc_auc_score(valid_y, preds)
    return value


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize"
    )
    study.optimize(objective, n_trials=10)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2023-03-14 17:52:00,618][0m A new study created in memory with name: no-name-cd7560f7-3405-4554-ab10-4f0cd814f273[0m
[32m[I 2023-03-14 17:52:31,958][0m Trial 0 finished with value: 0.7749954567713652 and parameters: {'num_iterations': 847, 'learning_rate': 0.2388572891279491, 'num_leaves': 147, 'neg_bagging_fraction': 0.048090749665005546, 'lambda_l1': 1.1062842754741027e-06, 'lambda_l2': 4.6630125594034854e-07, 'bagging_freq': 50, 'min_child_samples': 186, 'max_bin': 265, 'min_data_in_bin': 6, 'feature_fraction': 0.8899531006564518}. Best is trial 0 with value: 0.7749954567713652.[0m
[32m[I 2023-03-14 17:52:55,687][0m Trial 1 finished with value: 0.7652044573633094 and parameters: {'num_iterations': 731, 'learning_rate': 0.500133624132809, 'num_leaves': 69, 'neg_bagging_fraction': 0.026034634612764642, 'lambda_l1': 6.333851122462012e-07, 'lambda_l2': 0.877976475690294, 'bagging_freq': 12, 'min_child_samples': 10, 'max_bin': 247, 'min_data_in_bin': 13, 'feature_fraction'

Number of finished trials: 10
Best trial:
  Value: 0.7824380415904969
  Params: 
    num_iterations: 433
    learning_rate: 0.007561527927323003
    num_leaves: 300
    neg_bagging_fraction: 0.08589325343712412
    lambda_l1: 8.17378444852004e-06
    lambda_l2: 0.0006853216782322677
    bagging_freq: 24
    min_child_samples: 7
    max_bin: 265
    min_data_in_bin: 21
    feature_fraction: 0.9090504944807012


In [10]:
study_df = study.trials_dataframe()
complete_trial = study_df.query('state == "COMPLETE"').sort_values('value', ascending=False).reset_index()
complete_trial.rename(columns={'params_bagging_freq':'bagging_freq', 'params_feature_fraction':'feature_fraction',
                              'params_lambda_l1':'lambda_l1','params_lambda_l2':'lambda_l2','params_learning_rate':'learning_rate',
                              'params_max_bin':'max_bin','params_min_child_samples':'min_child_samples', 'params_min_data_in_bin':'min_data_in_bin',
                              'params_neg_bagging_fraction':'neg_bagging_fraction','params_num_iterations':'num_iterations','params_num_leaves':'num_leaves'},
                    inplace=True)

In [11]:
complete_trial

Unnamed: 0,index,number,value,datetime_start,datetime_complete,duration,bagging_freq,feature_fraction,lambda_l1,lambda_l2,learning_rate,max_bin,min_child_samples,min_data_in_bin,neg_bagging_fraction,num_iterations,num_leaves,state
0,2,2,0.782438,2023-03-14 17:52:55.688687,2023-03-14 17:54:17.039270,0 days 00:01:21.350583,24,0.90905,8.173784e-06,0.0006853217,0.007562,265,7,21,0.085893,433,300,COMPLETE
1,6,6,0.7811,2023-03-14 17:57:10.937064,2023-03-14 17:57:44.537093,0 days 00:00:33.600029,15,0.820636,0.0004447878,0.3468654,0.086703,255,17,15,0.083752,50,215,COMPLETE
2,4,4,0.780769,2023-03-14 17:54:41.615252,2023-03-14 17:56:50.679351,0 days 00:02:09.064099,10,0.955939,0.0208292,0.005999611,0.006044,232,118,22,0.097157,825,16,COMPLETE
3,7,7,0.78061,2023-03-14 17:57:44.538479,2023-03-14 17:58:27.861360,0 days 00:00:43.322881,28,0.899536,0.0195807,2.653106e-08,0.005985,210,67,8,0.049128,101,392,COMPLETE
4,0,0,0.774995,2023-03-14 17:52:00.620059,2023-03-14 17:52:31.958403,0 days 00:00:31.338344,50,0.889953,1.106284e-06,4.663013e-07,0.238857,265,186,6,0.048091,847,147,COMPLETE
5,3,3,0.770369,2023-03-14 17:54:17.040795,2023-03-14 17:54:41.613935,0 days 00:00:24.573140,37,0.991974,1.063451e-05,1.987321e-05,0.258545,203,63,12,0.035278,215,372,COMPLETE
6,1,1,0.765204,2023-03-14 17:52:31.959571,2023-03-14 17:52:55.687388,0 days 00:00:23.727817,12,0.985841,6.333851e-07,0.8779765,0.500134,247,10,13,0.026035,731,69,COMPLETE


In [14]:
def train_model(train, dev, param):
    train_x, train_y = train.drop(columns='下单用户'), train['下单用户']
    valid_x, valid_y = dev.drop(columns='下单用户'), dev['下单用户']
    dtrain = lgbm.Dataset(train_x, label=train_y)
    dvalid = lgbm.Dataset(valid_x, label=valid_y)

    gbm = lgbm.train(param, dtrain, valid_sets=[dvalid], verbose_eval=False)
    
    preds = gbm.predict(valid_x)
    auc_score = roc_auc_score(valid_y, preds)
    print("auc score: ", auc_score)
    
    predictions = dev.copy()
    predictions['pred_score'] = preds
    predictions['pred_labels'] = list(map(lambda x: 1 if x > 0.5 else 0, preds))
    predictions = predictions[['下单用户','pred_score','pred_labels']]
    return gbm, predictions

# 使用前两个模型做ensemble
gbms = []
predictions = []
for i in range(0,complete_trial.shape[0]):
    if i > 1:
        break
    params = dict(complete_trial.iloc[i,6:-1])
    params.update({"objective": "binary",
             "boosting_type": "gbdt",
             'metric':"auc",
             'pos_bagging_fraction':1,
            'early_stopping_round':10})
    model, pred = train_model(train, valid, params)
    pred['model'] = f"gbm{i}"
    # pickle.dump(model, open(f"{config.MODEL_GBM}/gbm%d.pkl" % (i), "wb"))
    pred['id'] = pred.index.tolist()
    gbms.append(model)
    predictions.append(pred)

[LightGBM] [Info] Number of positive: 225426, number of negative: 8422633
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1377
[LightGBM] [Info] Number of data points in the train set: 8648059, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026067 -> initscore=-3.620686
[LightGBM] [Info] Start training from score -3.620686
auc score:  0.7825896598459836
[LightGBM] [Info] Number of positive: 225426, number of negative: 8422633
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1383
[LightGBM] [Info] Number of data points in the train set: 8648059, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026067 -> initscore=-3.620686
[LightGBM] [Info] Start training from score -3.620686
auc score:  0.7814828893641237


In [15]:
predictions = pd.concat(predictions)

In [16]:
predictions['percentile'] = predictions.groupby(['model'])['pred_score'].rank(pct=True, ascending=False)
predictions['percentile'] = predictions['percentile'] * 100

In [17]:
# 评估用ensemble投票的precision和recall
percentile = list(range(0, 101, 10))
for p in percentile:
    if p == 0:
        continue
    over_threshold = predictions.query(f'percentile < {p}').drop_duplicates(['id'])
    precision = over_threshold['下单用户'].sum() / over_threshold.shape[0]
    recall = over_threshold['下单用户'].sum() / (predictions['下单用户'].sum() / 2)
    print(f'top {p}%, precision {precision:.4f}, recall {recall:.4f}')

top 10%, precision 0.1003, recall 0.4751
top 20%, precision 0.0686, recall 0.6357
top 30%, precision 0.0532, recall 0.7311
top 40%, precision 0.0440, recall 0.8007
top 50%, precision 0.0378, recall 0.8543
top 60%, precision 0.0333, recall 0.8986
top 70%, precision 0.0299, recall 0.9351
top 80%, precision 0.0273, recall 0.9660
top 90%, precision 0.0250, recall 0.9877
top 100%, precision 0.0232, recall 1.0000
