In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

# 读取训练数据和测试数据
df_train = pd.read_excel('train.xlsx')
df_test = pd.read_excel('test_A榜.xlsx')

# 打印数据的尺寸, 一共有49个特征
print(f"训练数据的数据大小为：{df_train.shape}")
print(f"测试A榜的数据大小为：{df_test.shape}")

control_ratio = 1

def count_q_and_nan(df,feature,ratio=0.5):
    """
    ratio是？和nan数据加起来数量与总数量比例，控制特征的噪音不能太多
    越小越严格，剔除的特征越多
    """
    total_invalid_num = (df.shape[0]-sum(df[feature].value_counts()))+df[feature].value_counts()["?"]
    if total_invalid_num/df.shape[0] > ratio:
        print(f"Remove feature {feature}!")
        df.drop([feature],inplace=True,axis = 1)
        return feature


# 构造 
# 'MON_12_EXT_SAM_TRSF_IN_AMT' - 'MON_12_EXT_SAM_TRSF_OUT_AMT'
# 'CUR_MON_EXT_SAM_CUST_TRSF_IN_AMT' - 'CUR_MON_EXT_SAM_CUST_TRSF_OUT_AMT'
# 'MON_12_ACT_IN_50_UP_CNT_PTY_QTY' - 'MON_12_ACT_OUT_50_UP_CNT_PTY_QTY'


# CNT代表计数
# MON代表月
# COUNTER：柜台

feature_info = {
    'AGN_CNT_RCT_12_MON':'float',
    'ICO_CUR_MON_ACM_TRX_TM':'float',
    'NB_RCT_3_MON_LGN_TMS_AGV':'float',
    'AGN_CUR_YEAR_AMT':'float',
    'AGN_CUR_YEAR_WAG_AMT':'float',
    'AGN_AGR_LATEST_AGN_AMT':'float',
    'ICO_CUR_MON_ACM_TRX_AMT':'float',
    'COUNTER_CUR_YEAR_CNT_AMT':'float',
    'PUB_TO_PRV_TRX_AMT_CUR_YEAR':'float',
    'MON_12_EXT_SAM_TRSF_IN_AMT':'float',
    'MON_12_EXT_SAM_TRSF_OUT_AMT':'float',
    'MON_12_EXT_SAM_NM_TRSF_OUT_CNT':'float',
    'MON_12_EXT_SAM_AMT':'float',
    'CUR_MON_EXT_SAM_CUST_TRSF_IN_AMT':'float',
    'CUR_MON_EXT_SAM_CUST_TRSF_OUT_AMT':'float',
    'MON_12_CUST_CNT_PTY_ID':'str',
    'MON_12_TRX_AMT_MAX_AMT_PCTT':'float',
    'CUR_YEAR_MON_AGV_TRX_CNT':'float',
    'MON_12_AGV_TRX_CNT':'float',
    'MON_12_ACM_ENTR_ACT_CNT':'float',
    'MON_12_AGV_ENTR_ACT_CNT':'float',
    'MON_12_ACM_LVE_ACT_CNT':'float',
    'MON_12_AGV_LVE_ACT_CNT':'float',
    'CUR_YEAR_PUB_TO_PRV_TRX_PTY_CNT':'float',
    'MON_6_50_UP_ENTR_ACT_CNT':'float',
    'MON_6_50_UP_LVE_ACT_CNT':'float',
    'CUR_YEAR_COUNTER_ENCASH_CNT':'float',
    'MON_12_ACT_OUT_50_UP_CNT_PTY_QTY':'float',
    'MON_12_ACT_IN_50_UP_CNT_PTY_QTY':'float',
    'LAST_12_MON_COR_DPS_TM_PNT_BAL_PEAK_VAL':'float',
    'LAST_12_MON_COR_DPS_DAY_AVG_BAL':'float',
    'CUR_MON_COR_DPS_MON_DAY_AVG_BAL':'float',
    'CUR_YEAR_COR_DMND_DPS_DAY_AVG_BAL':'float',
    'CUR_YEAR_COR_DPS_YEAR_DAY_AVG_INCR':'float',
    'LAST_12_MON_DIF_NM_MON_AVG_TRX_AMT_NAV':'float',
    'LAST_12_MON_MON_AVG_TRX_AMT_NAV':'float',
    'COR_KEY_PROD_HLD_NBR':'float',
    'CUR_YEAR_MID_BUS_INC':'float',
    'AI_STAR_SCO':'float',
    'WTHR_OPN_ONL_ICO':'str',
    'EMP_NBR':'float',
    'REG_CPT':'float',
    'SHH_BCK':'float',
    'HLD_DMS_CCY_ACT_NBR':'float',
    'REG_DT':'float',
    'LGP_HLD_CARD_LVL':'str',
    'OPN_TM':'float',
    'NB_CTC_HLD_IDV_AIO_CARD_SITU':'str',
    'HLD_FGN_CCY_ACT_NBR':'float',
}

removed_features = []
for f in feature_info.keys():
    removed_features.append(count_q_and_nan(df_train,f,ratio=control_ratio))

removed_features = [i for i in removed_features if i != None]
for rf in removed_features:
    del feature_info[rf]

# 发现有4个类别特征
class_feature_name = []
value_feature_name = []
for feature in feature_info.keys():
    if feature_info[feature] =='float':
        value_feature_name.append(feature)
    else:
        class_feature_name.append(feature)


# 数据处理阶段
# 不知道用平均数好还是中位数好，这里先用平均数
def replace_q_with_average(df, feature):
    if '?' not in list(df[feature]):
        # print(f"{feature} do not contain ?")
        return df
    else:  
        values = [i for i in df[feature] if i != '?']
        df[feature].replace("?", sum(values)/len(values), inplace = True)
        
        
def replace_q_with_nan(df, feature):
    if '?' not in list(df[feature]):
        # print(f"{feature} do not contain ?")
        return df
    else:  
        values = [i for i in df[feature] if i != '?']
        df[feature].replace("?", np.nan, inplace = True)
        
        

# 处理类别型状特征
def replace_q_with_G(df, feature):
    if '?' not in list(df[feature]):
        # print(f"{feature} do not contain ?")
        return df
    else:
        df[feature].replace("?", "G", inplace = True)
        
def replace_nan_with_N(df, feature):
    df[feature].replace(np.nan, "N", inplace = True)
    

for feature, kind in feature_info.items():
    if kind =='float':
        replace_q_with_nan(df_train, feature = feature)
    else:
        replace_q_with_G(df_train, feature = feature)
        replace_nan_with_N(df_train, feature = feature)

# 将类型特征独热编码

def change_to_onehot(df, feature):
    if feature not in df.columns:
        return df
    df_onehot = pd.get_dummies(df[feature])
    new_columns = [feature+'_'+i for i in df_onehot.columns]
    df_onehot.columns = new_columns
    df_contact = pd.concat([df,df_onehot],axis=1)
    df_contact.drop([feature], axis = 1, inplace = True)
    return df_contact


def remove_features(df, feature):
    if feature in class_feature_name:
        class_feature_name.remove(feature)
    else:
        value_feature_name.remove(feature)
    df.drop([feature], axis = 1, inplace = True)

# remove_features(df_train, 'MON_12_CUST_CNT_PTY_ID')
    

for f in class_feature_name:
    df_train = change_to_onehot(df_train, f)
    
print(f"数值型特征有{len(value_feature_name)}个， 类别型特征有{len(class_feature_name)}个")

from itertools import combinations, permutations

def add_feature(df, feature_list = [], oprater_list = ['+','-','*','/']):
    for i in feature_list:
        if i not in df.columns:
            print(f"{i} is not in feature!!!")
            return
    combination = list(combinations(feature_list, 2)) # 组合，适用于加、减、乘操作
    permutation = list(permutations(feature_list, 2)) # 排列，适用于除法操作
#     if '+' in oprater_list:
#         for f1, f2 in combination:
#             df[f1+'+'+f2] = df[f1]+df[f2]
#     if '-' in oprater_list:
#         for f1, f2 in combination:
#             df[f1+'-'+f2] = df[f1]-df[f2]
    if '*' in oprater_list:
        for f1, f2 in combination:
            df[f1+'*'+f2] = df[f1]*df[f2]
#     if '/' in oprater_list:
#         for f1, f2 in permutation:
#             df[f1+'/'+f2] = df[f1]/df[f2]

# 处理test数据
for feature, kind in feature_info.items():
    if kind =='float':
        replace_q_with_nan(df_test, feature = feature)
    else:
        replace_q_with_G(df_test, feature = feature)
        replace_nan_with_N(df_test, feature = feature)

def change_to_onehot(df, feature):
    df_onehot = pd.get_dummies(df[feature])
    new_columns = [feature+'_'+i for i in df_onehot.columns]
    df_onehot.columns = new_columns
    df_contact = pd.concat([df,df_onehot],axis=1)
    df_contact.drop([feature], axis = 1, inplace = True)
    return df_contact

for f in class_feature_name:
    df_test = change_to_onehot(df_test, f)    



# 制造净流入新特征

f1 = 'MON_12_EXT_SAM_TRSF_IN_AMT'
f2 = 'MON_12_EXT_SAM_TRSF_OUT_AMT'
df_train[f1+'-'+f2] = df_train[f1]-df_train[f2]
df_test[f1+'-'+f2] = df_test[f1]-df_test[f2]

f3 = 'CUR_MON_EXT_SAM_CUST_TRSF_IN_AMT'
f4 = 'CUR_MON_EXT_SAM_CUST_TRSF_OUT_AMT'
df_train[f3+'-'+f4] = df_train[f3]-df_train[f4]
df_test[f3+'-'+f4] = df_test[f3]-df_test[f4]

f5 = 'MON_12_ACT_IN_50_UP_CNT_PTY_QTY'
f6 = 'MON_12_ACT_OUT_50_UP_CNT_PTY_QTY'
df_train[f5+'-'+f6] = df_train[f5]-df_train[f6]
df_test[f5+'-'+f6] = df_test[f5]-df_test[f6]

# # 增加特征
add_feature(df_train, feature_list = ['CUR_YEAR_COR_DMND_DPS_DAY_AVG_BAL','CUR_MON_COR_DPS_MON_DAY_AVG_BAL','LAST_12_MON_COR_DPS_DAY_AVG_BAL'])
add_feature(df_test, feature_list = ['CUR_YEAR_COR_DMND_DPS_DAY_AVG_BAL','CUR_MON_COR_DPS_MON_DAY_AVG_BAL','LAST_12_MON_COR_DPS_DAY_AVG_BAL'])


# # 去掉xgboost给出的破特征
# df_train.drop(['MON_12_ACT_IN_50_UP_CNT_PTY_QTY'],axis = 1)
# df_test.drop(['MON_12_ACT_IN_50_UP_CNT_PTY_QTY'],axis = 1)

训练数据的数据大小为：(40000, 51)
测试A榜的数据大小为：(12000, 50)
数值型特征有45个， 类别型特征有4个


In [5]:

# ! pip install optuna
import lightgbm as lgbm
import optuna  # pip install optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    # 参数网格
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 50,100,20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
        "random_state": 2022,
    }
    # 5折交叉验证
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # LGBM建模
        model = lgbm.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss")
            ],
        )
        # 模型预测
        preds = model.predict_proba(X_test)
        # 优化指标logloss最小
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [6]:
import warnings
warnings.filterwarnings("ignore")



x_train, y_train = df_train.drop(['CUST_UID','LABEL'],axis=1), df_train['LABEL']

study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, x_train, y_train)
study.optimize(func, n_trials=50)

[32m[I 2022-05-07 23:10:56,764][0m A new study created in memory with name: LGBM Classifier[0m




[32m[I 2022-05-07 23:10:59,394][0m Trial 0 finished with value: 0.2827465496559936 and parameters: {'n_estimators': 90, 'learning_rate': 0.04813768727851033, 'num_leaves': 1220, 'max_depth': 4, 'min_data_in_leaf': 1100, 'lambda_l1': 90, 'lambda_l2': 45, 'min_gain_to_split': 11.628202121959257, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 0 with value: 0.2827465496559936.[0m




[32m[I 2022-05-07 23:11:01,195][0m Trial 1 finished with value: 0.2993786695517694 and parameters: {'n_estimators': 50, 'learning_rate': 0.17998518620554038, 'num_leaves': 640, 'max_depth': 12, 'min_data_in_leaf': 9500, 'lambda_l1': 65, 'lambda_l2': 90, 'min_gain_to_split': 4.268687795904335, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 0 with value: 0.2827465496559936.[0m




[32m[I 2022-05-07 23:11:03,557][0m Trial 2 finished with value: 0.34491503920308164 and parameters: {'n_estimators': 70, 'learning_rate': 0.024102702802304938, 'num_leaves': 660, 'max_depth': 12, 'min_data_in_leaf': 3400, 'lambda_l1': 70, 'lambda_l2': 35, 'min_gain_to_split': 5.17563767199811, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 0 with value: 0.2827465496559936.[0m




[32m[I 2022-05-07 23:11:06,261][0m Trial 3 finished with value: 0.24980783014123 and parameters: {'n_estimators': 90, 'learning_rate': 0.19782532377614848, 'num_leaves': 640, 'max_depth': 6, 'min_data_in_leaf': 200, 'lambda_l1': 10, 'lambda_l2': 65, 'min_gain_to_split': 3.493862555634621, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:08,563][0m Trial 4 finished with value: 0.26533150227336744 and parameters: {'n_estimators': 50, 'learning_rate': 0.13125081478789, 'num_leaves': 580, 'max_depth': 6, 'min_data_in_leaf': 300, 'lambda_l1': 90, 'lambda_l2': 95, 'min_gain_to_split': 6.116084450432898, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 3 with value: 0.24980783014123.[0m
[32m[I 2022-05-07 23:11:08,792][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2022-05-07 23:11:09,091][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:09,373][0m Trial 7 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:09,651][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:09,982][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2022-05-07 23:11:13,356][0m Trial 10 finished with value: 0.26679160062664536 and parameters: {'n_estimators': 70, 'learning_rate': 0.2957118786526114, 'num_leaves': 120, 'max_depth': 6, 'min_data_in_leaf': 3200, 'lambda_l1': 0, 'lambda_l2': 10, 'min_gain_to_split': 8.959101274064603, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 3 with value: 0.24980783014123.[0m
[32m[I 2022-05-07 23:11:13,618][0m Trial 11 pruned. Trial was pruned at iteration 1.[0m




[32m[I 2022-05-07 23:11:16,338][0m Trial 12 finished with value: 0.26446558823945976 and parameters: {'n_estimators': 70, 'learning_rate': 0.24026301618232354, 'num_leaves': 1660, 'max_depth': 5, 'min_data_in_leaf': 2000, 'lambda_l1': 20, 'lambda_l2': 65, 'min_gain_to_split': 8.769633226723595, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:16,740][0m Trial 13 pruned. Trial was pruned at iteration 26.[0m
[32m[I 2022-05-07 23:11:17,052][0m Trial 14 pruned. Trial was pruned at iteration 19.[0m




[32m[I 2022-05-07 23:11:19,873][0m Trial 15 finished with value: 0.25810936762201886 and parameters: {'n_estimators': 70, 'learning_rate': 0.19964064958037922, 'num_leaves': 1140, 'max_depth': 8, 'min_data_in_leaf': 1600, 'lambda_l1': 0, 'lambda_l2': 80, 'min_gain_to_split': 2.833253363119459, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:20,293][0m Trial 16 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:20,712][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:21,045][0m Trial 18 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2022-05-07 23:11:23,992][0m Trial 19 finished with value: 0.2609383590507719 and parameters: {'n_estimators': 90, 'learning_rate': 0.29784876498020213, 'num_leaves': 1360, 'max_depth': 7, 'min_data_in_leaf': 1600, 'lambda_l1': 55, 'lambda_l2': 80, 'min_gain_to_split': 0.5620228315569875, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 3 with value: 0.24980783014123.[0m
[32m[I 2022-05-07 23:11:24,313][0m Trial 20 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2022-05-07 23:11:27,166][0m Trial 21 finished with value: 0.2596812969001842 and parameters: {'n_estimators': 90, 'learning_rate': 0.29423986544238717, 'num_leaves': 1460, 'max_depth': 7, 'min_data_in_leaf': 1500, 'lambda_l1': 55, 'lambda_l2': 80, 'min_gain_to_split': 0.3585172034319921, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:27,538][0m Trial 22 pruned. Trial was pruned at iteration 19.[0m
[32m[I 2022-05-07 23:11:27,811][0m Trial 23 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:28,098][0m Trial 24 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:28,433][0m Trial 25 pruned. Trial was pruned at iteration 11.[0m
[32m[I 2022-05-07 23:11:28,887][0m Trial 26 pruned. Trial was pruned at iteration 30.[0m
[32m[I 2022-05-07 23:11:29,227][0m Trial 27 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:29,745][0m Trial 28 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:30,029][0m Trial 29 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:30,394][0m Trial 30 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2022-05-07 23:11:33,270][0m Trial 31 finished with value: 0.26177991139487616 and parameters: {'n_estimators': 90, 'learning_rate': 0.29131379152406367, 'num_leaves': 1340, 'max_depth': 7, 'min_data_in_leaf': 1500, 'lambda_l1': 55, 'lambda_l2': 80, 'min_gain_to_split': 0.7995535383194698, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:36,285][0m Trial 32 finished with value: 0.2586400002564433 and parameters: {'n_estimators': 90, 'learning_rate': 0.27584779461025716, 'num_leaves': 1620, 'max_depth': 7, 'min_data_in_leaf': 1900, 'lambda_l1': 45, 'lambda_l2': 85, 'min_gain_to_split': 0.337156842490927, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:36,685][0m Trial 33 pruned. Trial was pruned at iteration 22.[0m
[32m[I 2022-05-07 23:11:37,001][0m Trial 34 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:37,647][0m Trial 35 pruned. Trial was pruned at iteration 50.[0m
[32m[I 2022-05-07 23:11:38,015][0m Trial 36 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2022-05-07 23:11:40,492][0m Trial 37 finished with value: 0.2594138216152845 and parameters: {'n_estimators': 70, 'learning_rate': 0.2540300865563293, 'num_leaves': 1140, 'max_depth': 3, 'min_data_in_leaf': 1000, 'lambda_l1': 40, 'lambda_l2': 60, 'min_gain_to_split': 1.3897702377231715, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:40,824][0m Trial 38 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:41,154][0m Trial 39 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:41,799][0m Trial 40 pruned. Trial was pruned at iteration 50.[0m




[32m[I 2022-05-07 23:11:44,425][0m Trial 41 finished with value: 0.25883896173737814 and parameters: {'n_estimators': 70, 'learning_rate': 0.27406809804360055, 'num_leaves': 1040, 'max_depth': 6, 'min_data_in_leaf': 1200, 'lambda_l1': 45, 'lambda_l2': 75, 'min_gain_to_split': 0.8971327442501764, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 3 with value: 0.24980783014123.[0m




[32m[I 2022-05-07 23:11:47,043][0m Trial 42 finished with value: 0.25640695719418394 and parameters: {'n_estimators': 70, 'learning_rate': 0.2817719948672518, 'num_leaves': 1040, 'max_depth': 6, 'min_data_in_leaf': 800, 'lambda_l1': 45, 'lambda_l2': 65, 'min_gain_to_split': 1.065614322192542, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 3 with value: 0.24980783014123.[0m
[32m[I 2022-05-07 23:11:47,326][0m Trial 43 pruned. Trial was pruned at iteration 7.[0m




[32m[I 2022-05-07 23:11:50,464][0m Trial 44 finished with value: 0.25227537577512366 and parameters: {'n_estimators': 70, 'learning_rate': 0.2786126800103613, 'num_leaves': 760, 'max_depth': 6, 'min_data_in_leaf': 600, 'lambda_l1': 35, 'lambda_l2': 65, 'min_gain_to_split': 1.026112985420929, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 3 with value: 0.24980783014123.[0m
[32m[I 2022-05-07 23:11:50,749][0m Trial 45 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2022-05-07 23:11:51,094][0m Trial 46 pruned. Trial was pruned at iteration 22.[0m
[32m[I 2022-05-07 23:11:51,356][0m Trial 47 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-05-07 23:11:51,712][0m Trial 48 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2022-05-07 23:11:52,026][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m


In [7]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.24981
	Best params:
		n_estimators: 90
		learning_rate: 0.19782532377614848
		num_leaves: 640
		max_depth: 6
		min_data_in_leaf: 200
		lambda_l1: 10
		lambda_l2: 65
		min_gain_to_split: 3.493862555634621
		bagging_fraction: 0.6000000000000001
		bagging_freq: 1
		feature_fraction: 0.7


In [126]:
# 网格搜索的时候一下子最多用2个参数，每个参数4个选项
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV,cross_val_score
x_train, y_train = df_train.drop(['CUST_UID','LABEL'],axis=1), df_train['LABEL']

random_seed = 10

parameters = {
#     'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    
#     'subsample': [0.6, 0.7, 0.8, 0.9],
#     'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
    
#     'reg_alpha': [0.05, 0.1, 1, 2, 3],
#     'reg_lambda': [0.05, 0.1, 1, 2, 3],
    
}
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)
xgb = XGBClassifier(
    learning_rate=0.1,
    silent=True, 
    objective='binary:logistic',
    use_label_encoder=False,
    
    n_estimators = 90,
    max_depth = 6,
    min_child_weight = 5,
    gamma = 0.2,
    colsample_bytree=0.9,
    subsample=0.9,
    
)


gsearch = GridSearchCV(xgb, param_grid=parameters, scoring='roc_auc', cv=kf, n_jobs=-1)
gsearch.fit(x_train, y_train)
print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
# print(gsearch.cv_results_['mean_test_score'])

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


参数的最佳取值:{}
最佳模型得分:0.9497585166666667


In [127]:
model_best = gsearch.best_estimator_
model_best.fit(x_train, y_train)

feature_importande = dict()
for i,j in zip(list(x_train.columns),model_best.feature_importances_):
    feature_importande[i] = j
feature_importande = {k: v for k, v in sorted(feature_importande.items(), key=lambda item: item[1], reverse=True)}
# print(feature_importande)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [128]:
cnt = 1
for i,j in feature_importande.items():
    print(i,j)
    cnt += 1

CUR_YEAR_COR_DMND_DPS_DAY_AVG_BAL 0.23133892
CUR_MON_COR_DPS_MON_DAY_AVG_BAL*LAST_12_MON_COR_DPS_DAY_AVG_BAL 0.11783886
CUR_MON_COR_DPS_MON_DAY_AVG_BAL 0.05715703
CUR_YEAR_COR_DMND_DPS_DAY_AVG_BAL*LAST_12_MON_COR_DPS_DAY_AVG_BAL 0.034619216
CUR_YEAR_COR_DMND_DPS_DAY_AVG_BAL*CUR_MON_COR_DPS_MON_DAY_AVG_BAL 0.033719636
WTHR_OPN_ONL_ICO_B 0.020237219
LAST_12_MON_COR_DPS_DAY_AVG_BAL 0.020010164
MON_12_AGV_TRX_CNT 0.016041365
HLD_DMS_CCY_ACT_NBR 0.015786393
CUR_YEAR_COR_DPS_YEAR_DAY_AVG_INCR 0.014753099
COR_KEY_PROD_HLD_NBR 0.013643141
CUR_MON_EXT_SAM_CUST_TRSF_IN_AMT 0.01173348
MON_12_CUST_CNT_PTY_ID_Y 0.011626406
LAST_12_MON_MON_AVG_TRX_AMT_NAV 0.0115253795
EMP_NBR 0.011187685
ICO_CUR_MON_ACM_TRX_TM 0.011106304
MON_12_ACM_ENTR_ACT_CNT 0.01097935
MON_12_CUST_CNT_PTY_ID_G 0.010673995
ICO_CUR_MON_ACM_TRX_AMT 0.010670558
LAST_12_MON_COR_DPS_TM_PNT_BAL_PEAK_VAL 0.010626054
HLD_FGN_CCY_ACT_NBR 0.010363478
NB_CTC_HLD_IDV_AIO_CARD_SITU_N 0.010259248
LGP_HLD_CARD_LVL_B 0.00933109
MON_6_50_UP_ENTR_

In [129]:
# import shap
# import matplotlib.pyplot as plt
# shap.initjs()
# plt.figure(figsize=(6,6))

# X = x_train
# explainer = shap.TreeExplainer(model_best)
# shap_values = explainer.shap_values(X) 
# shap.summary_plot(shap_values, X,show=False,layered_violin_max_num_bins=200)

In [130]:
# parameters = {
# #     'n_estimators': [200, 300, 400, 500, 600, 700, 800],

# #     'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
#     'min_child_weight': [1, 2, 3, 4, 5, 6],

#     'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],

#     'subsample': [0.6, 0.7, 0.8, 0.9],
#     'colsample_bytree': [0.6, 0.7, 0.8, 0.9],

#     'reg_alpha': [0.05, 0.1, 1, 2, 3],
#     'reg_lambda': [0.05, 0.1, 1, 2, 3],

# #     'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2],

# }

In [131]:
model_best = gsearch.best_estimator_

x_test = df_test.drop(['CUST_UID'],axis=1)

# 预测并且输出结果，给出是1的概率
pre_res = model_best.predict_proba(x_test)[:,1]
pre_res = np.around(pre_res,10)
uid = np.array(df_test['CUST_UID'])
final_res = np.vstack((uid,pre_res)).T
np.savetxt('res.txt',  final_res,encoding='UTF-8',fmt = '%s',delimiter=' ')