In [143]:
import warnings
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.simplefilter('ignore')

In [144]:
# 读取训练数据
acc_train = pd.read_csv('../input/赛题B_预赛数据/训练集/acct_train.csv')
bhv_train = pd.read_csv('../input/赛题B_预赛数据/训练集/bhv_train.csv')
cust_train = pd.read_csv('../input/赛题B_预赛数据/训练集/cust_train.csv')
train_label = pd.read_csv('../input/赛题B_预赛数据/训练集/train_label.csv')

In [145]:
# 读取测试数据
acct_test = pd.read_csv('../input/赛题B_预赛数据/测试集/acct_test.csv')
bhv_test = pd.read_csv('../input/赛题B_预赛数据/测试集/bhv_test.csv')
cust_test = pd.read_csv('../input/赛题B_预赛数据/测试集/cust_test.csv')

In [146]:
# 拼接数据
train_data = acc_train.merge(bhv_train,on='id',how='left').merge(cust_train,on='id',how='left').merge(train_label,on='id',how='left')
test_data = acct_test.merge(bhv_test,on='id',how='left').merge(cust_test,on='id',how='left')

In [147]:
# 拼接train test
data = pd.concat([train_data,test_data])

In [None]:
data

In [149]:
#类别特征
cat_f = ['b2', 'b3','b28']
for f in cat_f:
    le = LabelEncoder()
    data[f] = le.fit_transform(data[f].fillna('nan'))

In [None]:
#暴力构建b10-b28衍生
loc_f = []
for i in range(10,28,1):
    loc_f.append('b'+str(i))
print(loc_f)
for i in tqdm(range(len(loc_f))):
    for j in range(i + 1, len(loc_f)):
        data[f'{loc_f[i]}+{loc_f[j]}'] = data[loc_f[i]] + data[loc_f[j]]
        data[f'{loc_f[i]}-{loc_f[j]}'] = data[loc_f[i]] - data[loc_f[j]]
        data[f'{loc_f[i]}*{loc_f[j]}'] = data[loc_f[i]] * data[loc_f[j]]
        data[f'{loc_f[i]}/{loc_f[j]}'] = data[loc_f[i]] / (data[loc_f[j]] + 1e-3)

In [151]:
#时间差值
data['6-5'] = data['b6'] - data['b5']
data['7-6'] = data['b7'] - data['b6']

In [152]:
drop_cols= []
for f in data.columns:
    if data[f].nunique() < 2:
        drop_cols.append(f)

In [None]:
drop_cols

In [None]:
feature_names = [i for i in data.columns if
                 i not in ['id', 'label', 'b30', 'b4']]
print(feature_names)

In [155]:
train = data[~data['label'].isna()].reset_index(drop=True)
test = data[data['label'].isna()].reset_index(drop=True)

In [156]:
def lgb_model(train, target, test, k, seed):
    feats = [f for f in train.columns if f not in ['id', 'label', 'b30', 'b4']]
    print('Current num of features:', len(feats))

    oof_probs = np.zeros((train.shape[0],))
    output_preds = 0
    offline_score = []
    feature_importance_df = pd.DataFrame()
    parameters = {
         'boosting_type': 'gbdt',
            'objective': 'binary',
            'tree_learner':'serial',
            # 'metric': 'auc',
            'min_child_weight': 4,
            'num_leaves': 64,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'learning_rate': 0.01,
            'seed': seed,
            'nthread': 32,
            'n_jobs':8,
            'silent': True,
            'verbose': -1,
    }

    seeds = [2222]
    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            train_y, test_y = target.iloc[train_index], target.iloc[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
    
            dtrain = lgb.Dataset(train_X,
                                 label=train_y)
            dval = lgb.Dataset(test_X,
                               label=test_y)

            lgb_model = lgb.train(
                parameters,
                dtrain,
                num_boost_round=8000,
                valid_sets=[dval],
                callbacks=[early_stopping(200), log_evaluation(200)],

            )

            oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration) / len(
                seeds)

            offline_score.append(lgb_model.best_score['valid_0']['binary_logloss'])
            output_preds += lgb_model.predict(test[feats],
                                              num_iteration=lgb_model.best_iteration) / folds.n_splits / len(seeds)
            print(offline_score)
            # feature importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
            fold_importance_df["fold"] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(50))

    return output_preds, oof_probs, np.mean(offline_score), feature_importance_df

In [None]:
print('开始模型训练train')
lgb_preds, lgb_oof, lgb_score, feature_importance_df = lgb_model(train=train[feature_names],
                                                                 target=train['label'],
                                                                 test=test[feature_names], k=5,seed=22222)

In [None]:
# 调整阈值寻找线下最优f1
from sklearn.metrics import f1_score

val_pred = lgb_oof.copy()
t0 = 0.1
v = 0.001
best_t = t0
best_f1 = 0
for step in range(3000):
    curr_t = t0 + step * v
    val_y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(train['label'], val_y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

In [None]:
from sklearn.metrics import roc_auc_score
auc_p = roc_auc_score(train['label'], lgb_oof)
print(0.3*auc_p+0.7*best_f1)

In [162]:
label_pre=[1 if x >= 0.185 else 0 for x in lgb_preds]

In [163]:
sub = pd.DataFrame()
sub['id'] = test['id']
sub['pred_prob'] = lgb_preds
sub['pred_label'] = label_pre

In [None]:
sub

In [165]:
#在output下创建GL文件夹 保存文件
sub.to_csv('../output/GL/baseline.csv', index=False)

In [None]:
# 提交
!castlecli --third local --token token(使用自己的token) --source ../output/GL/bs.csv