In [1]:
import warnings
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
warnings.simplefilter('ignore')

In [2]:
jy_df=pd.read_csv('data/账户交易信息.csv')
jy_df.columns = ['交易流水序号','账户代号','对方账号','借贷标志','交易金额','交易余额','对方行号','交易日期','交易时间','交易渠道','摘要代号','对方名称长度']
jt_df=pd.read_csv('data/账户静态信息.csv')
jt_df.columns = ['账户代号','开户日期','开户行代号','客户性别','年龄']
train_label=pd.read_csv('data/训练集标签.csv')
train_label.columns = ['账户代号','label']
test_df=pd.read_csv('data/test_dataset.csv')
test_df.columns = ['账户代号']

In [3]:
data=pd.merge(jt_df,train_label,on='账户代号',how='left')
data.head()

Unnamed: 0,账户代号,开户日期,开户行代号,客户性别,年龄,label
0,DDF394282B1E1508,2018-04-13,577BCC91,1,25,
1,CAE68290A37CC77D,2016-04-02,34ED066D,1,27,
2,41E4A8AECE47E5F3,2014-09-28,30BB3825,1,44,
3,163C42F2A3FD518E,2010-06-11,34ED066D,1,55,0.0
4,6FBFEB03252FDB9F,2015-08-20,D64A340B,0,44,


In [4]:
df_1 = jy_df[jy_df['借贷标志']==1].groupby(['账户代号'])['交易金额'].agg(sum).reset_index()
df_1.columns = ['账户代号','收入金额']
df_0 = jy_df[jy_df['借贷标志']==0].groupby(['账户代号'])['交易金额'].agg(sum).reset_index()
df_0.columns = ['账户代号','支出金额']
df_0.shape

(5997, 2)

In [5]:
data=pd.merge(data,df_0,on='账户代号',how='left')
data=pd.merge(data,df_1,on='账户代号',how='left')
data.head()

Unnamed: 0,账户代号,开户日期,开户行代号,客户性别,年龄,label,支出金额,收入金额
0,DDF394282B1E1508,2018-04-13,577BCC91,1,25,,15972.4,15977.95
1,CAE68290A37CC77D,2016-04-02,34ED066D,1,27,,37164.58,37315.66
2,41E4A8AECE47E5F3,2014-09-28,30BB3825,1,44,,386217.53,386225.41
3,163C42F2A3FD518E,2010-06-11,34ED066D,1,55,0.0,1453202.58,7057520.0
4,6FBFEB03252FDB9F,2015-08-20,D64A340B,0,44,,258455.87,299540.0


In [6]:
data['收支比'] = data['收入金额'] / data['支出金额']
data['收支差'] = data['收入金额'] - data['支出金额']

In [7]:
jy_df['交易时间'] = pd.to_datetime(jy_df['交易时间'])
jy_df['jy_hour'] = jy_df['交易时间'].map(lambda x: x.hour)
jy_df['jy_min'] = jy_df['交易时间'].map(lambda x: x.minute)
jy_df['jy_sec'] = jy_df['交易时间'].map(lambda x: x.second)

In [8]:
jy_df['交易日期'] = pd.to_datetime(jy_df['交易日期'])
jy_df['jy_month'] = jy_df['交易日期'].map(lambda x: x.month)
jy_df['jy_day'] = jy_df['交易日期'].map(lambda x: x.day)
jy_df['jy_dayofweek'] = jy_df['交易日期'].map(lambda x: x.dayofweek)

In [9]:
cat_f = ['对方账号', '对方行号', '摘要代号','交易日期','交易时间']
for f in cat_f:
    le = LabelEncoder()
    jy_df[f] = le.fit_transform(jy_df[f])

In [10]:
JYQD = ['E96ED478', '621461AF', '091D584F', '757B505C', 'F47D0AD3',
       '892C91E0', '854D6FAE', '274AD478', 'B6A1085A', '38B3EFF8',
       'B706835D', 'F57A2F55', 'C8FBBC86', '7EABE3A1', '3B8A6142',
       'AA169B49', '6974CE5A', '6F3EF77A', 'EAE27D77', 'E205EE2A',
       '979D472A']

In [11]:
df=pd.DataFrame()
df['账户代号']=data['账户代号'].unique()
cat_fea = ['对方账号', '对方行号', '摘要代号']
num_fea = ['交易金额', '交易余额','对方名称长度', 'jy_hour', 'jy_min',
       'jy_sec', 'jy_month', 'jy_day', 'jy_dayofweek','交易日期' ,'交易时间']
for fea in num_fea + cat_fea:
    print(fea)
    temp = jy_df.groupby('账户代号')[fea].agg(**{
            'df_{}_mean'.format(fea): 'mean',
            'df_{}_std'.format(fea): 'std',
            'df_{}_median'.format(fea): 'median',
            'df_{}_nunique'.format(fea): 'nunique',
            'df_{}_min'.format(fea): 'min',
            'df_{}_max'.format(fea): 'max',
        }).reset_index()
    df=pd.merge(df,temp,on='账户代号',how='left')
    
dfs = []   
for c in JYQD: 
    jy_df[c] = (jy_df.交易渠道 == c).astype('int8')
for c in JYQD:
    tmp = jy_df.groupby(['账户代号'])[c].agg('sum')
    tmp.name = tmp.name + '_sum'
    dfs.append(tmp)
t = pd.concat(dfs,axis=1)
df=pd.merge(df,t,on='账户代号',how='left')

temp = jy_df.groupby('账户代号')['交易流水序号'].agg(**{
            'df_{}_count'.format('交易流水序号'): 'count',
        }).reset_index()
df=pd.merge(df,temp,on='账户代号',how='left')

交易金额
交易余额
对方名称长度
jy_hour
jy_min
jy_sec
jy_month
jy_day
jy_dayofweek
交易日期
交易时间
对方账号
对方行号
摘要代号


In [13]:
data=pd.merge(data,df,on='账户代号',how='left')
data.head()

Unnamed: 0,账户代号,开户日期,开户行代号,客户性别,年龄,label,支出金额,收入金额,收支比,收支差,...,C8FBBC86_sum,7EABE3A1_sum,3B8A6142_sum,AA169B49_sum,6974CE5A_sum,6F3EF77A_sum,EAE27D77_sum,E205EE2A_sum,979D472A_sum,df_交易流水序号_count
0,DDF394282B1E1508,2018-04-13,577BCC91,1,25,,15972.4,15977.95,1.000347,5.55,...,0,0,0,0,0,0,0,0,0,221
1,CAE68290A37CC77D,2016-04-02,34ED066D,1,27,,37164.58,37315.66,1.004065,151.08,...,0,0,0,0,0,0,0,0,0,67
2,41E4A8AECE47E5F3,2014-09-28,30BB3825,1,44,,386217.53,386225.41,1.00002,7.88,...,0,0,0,0,0,0,0,0,0,74
3,163C42F2A3FD518E,2010-06-11,34ED066D,1,55,0.0,1453202.58,7057520.0,4.856529,5604317.42,...,0,0,0,0,0,0,0,0,0,45
4,6FBFEB03252FDB9F,2015-08-20,D64A340B,0,44,,258455.87,299540.0,1.15896,41084.13,...,0,0,0,0,0,0,0,0,0,81


In [14]:
data.columns

Index(['账户代号', '开户日期', '开户行代号', '客户性别', '年龄', 'label', '支出金额', '收入金额', '收支比',
       '收支差',
       ...
       'C8FBBC86_sum', '7EABE3A1_sum', '3B8A6142_sum', 'AA169B49_sum',
       '6974CE5A_sum', '6F3EF77A_sum', 'EAE27D77_sum', 'E205EE2A_sum',
       '979D472A_sum', 'df_交易流水序号_count'],
      dtype='object', length=116)

In [15]:
data['开户日期'] = pd.to_datetime(data['开户日期'])
data['ku_month'] = data['开户日期'].map(lambda x: x.month)
data['ku_day'] = data['开户日期'].map(lambda x: x.day)
data['ku_dayofweek'] = data['开户日期'].map(lambda x: x.dayofweek)

In [16]:
cat_f = ['开户日期','开户行代号']

for f in cat_f:
    le = LabelEncoder()
    data[f] = le.fit_transform(data[f])

In [17]:
feature_names = [i for i in data.columns if
                 i not in ['账户代号', 'label']]
len(feature_names)

117

In [18]:
train = data[~data['label'].isna()].reset_index(drop=True)
test = data[data['label'].isna()].reset_index(drop=True)

In [19]:
def lgb_model(train, target, test, k, seed):
    feats = [f for f in train.columns if f not in ['zhdh', 'black_flag']]
    print('Current num of features:', len(feats))

    oof_probs = np.zeros((train.shape[0],))
    output_preds = 0
    offline_score = []
    feature_importance_df = pd.DataFrame()
    parameters = {
         'boosting_type': 'gbdt',
            'objective': 'binary',
            'tree_learner':'serial',
            'metric': 'auc',
            'min_child_weight': 4,
            'num_leaves': 64,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'learning_rate': 0.02,
            'seed': seed,
            'nthread': 32,
            'n_jobs':8,
            'silent': True,
            'verbose': -1,
    }

    seeds = [2]
    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            train_y, test_y = target.iloc[train_index], target.iloc[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]

            dtrain = lgb.Dataset(train_X,
                                 label=train_y)
            dval = lgb.Dataset(test_X,
                               label=test_y)

            lgb_model = lgb.train(
                parameters,
                dtrain,
                num_boost_round=8000,
                valid_sets=[dval],
                callbacks=[early_stopping(100), log_evaluation(100)],

            )

            oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration) / len(
                seeds)

            offline_score.append(lgb_model.best_score['valid_0']['auc'])
            output_preds += lgb_model.predict(test[feats],
                                              num_iteration=lgb_model.best_iteration) / folds.n_splits / len(seeds)
            print(offline_score)
            # feature importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
            fold_importance_df["fold"] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(50))

    return output_preds, oof_probs, np.mean(offline_score), feature_importance_df

In [20]:
print('开始模型训练train')
lgb_preds, lgb_oof, lgb_score, feature_importance_df = lgb_model(train=train[feature_names],
                                                                 target=train['label'],
                                                                 test=test[feature_names], k=5,seed=2020)

开始模型训练train
Current num of features: 117
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.942593
[200]	valid_0's auc: 0.948333
[300]	valid_0's auc: 0.950648
Early stopping, best iteration is:
[253]	valid_0's auc: 0.951204
[0.9512037037037037]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.956111
[200]	valid_0's auc: 0.958056
Early stopping, best iteration is:
[184]	valid_0's auc: 0.959167
[0.9512037037037037, 0.9591666666666666]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.93963
[200]	valid_0's auc: 0.950833
[300]	valid_0's auc: 0.953333
[400]	valid_0's auc: 0.954352
[500]	valid_0's auc: 0.954537
Early stopping, best iteration is:
[447]	valid_0's auc: 0.955648
[0.9512037037037037, 0.9591666666666666, 0.9556481481481481]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.933241
[200]	valid_0's auc: 0.934815
Early stopping, best iteration is:

In [21]:
from sklearn.metrics import f1_score

val_pred = lgb_oof.copy()
t0 = 0.2
v = 0.002
best_t = t0
best_f1 = 0
for step in range(1000):
    curr_t = t0 + step * v
    val_y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(train['label'], val_y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best auc: {}'.format(step, best_t, best_f1))
print('search finish.')

step: 0   best threshold: 0.2   best auc: 0.8148148148148148
step: 5   best threshold: 0.21000000000000002   best auc: 0.8155038759689922
step: 6   best threshold: 0.21200000000000002   best auc: 0.8161993769470405
step: 7   best threshold: 0.21400000000000002   best auc: 0.8174726989079563
step: 8   best threshold: 0.21600000000000003   best auc: 0.81875
step: 10   best threshold: 0.22   best auc: 0.8200312989045384
step: 11   best threshold: 0.222   best auc: 0.8213166144200627
step: 17   best threshold: 0.234   best auc: 0.8220472440944883
step: 22   best threshold: 0.244   best auc: 0.8233438485804416
step: 28   best threshold: 0.256   best auc: 0.8253968253968254
step: 31   best threshold: 0.262   best auc: 0.8256000000000001
step: 35   best threshold: 0.27   best auc: 0.8276972624798712
step: 71   best threshold: 0.342   best auc: 0.8291032148900169
step: 89   best threshold: 0.378   best auc: 0.8296041308089501
step: 91   best threshold: 0.382   best auc: 0.8310344827586208
step

In [22]:
best_t

0.41800000000000004

In [44]:
#best auc: 0.8596491228070176

In [23]:
label=[1 if x >= best_t else 0 for x in lgb_preds]

In [24]:
sum(label)

1086

In [25]:
sub=pd.read_csv('data/submit_example.csv')
sub

Unnamed: 0,zhdh,black_flag
0,B6751CD225DD4886,1
1,8265CA869E4AFF16,1
2,06DD2A17463919FC,0
3,F36469EA4C5BD7FC,1
4,A18F7ACD7A3853D1,1
...,...,...
4795,5E1C4461BC135745,1
4796,0507C0E6649E637C,1
4797,2CB5C631D0BE5241,0
4798,193FE069CEA9AD06,1


In [26]:
sub=pd.read_csv('data/submit_example.csv')
del sub['black_flag']
sub.columns = ['账户代号']
test['black_flag']=label
sub=pd.merge(sub,test,on='账户代号',how='left')
sub1 = sub[['账户代号','black_flag']]
sub1.columns = ['zhdh','black_flag']
sub1[['zhdh','black_flag']].to_csv('baseline_submission_{}.csv'.format(best_f1), index=False)

In [27]:
sub1['black_flag'].value_counts()

0    3714
1    1086
Name: black_flag, dtype: int64

# 祝大家2023上分嘎嘎快！！