In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from matplotlib.pyplot import plot, show
from sklearn.metrics import roc_auc_score
import json
from gensim.models.word2vec import Word2Vec

In [2]:
df_train = pd.read_csv('data/train_dataset.csv', sep='\t')
df_test = pd.read_csv('data/test_dataset.csv', sep='\t')
sub = pd.read_csv('data/submit_example.csv')
df_test['id'] = sub['id']
df = pd.concat([df_train, df_test])

# **细化特征**

In [3]:
df['location_first_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['first_lvl'])
df['location_sec_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['sec_lvl'])
df['location_third_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['third_lvl'])

feats = ['user_name', 'action', 'auth_type', 'ip_location_type_keyword', 'ip_risk_level', 'ip', 'location',
         'device_model', 'os_type', 'os_version', 'browser_type', 'browser_version', 'bus_system_code', 'op_target',
         'location_first_lvl', 'location_sec_lvl', 'location_third_lvl',
         ]
cat = []


LABEL = 'risk_label'

df['sec'] = df['session_id'].apply(lambda x: int(x[-7:-5]))
df['sec_sin'] = np.sin(df['sec'] / 60 * 2 * np.pi)
df['sec_cos'] = np.cos(df['sec'] / 60 * 2 * np.pi)
df['op_date'] = pd.to_datetime(df['op_date'])
df['hour'] = df['op_date'].dt.hour
df['weekday'] = df['op_date'].dt.weekday
df['year'] = df['op_date'].dt.year
df['month'] = df['op_date'].dt.month
df['day'] = df['op_date'].dt.day
df['op_ts'] = df["op_date"].values.astype(np.int64) // 10 ** 9
df = df.sort_values(by=['user_name', 'op_ts']).reset_index(drop=True)
df['last_ts'] = df.groupby(['user_name'])['op_ts'].shift(1)
df['last_ts2'] = df.groupby(['user_name'])['op_ts'].shift(2)
df['ts_diff'] = df['op_ts'] - df['last_ts']
df['ts_diff2'] = df['op_ts'] - df['last_ts2']
feats += ['sec',
          'sec_sin', 'sec_cos',
          'op_ts', 'last_ts', 'ts_diff',
          'last_ts2',
          'ts_diff2',
          ]

# **特征构造（衍生特征）**

In [4]:
for name in ['auth_type']:
    df[name + '_fillna'] = df[name].astype('str')
    sent = df.groupby(['user_name', 'year', 'month', 'day'])[name + '_fillna'].agg(list).values

    vec_size = 6
    w2v_model = Word2Vec(sentences=sent, vector_size=vec_size, window=12, min_count=1, workers=1)
    tmp = df[name + '_fillna'].map(lambda x: w2v_model.wv[x])
    tmp = pd.DataFrame(list(tmp))
    tmp.columns = ['_'.join([name, 'emb', str(i)]) for i in range(vec_size)]
    df = pd.concat([df, tmp], axis=1)
    feats += list(tmp.columns)

for name in ['mean', 'std', 'max', 'min', 'median', 'skew']:
    for name1 in ['user_name', 'bus_system_code', 'auth_type', 'action',
                  ]:  # 'op_target'

        df[name1 + '_ts_diff_' + name] = df.groupby([name1])['ts_diff'].transform(name)
        feats.append(name1 + '_ts_diff_' + name)

df['if_out'] = (df['location'] == '{"first_lvl":"成都分公司","sec_lvl":"9楼","third_lvl":"销售部"}')
feats.append('if_out')

for name in ['user_name', 'action', 'auth_type', 'ip', 'ip_location_type_keyword', 'ip_risk_level', 'location',
             'device_model', 'os_type', 'os_version', 'browser_type', 'browser_version', 'bus_system_code',
             'op_target',
             'location_first_lvl', 'location_sec_lvl', 'location_third_lvl',
             ] + cat:
    le = LabelEncoder()
    df[name] = le.fit_transform(df[name])
    # df[name] = df[name].astype('category')

# 分离训练集和训练集，并对模型调参

In [5]:
df_train = df[~df[LABEL].isna()].reset_index(drop=True)
df_test = df[df[LABEL].isna()].reset_index(drop=True)

params = {
    'learning_rate': 0.06,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'verbose': -1,
    'seed': 2222,
    'n_jobs': -1,
}
feats

['user_name',
 'action',
 'auth_type',
 'ip_location_type_keyword',
 'ip_risk_level',
 'ip',
 'location',
 'device_model',
 'os_type',
 'os_version',
 'browser_type',
 'browser_version',
 'bus_system_code',
 'op_target',
 'location_first_lvl',
 'location_sec_lvl',
 'location_third_lvl',
 'sec',
 'sec_sin',
 'sec_cos',
 'op_ts',
 'last_ts',
 'ts_diff',
 'last_ts2',
 'ts_diff2',
 'auth_type_emb_0',
 'auth_type_emb_1',
 'auth_type_emb_2',
 'auth_type_emb_3',
 'auth_type_emb_4',
 'auth_type_emb_5',
 'user_name_ts_diff_mean',
 'bus_system_code_ts_diff_mean',
 'auth_type_ts_diff_mean',
 'action_ts_diff_mean',
 'user_name_ts_diff_std',
 'bus_system_code_ts_diff_std',
 'auth_type_ts_diff_std',
 'action_ts_diff_std',
 'user_name_ts_diff_max',
 'bus_system_code_ts_diff_max',
 'auth_type_ts_diff_max',
 'action_ts_diff_max',
 'user_name_ts_diff_min',
 'bus_system_code_ts_diff_min',
 'auth_type_ts_diff_min',
 'action_ts_diff_min',
 'user_name_ts_diff_median',
 'bus_system_code_ts_diff_median',
 'au

# 使用n折交叉验证划分数据集（此处n取10）训练模型并进行预测

In [6]:
seeds = [2022]
oof = np.zeros(len(df_train))
importance = 0
fold_num = 10
pred_y = pd.DataFrame()
for seed in seeds:
    print('############################', seed)
    kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
        print('-----------', fold)
        train = lgb.Dataset(df_train.loc[train_idx, feats],
                            df_train.loc[train_idx, LABEL])
        val = lgb.Dataset(df_train.loc[val_idx, feats],
                          df_train.loc[val_idx, LABEL])
        model = lgb.train(params, train, valid_sets=val, num_boost_round=10000,
                          early_stopping_rounds=100, verbose_eval=100)

        oof[val_idx] += model.predict(df_train.loc[val_idx, feats]) / len(seeds)
        pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
        importance += model.feature_importance(importance_type='gain') / fold_num

df_train['oof'] = oof
score = roc_auc_score(df_train[LABEL], df_train['oof'])
score

############################ 2022
----------- 0
Training until validation scores don't improve for 100 rounds




[100]	valid_0's auc: 0.518595
Early stopping, best iteration is:
[19]	valid_0's auc: 0.544581
----------- 1
Training until validation scores don't improve for 100 rounds




[100]	valid_0's auc: 0.506482
Early stopping, best iteration is:
[61]	valid_0's auc: 0.512482
----------- 2




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.541011
Early stopping, best iteration is:
[12]	valid_0's auc: 0.549965
----------- 3




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.498036
Early stopping, best iteration is:
[9]	valid_0's auc: 0.51458
----------- 4




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.560416
[200]	valid_0's auc: 0.559718
Early stopping, best iteration is:
[169]	valid_0's auc: 0.567298
----------- 5




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.492348
Early stopping, best iteration is:
[4]	valid_0's auc: 0.504666
----------- 6




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.496424
Early stopping, best iteration is:
[85]	valid_0's auc: 0.501196
----------- 7
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.5144




Early stopping, best iteration is:
[36]	valid_0's auc: 0.525018
----------- 8
Training until validation scores don't improve for 100 rounds




[100]	valid_0's auc: 0.498189
Early stopping, best iteration is:
[5]	valid_0's auc: 0.50327
----------- 9
Training until validation scores don't improve for 100 rounds




[100]	valid_0's auc: 0.507881
Early stopping, best iteration is:
[6]	valid_0's auc: 0.525141


0.5279398137194593

# **获取特征重要性**

In [7]:
feats_importance = pd.DataFrame()
feats_importance['name'] = feats
feats_importance['importance'] = importance
print(feats_importance.sort_values('importance', ascending=False)[:10])

                      name  importance
24                ts_diff2  774.989928
22                 ts_diff  696.405009
20                   op_ts  635.318907
19                 sec_cos  430.459783
18                 sec_sin  426.589768
17                     sec  408.136335
21                 last_ts  322.308488
23                last_ts2  228.533105
51  user_name_ts_diff_skew  175.454799
12         bus_system_code  148.798247


# 保存结果

In [8]:
sub = pd.read_csv('data/submit_example.csv')
pred_y = pred_y.mean(axis=1)
sub['ret'] = pred_y
sub[['id', 'ret']].to_csv('ans/lgb_5272.csv', index=False)
# df_train.to_csv('train.csv')