In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from matplotlib.pyplot import plot, show
from sklearn.metrics import roc_auc_score
import json
from gensim.models.word2vec import Word2Vec

# 1.导入数据

In [2]:
df_train = pd.read_csv('data/train_dataset.csv', sep='\t')
df_test = pd.read_csv('data/test_dataset.csv', sep='\t')
sub = pd.read_csv('data/submit_example.csv')
df_test['id'] = sub['id']
df = pd.concat([df_train, df_test])

# 2.细化特征

In [3]:
df['location_first_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['first_lvl'])
df['location_sec_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['sec_lvl'])
df['location_third_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['third_lvl'])

# 构建训练特征
feats = ['user_name', 'action', 'auth_type', 'ip_location_type_keyword', 'ip_risk_level', 'ip', 'location',
         'device_model', 'os_type', 'os_version', 'browser_type', 'browser_version', 'bus_system_code', 'op_target',
         'location_first_lvl', 'location_sec_lvl', 'location_third_lvl',
         ]


LABEL = 'risk_label'

# 细分时间特征
df['sec'] = df['session_id'].apply(lambda x: int(x[-7:-5]))
df['sec_sin'] = np.sin(df['sec'] / 60 * 2 * np.pi)
df['sec_cos'] = np.cos(df['sec'] / 60 * 2 * np.pi)
df['op_date'] = pd.to_datetime(df['op_date'])
df['hour'] = df['op_date'].dt.hour
df['weekday'] = df['op_date'].dt.weekday
df['year'] = df['op_date'].dt.year
df['month'] = df['op_date'].dt.month
df['day'] = df['op_date'].dt.day
df['op_ts'] = df["op_date"].values.astype(np.int64) // 10 ** 9
df = df.sort_values(by=['user_name', 'op_ts']).reset_index(drop=True)
df['last_ts'] = df.groupby(['user_name'])['op_ts'].shift(1)
df['last_ts2'] = df.groupby(['user_name'])['op_ts'].shift(2)
df['ts_diff'] = df['op_ts'] - df['last_ts']
df['ts_diff2'] = df['op_ts'] - df['last_ts2']
feats += ['sec',
          'sec_sin', 'sec_cos',
          'op_ts', 'last_ts', 'ts_diff',
          # 'last_ts2',
          'ts_diff2',
          ]

# 3.词嵌入特征

In [6]:
# 词嵌入特征
for name in ['auth_type']:
    df[name + '_fillna'] = df[name].astype('str')
    sent = df.groupby(['user_name', 'year', 'month', 'day'])[name + '_fillna'].agg(list).values

    vec_size = 6
    w2v_model = Word2Vec(sentences=sent, vector_size=vec_size, window=12, min_count=1, workers=1)
    tmp = df[name + '_fillna'].map(lambda x: w2v_model.wv[x])
    tmp = pd.DataFrame(list(tmp))
    tmp.columns = ['_'.join([name, 'emb', str(i)]) for i in range(vec_size)]
    df = pd.concat([df, tmp], axis=1)
    feats += list(tmp.columns)

# for name in df['auth_type']:
for w in w2v_model.wv.key_to_index:
    print(w, w2v_model.wv[w])

# 特征构造
for name in ['mean', 'std', 'max', 'min', 'median', 'skew']:
    for name1 in ['user_name', 'bus_system_code', 'auth_type', 'action',
                  ]:  # 'op_target'

        df[name1 + '_ts_diff_' + name] = df.groupby([name1])['ts_diff'].transform(name)
        feats.append(name1 + '_ts_diff_' + name)

df['if_out'] = (df['location'] == '{"first_lvl":"成都分公司","sec_lvl":"9楼","third_lvl":"销售部"}')
feats.append('if_out')

nan [-0.13903026  0.03005757  0.25148448  0.9644866  -0.28805935 -0.67375904]
pwd [ 0.03973053  0.1891723   0.13216272  1.0011268  -0.16735148 -0.7792582 ]
sms [-0.12732314  0.05897151  0.15256187  1.0156738  -0.19783318 -0.69883054]
qr [-0.15083304 -0.18690944  0.32928893  1.0530428  -0.14471556 -0.6634975 ]
otp [ 0.10661024 -0.01189896  0.15297233  0.95404774 -0.40966824 -0.6917451 ]


# 4.标签编码

In [4]:
# 标签编码
for name in ['user_name', 'action', 'auth_type', 'ip', 'ip_location_type_keyword', 'ip_risk_level', 'location',
             'device_model', 'os_type', 'os_version', 'browser_type', 'browser_version', 'bus_system_code',
             'op_target',
             'location_first_lvl', 'location_sec_lvl', 'location_third_lvl',
             ]:
    le = LabelEncoder()
    df[name] = le.fit_transform(df[name])

# 5.分离训练集和测试集

In [5]:
df_train = df[~df[LABEL].isna()].reset_index(drop=True)
df_test = df[df[LABEL].isna()].reset_index(drop=True)

# 6.确定模型参数

In [6]:
params = {
    'learning_rate': 0.08,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'verbose': -1,
    'seed': 2222,
    'n_jobs': -1,
}
feats

['user_name',
 'action',
 'auth_type',
 'ip_location_type_keyword',
 'ip_risk_level',
 'ip',
 'location',
 'device_model',
 'os_type',
 'os_version',
 'browser_type',
 'browser_version',
 'bus_system_code',
 'op_target',
 'location_first_lvl',
 'location_sec_lvl',
 'location_third_lvl',
 'sec',
 'sec_sin',
 'sec_cos',
 'op_ts',
 'last_ts',
 'ts_diff',
 'ts_diff2']

# 7.使用n折交叉验证划分数据集（此处n取10）训练模型并进行预测

In [8]:
seeds = [2022]
# 初始化oof数组
oof = np.zeros(len(df_train))
importance = 0
fold_num = 10
pred_y = pd.DataFrame()
for seed in seeds:
    print('############################', seed)

    # 10折交叉验证划分数据
    kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
        print('-----------', fold)
        train = lgb.Dataset(df_train.loc[train_idx, feats],
                            df_train.loc[train_idx, LABEL])
        val = lgb.Dataset(df_train.loc[val_idx, feats],
                          df_train.loc[val_idx, LABEL])
        model = lgb.train(params, train, valid_sets=val, num_boost_round=10000,
                          early_stopping_rounds=100, verbose_eval=100)

        oof[val_idx] += model.predict(df_train.loc[val_idx, feats]) / len(seeds)
        pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
        importance += model.feature_importance(importance_type='gain') / fold_num

df_train['oof'] = oof
score = roc_auc_score(df_train[LABEL], df_train['oof'])
score

############################ 2022
----------- 0
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.515185
Early stopping, best iteration is:
[10]	valid_0's auc: 0.551171
----------- 1
Training until validation scores don't improve for 100 rounds




[100]	valid_0's auc: 0.508991
Early stopping, best iteration is:
[79]	valid_0's auc: 0.517923
----------- 2
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.510712




Early stopping, best iteration is:
[4]	valid_0's auc: 0.515377
----------- 3
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.510535
[200]	valid_0's auc: 0.519352
Early stopping, best iteration is:
[198]	valid_0's auc: 0.520555
----------- 4
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.541088




[200]	valid_0's auc: 0.547385
Early stopping, best iteration is:
[118]	valid_0's auc: 0.549228
----------- 5
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.499745
Early stopping, best iteration is:
[2]	valid_0's auc: 0.516372
----------- 6
Training until validation scores don't improve for 100 rounds




[100]	valid_0's auc: 0.480316
Early stopping, best iteration is:
[2]	valid_0's auc: 0.503763
----------- 7
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.533863
Early stopping, best iteration is:
[6]	valid_0's auc: 0.547277
----------- 8
Training until validation scores don't improve for 100 rounds




[100]	valid_0's auc: 0.487605
Early stopping, best iteration is:
[2]	valid_0's auc: 0.498464
----------- 9
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.497651
Early stopping, best iteration is:
[13]	valid_0's auc: 0.539979




0.5253528672151205

# **8.获取特征重要性**

In [9]:
feats_importance = pd.DataFrame()
feats_importance['name'] = feats
feats_importance['importance'] = importance
feats_importance.sort_values('importance', ascending=False)[:10]

Unnamed: 0,name,importance
23,ts_diff2,835.041332
20,op_ts,728.293245
22,ts_diff,702.748968
21,last_ts,431.427922
17,sec,409.421355
19,sec_cos,404.674026
18,sec_sin,389.103014
12,bus_system_code,260.092006
0,user_name,253.200059
2,auth_type,201.037838


# 9.保存结果

In [12]:
sub = pd.read_csv('data/submit_example.csv')
pred_y = pred_y.mean(axis=1)
sub['ret'] = pred_y
sub[['id', 'ret']].to_csv('ans/lgb_5285.csv', index=False)