In [50]:
!pip install pandas numpy lightgbm sklearn matplotlib gensim

[0m

In [51]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from matplotlib.pyplot import plot, show
from sklearn.metrics import roc_auc_score
import json
from gensim.models.word2vec import Word2Vec
import os 
import warnings
warnings.filterwarnings('ignore')

In [52]:
df_train = pd.read_csv('data/train_dataset.csv',sep='\t')
df_test = pd.read_csv('data/test_dataset.csv', sep='\t')
sub = pd.read_csv('data/submit_example.csv')

df_test['id'] = sub['id']
df = pd.concat([df_train,df_test])

In [53]:
df.head()

Unnamed: 0,session_id,op_date,user_name,action,auth_type,ip,ip_location_type_keyword,ip_risk_level,location,client_type,browser_source,device_model,os_type,os_version,browser_type,browser_version,bus_system_code,op_target,risk_label,id
0,access:test_d:20180101111639:bBp1,2018/1/1 11:16,test_d,login,otp,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,chrome,chrome 90,coremail,management,0.0,
1,access:test_d:20180101121524:OBSg,2018/1/1 12:15,test_d,login,qr,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,order-mgnt,sales,0.0,
2,access:test_d:20180101151333:BpQN,2018/1/1 15:13,test_d,login,qr,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,chrome,chrome 90,order-mgnt,sales,0.0,
3,access:test_d:20180101124502:hYQm,2018/1/1 12:45,test_d,sso,,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,oa,management,0.0,
4,access:test_d:20180101202749:FkDK,2018/1/1 20:27,test_d,sso,,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,order-mgnt,sales,0.0,


In [54]:
# 把location里的字段解析出来

df['location_first_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['first_lvl'])
df['location_sec_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['sec_lvl'])
df['location_third_lvl'] = df['location'].astype(str).apply(lambda x: json.loads(x)['third_lvl'])

In [55]:
len(df.columns),df.columns

(23,
 Index(['session_id', 'op_date', 'user_name', 'action', 'auth_type', 'ip',
        'ip_location_type_keyword', 'ip_risk_level', 'location', 'client_type',
        'browser_source', 'device_model', 'os_type', 'os_version',
        'browser_type', 'browser_version', 'bus_system_code', 'op_target',
        'risk_label', 'id', 'location_first_lvl', 'location_sec_lvl',
        'location_third_lvl'],
       dtype='object'))

In [56]:
feats = ['user_name', 'action', 'auth_type', 'ip_location_type_keyword', 'ip_risk_level', 'ip', 'location',
         'device_model', 'os_type', 'os_version', 'browser_type', 'browser_version', 'bus_system_code', 'op_target',
         'location_first_lvl', 'location_sec_lvl', 'location_third_lvl',
         ]

len(feats)

17

In [57]:
cat = []

LABEL = 'risk_label'

In [58]:
# 特征处理

df['sec'] = df.session_id.apply(lambda x:int(x[-7:-5]))
df['sec_sin'] = np.sin(df['sec']/60*2*np.pi)
df['sec_cos'] = np.cos(df['sec']/60*2*np.pi)

df['op_date'] = pd.to_datetime(df['op_date'])
df['hour'] = df['op_date'].dt.hour
df['weekday'] = df['op_date'].dt.weekday
df['year'] = df['op_date'].dt.year
df['month'] = df['op_date'].dt.month
df['day'] = df['op_date'].dt.day
# 转Unix时间戳
df['op_ts'] = df["op_date"].values.astype(np.int64) // 10 ** 9

# 根据用户、时间戳排序
df = df.sort_values(by=['user_name', 'op_ts']).reset_index(drop=True)

df['last_ts'] = df.groupby(['user_name'])['op_ts'].shift(1)
df['last_ts2'] = df.groupby(['user_name'])['op_ts'].shift(2)
df['ts_diff'] = df['op_ts'] - df['last_ts']
df['ts_diff2'] = df['op_ts'] - df['last_ts2']

In [59]:
feats += ['sec',
          'sec_sin', 'sec_cos',
          'op_ts', 'last_ts', 'ts_diff',
          'last_ts2',
          'ts_diff2',
          ]

In [60]:
df.auth_type.drop_duplicates()

0     otp
1     pwd
2      qr
3     NaN
16    sms
Name: auth_type, dtype: object

In [61]:
for name in ['auth_type']:
    df[name+'_fillna'] = df[name].astype('str')

    # 每个用户 ，不同年月日下，auth_type状态列表（一个用户一天有很多条auth_type）
    sent = df.groupby(['user_name', 'year', 'month', 'day'])[name+'_fillna']\
                                                            .agg(list).values
    print(sent[0])

    # 对于每个用户每天的auth_type状态列表，embedding（word2vec）
    vec_size = 6
    w2v_model = Word2Vec(sentences=sent,vector_size=vec_size,window=12,min_count=1,workers=1)
    tmp = df[name+ '_fillna'].map(lambda x:w2v_model.wv[x])
    
    tmp = pd.DataFrame(list(tmp))
    tmp.columns = ['_'.join([name,'emb',str(i)]) for i in range(vec_size)]

    df = pd.concat([df,tmp],axis=1)
    feats += list(tmp.columns)


['otp', 'pwd', 'qr', 'nan', 'otp', 'nan', 'pwd', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'pwd', 'otp', 'nan', 'sms', 'pwd', 'nan', 'nan', 'nan', 'nan', 'sms', 'otp', 'pwd', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'qr', 'pwd', 'nan', 'nan', 'nan', 'nan', 'qr', 'pwd', 'nan', 'nan', 'sms', 'nan', 'nan', 'otp', 'nan', 'nan', 'nan', 'pwd', 'qr', 'nan', 'nan', 'pwd', 'otp', 'nan', 'pwd', 'qr', 'pwd', 'nan', 'nan', 'nan', 'sms']


In [62]:
# 计算特征时间戳时序间隔的统计值

for name in ['mean','std','max','min','median','skew']:
    for name1 in ['user_name','bus_system_code','auth_type','action']:
        df[name1+'_ts_diff'+name] = df.groupby([name1])['ts_diff'].transform(name)
        feats.append(name1+'_ts_diff'+name)

        

In [63]:
# 该类下正负样本比例4:1

df['if_out'] = (df['location'] == '{"first_lvl":"成都分公司","sec_lvl":"9楼","third_lvl":"销售部"}')
feats.append('if_out')

In [64]:
# 类别特征编码

for name in ['user_name', 'action', 'auth_type', 'ip', 'ip_location_type_keyword', 'ip_risk_level', 'location',
             'device_model', 'os_type', 'os_version', 'browser_type', 'browser_version', 'bus_system_code', 'op_target',
             'location_first_lvl', 'location_sec_lvl', 'location_third_lvl',
             ]+cat:
    le = LabelEncoder()
    df[name] = le.fit_transform(df[name])


In [65]:
df_train = df[~df[LABEL].isna()].reset_index(drop=True)
df_test = df[df[LABEL].isna()].reset_index(drop=True)

In [66]:
params = {
    'learning_rate': 0.06,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'verbose': -1,
    'seed': 2222,
    'n_jobs': -1,
}


print(feats)
print(df_train[feats].shape, df_test[feats].shape)

['user_name', 'action', 'auth_type', 'ip_location_type_keyword', 'ip_risk_level', 'ip', 'location', 'device_model', 'os_type', 'os_version', 'browser_type', 'browser_version', 'bus_system_code', 'op_target', 'location_first_lvl', 'location_sec_lvl', 'location_third_lvl', 'sec', 'sec_sin', 'sec_cos', 'op_ts', 'last_ts', 'ts_diff', 'last_ts2', 'ts_diff2', 'auth_type_emb_0', 'auth_type_emb_1', 'auth_type_emb_2', 'auth_type_emb_3', 'auth_type_emb_4', 'auth_type_emb_5', 'user_name_ts_diffmean', 'bus_system_code_ts_diffmean', 'auth_type_ts_diffmean', 'action_ts_diffmean', 'user_name_ts_diffstd', 'bus_system_code_ts_diffstd', 'auth_type_ts_diffstd', 'action_ts_diffstd', 'user_name_ts_diffmax', 'bus_system_code_ts_diffmax', 'auth_type_ts_diffmax', 'action_ts_diffmax', 'user_name_ts_diffmin', 'bus_system_code_ts_diffmin', 'auth_type_ts_diffmin', 'action_ts_diffmin', 'user_name_ts_diffmedian', 'bus_system_code_ts_diffmedian', 'auth_type_ts_diffmedian', 'action_ts_diffmedian', 'user_name_ts_diffs

In [67]:
seeds = [2021]
oof = np.zeros(len(df_train))
importance = 0
fold_num = 10
pred_y = pd.DataFrame()

# 交叉验证
for seed in seeds:
    print('############################', seed)
    kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
        print('-----------', fold)
        train = lgb.Dataset(df_train.loc[train_idx, feats],
                            df_train.loc[train_idx, LABEL])
        val = lgb.Dataset(df_train.loc[val_idx, feats],
                          df_train.loc[val_idx, LABEL])
        model = lgb.train(params, train, valid_sets=val, num_boost_round=10000,
                          early_stopping_rounds=100, verbose_eval=100)

        oof[val_idx] += model.predict(df_train.loc[val_idx, feats]) / len(seeds)
        pred_y['fold_%d_seed_%d'%(fold, seed)] = model.predict(df_test[feats])
        importance += model.feature_importance(importance_type='gain')/fold_num

############################ 2021
----------- 0
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.539889
[200]	valid_0's auc: 0.534839
Early stopping, best iteration is:
[112]	valid_0's auc: 0.543007
----------- 1
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.532442
Early stopping, best iteration is:
[60]	valid_0's auc: 0.538341
----------- 2
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.508189
Early stopping, best iteration is:
[11]	valid_0's auc: 0.542748
----------- 3
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.506773
[200]	valid_0's auc: 0.508354
Early stopping, best iteration is:
[132]	valid_0's auc: 0.514964
----------- 4
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.499504
Early stopping, best iteration is:
[1]	valid_0's auc: 0.534653
----------- 5
Training until validation scores don't im

In [68]:
pred_y

Unnamed: 0,fold_0_seed_2021,fold_1_seed_2021,fold_2_seed_2021,fold_3_seed_2021,fold_4_seed_2021,fold_5_seed_2021,fold_6_seed_2021,fold_7_seed_2021,fold_8_seed_2021,fold_9_seed_2021
0,0.214038,0.128706,0.187883,0.197143,0.195161,0.125202,0.212547,0.138900,0.160376,0.194740
1,0.185360,0.245288,0.186160,0.315464,0.195161,0.320479,0.194946,0.222511,0.280848,0.194740
2,0.212769,0.197814,0.192663,0.156851,0.196120,0.103962,0.197862,0.179805,0.148155,0.196693
3,0.320877,0.212664,0.197902,0.345416,0.195161,0.279643,0.212547,0.226014,0.302799,0.194740
4,0.248650,0.163068,0.187824,0.193006,0.195161,0.250472,0.194946,0.205050,0.258180,0.194740
...,...,...,...,...,...,...,...,...,...,...
9995,0.191282,0.202966,0.215640,0.190242,0.200812,0.272564,0.197862,0.201215,0.237589,0.202658
9996,0.287578,0.218022,0.216185,0.193700,0.195161,0.298699,0.212547,0.275335,0.156865,0.194740
9997,0.324800,0.226623,0.221910,0.271162,0.200812,0.283602,0.197862,0.263231,0.284173,0.209497
9998,0.276765,0.205150,0.221910,0.215708,0.195161,0.326603,0.194946,0.276279,0.277403,0.194740


In [69]:
# 计算模型roc
df_train['oof'] = oof
score = roc_auc_score(df_train[LABEL], df_train['oof'])
print(score)

# 特征重要性
feats_importance = pd.DataFrame()
feats_importance['name'] = feats
feats_importance['importance'] = importance
print(feats_importance.sort_values('importance', ascending=False)[:10])

sub = pd.read_csv('data/submit_example.csv')

# 10折取平均
pred_y = pred_y.mean(axis=1)
sub['ret'] = pred_y


sub[['id', 'ret']].to_csv('./results/lgb_5272.csv', index=False)
# df_train.to_csv('train.csv')

0.5259922841279606
                     name   importance
24               ts_diff2  1252.395027
22                ts_diff  1151.849817
20                  op_ts   945.572552
19                sec_cos   668.319933
17                    sec   663.585258
18                sec_sin   661.357806
21                last_ts   530.540512
23               last_ts2   411.898174
51  user_name_ts_diffskew   295.957002
0               user_name   244.324481
