In [2]:
import matplotlib_inline.backend_inline
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import joblib
import lightgbm as lgb
import warnings
import json
from IPython import display

# 构建训练函数

In [3]:
def lgb_model(data_, test_, y_):
    
    # 定义重要性数组，oof数组与预测数组   
    df_importance_list = []
    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    feature_importance_df = pd.DataFrame()
    
    # 进行n折交叉验证划分数据集    
    folds_ = StratifiedKFold(n_splits=20, shuffle=True, random_state=1983)
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_, y_)):
        trn_x, trn_y = data_.iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_.iloc[val_idx], y_.iloc[val_idx]
        
        # 分类特征        
        cat_feats = ['auth_type', 'bus_system_code', 'op_target', 'browser_type', 'action']
        clf = lgb.LGBMClassifier(objective='binary',
                                 boosting_type='gbdt',
                                 tree_learner='serial',
                                 num_leaves=2 ** 8,
                                 max_depth=16,
                                 learning_rate=0.2,
                                 n_estimators=10000,
                                 subsample=0.75,
                                 feature_fraction=0.55,
                                 reg_alpha=0.2,
                                 reg_lambda=0.2,
                                 random_state=1983,
                                 is_unbalance=True,
                                 metric='auc',
                                 device='gpu',
                                 gpu_platform_id=0,
                                 gpu_device_id=0,
                                 )
        
        # 训练模型
        clf.fit(trn_x, trn_y,
                eval_set=[(trn_x, trn_y), (val_x, val_y)], categorical_feature=cat_feats,
                eval_metric='auc', verbose=100, early_stopping_rounds=40  # 30
                )

        #特征缩放
        vfunc = np.vectorize(lambda x: (x - minmin) / (maxmax - minmin))

        # 记录训练结果
        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        minmin = min(oof_preds[val_idx])
        maxmax = max(oof_preds[val_idx])
        oof_preds[val_idx] = vfunc(oof_preds[val_idx])
        
        # 预测
        sub_preds += clf.predict_proba(test_, num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        minmin = min(sub_preds)
        maxmax = max(sub_preds)
        sub_preds = vfunc(sub_preds)
        
        # 计算特征重要性
        df_importance = pd.DataFrame({
            'column': feature_names,
            'importance': clf.feature_importances_,
        })
        df_importance_list.append(df_importance)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        
    score = roc_auc_score(y_, oof_preds)
    print('Full AUC score %.6f' % score)

    df_importance = pd.concat(df_importance_list)
    df_importance = df_importance.groupby(['column'])['importance'].agg(
        'mean').sort_values(ascending=False).reset_index()
    print(df_importance)

    return oof_preds, sub_preds

# 导入数据

In [4]:
train = pd.read_csv('data/train_dataset.csv', sep='\t')
print(train.shape)
test = pd.read_csv('data/test_dataset.csv', sep='\t')
print(test.shape)
data = pd.concat([train, test])
print(data.shape)

(15016, 19)
(10000, 18)
(25016, 19)


# 细分特征

In [5]:
# 细化location
data['location_first_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['first_lvl'])
data['location_sec_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['sec_lvl'])
data['location_third_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['third_lvl'])

# 丢弃两个特征
data.drop(['client_type', 'browser_source'], axis=1, inplace=True)
# 填空白
data['auth_type'].fillna('__NaN__', inplace=True)

for col in tqdm(['user_name', 'action', 'auth_type', 'ip',
                 'ip_location_type_keyword', 'ip_risk_level', 'location', 'device_model',
                 'os_type', 'os_version', 'browser_type', 'browser_version',
                 'bus_system_code', 'op_target', 'location_first_lvl', 'location_sec_lvl',
                 'location_third_lvl']):
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col])

data['op_date'] = pd.to_datetime(data['op_date'])
data['op_ts'] = data["op_date"].values.astype(np.int64) // 10 ** 9
data = data.sort_values(by=['user_name', 'op_ts']).reset_index(drop=True)
# 每个人本次与上次操作时间差
data['last_ts'] = data.groupby(['user_name'])['op_ts'].shift(1)
data['ts_diff1'] = data['op_ts'] - data['last_ts']

# 生成user_ip_unique\user_location_unique....特征构建
for f in ['ip', 'location', 'device_model', 'os_version', 'browser_version']:
    data[f'user_{f}_nunique'] = data.groupby(['user_name'])[f].transform('nunique')

for method in ['mean', 'max', 'min', 'std', 'sum', 'median', 'prod']:
    for col in ['user_name', 'ip', 'location', 'device_model', 'os_version', 'browser_version']:
        data[f'ts_diff1_{method}_' + str(col)] = data.groupby(col)['ts_diff1'].transform(method)

# 通过标签是否为空将训练集与样本集分开
train = data[data['risk_label'].notna()]
test = data[data['risk_label'].isna()]

print(train.shape, test.shape)
# 'last_ts'   ts_diff1_std_os_version
ycol = 'risk_label'

# 构件训练特征
feature_names = ['last_ts', 'op_ts', 'ts_diff1', 'ts_diff1_mean_user_name', 'ts_diff1_sum_user_name',
                 'browser_version', 'ts_diff1_max_user_name', 'ts_diff1_max_browser_version',
                 'ts_diff1_mean_browser_version', 'ts_diff1_sum_browser_version', 'user_name',
                 'ts_diff1_std_browser_version', 'ts_diff1_std_user_name', 'bus_system_code', 'ts_diff1_mean_ip',
                 'auth_type', 'location', 'ip', 'action', 'op_target', 'device_model', 'browser_type']

  0%|          | 0/17 [00:00<?, ?it/s]

(15016, 70) (10000, 70)


# 分离训练集和测试集

In [6]:
x_train = train[feature_names]
y_train = train['risk_label']
x_test = test[feature_names]

# 调用训练函数进行模型的训练

In [7]:
lgb_train, lgb_test = lgb_model(x_train, x_test, y_train)



New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  1 AUC : 0.540310


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  2 AUC : 0.518781


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  3 AUC : 0.508109


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  4 AUC : 0.514360


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  5 AUC : 0.556646


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  6 AUC : 0.513273


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  7 AUC : 0.499803


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold  8 AUC : 0.519946


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


[100]	training's auc: 0.978503	valid_1's auc: 0.495923
Fold  9 AUC : 0.499944


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 10 AUC : 0.497939


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 11 AUC : 0.521686


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 12 AUC : 0.542348


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 13 AUC : 0.528067


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 14 AUC : 0.574233


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 15 AUC : 0.530747


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 16 AUC : 0.536007


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


[100]	training's auc: 0.983199	valid_1's auc: 0.524893
Fold 17 AUC : 0.528841


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 18 AUC : 0.531628


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 19 AUC : 0.527747


New categorical_feature is ['action', 'auth_type', 'browser_type', 'bus_system_code', 'op_target']


Fold 20 AUC : 0.509403
Full AUC score 0.523531
                           column  importance
0                         last_ts     1459.35
1                           op_ts     1232.30
2                        ts_diff1      847.80
3         ts_diff1_mean_user_name      197.10
4          ts_diff1_sum_user_name      178.55
5    ts_diff1_sum_browser_version      163.15
6          ts_diff1_max_user_name      153.15
7                       user_name      127.75
8                 browser_version      121.00
9    ts_diff1_std_browser_version      105.90
10   ts_diff1_max_browser_version       98.45
11  ts_diff1_mean_browser_version       94.80
12                       location       72.85
13                bus_system_code       71.85
14         ts_diff1_std_user_name       69.35
15               ts_diff1_mean_ip       66.00
16                      auth_type       55.80
17                             ip       54.65
18                         action       51.70
19                   device_model

# 将结果保存到文件中

In [8]:
submit = pd.DataFrame([])
submit['id'] = range(1, 10001)
submit['ret'] = lgb_test
submit.to_csv('ans/submit28.csv', index=False)