In [1]:
# 3.3 基线模型
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# 读取数据
path = './dataset/'
train = pd.read_csv(path + 'security_train.csv')
test = pd.read_csv(path + 'security_test.csv')

In [3]:
# 3.3.2 特征工程
# 利用count函数和nunique函数生成特征：反映样本调用api，tid，index的频率信息。
def simple_sts_features(df):
    simple_fea = pd.DataFrame()
    simple_fea['file_id'] = df['file_id'].unique()
    simple_fea = simple_fea.sort_values('file_id')

    df_grp = df.groupby('file_id')
    simple_fea['file_id_api_count'] = df_grp['api'].count().values
    simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values

    simple_fea['file_id_tid_count'] = df_grp['tid'].count().values
    simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values

    simple_fea['file_id_index_count'] = df_grp['index'].count().values
    simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values

    return simple_fea

In [5]:
# 利用mean min std max生成特征
def simple_numerical_sts_features(df):
    simple_numerical_fea = pd.DataFrame()
    simple_numerical_fea['file_id'] = df['file_id'].unique()
    simple_numerical_fea = simple_numerical_fea.sort_values('file_id')

    df_grp = df.groupby('file_id')
    simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values
    simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values
    simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values
    simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values

    simple_numerical_fea['file_id_index_mean'] = df_grp['index'].mean().values
    simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values
    simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values
    simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values

    return simple_numerical_fea

In [8]:
# 利用定义的特征生成函数，并生成训练集和测试集的统计特征。
# 反映样本调用api，tid，index

simple_train_fea1 = simple_sts_features(train)

simple_test_fea1 = simple_sts_features(test)

In [7]:
simple_train_fea2 = simple_numerical_sts_features(train)

simple_test_fea2 = simple_numerical_sts_features(test)

In [9]:
# 3.3.3 基线构建 获取标签
train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')
test_submit = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')

In [10]:
# 训练集和测试集的构建
train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')
train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')

test_submit = test_submit.merge(simple_test_fea1, on='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')

In [12]:
# 赛题模型评估函数
def lgb_logloss(preds, data):
    labels_ = data.get_label()
    classes_ = np.unique(labels_)
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)])

    preds_prob_ = np.vstack(preds_prob)
    loss = []
    for i in range(preds_prob_.shape[1]):
        sum_ = 0
        for j in range(preds_prob_.shape[0]):
            pred = preds_prob_[j, i]
            if j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred)
        loss.append(sum_)
    return 'loss is: ', -1 * (np.sum(loss) / preds_prob_.shape[1]), False

In [13]:
train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]
train_label = 'label'

In [None]:
# 使用5折交叉验证，采用lightgbm模型
from sklearn.model_selection import StratifiedKFold, KFold
params = {
    'task': 'train',
    'objective': 'multiclass',
    'num_leaves': 255,
    'num_class':8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq':5,
    'max_bin': 128,
    'random_state': 100
}

folds = KFold(n_splits=5, shuffle=True, random_state=15)

predict_res = 0
models = []

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
    print('fold n {}'.format(fold_))
    trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features],
                           label=train_data.iloc[trn_idx][train_label].values)
    val_data = lgb.Dataset(train_data.iloc[val_idx][train_features],
                           label=train_data.iloc[val_idx][train_label].values)

    clf = lgb.train(params,
                    trn_data,
                    num_boost_round=2000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=100,
                    feval=lgb_logloss)
    models.append(clf)

In [None]:
# 3.3.4 特征重要性分析
feature_importance = pd.DataFrame()
feature_importance['fea_name'] = train_features
feature_importance['fea_imp'] = clf.feature_importance()
feature_importance = feature_importance.sort_values('fea_imp', ascending=False)

plt.figure(figsize=[20,10,])
sns.barplot(x=feature_importance['fea_name'],
            y=feature_importance['fea_imp'])

In [None]:
# 3.3.5 模型测试
pred_res = 0
fold = 5
for model in models:
    pred_res += model.predict(test_submit[train_features]) * 1.0 / fold
test_submit['prob0'] = 0
test_submit['prob1'] = 0

test_submit[['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']] = pred_res
test_submit[['file_id', 'prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']].to_csv('baseline.csv', index=None)