# 任务目标--构建baseline

# 调包区


In [1]:
from tqdm.notebook import tqdm
import random
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import gc

# 警告忽略
import warnings
warnings.filterwarnings("ignore")

# matplotlib字体设置
plt.rcParams["font.family"] = "Songti SC"
plt.rcParams["axes.unicode_minus"] = False

# matplotlib警告忽略
pd.plotting.register_matplotlib_converters()


# 观看Dataframe长度
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
# 浮点数位长度
pd.set_option('display.precision',5)

# 显示多个结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # ['all', 'last', 'last_expr', 'none', 'last_expr_or_assign']

In [2]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,KFold
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,auc,f1_score
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_squared_log_error
from sklearn.metrics import classification_report

In [3]:
import keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D , GlobalAveragePooling1D,MaxPool1D
from keras import Model
from keras.models import Sequential
from keras.optimizers import Adam,SGD # 优化方法
from keras.callbacks import EarlyStopping,ModelCheckpoint,RemoteMonitor,CSVLogger
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
def lgb_logloss(preds,data):
    labels_ = data.get_label()             
    classes_ = np.unique(labels_) 
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)] )
        
    preds_prob_ = np.vstack(preds_prob) 
    
    loss = []
    for i in range(preds_prob_.shape[1]):     # 样本个数
        sum_ = 0
        for j in range(preds_prob_.shape[0]): #类别个数
            pred = preds_prob_[j,i]           # 第i个样本预测为第j类的概率
            if  j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred)
        loss.append(sum_)       
    return 'loss is: ',-1 * (np.sum(loss) / preds_prob_.shape[1]),False

# 打开文件

In [5]:
#path = './sampledata' # 打开样本数据
path = './fulldata' # 打开全量数据

train_data = pd.read_csv(f'{path}/security_train.csv')
test_data = pd.read_csv(f'{path}/security_test.csv')
# train_data = pd.read_pickle(f'{path}/security_train.pkl')
# test_data = pd.read_pickle(f'{path}/security_test.pkl')

# 数据处理

## 进行标签编码

In [6]:
%%time
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_all = pd.concat([train_data,test_data])
df_all['api'] = le.fit_transform(df_all['api'])

CPU times: user 33.5 s, sys: 14 s, total: 47.4 s
Wall time: 51.4 s


In [7]:
%%time
# 设置训练集api
train_data['api']  = df_all[df_all['label'].notnull()]['api']
# 设置测试集api
test_data['api']  = df_all[df_all['label'].isnull()]['api']

del df_all
gc.collect()

CPU times: user 13 s, sys: 32.5 s, total: 45.5 s
Wall time: 56.7 s


40

## 提取特征

In [8]:
# def get_feature(df):
#     # 按照fild_id进行分组，提取特征
#     df_file = df.groupby('file_id')
#     if 'label' in df.columns:
#         # 需要保留两列
#         df1 = df.drop_duplicates(subset=['file_id','label'],keep='first')
#         df1 = df1.sort_values('file_id')
#     else:
#         df1 = df.drop_duplicates(subset=['file_id'],keep='first')
    
#     # 提取多个特征，统计特征
#     features = ['api','tid','index']
    
#     for f in features:
#         df1[f+'_count'] = df_file[f].count().values
#         df1[f+'_nunique'] = df_file[f].nunique().values
#         df1[f+'_min'] = df_file[f].min().values
#         df1[f+'_max'] = df_file[f].max().values
#         df1[f+'_mean'] = df_file[f].mean().values
#         df1[f+'_median'] = df_file[f].median().values
#         df1[f+'_std'] = df_file[f].std().values
        
#     return df1

In [9]:
def get_feature(df):
    # 按照fild_id进行分组，提取特征
    df_file = df.groupby('file_id')
    if 'label' in df.columns:
        # 需要保留两列
        df1 = df.drop_duplicates(subset=['file_id','label'],keep='first')
        df1 = df1.sort_values('file_id')
    else:
        df1 = df.drop_duplicates(subset=['file_id'],keep='first')
    
    # 提取多个特征，统计特征
    stats_dict = {
                'api': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'tid': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'index': ['count', 'nunique', 'min', 'max', 'mean', 'median','std']
    }
        
  
    stats_df = df_file.agg(stats_dict)
    stats_df.columns = ["_".join(tup) for tup in stats_df.columns]
    stats_df = stats_df.reset_index()
    
    df1 = df1.merge(stats_df,on='file_id',how='left')
        
    return df1

In [10]:
%%time
# 对训练集 构建新特征
df_train = get_feature(train_data)
df_test = get_feature(test_data)

CPU times: user 1min 24s, sys: 29.3 s, total: 1min 53s
Wall time: 1min 57s


# 建模

## 使用LGB进行建模

In [11]:
import lightgbm as lgb

clf = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='multiclass',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=1, colsample_bytree=1)



# 模型训练
clf.fit(df_train.drop('label',axis=1),df_train['label'])




LGBMClassifier(colsample_bytree=1, learning_rate=0.005, min_child_samples=3,
               n_estimators=2000, objective='multiclass', random_state=2021,
               reg_alpha=0.25, reg_lambda=0.25, subsample=1)

In [12]:
# 模型预测
result = clf.predict_proba(df_test)

result = pd.DataFrame(result,columns=['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7'])

result['file_id'] = df_test['file_id'].values

columns = ['file_id','prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']
result.to_csv('./outdata/lgb_t.csv',index=False,columns=columns)


## 使用XGB进行建模

In [13]:
import xgboost as xgb

clf = xgb.XGBClassifier(
            max_depth=9, learning_rate=0.005, n_estimators=2000, 
            objective='multi:softprob', 
            #tree_method='gpu_hist', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='logloss', reg_lambda=0.5)


# 模型训练
clf.fit(df_train.drop('label',axis=1),df_train['label'])



Parameters: { min_child_samples } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.005, max_delta_step=0,
              max_depth=9, min_child_samples=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=2000, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=0.5, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
# 模型预测
result = clf.predict_proba(df_test)

result = pd.DataFrame(result,columns=['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7'])

result['file_id'] = df_test['file_id'].values

columns = ['file_id','prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']
result.to_csv('./outdata/xgb_t.csv',index=False,columns=columns)