# 任务目标--构建baseline

# 调包区


In [1]:
from tqdm.notebook import tqdm
import random
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import gc

# 警告忽略
import warnings
warnings.filterwarnings("ignore")

# matplotlib字体设置
plt.rcParams["font.family"] = "Songti SC"
plt.rcParams["axes.unicode_minus"] = False

# matplotlib警告忽略
pd.plotting.register_matplotlib_converters()


# 观看Dataframe长度
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
# 浮点数位长度
pd.set_option('display.precision',5)

# 显示多个结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # ['all', 'last', 'last_expr', 'none', 'last_expr_or_assign']

In [2]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,KFold
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,auc,f1_score
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_squared_log_error
from sklearn.metrics import classification_report

In [3]:
import keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D , GlobalAveragePooling1D,MaxPool1D
from keras import Model
from keras.models import Sequential
from keras.optimizers import Adam,SGD # 优化方法
from keras.callbacks import EarlyStopping,ModelCheckpoint,RemoteMonitor,CSVLogger
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
def lgb_logloss(preds,data):
    labels_ = data.get_label()             
    classes_ = np.unique(labels_) 
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)] )
        
    preds_prob_ = np.vstack(preds_prob) 
    
    loss = []
    for i in range(preds_prob_.shape[1]):     # 样本个数
        sum_ = 0
        for j in range(preds_prob_.shape[0]): #类别个数
            pred = preds_prob_[j,i]           # 第i个样本预测为第j类的概率
            if  j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred)
        loss.append(sum_)       
    return 'loss is: ',-1 * (np.sum(loss) / preds_prob_.shape[1]),False

# 打开文件

In [5]:
#path = './sampledata' # 打开样本数据
path = './fulldata' # 打开全量数据

train_data = pd.read_csv(f'{path}/security_train.csv')
test_data = pd.read_csv(f'{path}/security_test.csv')


# 数据处理

## 进行标签编码

In [6]:
# 数据集合并
data = pd.concat([train_data,test_data],axis=0)

# 进行标签编码
le = LabelEncoder()
data['api'] = le.fit_transform(data['api'])

# 数据集拆分
train_data = data[data['label'].notnull()]
test_data = data[data['label'].isnull()]

## 生成特征文档

In [7]:
basic_feature = ['file_id','label','api','tid','index']

train_fe = train_data[basic_feature].drop_duplicates(['file_id','label'])
test_fe = test_data[basic_feature].drop_duplicates(['file_id','label'])
train_fe

Unnamed: 0,file_id,label,api,tid,index
0,1,5.0,135,2488,0
6786,2,2.0,95,2320,0
7602,3,0.0,151,2208,0
8065,4,0.0,95,2284,0
10111,5,0.0,249,2500,0
...,...,...,...,...,...
89620181,13883,2.0,95,100,0
89798402,13884,5.0,95,2592,0
89799721,13885,0.0,151,2240,0
89800754,13886,1.0,95,2324,0


## 构建统计值特征（count,nunique,min、max、sum、mean、median）

In [8]:
# 得到统计值特征（count,nunique,min、max、sum、mean、median）
def creat_stats_feature(tem_df):
    # 以file_id 为单位聚合，统计每一个file_id下的情况
    group_df = tem_df.groupby('file_id')

    # 得到 count,nunique,min、max、sum、mean、median的统计值
    stats_dict = {
        "api": ['count', 'nunique', 'min', 'max', 'mean', 'median','std'], 
        "tid":['count', 'nunique', 'min', 'max', 'mean', 'median','std'], 
        "index":['count', 'nunique', 'min', 'max', 'mean', 'median','std']}

    stats_df = group_df.agg(stats_dict)
    stats_df.columns = ["_".join(tup) for tup in stats_df.columns]
    stats_df = stats_df.reset_index()
    return(stats_df)



# 构建训练集的特征
stats_df = creat_stats_feature(train_data)
train_fe = train_fe.merge(stats_df,on='file_id',how='left')

# 构建测试集的特征
stats_df = creat_stats_feature(test_data)
test_fe = test_fe.merge(stats_df,on='file_id',how='left')

# 回收内存
del stats_df
gc.collect()

36

## 进行标准化

In [9]:
# 数据集合并
data = pd.concat([train_fe,test_fe],axis=0)


# 进行标准化
de_ss_list = ['file_id','label']
ss_list = [i for i in data.columns if i not in de_ss_list]

ss = StandardScaler()
data[ss_list] = ss.fit_transform(data[ss_list])


# 数据集拆分
train_fe = data[data['label'].notnull()]
test_fe = data[data['label'].isnull()]

# 特征工程保存

In [10]:
path = "./temdata"
train_fe.to_pickle(f'{path}/train_fe_baseline.pkl')
test_fe.to_pickle(f'{path}/test_fe_baseline.pkl')

In [11]:
path = "./temdata"
train_fe = pd.read_pickle(f'{path}/train_fe_baseline.pkl')
test_fe = pd.read_pickle(f'{path}/test_fe_baseline.pkl')

# 数据切分

In [12]:
# # 进行数据切分
# file_id_list = train_fe['file_id'].unique()

# train_id,vaild_id = train_test_split(file_id_list,test_size=0.2)
# train_df = train_fe[train_fe['file_id'].isin(train_id)]
# vaild_df = train_fe[train_fe['file_id'].isin(vaild_id)]


# # 得到训练数据集与提交数据集
# train_x,train_y = train_df.drop('label',axis=1),train_df['label']
# vaild_x,vaild_y = vaild_df.drop('label',axis=1),vaild_df['label']
# sub_x,sub_y = test_fe.drop('label',axis=1),test_fe['label']


In [13]:
# 数据划分
raw_x,raw_y = train_fe.drop('label',axis=1),train_fe['label']
sub_x,sub_y = test_fe.drop('label',axis=1),test_fe['label']

# 进行数据切分
train_x,vaild_x,train_y,vaild_y = train_test_split(raw_x,raw_y,test_size=0.2)


# 建模尝试

## 使用lgb进行建模预测

In [14]:
# 参数设置
gpu=False

#############################################

if gpu==True:
    params = {
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0}
else:
    params = {}




#############################################

model = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='multiclass',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=1, colsample_bytree=1,
    **params)


#############################################
# 模型训练
# model.fit(
#     train_x, train_y,
#     eval_metric='logloss', eval_set=[(train_x, train_y), (vaild_x, vaild_y)],
#     verbose=100,
#     #早停法，如果auc在10epoch没有进步就stop
#     early_stopping_rounds=1000 )

model.fit(
    raw_x, raw_y)



# 预测
pred_prob = model.predict_proba(sub_x)


#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)

# 保存在本地
path = './outdata'
sub_df.to_csv(f'{path}/lgb_baseline.csv',index=None)

#############################################
# 保存模型
import joblib
# 模型存储
joblib.dump(model, './modelfile/lgb_baseline.pkl')
# 模型加载
# model = joblib.load('lgb.pkl')
print('模型保存完毕！')

LGBMClassifier(colsample_bytree=1, learning_rate=0.005, min_child_samples=3,
               n_estimators=2000, objective='multiclass', random_state=2021,
               reg_alpha=0.25, reg_lambda=0.25, subsample=1)

['./modelfile/lgb_baseline.pkl']

模型保存完毕！


## 使用xgb进行建模预测

In [15]:
# 参数设置
gpu=False

#############################################

if gpu==True:
    params = {'tree_method': 'gpu_hist'}

else:
    params = {}



#############################################

model = xgb.XGBClassifier(
            max_depth=9, learning_rate=0.005, n_estimators=2000, 
            objective='multi:softprob', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='mlogloss', reg_lambda=0.5,**params)


#############################################
# 模型训练
# model.fit(
#     train_x, train_y,
#     eval_metric='mlogloss', eval_set=[(train_x, train_y), (vaild_x, vaild_y)],
#     verbose=100,
#     #早停法，如果auc在10epoch没有进步就stop
#     early_stopping_rounds=1000 )

model.fit(raw_x, raw_y)



# 预测
pred_prob = model.predict_proba(sub_x)


#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))

# 保存在本地
path = './outdata'
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)
sub_df.to_csv(f'{path}/xgb_baseline.csv',index=None)


#############################################
# 保存模型
import joblib
# 模型存储
joblib.dump(model, './modelfile/xgb_baseline.pkl')
# 模型加载
# model = joblib.load('xgb.pkl')
print('模型保存完毕！')

Parameters: { min_child_samples } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.005, max_delta_step=0,
              max_depth=9, min_child_samples=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=2000, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=0.5, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

['./modelfile/xgb_baseline.pkl']

模型保存完毕！


## 使用神经网络进行建模预测

In [16]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import keras
from keras.callbacks import EarlyStopping,ModelCheckpoint,RemoteMonitor,CSVLogger

#############################################
# 数据格式转化

cate_train_y = keras.utils.to_categorical(train_y,8)
cate_vaild_y = keras.utils.to_categorical(vaild_y,8)





#############################################

# 搭建模型
model = keras.Sequential([
    keras.layers.Dense(200, activation='relu', input_shape=[len(train_x.columns)]),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(200, activation='relu'), 
    keras.layers.Dense(8, activation='softmax') # 需要改写成sigmoid
])


#############################################
# 配置损失函数，评估指标，优化器
model.compile(loss='categorical_crossentropy', metrics=['categorical_crossentropy'], optimizer='adam')


# 配置backcall
callback = ModelCheckpoint(filepath="modelfile/MLP.ckpt",
                           monitor="val_categorical_crossentropy",
                           verbose=1,save_best_only=True,
                           save_weights_only=False)


In [17]:

#############################################
# 训练
model.fit(train_x, cate_train_y, 
          validation_data=(vaild_x, cate_vaild_y), 
          batch_size=100, epochs=5,
          callbacks=callback)


Epoch 1/5
Epoch 00001: val_categorical_crossentropy improved from inf to 1.64191, saving model to modelfile/MLP.ckpt
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: modelfile/MLP.ckpt/assets
Epoch 2/5
Epoch 00002: val_categorical_crossentropy improved from 1.64191 to 1.63694, saving model to modelfile/MLP.ckpt
INFO:tensorflow:Assets written to: modelfile/MLP.ckpt/assets
Epoch 3/5
Epoch 00003: val_categorical_crossentropy did not improve from 1.63694
Epoch 4/5
Epoch 00004: val_categorical_crossentropy did not improve from 1.63694
Epoch 5/5
Epoch 00005: val_categorical_crossentropy did not improve from 1.63694


<tensorflow.python.keras.callbacks.History at 0x7fe59acf4890>

In [18]:
#############################################
# 进行预测
pred_prob = model.predict(sub_x)

#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)

# 保存在本地
path = './outdata'
sub_df.to_csv(f'{path}/mlp_baseline.csv',index=None)



# 进行模型融合

## 普通的加权融合

In [19]:
# 打开之前的预测结果proba
path = './outdata'
lgb_pred = pd.read_csv(f'{path}/lgb.csv').set_index('file_id')
xgb_pred = pd.read_csv(f'{path}/xgb.csv').set_index('file_id')
mlp_pred = pd.read_csv(f'{path}/mlp.csv').set_index('file_id')

# 设置权重
lgb_weight = 0.2
xgb_weight = 0.7
mlp_weight = 0.1

# 进行加权融合
mix_pred = (lgb_weight*lgb_pred)+(xgb_weight*xgb_pred)+(mlp_weight*mlp_pred)

# 保存加权融合后的结果
mix_pred.to_csv(f'{path}/mix_baseline.csv')

In [20]:
sub

NameError: name 'sub' is not defined