# 任务目标--TFIDF+完整二阶统计

# 调包区


In [1]:
from tqdm.notebook import tqdm
import random
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import gc

# 警告忽略
import warnings
warnings.filterwarnings("ignore")

# matplotlib字体设置
plt.rcParams["font.family"] = "Songti SC"
plt.rcParams["axes.unicode_minus"] = False

# matplotlib警告忽略
pd.plotting.register_matplotlib_converters()


# 观看Dataframe长度
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
# 浮点数位长度
pd.set_option('display.precision',5)

# 显示多个结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # ['all', 'last', 'last_expr', 'none', 'last_expr_or_assign']

In [2]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,KFold
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,auc,f1_score
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_squared_log_error
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D , GlobalAveragePooling1D,MaxPool1D
from keras import Model
from keras.models import Sequential
from keras.optimizers import Adam,SGD # 优化方法
from keras.callbacks import EarlyStopping,ModelCheckpoint,RemoteMonitor,CSVLogger
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
import networkx as nx
import node2vec
from node2vec import Node2Vec
from gensim.models import word2vec

# 打开文件

## 进行标签编码

In [5]:
#path = './sampledata' # 打开样本数据
path = './fulldata' # 打开全量数据

train_data = pd.read_csv(f'{path}/security_train.csv')
test_data = pd.read_csv(f'{path}/security_test.csv')

test_data['label']=np.nan

verson='v2.0.2'

# 特征工程（全量级特征）

# 特征工程（聚合级特征）

## 构建特征表

In [6]:
basic_feature = ['file_id','label','api','tid','index']

train_fe = train_data[basic_feature].drop_duplicates(['file_id','label'])
test_fe = test_data[basic_feature].drop_duplicates(['file_id','label'])

## 构建统计值特征（count,nunique,min、max、sum、mean、median）

In [7]:
# 得到统计值特征（count,nunique,min、max、sum、mean、median）
def creat_stats_feature(tem_df):
    # 以file_id 为单位聚合，统计每一个file_id下的情况
    group_df = tem_df.groupby('file_id')

    # 得到 count,nunique,min、max、sum、mean、median的统计值
    stats_dict = {
                'api': ['count', 'nunique'],
                'tid': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'index': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
    }
        

    stats_df = group_df.agg(stats_dict)
    stats_df.columns = ["_".join(tup) for tup in stats_df.columns]
    stats_df = stats_df.reset_index()
    return(stats_df)



# 构建训练集的特征
stats_df = creat_stats_feature(train_data)
train_fe = train_fe.merge(stats_df,on='file_id',how='left')

# 构建测试集的特征
stats_df = creat_stats_feature(test_data)
test_fe = test_fe.merge(stats_df,on='file_id',how='left')


# 回收内存
del stats_df
gc.collect()

3

## 构建一阶统计值特征（count,nunique,min、max、sum、mean、median）

In [8]:
# 得到统计值特征（count,nunique,min、max、sum、mean、median）
def creat_stats_feature(tem_df):
    # 以file_id 为单位聚合，统计每一个file_id下的情况
    group_df = tem_df.groupby('file_id')

    # 得到 count,nunique,min、max、sum、mean、median的统计值
    stats_dict = {
        "api":['count', 'nunique'],
        "tid":['count', 'nunique', 'min', 'max', 'mean', 'median','std'], 
        "index":['count', 'nunique', 'min', 'max', 'mean', 'median','std'],}

    stats_df = group_df.agg(stats_dict)
    stats_df.columns = ["file_"+"_".join(tup) for tup in stats_df.columns]
    stats_df = stats_df.reset_index()
    return(stats_df)



# 构建训练集的特征
stats_df = creat_stats_feature(train_data)
train_fe = train_fe.merge(stats_df,on='file_id',how='left')

# 构建测试集的特征
stats_df = creat_stats_feature(test_data)
test_fe = test_fe.merge(stats_df,on='file_id',how='left')


# 回收内存
del stats_df
gc.collect()


0

## 构建二阶段特征 -- 聚合file_id，api，统计tid，index

In [9]:
def get_pivot_count_feature(tem_df):
    stats_parmas = {'tid':['count','nunique'],
                    'index':['count','nunique','max','min','median','std'],
                   }
    tem_df = tem_df.groupby(['file_id','api']).agg(stats_parmas)
    tem_df.columns = [f'file_api_{i}_{b}' for i,b in tem_df.columns]

    df_list = []
    for columns in tem_df.columns:
        pivot_df = pd.pivot_table(data=tem_df,index='file_id',columns='api',values=columns,fill_value=0)
        pivot_df.columns = [f'{columns}_{feature}' for feature in pivot_df.columns]
        df_list.append(pivot_df)

    pivot_df = pd.concat(df_list,axis=1)

    return pivot_df



# 构建训练集的特征
pivot_df = get_pivot_count_feature(train_data)
train_fe = train_fe.merge(pivot_df,on='file_id',how='left')

# 构建测试集的特征
pivot_df = get_pivot_count_feature(test_data)
test_fe = test_fe.merge(pivot_df,on='file_id',how='left')


# 回收内存
del pivot_df

## 构建TFIDF特征

In [10]:
############################################################################################
# 构建api句子

def get_apis(df):
    # 以file_id进行聚合
    group_df = df.groupby('file_id')
    
    # 统计每一个file_id下的api的情况
    file_api = {}
    for file_id,file_group in group_df:
        # 以tid、index从小达到的排序，生成每一个file_id下的api句子
        result = file_group.sort_values(['tid','index'],ascending=True)
        api_sequence = ' '.join(result['api'])
        file_api[file_id] = api_sequence
    return(file_api)
        

## 构建TFIDF特征
def get_tfidf(tem_df):
    tfidf = TfidfVectorizer(ngram_range=(1,3),min_df=0.1,max_df=0.8)
    api_feature = tfidf.fit_transform(tem_df['apis'])

    # 转化为DataFrame
    df_apis = pd.DataFrame(api_feature.toarray(),columns=tfidf.get_feature_names()).reset_index(drop=True)
    return(df_apis)

############################################################################################

#### 构建api句子
def creat_tfidf_feature(train_data,test_data,train_fe,test_fe,cache=False):
    # 判断是否需要重新构建特征
    if cache==True:
        path = './temdata'
        df_fe = pd.read_pickle(f'{path}/tfidf_feature.pkl')
        
    
    else:
    ##########################    
        # 数据格式转化
        train_data['api'] = train_data['api'].astype(str)
        test_data['api'] = test_data['api'].astype(str)
    
        # 构建训练集
        train_api = get_apis(df = train_data)
        temp = pd.DataFrame.from_dict(train_api,orient='index',columns=['apis'])
        temp = temp.reset_index().rename(columns={'index':'file_id'})
        train_fe = train_fe.merge(temp,on='file_id',how='left')

        # 构建测试集
        test_api = get_apis(df = test_data)
        temp = pd.DataFrame.from_dict(test_api,orient='index',columns=['apis'])
        temp = temp.reset_index().rename(columns={'index':'file_id'})
        test_fe = test_fe.merge(temp,on='file_id',how='left')

        #### 构建tfdif特征
        df_fe = pd.concat([train_fe,test_fe],axis=0).reset_index(drop=True)

        # 得到tfidf特征
        tfidf_feature = get_tfidf(df_fe)


        # 合并tfidf特征
        df_fe = pd.concat([df_fe,tfidf_feature],axis=1).drop(['apis'],axis=1)
        
        # 保存tfidf特征
        path = './temdata'
        df_fe.to_pickle(f'{path}/tfidf_feature.pkl')
        
    
    ##########################


    # 划分训练集
    train_fe = df_fe[df_fe['label'].notnull()]

    # 划分测试集
    test_fe = df_fe[df_fe['label'].isnull()]


    return(train_fe,test_fe)


############################################################################################
train_fe,test_fe = creat_tfidf_feature(train_data,test_data,train_fe,test_fe,cache=False)


## 补全columns

In [11]:
fe_data = pd.concat([train_fe,test_fe],axis=0)
train_fe = fe_data[fe_data['label'].notnull()].fillna(0)
test_fe = fe_data[fe_data['label'].isnull()].fillna(0)                                      
test_fe['label']=np.nan

del fe_data
gc.collect()

train_fe.shape
test_fe.shape

0

(13887, 3937)

(12955, 3937)

## 删除无用特征

In [12]:
del_list = ['api']
train_fe = train_fe.drop(del_list,axis=1)
test_fe = test_fe.drop(del_list,axis=1)

# 特征工程保存

In [13]:
path = "./temdata"
train_fe.to_pickle(f'{path}/train_fe_{verson}.pkl')
test_fe.to_pickle(f'{path}/test_fe_{verson}.pkl')

In [14]:
path = "./temdata"
train_fe = pd.read_pickle(f'{path}/train_fe_{verson}.pkl')
test_fe = pd.read_pickle(f'{path}/test_fe_{verson}.pkl')

# 数据切分

In [15]:
# 数据划分
raw_x,raw_y = train_fe.drop('label',axis=1),train_fe['label']
sub_x,sub_y = test_fe.drop('label',axis=1),test_fe['label']

# 进行数据切分
train_x,vaild_x,train_y,vaild_y = train_test_split(raw_x,raw_y,test_size=0.2)

# 建模尝试

## 使用lgb进行建模预测

In [16]:
# 参数设置
gpu=False

#############################################

if gpu==True:
    params = {
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0}
else:
    params = {}




#############################################

model = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='multiclass',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=1, colsample_bytree=1,**params)


#############################################
# 模型训练
# model.fit(
#     train_x, train_y,
#     eval_metric='logloss', eval_set=[(train_x, train_y), (vaild_x, vaild_y)],
#     verbose=100,
#     #早停法，如果auc在10epoch没有进步就stop
#     early_stopping_rounds=1000 )

model.fit(raw_x,raw_y)


# 预测
pred_prob = model.predict_proba(sub_x)


#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)

# 保存在本地
path = './outdata'
sub_df.to_csv(f'{path}/lgb_{verson}.csv',index=None)


#############################################
# 观察特征重要性
feature_list = train_x.columns
value = model.feature_importances_.flatten()
print(pd.Series(value,index=feature_list).sort_values(ascending=False).to_string())


#############################################
# 保存模型
import joblib
# 模型存储
joblib.dump(model, f'./modelfile/lgb_{verson}.pkl')
# 模型加载
# model = joblib.load('lgb.pkl')
print('模型保存完毕！')

LGBMClassifier(colsample_bytree=1, learning_rate=0.005, min_child_samples=3,
               n_estimators=2000, objective='multiclass', random_state=2021,
               reg_alpha=0.25, reg_lambda=0.25, subsample=1)

file_id                                                                             18454
tid                                                                                 10034
tid_median                                                                           9890
tid_mean                                                                             8807
tid_max                                                                              8393
tid_std                                                                              8235
ldrgetprocedureaddress ldrgetdllhandle ntclose                                       3569
file_api_index_min_NtClose                                                           3552
api_count                                                                            2848
file_api_index_min_LdrLoadDll                                                        2323
file_api_tid_count_LdrGetProcedureAddress                                            2115
setunhandl

['./modelfile/lgb_v2.0.2.pkl']

模型保存完毕！


## 使用xgb进行建模预测

In [17]:
# 参数设置
gpu=False

#############################################

if gpu==True:
    params = {'tree_method': 'gpu_hist'}

else:
    params = {}



#############################################

model = xgb.XGBClassifier(
            max_depth=9, learning_rate=0.005, n_estimators=2000, 
            objective='multi:softprob', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='mlogloss', reg_lambda=0.5,**params)


#############################################
# 模型训练
# model.fit(
#     train_x, train_y,
#     eval_metric='mlogloss', eval_set=[(train_x, train_y), (vaild_x, vaild_y)],
#     verbose=100,
#     #早停法，如果auc在10epoch没有进步就stop
#     early_stopping_rounds=1000 )


model.fit(raw_x,raw_y)

# 预测
pred_prob = model.predict_proba(sub_x)


#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))

# 保存在本地
path = './outdata'
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)
sub_df.to_csv(f'{path}/xgb_{verson}.csv',index=None)


#############################################
# 观察特征重要性
feature_list = train_x.columns
value = model.feature_importances_.flatten()
print(pd.Series(value,index=feature_list).sort_values(ascending=False).to_string())


#############################################
# 保存模型
import joblib
# 模型存储
joblib.dump(model, f'./modelfile/xgb_{verson}.pkl')
# 模型加载
# model = joblib.load('xgb.pkl')
print('模型保存完毕！')

Parameters: { min_child_samples } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.005, max_delta_step=0,
              max_depth=9, min_child_samples=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=2000, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=0.5, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

ntclose ntclose setstdhandle                                                        2.53757e-02
sendnotifymessagew couninitialize                                                   1.94172e-02
ntqueryattributesfile ldrunloaddll                                                  1.73412e-02
file_api_index_max_CreateRemoteThread                                               1.64163e-02
file_api_tid_nunique_NtTerminateProcess                                             1.41093e-02
ntwritefile ntclose createprocessinternalw                                          1.25786e-02
file_api_index_median_CreateRemoteThread                                            1.02414e-02
thread32next ntclose __exception__                                                  9.85945e-03
file_api_index_min_LdrGetProcedureAddress                                           9.33077e-03
copyfilea openscmanagera                                                            7.95936e-03
ntunmapviewofsection ntclose ntunmapview

['./modelfile/xgb_v2.0.2.pkl']

模型保存完毕！


# 进行模型融合

## 普通的加权融合

In [18]:
# 打开之前的预测结果proba
path = './outdata'
lgb_pred = pd.read_csv(f'{path}/lgb_{verson}.csv').set_index('file_id')
xgb_pred = pd.read_csv(f'{path}/xgb_{verson}.csv').set_index('file_id')

# 设置权重
lgb_weight = 0.4
xgb_weight = 0.6
mlp_weight = 0.1

# 进行加权融合
mix_pred = (lgb_weight*lgb_pred)+(xgb_weight*xgb_pred)

# 保存加权融合后的结果
mix_pred.to_csv(f'{path}/mix_{verson}.csv')