# 任务目标--使用w2v提取特征

# 调包区


In [1]:
from tqdm.notebook import tqdm
import random
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import gc

from pandarallel import pandarallel
pandarallel.initialize() # 初始化该这个b...并行库

# 警告忽略
import warnings
warnings.filterwarnings("ignore")

# matplotlib字体设置
plt.rcParams["font.family"] = "Songti SC"
plt.rcParams["axes.unicode_minus"] = False

# matplotlib警告忽略
pd.plotting.register_matplotlib_converters()


# 观看Dataframe长度
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
# 浮点数位长度
pd.set_option('display.precision',5)

# 显示多个结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # ['all', 'last', 'last_expr', 'none', 'last_expr_or_assign']

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,KFold
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,auc,f1_score
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_squared_log_error
from sklearn.metrics import classification_report

In [3]:
import keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D , GlobalAveragePooling1D,MaxPool1D
from keras import Model
from keras.models import Sequential
from keras.optimizers import Adam,SGD # 优化方法
from keras.callbacks import EarlyStopping,ModelCheckpoint,RemoteMonitor,CSVLogger
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
import networkx as nx
import node2vec
from node2vec import Node2Vec
from gensim.models import word2vec

# 打开文件

In [5]:
#path = './sampledata' # 打开样本数据
path = './fulldata' # 打开全量数据

train_data = pd.read_csv(f'{path}/security_train.csv')
test_data = pd.read_csv(f'{path}/security_test.csv')


# 特征工程

## 进行标签编码

In [6]:
# 数据集合并
data = pd.concat([train_data,test_data],axis=0)

# 进行标签编码
le = LabelEncoder()
data['api'] = le.fit_transform(data['api'])

# 数据集拆分
train_data = data[data['label'].notnull()]
test_data = data[data['label'].isnull()]

## 构建w2v特征

In [7]:
%%time
def creat_w2v_feature(tem_df,cache=False):
    if cache==True:
        path = './temdata'
        w2v_df = pd.read_pickle(f'{path}/w2v_feature.pkl')

    else:
        #################################################
        # 对api进行排序
        tem_df = tem_df.sort_values(['api','tid','index'])
        tem_df['api'] = tem_df['api'].astype(str) # 转化为str
    #     tem_df['api'] = le.inverse_transform(tem_df['api']) # 转化回文字

        # 设置进度条
        from tqdm.notebook import tqdm
        tqdm.pandas(desc="句子转化进度：")

        # 生成api句子(去重复)
        file_api_sequence = tem_df.groupby('file_id')['api'].apply(lambda x:x.drop_duplicates().tolist())

        #################################################
        # 进行w2v的训练
        model = word2vec.Word2Vec(sentences=file_api_sequence, size=5, window=3, min_count=1)

        # 将api信息转化为embedding
        tqdm.pandas(desc="embedding转化进度：")
        tem_df = tem_df['api'].drop_duplicates().reset_index(drop=True).to_frame() # 去重复
        w2v_df = tem_df['api'].progress_apply(lambda x:model.wv[x])
        w2v_df = pd.DataFrame(w2v_df.tolist())
        w2v_df.columns = [f'api_embedding_{i}' for i in w2v_df.columns]

        #################################################
        # 合并表格
        tem_df['api'] = tem_df['api'].astype(int) # 转化回INT
        w2v_df = pd.concat([tem_df,w2v_df],axis=1) 
        
        # 保存特征工程文档
        path = './temdata'
        w2v_df.to_pickle(f'{path}/w2v_feature.pkl')    
    
    return(w2v_df)



################################################################################################################
# 数据集合并
data = pd.concat([train_data,test_data],axis=0)

# 进行标准化
w2v_df = creat_w2v_feature(data)

train_data = train_data.merge(w2v_df,on='api',how='left')
test_data = test_data.merge(w2v_df,on='api',how='left')


# 回收内存
del w2v_df,data
gc.collect()

HBox(children=(FloatProgress(value=0.0, description='embedding转化进度：', max=301.0, style=ProgressStyle(descripti…




25

## 构建病毒api类型与用户当前api的匹配程度

In [8]:
# 可以优化的地方
# embedding名称获取
# 进行加权修正


# 构建病毒api类型与用户当前api的匹配程度
def create_api_virus_sim(train_data,cache=False):
    if cache==True:
        path = './temdata'
        api_virus_df = pd.read_pickle(f'{path}/create_api_virus_sim.pkl')
        
        
    else:
    ################################################################
        tem_df = train_data
        # 提取每一个病毒的embedding偏好
        virus_eb = tem_df.groupby(['label'])['api_embedding_0','api_embedding_1','api_embedding_2','api_embedding_3','api_embedding_4'].mean()
        # 更改名称
        virus_eb.columns = [f'virus_{i}' for i in virus_eb.columns]

        # 提取file所有api的embedding
        user_eb_feature = ['file_id','api','api_embedding_0',
               'api_embedding_1', 'api_embedding_2', 'api_embedding_3',
               'api_embedding_4']
        user_eb = tem_df[user_eb_feature].set_index(['file_id','api'])

        # 病毒0-7的匹配度
        api_virus_list = []
        for label in range(virus_eb.shape[0]):
            virus_embedding = virus_eb.iloc[label].to_frame().T

            # 进行余弦相似度的比较
            from sklearn.metrics.pairwise import cosine_similarity

            # 生成相似矩阵
            similary_array = cosine_similarity(user_eb,virus_embedding)

            # 生成相似DataFrame,熟悉的中文化
            columns_list = index_list = user_eb.index
            similary_df = pd.DataFrame(similary_array,index=index_list,columns=[f'api_virus_sim_eb{label}'])

            api_virus_list.append(similary_df)

        api_virus_df = pd.concat(api_virus_list,axis=1)

        # 去重，只输出api与病毒的相似度
        api_virus_df = api_virus_df.reset_index().drop('file_id',axis=1).drop_duplicates().reset_index(drop=True)
    
        # 保存特征文档
        path = './temdata'
        api_virus_df.to_pickle(f'{path}/create_api_virus_sim.pkl')
        
    ################################################################
    
    return api_virus_df

################################################################################################################

# 通过训练集构建病毒、api的相似程度
api_virus_df = create_api_virus_sim(train_data)

# 与训练集拼接
train_data = train_data.merge(api_virus_df,on='api',how='left')

# 与测试集拼接
test_data = test_data.merge(api_virus_df,on='api',how='left')


# 回收内存
del api_virus_df
gc.collect()

0

## 生成特征文档

In [10]:
# train_fe = train_data[['file_id','label']].drop_duplicates()
# test_fe = test_data[['file_id','label']].drop_duplicates()

basic_feature = ['file_id','label','api','tid','index']

train_fe = train_data[basic_feature].drop_duplicates(['file_id','label'])
test_fe = test_data[basic_feature].drop_duplicates(['file_id','label'])
train_fe

Unnamed: 0,file_id,label,api,tid,index
0,1,5.0,135,2488,0
6786,2,2.0,95,2320,0
7602,3,0.0,151,2208,0
8065,4,0.0,95,2284,0
10111,5,0.0,249,2500,0
...,...,...,...,...,...
89620181,13883,2.0,95,100,0
89798402,13884,5.0,95,2592,0
89799721,13885,0.0,151,2240,0
89800754,13886,1.0,95,2324,0


## 构建文件与病毒的匹配程度特征(mean)

In [12]:
# 构建病毒各自的类别（mean）
def create_file_virus_sim(tem_df):    
    global virus_eb,api_embedding_list    
    ################################################################
    # 一、构建病毒类型
    
    # 截取api_embedding的列表名称
    if tem_df['label'].isnull().sum()>0:
        virus_eb = virus_eb
        
    else:
        api_embedding_list = [i for i in tem_df.columns if 'api_embedding' in i]
        # 提取每一个病毒的embedding偏好
        virus_eb = tem_df.groupby(['label'])[api_embedding_list].mean()
        # 更改名称
        virus_eb.columns = [f'virus_{i}' for i in virus_eb.columns]

    
    ###############################################################
    # 二、构建file的类型

    # 构建file的api类别（mean）
    file_eb = tem_df.groupby('file_id')[api_embedding_list].mean()
    # 更改名称
    file_eb.columns = [f'file_{i}' for i in virus_eb.columns]


    ###############################################################
    # 病毒0-7的匹配度
    file_virus_list = []
    for label in range(virus_eb.shape[0]):
        virus_embedding = virus_eb.iloc[label].to_frame().T

        # 进行余弦相似度的比较
        from sklearn.metrics.pairwise import cosine_similarity

        # 生成相似矩阵
        similary_array = cosine_similarity(file_eb,virus_embedding)

        # 生成相似DataFrame,熟悉的中文化
        columns_list = index_list = file_eb.index
        similary_df = pd.DataFrame(similary_array,index=index_list,columns=[f'file_virus_mean_sim_eb{label}'])

        file_virus_list.append(similary_df)
        
    file_virus_df = pd.concat(file_virus_list,axis=1)
    return file_virus_df


def _create_file_virus_sim(train_data,test_data,train_fe,test_fe,cache=False):
    if cache==True:
        path = './temdata'
        train_fe = pd.read_pickle(f'{path}/create_file_virus_mean_sim_train.pkl')
        test_fe = pd.read_pickle(f'{path}/create_file_virus_mean_sim_test.pkl')
    
    else:
    ###############################################################   
        # 给训练集构建匹配特征
        file_virus_df = create_file_virus_sim(train_data)
        train_fe = train_fe.merge(file_virus_df,on='file_id',how='left')
        # 给测试集构建匹配特征
        file_virus_df = create_file_virus_sim(test_data)
        test_fe = test_fe.merge(file_virus_df,on='file_id',how='left')
        
        # 保存特征
        path = './temdata'
        train_fe.to_pickle(f'{path}/create_file_virus_mean_sim_train.pkl')
        test_fe.to_pickle(f'{path}/create_file_virus_mean_sim_test.pkl')
    
    ###############################################################
        
    return (train_fe,test_fe)


################################################################################################################
train_fe,test_fe = _create_file_virus_sim(train_data,test_data,train_fe,test_fe,cache=False)



## 构建文件与病毒的匹配程度特征(sum)

In [13]:
# 构建病毒各自的类别（mean）
def create_file_virus_sim(tem_df):    
    global virus_eb,api_embedding_list    
    ################################################################
    # 一、构建病毒类型
    
    # 截取api_embedding的列表名称
    if tem_df['label'].isnull().sum()>0:
        virus_eb = virus_eb
        
    else:
        api_embedding_list = [i for i in tem_df.columns if 'api_embedding' in i]
        # 提取每一个病毒的embedding api总和
        virus_eb = tem_df.groupby(['label'])[api_embedding_list].sum()
        # 更改名称
        virus_eb.columns = [f'virus_{i}' for i in virus_eb.columns]

    
    ###############################################################
    # 二、构建file的类型

    # 构建file的api 总和（sum）
    file_eb = tem_df.groupby('file_id')[api_embedding_list].sum()
    # 更改名称
    file_eb.columns = [f'file_{i}' for i in virus_eb.columns]


    ###############################################################
    # 病毒0-7的匹配度
    file_virus_list = []
    for label in range(virus_eb.shape[0]):
        virus_embedding = virus_eb.iloc[label].to_frame().T

        # 进行余弦相似度的比较
        from sklearn.metrics.pairwise import cosine_similarity

        # 生成相似矩阵
        similary_array = cosine_similarity(file_eb,virus_embedding)

        # 生成相似DataFrame,熟悉的中文化
        columns_list = index_list = file_eb.index
        similary_df = pd.DataFrame(similary_array,index=index_list,columns=[f'file_virus_mean_sim_eb{label}'])

        file_virus_list.append(similary_df)
        
    file_virus_df = pd.concat(file_virus_list,axis=1)
    return file_virus_df


def _create_file_virus_sim(train_data,test_data,train_fe,test_fe,cache=False):
    if cache==True:
        path = './temdata'
        train_fe = pd.read_pickle(f'{path}/create_file_virus_sum_sim_train.pkl')
        test_fe = pd.read_pickle(f'{path}/create_file_virus_sum_sim_test.pkl')
    
    else:
    ###############################################################   
        # 给训练集构建匹配特征
        file_virus_df = create_file_virus_sim(train_data)
        train_fe = train_fe.merge(file_virus_df,on='file_id',how='left')
        # 给测试集构建匹配特征
        file_virus_df = create_file_virus_sim(test_data)
        test_fe = test_fe.merge(file_virus_df,on='file_id',how='left')
        
        # 保存特征
        path = './temdata'
        train_fe.to_pickle(f'{path}/create_file_virus_sum_sim_train.pkl')
        test_fe.to_pickle(f'{path}/create_file_virus_sum_sim_test.pkl')
    
    ###############################################################
        
    return (train_fe,test_fe)


################################################################################################################
train_fe,test_fe = _create_file_virus_sim(train_data,test_data,train_fe,test_fe,cache=False)



## 构建统计值特征（count,nunique,min、max、sum、mean、median）

In [14]:
# 得到统计值特征（count,nunique,min、max、sum、mean、median）
def create_stats_feature(tem_df):
    # 以file_id 为单位聚合，统计每一个file_id下的情况
    group_df = tem_df.groupby('file_id')

    # 得到 count,nunique,min、max、sum、mean、median的统计值
    stats_dict = {
                'api': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'tid': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'index': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_embedding_0': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_embedding_1': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_embedding_2': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_embedding_3': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_embedding_4': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_virus_sim_eb1': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_virus_sim_eb2': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_virus_sim_eb3': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_virus_sim_eb4': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_virus_sim_eb5': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_virus_sim_eb6': ['count', 'nunique', 'min', 'max', 'mean', 'median','std'],
                'api_virus_sim_eb7': ['count', 'nunique', 'min', 'max', 'mean', 'median','std']
    }
        
  
    stats_df = group_df.agg(stats_dict)
    stats_df.columns = ["_".join(tup) for tup in stats_df.columns]
    stats_df = stats_df.reset_index()
    return(stats_df)



# 构建训练集的特征
stats_df = create_stats_feature(train_data)
train_fe = train_fe.merge(stats_df,on='file_id',how='left')

# 构建测试集的特征
stats_df = create_stats_feature(test_data)
test_fe = test_fe.merge(stats_df,on='file_id',how='left')


# 回收内存
del stats_df
gc.collect()

0

## 进行标准化

In [15]:
# 数据集合并
data = pd.concat([train_fe,test_fe],axis=0)


# 进行标准化
de_ss_list = ['file_id','label']
ss_list = [i for i in data.columns if i not in de_ss_list]

ss = StandardScaler()
data[ss_list] = ss.fit_transform(data[ss_list])


# 数据集拆分
train_fe = data[data['label'].notnull()]
test_fe = data[data['label'].isnull()]

# 特征工程保存

In [16]:
path = "./temdata"
train_fe.to_pickle(f'{path}/train_fe_v4.pkl')
test_fe.to_pickle(f'{path}/test_fe_v4.pkl')

In [17]:
path = "./temdata"
train_fe = pd.read_pickle(f'{path}/train_fe_v4.pkl')
test_fe = pd.read_pickle(f'{path}/test_fe_v4.pkl')

# 数据切分

In [18]:
# # 进行数据切分
# file_id_list = train_fe['file_id'].unique()

# train_id,vaild_id = train_test_split(file_id_list,test_size=0.2)
# train_df = train_fe[train_fe['file_id'].isin(train_id)]
# vaild_df = train_fe[train_fe['file_id'].isin(vaild_id)]


# # 得到训练数据集与提交数据集
# train_x,train_y = train_df.drop('label',axis=1),train_df['label']
# vaild_x,vaild_y = vaild_df.drop('label',axis=1),vaild_df['label']
# sub_x,sub_y = test_fe.drop('label',axis=1),test_fe['label']


In [19]:
# 数据划分
raw_x,raw_y = train_fe.drop('label',axis=1),train_fe['label']
sub_x,sub_y = test_fe.drop('label',axis=1),test_fe['label']

# 进行数据切分
train_x,vaild_x,train_y,vaild_y = train_test_split(raw_x,raw_y,test_size=0.2)



In [20]:
# columns_list = ['api_count', 'api_nunique', 'api_mean', 'api_sum',
#        'api_max', 'api_min', 'api_median', 'tid_count', 'tid_nunique',
#        'tid_mean', 'tid_sum', 'tid_max', 'tid_min', 'tid_median',
#        'index_count', 'index_nunique', 'index_mean', 'index_sum', 'index_max',
#        'index_min', 'index_median']
# train_x = train_x[columns_list]
# vaild_x = vaild_x[columns_list]
# sub_x = sub_x[columns_list]

# 建模尝试

## 使用lgb进行建模预测

In [21]:
# 参数设置
gpu=False

#############################################

if gpu==True:
    params = {
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0}
else:
    params = {}




#############################################

model = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='multiclass',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=1, colsample_bytree=1,**params)


#############################################
# 模型训练
# model.fit(
#     train_x, train_y,
#     eval_metric='logloss', eval_set=[(train_x, train_y), (vaild_x, vaild_y)],
#     verbose=100,
#     #早停法，如果auc在10epoch没有进步就stop
#     early_stopping_rounds=1000 )


model.fit(raw_x,raw_y)

# 预测
pred_prob = model.predict_proba(sub_x)


#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)

# 保存在本地
path = './outdata'
sub_df.to_csv(f'{path}/lgb_v4.csv',index=None)

#############################################
# 保存模型
import joblib
# 模型存储
joblib.dump(model, './modelfile/lgb_v4.pkl')
# 模型加载
# model = joblib.load('lgb.pkl')
print('模型保存完毕！')

LGBMClassifier(colsample_bytree=1, learning_rate=0.005, min_child_samples=3,
               n_estimators=2000, objective='multiclass', random_state=2021,
               reg_alpha=0.25, reg_lambda=0.25, subsample=1)

['./modelfile/lgb_v4.pkl']

模型保存完毕！


## 使用xgb进行建模预测

In [22]:
# 参数设置
gpu=False

#############################################

if gpu==True:
    params = {'tree_method': 'gpu_hist'}

else:
    params = {}



#############################################

model = xgb.XGBClassifier(
            max_depth=9, learning_rate=0.005, n_estimators=2000, 
            objective='multi:softprob', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='mlogloss', reg_lambda=0.5,**params)


#############################################
# 模型训练
# model.fit(
#     train_x, train_y,
#     eval_metric='mlogloss', eval_set=[(train_x, train_y), (vaild_x, vaild_y)],
#     verbose=100,
#     #早停法，如果auc在10epoch没有进步就stop
#     early_stopping_rounds=1000 )


model.fit(raw_x,raw_y)

# 预测
pred_prob = model.predict_proba(sub_x)


#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))

# 保存在本地
path = './outdata'
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)
sub_df.to_csv(f'{path}/xgb_v4.csv',index=None)


#############################################
# 保存模型
import joblib
# 模型存储
joblib.dump(model, './modelfile/xgb_v4.pkl')
# 模型加载
# model = joblib.load('xgb.pkl')
print('模型保存完毕！')

Parameters: { min_child_samples } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.005, max_delta_step=0,
              max_depth=9, min_child_samples=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=2000, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=0.5, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

['./modelfile/xgb_v4.pkl']

模型保存完毕！


## 使用神经网络进行建模预测

In [23]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import keras
from keras.callbacks import EarlyStopping,ModelCheckpoint,RemoteMonitor,CSVLogger

#############################################
# 数据格式转化

cate_train_y = keras.utils.to_categorical(train_y,8)
cate_vaild_y = keras.utils.to_categorical(vaild_y,8)





#############################################

# 搭建模型
model = keras.Sequential([
    keras.layers.Dense(200, activation='relu', input_shape=[len(train_x.columns)]),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(200, activation='relu'), 
    keras.layers.Dense(8, activation='softmax') # 需要改写成sigmoid
])


#############################################
# 配置损失函数，评估指标，优化器
model.compile(loss='categorical_crossentropy', metrics=['categorical_crossentropy'], optimizer='adam')


# 配置backcall
callback = ModelCheckpoint(filepath="modelfile/MLP_v4.ckpt",
                           monitor="val_categorical_crossentropy",
                           verbose=1,save_best_only=True,
                           save_weights_only=False)


In [24]:

#############################################
# 训练
model.fit(train_x, cate_train_y, 
          validation_data=(vaild_x, cate_vaild_y), 
          batch_size=100, epochs=5,
          callbacks=callback)


Epoch 1/5
Epoch 00001: val_categorical_crossentropy improved from inf to 1.62144, saving model to modelfile/MLP_v4.ckpt
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: modelfile/MLP_v4.ckpt/assets
Epoch 2/5
Epoch 00002: val_categorical_crossentropy improved from 1.62144 to 1.61115, saving model to modelfile/MLP_v4.ckpt
INFO:tensorflow:Assets written to: modelfile/MLP_v4.ckpt/assets
Epoch 3/5
Epoch 00003: val_categorical_crossentropy did not improve from 1.61115
Epoch 4/5
Epoch 00004: val_categorical_crossentropy improved from 1.61115 to 1.61051, saving model to modelfile/MLP_v4.ckpt
INFO:tensorflow:Assets written to: modelfile/MLP_v4.ckpt/assets
Epoch 5/5
Epoch 00005: val_categorical_crossentropy did not improve from 1.61051


<tensorflow.python.keras.callbacks.History at 0x7fb621a69c90>

In [25]:
#############################################
# 进行预测
pred_prob = model.predict(sub_x)

#############################################
# 转化为提交文件
# 转化为DataFrame
sub_df = pd.DataFrame(pred_prob)
sub_df.insert(0,"file_id",sub_x['file_id'].reset_index(drop=True))

# 更改列名
sub_df.columns = list(map(lambda x:f"prob{x}" if x!='file_id' else x,sub_df.columns))
sub_df = sub_df.astype('double')
sub_df['file_id'] = sub_df['file_id'].astype(int)

# 保存在本地
path = './outdata'
sub_df.to_csv(f'{path}/mlp_v3.csv',index=None)



# 进行模型融合

## 普通的加权融合

In [26]:
# 打开之前的预测结果proba
path = './outdata'
lgb_pred = pd.read_csv(f'{path}/lgb_v4.csv').set_index('file_id')
xgb_pred = pd.read_csv(f'{path}/xgb_v4.csv').set_index('file_id')
mlp_pred = pd.read_csv(f'{path}/mlp_v4.csv').set_index('file_id')

# 设置权重
lgb_weight = 0.2
xgb_weight = 0.7
mlp_weight = 0.1

# 进行加权融合
mix_pred = (lgb_weight*lgb_pred)+(xgb_weight*xgb_pred)+(mlp_weight*mlp_pred)

# 保存加权融合后的结果
mix_pred.to_csv(f'{path}/mix_v2.csv')

FileNotFoundError: [Errno 2] File ./outdata/mlp_v4.csv does not exist: './outdata/mlp_v4.csv'