## 第六节：优化技巧与解决方案升级

### 工具包导入

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelBinarizer,LabelEncoder

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### 内存工具包

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm  

class _Data_Preprocess:
    def __init__(self):
        self.int8_max = np.iinfo(np.int8).max
        self.int8_min = np.iinfo(np.int8).min

        self.int16_max = np.iinfo(np.int16).max
        self.int16_min = np.iinfo(np.int16).min

        self.int32_max = np.iinfo(np.int32).max
        self.int32_min = np.iinfo(np.int32).min

        self.int64_max = np.iinfo(np.int64).max
        self.int64_min = np.iinfo(np.int64).min

        self.float16_max = np.finfo(np.float16).max
        self.float16_min = np.finfo(np.float16).min

        self.float32_max = np.finfo(np.float32).max
        self.float32_min = np.finfo(np.float32).min

        self.float64_max = np.finfo(np.float64).max
        self.float64_min = np.finfo(np.float64).min

    def _get_type(self, min_val, max_val, types):
        if types == 'int':
            if max_val <= self.int8_max and min_val >= self.int8_min:
                return np.int8
            elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:
                return np.int16
            elif max_val <= self.int32_max and min_val >= self.int32_min:
                return np.int32
            return None

        elif types == 'float':
            if max_val <= self.float16_max and min_val >= self.float16_min:
                return np.float16
            if max_val <= self.float32_max and min_val >= self.float32_min:
                return np.float32
            if max_val <= self.float64_max and min_val >= self.float64_min:
                return np.float64
            return None

    def _memory_process(self, df):
        init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
        print('Original data occupies {} GB memory.'.format(init_memory))
        df_cols = df.columns

          
        for col in tqdm_notebook(df_cols):
            try:
                if 'float' in str(df[col].dtypes):
                    max_val = df[col].max()
                    min_val = df[col].min()
                    trans_types = self._get_type(min_val, max_val, 'float')
                    if trans_types is not None:
                        df[col] = df[col].astype(trans_types)
                elif 'int' in str(df[col].dtypes):
                    max_val = df[col].max()
                    min_val = df[col].min()
                    trans_types = self._get_type(min_val, max_val, 'int')
                    if trans_types is not None:
                        df[col] = df[col].astype(trans_types)
            except:
                print(' Can not do any process for column, {}.'.format(col)) 
        afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
        print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))
        return df

In [3]:
memory_process = _Data_Preprocess()

### 数据读取

In [4]:
path  = '../security_data/'
train = pd.read_csv(path + 'security_train.csv')
test  = pd.read_csv(path + 'security_test.csv')

In [5]:
train.head()

Unnamed: 0,file_id,label,api,tid,index
0,1,5,LdrLoadDll,2488,0
1,1,5,LdrGetProcedureAddress,2488,1
2,1,5,LdrGetProcedureAddress,2488,2
3,1,5,LdrGetProcedureAddress,2488,3
4,1,5,LdrGetProcedureAddress,2488,4


### 数据预处理（字符串转化为数字）

In [6]:
unique_api = train['api'].unique()

In [7]:
unique_api.shape

(295,)

In [8]:
api2index = {item:(i+1) for i,item in enumerate(unique_api)}
index2api = {(i+1):item for i,item in enumerate(unique_api)}

In [9]:
train['api_idx'] = train['api'].map(api2index)
test['api_idx']  = test['api'].map(api2index)

In [10]:
# 获取每个文件对应的字符串序列
def get_sequence(df,period_idx):
    seq_list = []
    for _id,begin in enumerate(period_idx[:-1]):
        seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)
    seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)
    return seq_list

In [11]:
train_period_idx = train.file_id.drop_duplicates(keep='first').index.values
test_period_idx  = test.file_id.drop_duplicates(keep='first').index.values

In [13]:
train_df = train[['file_id','label']].drop_duplicates(keep='first')
test_df  = test[['file_id']].drop_duplicates(keep='first')

In [14]:
train_df['seq'] = get_sequence(train,train_period_idx)
test_df['seq']  = get_sequence(test,test_period_idx)

### TextCNN构建

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional
from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D
from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average
from keras.models import Model
from keras.optimizers import RMSprop,Adam
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD
from keras import backend as K
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from keras.layers import SpatialDropout1D
from keras.layers.wrappers import Bidirectional

Using TensorFlow backend.


### TextCNN的网络结构

In [17]:
def TextCNN(max_len,max_cnt,embed_size, num_filters,kernel_size,conv_action, mask_zero):
    
    _input = Input(shape=(max_len,), dtype='int32')
    _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)
    _embed = SpatialDropout1D(0.15)(_embed)
    warppers = []
    
    for _kernel_size in kernel_size:
        conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)
        warppers.append(GlobalMaxPooling1D()(conv1d))
                        
    fc = concatenate(warppers)
    fc = Dropout(0.5)(fc)
    #fc = BatchNormalization()(fc)
    fc = Dense(256, activation='relu')(fc)
    fc = Dropout(0.25)(fc)
    #fc = BatchNormalization()(fc) 
    preds = Dense(8, activation = 'softmax')(fc)
    
    model = Model(inputs=_input, outputs=preds)
    
    model.compile(loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    return model

In [18]:
train_labels = pd.get_dummies(train_df.label).values
train_seq    = pad_sequences(train_df.seq.values, maxlen = 6000)
test_seq     = pad_sequences(test_df.seq.values, maxlen = 6000)

### 模型训练&预测

In [19]:
from sklearn.model_selection import StratifiedKFold,KFold 
skf = KFold(n_splits=5, shuffle=True)

In [20]:
max_len     = 6000
max_cnt     = 295
embed_size  = 256
num_filters = 64
kernel_size = [2,4,6,8,10,12,14]
conv_action = 'relu'
mask_zero   = False
TRAIN       = True

In [21]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
meta_train = np.zeros(shape = (len(train_seq),8))
meta_test = np.zeros(shape = (len(test_seq),8))
FLAG = True
i = 0
for tr_ind,te_ind in skf.split(train_labels):
    i +=1
    print('FOLD: '.format(i))
    print(len(te_ind),len(tr_ind)) 
    model_name = 'benchmark_textcnn_fold_'+str(i)
    X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]
    X_val,X_val_label     = train_seq[te_ind],train_labels[te_ind]
    
    model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)
    
    model_save_path = './NN/%s_%s.hdf5'%(model_name,embed_size)
    early_stopping =EarlyStopping(monitor='val_loss', patience=3)
    model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
    if TRAIN and FLAG:
        model.fit(X_train,X_train_label,validation_data=(X_val,X_val_label),epochs=100,batch_size=64,shuffle=True,callbacks=[early_stopping,model_checkpoint] )
    
    model.load_weights(model_save_path)
    pred_val = model.predict(X_val,batch_size=128,verbose=1)
    pred_test = model.predict(test_seq,batch_size=128,verbose=1)
    
    meta_train[te_ind] = pred_val
    meta_test += pred_test
    K.clear_session()
meta_test /= 5.0 

FOLD: 
2778 11109
Train on 11109 samples, validate on 2778 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
FOLD: 
2778 11109
Train on 11109 samples, validate on 2778 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
FOLD: 
2777 11110
Train on 11110 samples, validate on 2777 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
FOLD: 
2777 11110
Train on 11110 samples, validate on 2777 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
FOLD: 
2777 11110
Train on 11110 samples, validate on 2777 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epo

### 五折结果提交

In [22]:
test_df['prob0'] = 0
test_df['prob1'] = 0
test_df['prob2'] = 0
test_df['prob3'] = 0
test_df['prob4'] = 0
test_df['prob5'] = 0
test_df['prob6'] = 0
test_df['prob7'] = 0

test_df[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = meta_test
test_df[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('nn_baseline_5fold.csv',index = None)