In [8]:
import keras
import keras.backend as K

import tensorflow as tf
import keras.backend.tensorflow_backend as KTF

import math, os
import pandas as pd
import pickle
import numpy as np
#from data.vocab import TextEncoder
#from transformer.embedding import Embedding
from keras.layers import *
#from transformer.layers import MultiHeadAttention, Gelu, LayerNormalization

# https://github.com/Separius/BERT-keras/blob/master/transformer/

In [12]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from keras.models import Sequential, Model
from keras import optimizers
from keras import regularizers
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical
import time
from scipy import interp
from sklearn import metrics
from keras.models import load_model
import csv
from sklearn.model_selection import StratifiedKFold

In [2]:
#指定第一块GPU可用 
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
config = tf.ConfigProto() 
#不全部占满显存, 按需分配
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

KTF.set_session(sess)

In [3]:
def shape_list(x):
    if K.backend() != 'theano':
        tmp = K.int_shape(x)
    else:
        tmp = x.shape
    tmp = list(tmp)
    tmp[0] = -1
    return tmp


def split_heads(x, n: int, k: bool = False):  # B, L, C
    x_shape = shape_list(x)
    m = x_shape[-1]
    new_x_shape = x_shape[:-1] + [n, m // n]
    new_x = K.reshape(x, new_x_shape)
    return K.permute_dimensions(new_x, [0, 2, 3, 1] if k else [0, 2, 1, 3])


def merge_heads(x):
    new_x = K.permute_dimensions(x, [0, 2, 1, 3])
    x_shape = shape_list(new_x)
    new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])]
    return K.reshape(new_x, new_x_shape)


# q and v are B, H, L, C//H ; k is B, H, C//H, L ; mask is B, 1, L, L
def scaled_dot_product_attention_tf(q, k, v, attn_mask, attention_dropout: float, neg_inf: float):
    w = K.batch_dot(q, k)  # w is B, H, L, L
    w = w / K.sqrt(K.cast(shape_list(v)[-1], K.floatx()))
    if attn_mask is not None:
        w = attn_mask * w + (1.0 - attn_mask) * neg_inf
    w = K.softmax(w)
    w = Dropout(attention_dropout)(w)
    return K.batch_dot(w, v)  # it is B, H, L, C//H [like v]


def scaled_dot_product_attention_th(q, k, v, attn_mask, attention_dropout: float, neg_inf: float):
    w = theano_matmul(q, k)
    w = w / K.sqrt(K.cast(shape_list(v)[-1], K.floatx()))
    if attn_mask is not None:
        attn_mask = K.repeat_elements(attn_mask, shape_list(v)[1], 1)
        w = attn_mask * w + (1.0 - attn_mask) * neg_inf
    w = K.T.exp(w - w.max()) / K.T.exp(w - w.max()).sum(axis=-1, keepdims=True)
    w = Dropout(attention_dropout)(w)
    return theano_matmul(w, v)


def multihead_attention(x, attn_mask, n_head: int, n_state: int, attention_dropout: float, neg_inf: float):
    _q, _k, _v = x[:, :, :n_state], x[:, :, n_state:2 * n_state], x[:, :, -n_state:]
    q = split_heads(_q, n_head)  # B, H, L, C//H
    k = split_heads(_k, n_head, k=True)  # B, H, C//H, L
    v = split_heads(_v, n_head)  # B, H, L, C//H
    if K.backend() == 'tensorflow':
        a = scaled_dot_product_attention_tf(q, k, v, attn_mask, attention_dropout, neg_inf)
    else:
        a = scaled_dot_product_attention_th(q, k, v, attn_mask, attention_dropout, neg_inf)
    return merge_heads(a)


def gelu(x):
    return 0.5 * x * (1 + K.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * K.pow(x, 3))))


# https://stackoverflow.com/a/42194662/2796084
def theano_matmul(a, b, _left=False):
    assert a.ndim == b.ndim
    ndim = a.ndim
    assert ndim >= 2
    if _left:
        b, a = a, b
    if ndim == 2:
        return K.T.dot(a, b)
    else:
        # If a is broadcastable but b is not.
        if a.broadcastable[0] and not b.broadcastable[0]:
            # Scan b, but hold a steady.
            # Because b will be passed in as a, we need to left multiply to maintain
            #  matrix orientation.
            output, _ = K.theano.scan(theano_matmul, sequences=[b], non_sequences=[a[0], 1])
        # If b is broadcastable but a is not.
        elif b.broadcastable[0] and not a.broadcastable[0]:
            # Scan a, but hold b steady.
            output, _ = K.theano.scan(theano_matmul, sequences=[a], non_sequences=[b[0]])
        # If neither dimension is broadcastable or they both are.
        else:
            # Scan through the sequences, assuming the shape for this dimension is equal.
            output, _ = K.theano.scan(theano_matmul, sequences=[a, b])
        return output

In [4]:
class TextEncoder:
    PAD_OFFSET = 0
    MSK_OFFSET = 1
    BOS_OFFSET = 2
    DEL_OFFSET = 3  # delimiter
    EOS_OFFSET = 4
    SPECIAL_COUNT = 5
    NUM_SEGMENTS = 2
    BERT_UNUSED_COUNT = 99  # bert pretrained models
    BERT_SPECIAL_COUNT = 4  # they don't have DEL

    def __init__(self, vocab_size: int):
        # NOTE you MUST always put unk at 0, then regular vocab, then special tokens, and then pos
        self.vocab_size = vocab_size
        self.unk_id = 0
        self.pad_id = vocab_size + self.PAD_OFFSET
        self.msk_id = vocab_size + self.MSK_OFFSET
        self.bos_id = vocab_size + self.BOS_OFFSET
        self.del_id = vocab_size + self.DEL_OFFSET
        self.eos_id = vocab_size + self.EOS_OFFSET

    def __len__(self) -> int:
        return self.vocab_size

    def encode(self, sent: str) -> List[int]:
        raise NotImplementedError()


class SentencePieceTextEncoder(TextEncoder):
    def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
                 vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
        super().__init__(vocab_size)
        if not os.path.exists('{}.model'.format(model_name)):
            if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
                raise ValueError(
                    '{} is not a valid model_type for sentence piece, '
                    'valid options are: unigram, bpe, char, word'.format(spm_model_type))
            spm.SentencePieceTrainer.Train(
                '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
                '--character_coverage={coverage} --model_type={model_type} '
                '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '.format(
                    input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
                    model_type=spm_model_type.lower()))
        self.sp = spm.SentencePieceProcessor()
        self.sp.load('{}.model'.format(model_name))

    def encode(self, sent: str) -> List[int]:
        return self.sp.encode_as_ids(sent)


class OpenAITextEncoder(TextEncoder):
    def __init__(self, encoder_path: str = './openai/model/encoder_bpe_40000.json',
                 bpe_path: str = './openai/model/vocab_40000.bpe') -> None:
        self.encoder = _OpenAITextEncoder(encoder_path, bpe_path)
        super().__init__(len(self.encoder.encoder))

    def encode(self, sent: str) -> List[int]:
        return self.encoder.encode([sent], verbose=False)[0]


class BERTTextEncoder(TextEncoder):
    def __init__(self, vocab_file: str, do_lower_case: bool = True) -> None:
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case)
        super().__init__(len(self.tokenizer.vocab))
        self.bert_unk_id = self.tokenizer.vocab['[UNK]']
        self.bert_msk_id = self.tokenizer.vocab['[MASK]']

    def standardize_ids(self, ids: List[int]) -> List[int]:
        for i in range(len(ids)):
            if ids[i] == self.bert_unk_id:  # UNK
                ids[i] = 0
            else:  # VOCAB
                ids[i] -= self.bert_msk_id
        return ids

    def encode(self, sent: str) -> List[int]:
        return self.standardize_ids(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(sent)))

NameError: name 'List' is not defined

In [None]:
def _get_pos_encoding_matrix(max_len: int, d_emb: int) -> np.array:
    pos_enc = np.array(
        [[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] if pos != 0 else np.zeros(d_emb) for pos in
         range(max_len)], dtype=np.float32)
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc


# NOTE that for vocab_size you should also add special_count
class Embedding(keras.layers.Layer):
    def __init__(self, output_dim: int = 768, dropout: float = 0.1, vocab_size: int = 30000 + TextEncoder.SPECIAL_COUNT,
                 max_len: int = 512, trainable_pos_embedding: bool = True, use_one_dropout: bool = False,
                 use_embedding_layer_norm: bool = False, layer_norm_epsilon: float = 1e-5, **kwargs):
        super().__init__(**kwargs)
        self.max_len = max_len
        self.use_one_dropout = use_one_dropout
        self.output_dim = output_dim
        self.dropout = dropout
        self.vocab_size = vocab_size
        self.trainable_pos_embedding = trainable_pos_embedding

        self.segment_emb = keras.layers.Embedding(TextEncoder.NUM_SEGMENTS, output_dim, input_length=max_len,
                                                  name='SegmentEmbedding')
        if not trainable_pos_embedding:
            self.pos_emb = keras.layers.Embedding(max_len, output_dim, trainable=False, input_length=max_len,
                                                  name='PositionEmbedding',
                                                  weights=[_get_pos_encoding_matrix(max_len, output_dim)])
        else:
            self.pos_emb = keras.layers.Embedding(max_len, output_dim, input_length=max_len, name='PositionEmbedding')
        self.token_emb = keras.layers.Embedding(vocab_size, output_dim, input_length=max_len, name='TokenEmbedding')
        self.embedding_dropout = keras.layers.Dropout(dropout, name='EmbeddingDropOut')
        self.add_embeddings = keras.layers.Add(name='AddEmbeddings')
        self.use_embedding_layer_norm = use_embedding_layer_norm
        if self.use_embedding_layer_norm:
            self.embedding_layer_norm = LayerNormalization(layer_norm_epsilon)
        else:
            self.embedding_layer_norm = None
        self.layer_norm_epsilon = layer_norm_epsilon

    def compute_output_shape(self, input_shape):
        return input_shape[0][0], input_shape[0][1], self.output_dim

    def get_config(self):
        config = {
            'max_len': self.max_len,
            'use_one_dropout': self.use_one_dropout,
            'output_dim': self.output_dim,
            'dropout': self.dropout,
            'vocab_size': self.vocab_size,
            'trainable_pos_embedding': self.trainable_pos_embedding,
            'embedding_layer_norm': self.use_embedding_layer_norm,
            'layer_norm_epsilon': self.layer_norm_epsilon
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def __call__(self, inputs, **kwargs):
        tokens, segment_ids, pos_ids = inputs
        segment_embedding = self.segment_emb(segment_ids)
        pos_embedding = self.pos_emb(pos_ids)
        token_embedding = self.token_emb(tokens)
        if self.use_one_dropout:
            summation = self.add_embeddings([segment_embedding, pos_embedding, token_embedding])
            if self.embedding_layer_norm:
                summation = self.embedding_layer_norm(summation)
            return self.embedding_dropout(summation)
        summation = self.add_embeddings(
            [self.embedding_dropout(segment_embedding), self.embedding_dropout(pos_embedding),
             self.embedding_dropout(token_embedding)])
        if self.embedding_layer_norm:
            summation = self.embedding_layer_norm(summation)
        return summation

In [5]:
class MultiHeadAttention(Layer):
    def __init__(self, n_head: int, n_state: int, attention_dropout: float, use_attn_mask: bool, neg_inf: float,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self.n_head = n_head
        self.n_state = n_state
        self.attention_dropout = attention_dropout
        self.use_attn_mask = use_attn_mask
        self.neg_inf = neg_inf

    def compute_output_shape(self, input_shape):
        x = input_shape[0] if self.use_attn_mask else input_shape
        return x[0], x[1], x[2] // 3

    def call(self, inputs, **kwargs):
        x = inputs[0] if self.use_attn_mask else inputs
        attn_mask = inputs[1] if self.use_attn_mask else None
        return multihead_attention(x, attn_mask, self.n_head, self.n_state, self.attention_dropout, self.neg_inf)

    def get_config(self):
        config = {
            'n_head': self.n_head,
            'n_state': self.n_state,
            'attention_dropout': self.attention_dropout,
            'use_attn_mask': self.use_attn_mask,
            'neg_inf': self.neg_inf,
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))


class LayerNormalization(Layer):
    def __init__(self, eps: float = 1e-5, **kwargs) -> None:
        self.eps = eps
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:], initializer=Ones(), trainable=True)
        self.beta = self.add_weight(name='beta', shape=input_shape[-1:], initializer=Zeros(), trainable=True)
        super().build(input_shape)

    def call(self, x, **kwargs):
        u = K.mean(x, axis=-1, keepdims=True)
        s = K.mean(K.square(x - u), axis=-1, keepdims=True)
        z = (x - u) / K.sqrt(s + self.eps)
        return self.gamma * z + self.beta

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = {
            'eps': self.eps,
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))


class Gelu(Layer):
    def __init__(self, accurate: bool = False, **kwargs):
        super().__init__(**kwargs)
        self.accurate = accurate

    def call(self, inputs, **kwargs):
        if not self.accurate:
            return gelu(inputs)
        if K.backend() == 'tensorflow':
            erf = K.tf.erf
        else:
            erf = K.T.erf
        return inputs * 0.5 * (1.0 + erf(inputs / math.sqrt(2.0)))

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = {
            'accurate': self.accurate,
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [6]:
class MultiHeadSelfAttention:
    def __init__(self, n_state: int, n_head: int, attention_dropout: float,
                 use_attn_mask: bool, layer_id: int, neg_inf: float) -> None:
        assert n_state % n_head == 0
        self.c_attn = Conv1D(3 * n_state, 1, name='layer_{}/c_attn'.format(layer_id))
        self.attn = MultiHeadAttention(n_head, n_state, attention_dropout, use_attn_mask,
                                       neg_inf, name='layer_{}/self_attention'.format(layer_id))
        self.c_attn_proj = Conv1D(n_state, 1, name='layer_{}/c_attn_proj'.format(layer_id))

    def __call__(self, x, mask):
        output = self.c_attn(x)
        output = self.attn(output) if mask is None else self.attn([output, mask])
        return self.c_attn_proj(output)


class PositionWiseFF:
    def __init__(self, n_state: int, d_hid: int, layer_id: int, accurate_gelu: bool) -> None:
        self.c_fc = Conv1D(d_hid, 1, name='layer_{}/c_fc'.format(layer_id))
        self.activation = Gelu(accurate=accurate_gelu, name='layer_{}/gelu'.format(layer_id))
        self.c_ffn_proj = Conv1D(n_state, 1, name='layer_{}/c_ffn_proj'.format(layer_id))

    def __call__(self, x):
        output = self.activation(self.c_fc(x))
        return self.c_ffn_proj(output)


class EncoderLayer:
    def __init__(self, n_state: int, n_head: int, d_hid: int, residual_dropout: float, attention_dropout: float,
                 use_attn_mask: bool, layer_id: int, neg_inf: float, ln_epsilon: float, accurate_gelu: bool) -> None:
        self.attention = MultiHeadSelfAttention(n_state, n_head, attention_dropout, use_attn_mask, layer_id, neg_inf)
        self.drop1 = Dropout(residual_dropout, name='layer_{}/ln_1_drop'.format(layer_id))
        self.add1 = Add(name='layer_{}/ln_1_add'.format(layer_id))
        self.ln1 = LayerNormalization(ln_epsilon, name='layer_{}/ln_1'.format(layer_id))
        self.ffn = PositionWiseFF(n_state, d_hid, layer_id, accurate_gelu)
        self.drop2 = Dropout(residual_dropout, name='layer_{}/ln_2_drop'.format(layer_id))
        self.add2 = Add(name='layer_{}/ln_2_add'.format(layer_id))
        self.ln2 = LayerNormalization(ln_epsilon, name='layer_{}/ln_2'.format(layer_id))

    def __call__(self, x, mask):
        a = self.attention(x, mask)
        n = self.ln1(self.add1([x, self.drop1(a)]))
        f = self.ffn(n)
        return self.ln2(self.add2([n, self.drop2(f)]))


def create_transformer(embedding_dim: int = 768, embedding_dropout: float = 0.1, vocab_size: int = 30000,
                       max_len: int = 512, trainable_pos_embedding: bool = True, num_heads: int = 12,
                       num_layers: int = 12, attention_dropout: float = 0.1, use_one_embedding_dropout: bool = False,
                       d_hid: int = 768 * 4, residual_dropout: float = 0.1, use_attn_mask: bool = True,
                       embedding_layer_norm: bool = False, neg_inf: float = -1e9, layer_norm_epsilon: float = 1e-5,
                       accurate_gelu: bool = False) -> keras.Model:
    vocab_size += TextEncoder.SPECIAL_COUNT
    tokens = Input(batch_shape=(None, max_len), name='token_input', dtype='int32')
    segment_ids = Input(batch_shape=(None, max_len), name='segment_input', dtype='int32')
    pos_ids = Input(batch_shape=(None, max_len), name='position_input', dtype='int32')
    attn_mask = Input(batch_shape=(None, 1, max_len, max_len), name='attention_mask_input',
                      dtype=K.floatx()) if use_attn_mask else None
    inputs = [tokens, segment_ids, pos_ids]
    embedding_layer = Embedding(embedding_dim, embedding_dropout, vocab_size, max_len, trainable_pos_embedding,
                                use_one_embedding_dropout, embedding_layer_norm, layer_norm_epsilon)
    x = embedding_layer(inputs)
    for i in range(num_layers):
        x = EncoderLayer(embedding_dim, num_heads, d_hid, residual_dropout,
                         attention_dropout, use_attn_mask, i, neg_inf, layer_norm_epsilon, accurate_gelu)(x, attn_mask)
    if use_attn_mask:
        inputs.append(attn_mask)
    return keras.Model(inputs=inputs, outputs=[x], name='Transformer')

In [19]:
from typing import List, Optional

class TextEncoder:
    PAD_OFFSET = 0
    MSK_OFFSET = 1
    BOS_OFFSET = 2
    DEL_OFFSET = 3  # delimiter
    EOS_OFFSET = 4
    SPECIAL_COUNT = 5
    NUM_SEGMENTS = 2
    BERT_UNUSED_COUNT = 99  # bert pretrained models
    BERT_SPECIAL_COUNT = 4  # they don't have DEL

    def __init__(self, vocab_size: int):
        # NOTE you MUST always put unk at 0, then regular vocab, then special tokens, and then pos
        self.vocab_size = vocab_size
        self.unk_id = 0
        self.pad_id = vocab_size + self.PAD_OFFSET
        self.msk_id = vocab_size + self.MSK_OFFSET
        self.bos_id = vocab_size + self.BOS_OFFSET
        self.del_id = vocab_size + self.DEL_OFFSET
        self.eos_id = vocab_size + self.EOS_OFFSET

    def __len__(self) -> int:
        return self.vocab_size

    def encode(self, sent: str) -> List[int]:
        raise NotImplementedError()


In [14]:
import time

Fname = 'malware_'
Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=True, write_images=False,
                          embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

with open("security_test.csv.pkl", "rb") as f:
    file_names = pickle.load(f)
    outfiles = pickle.load(f)
with open("security_train.csv.pkl", "rb") as f:
    labels_d = pickle.load(f)
with open("security_train.csv.pkl", "rb") as f:
    labels = pickle.load(f)
    files = pickle.load(f)
maxlen = 10000



labels = np.asarray(labels)

labels = to_categorical(labels, num_classes=8)
tokenizer = Tokenizer(num_words=None,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      split=' ',
                      char_level=False,
                      oov_token=None)
tokenizer.fit_on_texts(files)
tokenizer.fit_on_texts(outfiles)

# with open("wordsdic.pkl", 'wb') as f:
#     pickle.dump(tokenizer, f)

vocab = tokenizer.word_index
print(tokenizer.word_index)

x_train_word_ids = tokenizer.texts_to_sequences(files)
x_out_word_ids = tokenizer.texts_to_sequences(outfiles)

x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen)

x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen)

max_features = 10000
meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8))
meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8))


{'ldrgetprocedureaddress': 1, 'ntclose': 2, 'regqueryvalueexw': 3, 'thread32next': 4, 'ntdelayexecution': 5, 'regopenkeyexw': 6, 'getsystemmetrics': 7, 'regclosekey': 8, 'getcursorpos': 9, 'ntreadfile': 10, 'ntallocatevirtualmemory': 11, 'getkeystate': 12, 'ntqueryvaluekey': 13, 'process32nextw': 14, 'ntwritefile': 15, 'ldrloaddll': 16, 'getforegroundwindow': 17, 'ntquerydirectoryfile': 18, 'getsystemtimeasfiletime': 19, 'ldrgetdllhandle': 20, 'ntquerykey': 21, 'cryptdecodeobjectex': 22, 'ntopenkey': 23, 'findfirstfileexw': 24, 'ntcreatefile': 25, 'loadstringw': 26, 'ntopenkeyex': 27, 'ntprotectvirtualmemory': 28, 'ntfreevirtualmemory': 29, 'loadresource': 30, 'regqueryvalueexa': 31, 'setfilepointer': 32, 'timegettime': 33, 'readprocessmemory': 34, 'regopenkeyexa': 35, 'findresourceexw': 36, 'getfileattributesw': 37, 'seterrormode': 38, 'ntmapviewofsection': 39, 'getfiletype': 40, 'loadstringa': 41, 'ntcreatesection': 42, 'ntunmapviewofsection': 43, 'regenumkeyexw': 44, 'ntopenfile': 4

In [31]:
len(x_train_word_ids[8])

18028