In [1]:
from keras import objectives, backend as K
from keras.layers import Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed
from keras.models import Model
import keras
from keras.layers import Input, Dense, Lambda, Layer
import numpy as np
import uuid
from nltk.tokenize import sent_tokenize
import pickle
import itertools
import mycallback

Using TensorFlow backend.


In [2]:


class VAE(object):
    
    def _build_decoder(self, encoded, vocab_size, max_length):
        repeated_context = RepeatVector(max_length)(encoded)

        #h = LSTM(1000, return_sequences=True, name='dec_lstm_1')(repeated_context)
        h = LSTM(1000, return_sequences=True, name='dec_lstm_2')(repeated_context)

        decoded = TimeDistributed(Dense(vocab_size, activation='softmax'), name='decoded_mean')(h)

        return decoded
    def _build_encoder(self, x, latent_rep_size=100, max_length=300, epsilon_std=0.01):
        #h = Bidirectional(LSTM(1000, return_sequences=True, name='lstm_1'), merge_mode='concat')(x)
        h = Bidirectional(LSTM(1000, return_sequences=False, name='lstm_2'), merge_mode='concat')(x)
        h = Dense(435, activation='tanh', name='dense_1')(h)

        def sampling(args):
            z_mean_, z_log_var_ = args
            batch_size = K.shape(z_mean_)[0]
            epsilon = K.random_normal(shape=(batch_size, latent_rep_size), mean=0., stddev=epsilon_std)
            return z_mean_ + K.exp(z_log_var_ / 2) * epsilon

        z_mean = Dense(latent_rep_size, name='z_mean', activation='linear')(h)
        z_log_var = Dense(latent_rep_size, name='z_log_var', activation='linear')(h)

        def vae_loss(x, x_decoded_mean):
            x = K.flatten(x)
            x_decoded_mean = K.flatten(x_decoded_mean)
            xent_loss = max_length * objectives.binary_crossentropy(x, x_decoded_mean)
            kl_loss = - 0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
            return xent_loss + kl_loss

        return (vae_loss, Lambda(sampling, output_shape=(latent_rep_size,), name='lambda')([z_mean, z_log_var]))


    def create(self, vocab_size=1000, max_length=50, latent_rep_size=100):
        self.encoder = None
        self.decoder = None
        self.autoencoder = None

        x = Input(shape=(max_length,))
        x_embed = Embedding(vocab_size, 100, input_length=max_length)(x)

        vae_loss, encoded = self._build_encoder(x_embed, latent_rep_size=latent_rep_size, max_length=max_length)
        self.encoder = Model(inputs=x, outputs=encoded)

        encoded_input = Input(shape=(latent_rep_size,))


        decoded = self._build_decoder(encoded_input, vocab_size, max_length)
        self.decoder = Model(encoded_input, decoded)

        self.autoencoder = Model(inputs=x, outputs=[self._build_decoder(encoded, vocab_size, max_length)])
        adam=keras.optimizers.Adam(lr=.001)
        self.autoencoder.compile(optimizer=adam,
                                 loss=[vae_loss],
                                 metrics=['accuracy'])
        


        

In [3]:
from keras.callbacks import ModelCheckpoint
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

NUM_WORDS=500
MAX_LENGTH=15
VALIDATION_SPLIT =.3
_EOS = "endofsent"

def sent_parse(sentences,tokenizer=None,build_indices=True):
    if build_indices:
        tokenizer = Tokenizer(nb_words=NUM_WORDS)
        tokenizer.fit_on_texts(sentences)
        sequences = tokenizer.texts_to_sequences(sentences)
        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))
        data = pad_sequences(sequences, maxlen=MAX_LENGTH)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
    else:
        sequences = tokenizer.texts_to_sequences(sentences)
        data = pad_sequences(sequences, maxlen=MAX_LENGTH)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
    return tokenizer,data


    

def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec


def interpolate_b_points(point_one, point_two, num,useSperical=True):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = False)
    hom_sample = []
    for s in sample:
        if useSperical:
            hom_sample.append(slerp(s,point_one,point_two))
        else:
            hom_sample.append(point_one + s * dist_vec)
    return hom_sample


def sent_2_sent(sent1,sent2, model,tokenizer=None):
    _,a = sent_parse([sent1],tokenizer,build_indices=False)
    _,b = sent_parse([sent2],tokenizer,build_indices=False)
    encode_a = model.encoder.predict(a)
    encode_b = model.encoder.predict(b)
    test_hom = interpolate_b_points(encode_a, encode_b, 5,False)
    index_word = {v: k for k, v in tokenizer.word_index.items()}

    for point in test_hom:
        words=[]
        deco=model.decoder.predict(point)
        #print(deco)
        for seq in deco[0]:
            words.append(index_word[np.argmax(seq)])
            words.append(' ')
        print(''.join(words))
        
       
def slerp(val, low, high):
    """Spherical interpolation. val has a range of 0 to 1."""
    if val <= 0:
        return low
    elif val >= 1:
        return high
    omega = np.arccos(np.dot(low/np.linalg.norm(low), high/np.linalg.norm(high)))
    so = np.sin(omega)
    return np.sin((1.0-val)*omega) / so * low + np.sin(val*omega)/so * high
    


In [4]:
import nltk
from nltk.corpus import brown
def split_into_sent (text):
    strg = ''
    for word in text:
        strg += word
        strg += ' '
    strg_cleaned = strg.lower()
    for x in ['\n','"',"!", '#','$','%','&','(',')','*','+',',','-','/',':',';','<','=','>','?','@','[','^',']','_','`','{','|','}','~','\t']:
        strg_cleaned = strg_cleaned.replace(x, '')
    sentences = sent_tokenize(strg_cleaned)
    return sentences

fiction_text=brown.words(categories=['fiction','humor', 'learned', 'lore', 'mystery', 'news'])
print(len(fiction_text))
sents=sent_tokenize(' '.join(fiction_text))[1:10000]




540093


In [5]:
###### APT Text#############

import nltk
import codecs
with codecs.open('/home/vmangipudi/APT_reports_combined.txt',"r",encoding="utf-8") as f:
    fl=f.readlines()


from nltk.tokenize import sent_tokenize
text=''.join(fl).replace('\n',' ')
sents = sent_tokenize(text)



tokenizer,data=sent_parse(sents)

nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-nb_validation_samples]
X_test = data[-nb_validation_samples:]





print("Training data")
print(X_train.shape)

print("Number of words:")
print(len(np.unique(np.hstack(X_train))))



temp = np.zeros((X_train.shape[0], MAX_LENGTH, NUM_WORDS))
temp[np.expand_dims(np.arange(X_train.shape[0]), axis=0).reshape(X_train.shape[0], 1), np.repeat(np.array([np.arange(MAX_LENGTH)]), X_train.shape[0], axis=0), X_train] = 1

X_train_one_hot = temp

temp = np.zeros((X_test.shape[0], MAX_LENGTH, NUM_WORDS))
temp[np.expand_dims(np.arange(X_test.shape[0]), axis=0).reshape(X_test.shape[0], 1), np.repeat(np.array([np.arange(MAX_LENGTH)]), X_test.shape[0], axis=0), X_test] = 1

x_test_one_hot = temp


    

def create_model_checkpoint(dir, model_name):
    filepath = dir + '/' + \
                model_name
               #model_name + "-{epoch:02d}-{val_decoded_mean_acc:.2f}-{val_pred_loss:.2f}.h5"
    directory = os.path.dirname(filepath)
    
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)

    checkpointer = ModelCheckpoint(filepath=filepath,
                                   verbose=1,
                                   save_best_only=False)

    return checkpointer



Found 121030 unique tokens.
Training data
(63904, 15)
Number of words:
500


In [6]:
from keras.callbacks import TensorBoard
from time import time
def train():
    model = VAE()
    model.create(vocab_size=NUM_WORDS, max_length=MAX_LENGTH)
    
    tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
    checkpointer = create_model_checkpoint('models', 'rnn_ae')
    cb=mycallback.validate_after_epoch(model,tokenizer,sents)
    
    
    model.autoencoder.fit(x=X_train, y={'decoded_mean': X_train_one_hot},
                          batch_size=500, epochs=150, callbacks=[checkpointer,tensorboard,cb],
                          validation_data=(X_test, {'decoded_mean': x_test_one_hot}),shuffle=True)
    return model

In [7]:
model=train()

Train on 63904 samples, validate on 27387 samples
Epoch 1/150
function function function function function function function function function function function function function function function 
function function function function function function function function function function function function function function function 
function function function function function function function function function function function function function function function 
function function function function function function function function function function function function function function function 
function function function function function function function function function function function function function function function 
Epoch 2/150
function function function function function function function function function function function function function function function 
function function function function function function function function function function function function fu

may may so so so so so so so so so so so so so 
service service service service service service so so so so so so so so so 
back back back back service service service service service service service service service service service 
back back back back back back back back back back back back back back back 
back back back back back back back back back back sample sample sample sample sample 
Epoch 11/150
include actor actor actor actor actor actor actor actor actor actor actor actor actor actor 
back details details details details actor actor actor actor actor actor actor actor actor actor 
back back back back back back back back back actor actor actor actor actor actor 
back back back back back back back dll dll dll dll dll dll dll dll 
back back dll dll dll dll dll dll dll dll dll dll dll dll dll 
Epoch 12/150
module module module being being module module module module out out out out out out 
module module module module module module out out out out out out out out out 
05 05 05 0

KeyError: 0

In [22]:
sent_2_sent(sents[210],sents[215],model,tokenizer=tokenizer)

ValueError: shapes (1,100) and (1,100) not aligned: 100 (dim 1) != 1 (dim 0)

In [11]:
sents[152]

u'It was found to be backdoored by Gh0st RAT  and exfiltrated data to an IP in China.'

In [7]:
keras.models.load_model('/home/vmangipudi/SentenceVAE/models/rnn_ae')

ValueError: Unknown loss function:vae_loss

In [24]:
deco=model.decoder.predict(z)

In [27]:
np.argmax(deco[0][1])

414