In [104]:
import tensorflow as tf
import numpy as np
import uuid
from nltk.tokenize import sent_tokenize
from gensim.models import KeyedVectors
import pickle
import itertools

batch_size = 200
original_dim = 1000
latent_dim = 100
intermediate_dim = 500
epochs = 100
epsilon_std = 1.0

def samp(z_mean,z_log_var):
    epsilon = tf.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + tf.exp(z_log_var / 2) * epsilon

def total_loss(x, y,z_mean,z_log_var):
    ent_loss =   tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=x, logits=y),1)
    
    kl_loss = - 0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var),1)
    print(kl_loss.get_shape())
    #recon_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=X), 1)
    # D_KL(Q(z|X) || P(z)); calculate in closed form as both dist. are Gaussian
    #kl_loss = 0.5 * tf.reduce_sum(tf.exp(z_logvar) + z_mu**2 - 1. - z_logvar, 1)
    return [ent_loss,kl_loss]

def split_into_sent (text):
    strg = ''
    for word in text:
        strg += word
        strg += ' '
    strg_cleaned = strg.lower()
    for x in ['\n','"',"!", '#','$','%','&','(',')','*','+',',','-','/',':',';','<','=','>','?','@','[','^',']','_','`','{','|','}','~','\t']:
        strg_cleaned = strg_cleaned.replace(x, '')
    sentences = sent_tokenize(strg_cleaned)
    return sentences

def vectorize_sentences(sentences):
    vectorized = []
    for sentence in sentences:
        byword = sentence.split()
        concat_vector = []
        for word in byword:
            try:
                concat_vector.append(w2v[word])
            except:
                pass
        vectorized.append(concat_vector)
    return vectorized

def batch_generator(sources,batch_size=50):
    idxs = np.random.permutation(np.arange(len(sources)))
    n_batches = len(idxs) // batch_size
    for batch_i in range(n_batches):
        this_idxs = idxs[batch_i * batch_size:(batch_i + 1) * batch_size]
        this_sources = sources[this_idxs, :]
        yield (this_sources)

def sent_parse(sentence, mat_shape):
    data_concat = []
    word_vecs = vectorize_sentences(sentence)
    print("wordvec length:",len(word_vecs))
    for x in word_vecs:
        data_concat.append(list(itertools.chain.from_iterable(x)))
    zero_matr = np.zeros(mat_shape)
    print(mat_shape)
    print(len(data_concat))
    zero_matr[0] = np.array(data_concat)
    return zero_matr


def print_sentence_with_w2v(sent_vect):
    word_sent = ''
    tocut = sent_vect
    for i in range (int(len(sent_vect)/100)):
        word_sent += w2v.most_similar(positive=[tocut[:100]], topn=1)[0][0]
        word_sent += ' '
        tocut = tocut[100:]
    print(word_sent)
    

def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec


def interpolate_b_points(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample


def sent_2_sent(sess,sent1,sent2, batch, dim,z_mean,x_decoded):
    a = sent_parse([sent1], (batch,dim))
   
    b = sent_parse([sent2], (batch,dim))
    encode_a = sess.run(z_mean,feed_dict={x:a})
    
    encode_b = sess.run(z_mean,feed_dict={x:b})
    intermediate_points = interpolate_b_points(encode_a[0], encode_b[0], 5)
    
    for point in intermediate_points:
        
        zero_matr = np.zeros((batch,latent_dim))
        zero_matr[0] = np.array(point)
        p = sess.run(x_decoded,feed_dict={z:zero_matr})[0]
        print_sentence_with_w2v(p)


    

In [105]:
w2v = KeyedVectors.load_word2vec_format('apt_vectors.vec')
data_concat = []


with open('APT_sanitized.txt',"r",encoding='utf-8') as f:
    text=f.readlines()
vect = vectorize_sentences(text)
data = [x for x in vect  if len(x) == 10]
for x in data:
    data_concat.append(list(itertools.chain.from_iterable(x)))
    

data_array = np.array(data_concat)
np.random.shuffle(data_array)

train = data_array[:60000]
test = data_array[60000:80000]


In [109]:
tf.reset_default_graph()
x = tf.placeholder(shape=[batch_size, original_dim], dtype=tf.float32)
h = tf.layers.dense(x,intermediate_dim , activation=tf.nn.relu)
z_mean=tf.layers.dense(h,latent_dim,name="encoder")
z_log_var =tf.layers.dense(h,latent_dim)
z=samp(z_mean,z_log_var)
                             
h_decoded = tf.layers.dense(z,intermediate_dim, activation=tf.nn.relu)
x_decoded = tf.layers.dense(h_decoded,original_dim, activation=tf.nn.sigmoid,name="decoder")
print(x.get_shape())
print(x_decoded.get_shape())


xent_loss,kl_loss=total_loss(x,x_decoded,z_mean,z_log_var)
cost = tf.reduce_mean(xent_loss + kl_loss)
optimizer = tf.train.AdamOptimizer(0.5).minimize(cost)

init=tf.global_variables_initializer()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
epoch_i=0
with tf.Session() as sess:
    sess.run(init)
    try:
        while not coord.should_stop():
            for it_i, (this_sources) in enumerate(batch_generator(train,batch_size)):
                l,s,kl = sess.run([cost,xent_loss,kl_loss],feed_dict={x:this_sources})
                
                
            if epoch_i==10:
                coord.request_stop()
            print("Epoch:",epoch_i)                    
            epoch_i=epoch_i+1
            print("Loss ",l)
            #print("sigmoid",s)
            #print("kl",kl)
    except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
    finally:
            # One of the threads has issued an exception.  So let's tell all the
            # threads to shutdown.
            coord.request_stop()

        # Wait until all threads have finished.
    coord.join(threads)
    sent_2_sent(sess,'already legitimately used in an Internet Explorer plugin open source','depending on Windows version . Returns failure or success with',batch=200,dim=1000,z_mean=z_mean,x_decoded=x_decoded)

(200, 1000)
(200, 1000)
(200,)
Epoch: 0
Loss  1068.5624
Epoch: 1
Loss  1070.268
Epoch: 2
Loss  1066.4949
Epoch: 3
Loss  1064.9714
Epoch: 4
Loss  1067.8148
Epoch: 5
Loss  1058.9308
Epoch: 6
Loss  1063.102
Epoch: 7
Loss  1060.4624
Epoch: 8
Loss  1061.5067
Epoch: 9
Loss  1061.3883
Epoch: 10
Loss  1067.2474
wordvec length: 1
(200, 1000)
1
wordvec length: 1
(200, 1000)
1
utilise utilise attack3 pilfered utilise attack3 darkhotel â€œuse 6aeb71d05a2f9b7c52ec06d65d838e82 Aurora8 
utilise attack3 attack3 pilfered utilise pilfered darkhotel darkhotel 6aeb71d05a2f9b7c52ec06d65d838e82 Aurora8 
utilise attack3 attack3 pilfered utilise pilfered darkhotel darkhotel 6aeb71d05a2f9b7c52ec06d65d838e82 darkhotel 
utilise attack3 attack3 pilfered utilise pilfered darkhotel darkhotel 6aeb71d05a2f9b7c52ec06d65d838e82 darkhotel 
6aeb71d05a2f9b7c52ec06d65d838e82 attack3 attack3 darkhotel utilise pilfered darkhotel darkhotel attack3 darkhotel 
