In [89]:
#https://stackoverflow.com/questions/51235118/how-to-get-word-vectors-from-keras-embedding-layer

import tensorflow as tf
import keras
from keras_preprocessing import sequence
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, normalize

from spam_dataset import SpamDataset
from layers import *

In [2]:
def pad_numpify_input(X):
    interm = sequence.pad_sequences(X, maxlen=max_len, truncating='post', padding='post', value=0)
    return np.array([np.array(i).astype('float32') for i in interm])

def one_hot_decode(Y):
    Y = np.array(Y)
    Y = np.eye(Y.shape[1])[Y.argmax(1)]
    print(Y)
    return np.array(["Yes" if y[1] == 1 else "No" for y in Y])

In [21]:
n_words = 10000
max_len = 50 #only take top 50 words

embed_dim = 32
num_heads = 1

tokenizer = keras.preprocessing.text.Tokenizer(num_words=n_words, split=' ')

X, _y = SpamDataset.get_data()

# binary encode
one_hot_encoder = OneHotEncoder(sparse=False)
_y = np.array(_y).reshape(len(_y), 1)
y = one_hot_encoder.fit_transform(_y)

#tokenize
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

#pad sequences
X = pad_numpify_input(X)
y = np.array(y).astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4296, 50)
(4296, 2)
(1432, 50)
(1432, 2)


In [59]:
l_input = layers.Input(shape=(max_len,))
embedding_layer = layers.Embedding(input_dim=n_words, output_dim=embed_dim)
l_embedding = embedding_layer(l_input)

l_sa = SelfAttention(embed_dim)(l_embedding)
l_pool = layers.GlobalAveragePooling1D()(l_sa)
l_output = layers.Dense(2, activation="softmax")(l_pool)

model = keras.Model(inputs=l_input, outputs=l_output)
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
model.summary()

model.fit(X_train, y_train, epochs=4)

model.evaluate(X_test, y_test)

Model: "functional_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 50, 32)            320000    
_________________________________________________________________
self_attention_4 (SelfAttent (None, 50, 32)            3168      
_________________________________________________________________
global_average_pooling1d_4 ( (None, 32)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 2)                 66        
Total params: 323,234
Trainable params: 323,234
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.04098685458302498, 0.9839385747909546]

In [62]:
### DEVELOPMENT AREA ###

#create test inputs
x_pred = tokenizer.texts_to_sequences([
    "Hi john, this is tyler. I am emailing you to talk about the amount of ideas I have for"
    , "Nigerian prince, this is callum the devil calling to come and kill you (over the phone)"
]);
x_pred = sequence.pad_sequences(x_pred, maxlen=max_len, truncating='post', padding='post', value=0)

#define intermediate model (model that does input -> embedding)
model_emb = keras.Model(inputs=l_input, outputs=l_embedding)
model_satt = keras.Model(inputs=l_input, outputs=l_sa)

x_pred_embedding = model_emb.predict(x_pred)
x_pred_satt = model_satt.predict(x_pred)



In [68]:
def reverse_token_emb(X):
    global embedding_layer

    X = np.array(X)
    lookup_dict = {}
    
    token_emb_weights = embedding_layer.get_weights()[0]
    texts = []
    for word in X:
        text_word_idxs = []
        for embed_vec in word:
            tuple_embed_vec = tuple(embed_vec)
            
            #if lookup dict contains embedded vector use it
            if lookup_dict.get(tuple_embed_vec, None) != None:
                text_word_idxs.append(lookup_dict[tuple_embed_vec])
            #else search for embedding and cache it
            else:
                for idx in range(token_emb_weights.shape[0]):
                    if np.array_equal(embed_vec, token_emb_weights[idx]):
                        lookup_dict[tuple_embed_vec] = idx
                        text_word_idxs.append(idx)
                    
        texts.append(text_word_idxs)
    
    return np.array(texts)

# print(tokenizer.sequences_to_texts(reverse_token_emb(out_pred)))

In [157]:
att_layer_values = np.array(model_satt(X_test))
emb_layer_values = np.array(model_emb(X_test))
att_scores = att_layer_values - emb_layer_values

#sum per token
att_scores_per_token = np.array([np.average(arr, axis=1) for arr in att_scores])
#normalize over rows (sklearn.preprocessing.normalize)
att_scores_per_token = normalize(att_scores_per_token)

tokens = reverse_token_emb(emb_layer_values)

reverse_word_index = {y:x for x,y in tokenizer.word_index.items()}

token_att_scores = []
#len(tokens) == len(att_scores_per_token)
for i in range(len(tokens)):
    #zip the att scores and tokens together for each word
    token_att_scores.append(
        np.array(
            list(zip(tokens[i], att_scores_per_token[i]))
        )
    )
token_att_scores = np.array(token_att_scores)

print(token_att_scores.shape)
word_list = []
for item in token_att_scores:
    for pair in item:
        if (reverse_word_index.get(pair[0], None) != None):
            word_list.append((reverse_word_index[pair[0]], pair[1]))
# word_list.sort()
print(sorted(word_list, key=lambda x: x[1], reverse=True))

(1432, 50, 2)
[('cheap', 0.692121684551239), ('http', 0.637756884098053), ('cum', 0.48434343934059143), ('low', 0.4415138363838196), ('low', 0.4415138363838196), ('3107', 0.43945592641830444), ('3107', 0.43945592641830444), ('http', 0.4393363893032074), ('http', 0.4393363893032074), ('your', 0.4388566017150879), ('your', 0.4343631863594055), ('software', 0.43430349230766296), ('software', 0.43430349230766296), ('a', 0.4320116341114044), ('a', 0.4320116341114044), ('not', 0.42938682436943054), ('delivered', 0.40020838379859924), ('only', 0.3856702744960785), ('get', 0.3824688196182251), ('software', 0.37961244583129883), ('most', 0.37346941232681274), ('every', 0.3722641170024872), ('nothing', 0.36764204502105713), ('cd', 0.36630088090896606), ('http', 0.36083003878593445), ('3107', 0.35858261585235596), ('mail', 0.3552185297012329), ('delivery', 0.35423436760902405), ('your', 0.3505861759185791), ('2004', 0.34810370206832886), ('a', 0.3477551341056824), ('recipients', 0.346208482980728