In [20]:
#https://stackoverflow.com/questions/51235118/how-to-get-word-vectors-from-keras-embedding-layer

import tensorflow as tf
import keras
from keras_preprocessing import sequence
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from spam_dataset import SpamDataset
from layers import *

In [2]:
def pad_numpify_input(X):
    interm = sequence.pad_sequences(X, maxlen=max_len, truncating='post', padding='post', value=0)
    return np.array([np.array(i).astype('float32') for i in interm])

def one_hot_decode(Y):
    Y = np.array(Y)
    Y = np.eye(Y.shape[1])[Y.argmax(1)]
    print(Y)
    return np.array(["Yes" if y[1] == 1 else "No" for y in Y])

In [21]:
n_words = 10000
max_len = 50 #only take top 50 words

embed_dim = 32
num_heads = 1

tokenizer = keras.preprocessing.text.Tokenizer(num_words=n_words, split=' ')

X, _y = SpamDataset.get_data()

# binary encode
one_hot_encoder = OneHotEncoder(sparse=False)
_y = np.array(_y).reshape(len(_y), 1)
y = one_hot_encoder.fit_transform(_y)

#tokenize
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

#pad sequences
X = pad_numpify_input(X)
y = np.array(y).astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4296, 50)
(4296, 2)
(1432, 50)
(1432, 2)


In [24]:
l_input = layers.Input(shape=(max_len,))
embedding_layer = layers.Embedding(input_dim=n_words, output_dim=embed_dim)
l_embedding = embedding_layer(l_input)

l_mha = SelfAttention(embed_dim)(l_embedding)
l_pool = layers.GlobalAveragePooling1D()(l_mha)
l_output = layers.Dense(2, activation="softmax")(l_pool)

model = keras.Model(inputs=l_input, outputs=l_output)
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
model.summary()

model.fit(X_train, y_train, epochs=4)

model.evaluate(X_test, y_test)

Model: "functional_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 50, 32)            320000    
_________________________________________________________________
self_attention_3 (SelfAttent (None, 50, 32)            3168      
_________________________________________________________________
global_average_pooling1d_3 ( (None, 32)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 2)                 66        
Total params: 323,234
Trainable params: 323,234
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.04066591337323189, 0.9839385747909546]

In [25]:
### DEVELOPMENT AREA ###

#define intermediate model (model that does input -> embedding)
model_emb = keras.Model(inputs=l_input, outputs=l_embedding)

#test inputs
x_pred = tokenizer.texts_to_sequences(["Hi john, this is tyler. I am emailing you to talk about the amount of ideas I have for"]);
x_pred = sequence.pad_sequences(x_pred, maxlen=max_len, truncating='post', padding='post', value=0)

out_pred = model_emb.predict(x_pred)
print(
    out_pred.shape
)

(1, 50, 32)


In [19]:
token_emb_weights = embedding_layer.token_emb.get_weights()[0]

print(token_emb_weights)

[[-0.03690045  0.01559929  0.02776337 ... -0.01968681 -0.01474567
   0.0296577 ]
 [ 0.04355304  0.02628536 -0.01919303 ... -0.08933982  0.03083475
   0.10062259]
 [ 0.06479541  0.02936449 -0.05001809 ... -0.04095309  0.01373926
  -0.0189221 ]
 ...
 [ 0.0024655   0.0108035   0.03648119 ...  0.01778117 -0.0332409
   0.04605789]
 [-0.00102471 -0.01526888  0.0136312  ...  0.0252532  -0.02494303
  -0.04669187]
 [ 0.00992849 -0.01687845 -0.01907049 ...  0.02761716 -0.01135495
  -0.02291597]]
