In [1]:
#https://stackoverflow.com/questions/51235118/how-to-get-word-vectors-from-keras-embedding-layer

import tensorflow as tf
import keras
from keras_preprocessing import sequence
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from spam_dataset import SpamDataset
from layers import *

In [2]:
def pad_numpify_input(X):
    interm = sequence.pad_sequences(X, maxlen=max_len, truncating='post', padding='post', value=0)
    return np.array([np.array(i).astype('float32') for i in interm])

def one_hot_decode(Y):
    Y = np.array(Y)
    Y = np.eye(Y.shape[1])[Y.argmax(1)]
    print(Y)
    return np.array(["Yes" if y[1] == 1 else "No" for y in Y])

In [3]:
n_words = 10000
max_len = 50 #only take top 50 words

embed_dim = 32
num_heads = 1

tokenizer = keras.preprocessing.text.Tokenizer(num_words=n_words, split=' ')

X, _y = SpamDataset.get_data()

# binary encode
one_hot_encoder = OneHotEncoder(sparse=False)
_y = np.array(_y).reshape(len(_y), 1)
y = one_hot_encoder.fit_transform(_y)

#tokenize
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

#pad sequences
X = pad_numpify_input(X)
y = np.array(y).astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4296, 50)
(4296, 2)
(1432, 50)
(1432, 2)


In [5]:
l_input = layers.Input(shape=(max_len,))
l_embedding = TokenAndPositionEmbedding(max_len, n_words, embed_dim)(l_input)

l_mha = SelfAttention(embed_dim)(l_embedding)
l_pool = layers.GlobalAveragePooling1D()(l_mha)
l_output = layers.Dense(2, activation="softmax")(l_pool)

model = keras.Model(inputs=l_input, outputs=l_output)
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
model.summary()

model.fit(X_train, y_train, epochs=4)

model.evaluate(X_test, y_test)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
token_and_position_embedding (None, 50, 32)            321600    
_________________________________________________________________
self_attention (SelfAttentio (None, 50, 32)            3168      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 66        
Total params: 324,834
Trainable params: 324,834
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.042269740253686905, 0.9853351712226868]

In [10]:
### DEVELOPMENT AREA ###

#define intermediate model (model that does input -> embedding)
model_emb = keras.Model(inputs=l_input, outputs=l_embedding)

#test inputs
x_pred = tokenizer.texts_to_sequences(["Hi john, this is tyler. I am emailing you to talk about the amount of ideas I have for"]);
x_pred = sequence.pad_sequences(x_pred, maxlen=max_len, truncating='post', padding='post', value=0)

# out_pred = model.predict(np.array([x_pred]))
out_pred = model.predict(X_train[0:1])
print(
    one_hot_decode(out_pred)
)
print(
    tokenizer.sequences_
)

[[1. 0.]]
['No']
