In [29]:
#https://stackoverflow.com/questions/51235118/how-to-get-word-vectors-from-keras-embedding-layer

import tensorflow as tf
import keras
from keras_preprocessing import sequence
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from spam_dataset import SpamDataset
from layers import *

In [64]:
def pad_numpify_input(X):
    interm = sequence.pad_sequences(X, maxlen=max_len, truncating='post', padding='post', value=0)
    return np.array([np.array(i).astype('float32') for i in interm])

def one_hot_decode(Y):
    Y = np.eye(Y.shape[1])[Y.argmax(2)]
    return np.array(["Yes" if y[1] == 1 else "No" for y in Y])


In [53]:
n_words = 10000
max_len = 50 #only take top 50 words

embed_dim = 32
num_heads = 1

tokenizer = keras.preprocessing.text.Tokenizer(num_words=n_words, split=' ')

X, _y = SpamDataset.get_data()

# binary encode
one_hot_encoder = OneHotEncoder(sparse=False)
_y = np.array(_y).reshape(len(_y), 1)
y = one_hot_encoder.fit_transform(_y)

#tokenize
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

#pad sequences
X = pad_numpify_input(X)
y = np.array(y).astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [54]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4296, 50)
(4296, 2)
(1432, 50)
(1432, 2)


In [55]:
l_input = layers.Input(shape=(max_len,))
l_embedding = TokenAndPositionEmbedding(max_len, n_words, embed_dim)(l_input)

l_mha = SelfAttention(embed_dim)(l_embedding)
l_pool = layers.GlobalAveragePooling1D()(l_mha)
l_output = layers.Dense(2, activation="softmax")(l_pool)

model = keras.Model(inputs=l_input, outputs=l_output)
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
model.summary()

model.fit(X_train, y_train, epochs=4)

model.evaluate(X_test, y_test)

Model: "functional_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
token_and_position_embedding (None, 50, 32)            321600    
_________________________________________________________________
self_attention_5 (SelfAttent (None, 50, 32)            3168      
_________________________________________________________________
global_average_pooling1d_5 ( (None, 32)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 2)                 66        
Total params: 324,834
Trainable params: 324,834
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.03275293484330177, 0.9895251393318176]

In [65]:
### DEVELOPMENT AREA ###

#define intermediate model (model that does input -> embedding)
model_emb = keras.Model(inputs=l_input, outputs=l_embedding)

#test inputs
x_pred = tokenizer.texts_to_sequences(["this is a test of the email classification system. I am a sentence that is long enough to be an email."]);
x_pred = sequence.pad_sequences(x_pred, maxlen=max_len, truncating='post', padding='post', value=0)
print(
    X_train[0]
)

out_pred = model.predict(np.array([np.array(X_train[0])]))
print(
    out_pred
)
out_pred_argmax = np.argmax(out_pred)
print(
    one_hot_decode([out_pred])
)

[9.400e+03 1.600e+01 4.116e+03 1.800e+01 5.000e+00 1.610e+03 5.700e+01
 1.600e+01 4.118e+03 1.180e+03 3.000e+00 6.000e+00 5.800e+02 2.500e+01
 1.930e+02 5.439e+03 1.060e+02 2.200e+01 6.000e+00 1.180e+03 6.040e+02
 2.000e+00 9.400e+03 1.600e+01 4.116e+03 1.800e+01 5.000e+00 1.610e+03
 3.561e+03 1.230e+02 1.200e+01 1.000e+00 1.450e+02 2.570e+02 2.000e+00
 2.760e+02 4.800e+01 5.300e+01 6.420e+02 3.206e+03 4.221e+03 2.135e+03
 1.500e+01 1.200e+01 1.672e+03 2.000e+00 6.740e+02 1.600e+01 5.089e+03
 1.598e+03]
[[4.5251436e-06 9.9999547e-01]]


AttributeError: 'list' object has no attribute 'shape'

1.0


0.0


array([[4.5251436e-06, 9.9999547e-01]], dtype=float32)

array([0., 1.])