In [48]:
import argparse
import gensim.downloader as api
import numpy as np
import os
import shutil
import tensorflow as tf
from sklearn.metrics import accuracy_score,confusion_matrix

In [49]:
#https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
def read_local_file():
    labels,texts = [],[]
    with open('SMSSpamCollection',"r") as fin:
        for line in fin:
            label,text = line.strip().split("\t")
            labels.append(1 if label == "spam" else 0)
            texts.append(text)
    return texts,labels
# DATASET_URL = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
texts,labels = read_local_file()

In [50]:
texts[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [51]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
text_sequences = tokenizer.texts_to_sequences(texts)
text_sequences=tf.keras.preprocessing.sequence.pad_sequences(text_sequences)

In [52]:
num_records,max_seqlen=len(text_sequences),len(text_sequences[0])

In [53]:
NUM_CLASSES = 2
cat_labels = tf.keras.utils.to_categorical(labels,num_classes=NUM_CLASSES)

In [54]:
cat_labels

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [55]:
word2idx = tokenizer.word_index
idx2word = {v:k for k,v in word2idx.items()}
word2idx["PAD"] = 0
idx2word[0]="PAD"
vocab_size = len(word2idx)
vocab_size

9013

In [56]:
dataset = tf.data.Dataset.from_tensor_slices((text_sequences,cat_labels)).batch(32).prefetch(tf.data.AUTOTUNE)
dataset = dataset.shuffle(10000)
test_size = num_records//4
val_size = (num_records - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size+val_size)
BATCH_SIZE = 128
test_dataset=test_dataset.batch(BATCH_SIZE,drop_remainder=True)
val_dataset = val_dataset.batch(BATCH_SIZE,drop_remainder=True)
train_dataset = train_dataset.batch(BATCH_SIZE,drop_remainder=True)

In [57]:
import gensim.downloader as api
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [58]:
def build_embedding_matrix(sequences,word2idx,embedding_dim,embedding_file):
    if os.path.exists(embedding_file):
        E = np.load(embedding_file)
    else:
        vocab_size = len(word2idx)
        E = np.zeros((vocab_size,embedding_dim))
        word_vectors = api.load(EMBEDDING_MODEL)
        for word,idx in word2idx.items():
            try:
                E[idx] = word_vectors.word_vec(word)
            except KeyError:
                pass
        np.save(embedding_file,E)
    return E

EMBEDDING_DIM = 300
DATA_DIR = "data"
EMBEDDING_NUMPY_FILE = os.path.join(DATA_DIR,"E.npy")
EMBEDDING_MODEL = 'glove-wiki-gigaword-300'
E = build_embedding_matrix(text_sequences,word2idx,EMBEDDING_DIM,EMBEDDING_NUMPY_FILE)
print("Embedding matrix : ",E.shape)

Embedding matrix :  (9013, 300)


In [59]:
class SpamClassifierModel(tf.keras.Model):
    def __init__(self,vocab_sz,embed_sz,input_length,num_fiilters,kernel_sz,output_sz,run_mode,embedding_weights,**kwargs):
        super(SpamClassifierModel,self).__init__(**kwargs)
        if run_mode=="scratch":
            self.embedding=tf.keras.layers.Embedding(vocab_sz,embed_sz,input_length=input_length,trainable=True)
        elif run_mode=="vectorizer":
            self.embedding=tf.keras.layers.Embedding(vocab_sz,embed_sz,input_length=input_length,weights=[embedding_weights],trainable=False)

        else:
            self.embedding=tf.keras.layers.Embedding(vocab_sz,embed_sz,input_length=input_length,weights=[embedding_weights],trainable=True)

        self.conv = tf.keras.layers.Conv1D(filters=num_fiilters,kernel_size=kernel_sz,activation="relu")
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        self.pool = tf.keras.layers.GlobalMaxPooling1D()
        self.dense = tf.keras.layers.Dense(output_sz,activation="softmax")

    def call(self,x):
        x = self.embedding(x)
        x = self.conv(x)
        x = self.dropout(x)
        x = self.pool(x)
        x = self.dense(x)
        return x

conv_num_filters = 256
conv_kernal_size = 3

model = SpamClassifierModel(vocab_size,EMBEDDING_DIM,max_seqlen,conv_num_filters,conv_kernal_size,NUM_CLASSES,"finetune",E)
model.build(input_shape=(None,max_seqlen))

In [60]:
model.compile(optimizer=tf.keras.optimizers.Adam(),loss = "categorical_crossentropy",metrics=["accuracy"])

In [61]:
NUM_EPOCHS = 3
# CLASS_WEIGHTS = {0:1,1:8}
#,class_weight=CLASS_WEIGHTS
model.fit(train_dataset,epochs=NUM_EPOCHS,validation_data=val_dataset)

labels,predictions = [],[]

for Xtest , Ytest in test_dataset:
    Ytest_ = model.predict_on_batch(Xtest)
    ytest = np.argmax(Ytest,axis=1)
    ytest_ = np.argmax(Ytest_,axis=1)
    labels.extend(ytest.tolist())
    predictions.extend(ytest.tolist())

print(accuracy_score(labels,predictions))
print(confusion_matrix(labels,predictions))

Epoch 1/3


ValueError: Unexpected result of `train_function` (Empty logs). Please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.