In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import re
import math
import time
import string
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
DATA_PATH = "../data/"
MODEL_PATH = "../weights/"
SERVING_PATH = "../servings/"

In [4]:
df = pd.read_csv(f"{DATA_PATH}/smsspam.tsv", sep="\t", names=["category", "statement"])
df.head()

Unnamed: 0,category,statement
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.loc[df["category"]=="ham", "category",] = 0
df.loc[df["category"]=="spam", "category",] = 1

df.head()

Unnamed: 0,category,statement
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Calculate average statement size
avg_size = sum([len(sta.split()) for sta in df.statement]) / len(df.statement)
print(f"Average statement size is {math.ceil(avg_size)}")

Average statement size is 16


In [7]:
X = df.statement.values
Y = tf.keras.utils.to_categorical(df.category.values)

print(X[:3])
print(Y[:3])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
[[1. 0.]
 [1. 0.]
 [0. 1.]]


In [8]:
def preprocessing(raw_text):
    lower_text = tf.strings.lower(raw_text)
    stripped_html = tf.strings.regex_replace(lower_text, "<br />", " ") 
    
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), ' ')

In [9]:
# TextVectorization hyper parameter
text_dataset = df.statement.values
max_vocab = 5000
max_len = 16

In [10]:
# Let's do a basic preprocessing inside TextVectorization layer
vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_vocab,
                                                                               standardize=preprocessing,
                                                                               split="whitespace",
                                                                               output_mode="int",
                                                                               output_sequence_length=max_len)

# Now to time consume these text_dataset to generate vocab and it's mapping
vectorize_layer.adapt(text_dataset)

# Vocabulary generated by layer
print(f"Top 10 words {vectorize_layer.get_vocabulary()[:10]}")
print(f"Bottom 10 words {vectorize_layer.get_vocabulary()[-10:]} \n")

# Number of words
print(f"Total words are {len(vectorize_layer.get_vocabulary())}")

Top 10 words ['', '[UNK]', 'i', 'to', 'you', 'a', 'the', 'u', 'and', 'in']
Bottom 10 words ['sympathetic', 'syllabus', 'syd', 'swollen', 'swimsuit', 'swhrt', 'swell', 'sweatter', 'sweater', 'swear'] 

Total words are 5000


In [11]:
# To see output of the layer let's build small network
model_custom = tf.keras.models.Sequential()
model_custom.add(tf.keras.layers.Input(shape=(1,), dtype=tf.string))
model_custom.add(vectorize_layer)

model_custom.predict([["i have learn about the byte pair encoding"], ["data science is everywhere"]])

array([[   2,   18, 1392,   84,    6,    1,    1,    1,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   1, 2455,   10, 3838,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]])

In [12]:
model_custom.add(tf.keras.layers.Embedding(max_vocab, 50))
model_custom.add(tf.keras.layers.LSTM(16))
model_custom.add(tf.keras.layers.Dense(2, activation="softmax"))

model_custom.compile(loss="categorical_crossentropy", optimizer="adam", metrics="accuracy")

In [13]:
model_custom.fit(X, Y, epochs=10, batch_size=64, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff75c70a4f0>

In [14]:
prediction = model_custom.predict([["free entry for 2 people in next FIFA match"]])
prediction

array([[0.00287533, 0.9971246 ]], dtype=float32)

As we have assigned value 0 to "ham" that means not spam and value 1 to "spam". So if we perform argmax on prediction variable we will get 1 as value that means "spam"