In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import re
import math
import time
import string
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
DATA_PATH = "../data/"
MODEL_PATH = "../weights/"
SERVING_PATH = "../servings/"

In [4]:
df = pd.read_csv(f"{DATA_PATH}/smsspam.tsv", sep="\t", names=["category", "statement"])
df.head()

Unnamed: 0,category,statement
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.loc[df["category"]=="ham", "category",] = 0
df.loc[df["category"]=="spam", "category",] = 1

df.head()

Unnamed: 0,category,statement
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Calculate average statement size
avg_size = sum([len(sta.split()) for sta in df.statement]) / len(df.statement)
print(f"Average statement size is {math.ceil(avg_size)}")

Average statement size is 16


In [7]:
X = df.statement.values
Y = tf.keras.utils.to_categorical(df.category.values)

print(X[:3])
print(Y[:3])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
[[1. 0.]
 [1. 0.]
 [0. 1.]]


In [8]:
def preprocessing(raw_text):
    lower_text = tf.strings.lower(raw_text)
    stripped_html = tf.strings.regex_replace(lower_text, "<br />", " ") 
    
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), ' ')

In [9]:
# TextVectorization hyper parameter# TextVectorization hyper parameter
max_vocab = 400002
max_len = 16

In [10]:
# Build vocabulary and relevent vector
words = []
# Adding 2 empty vector of dimension 50 which will be used in padding and unknown token(word)
# According to previous knowledge i have idea that it contains 400,000 words
# Added extra 2 rows because when we will use inside TextVectorization add blank i.e. '' and ['UNK'] inside vocabulary
vectors = np.zeros((400002, 50))

with open(f"{DATA_PATH}glove.6B.50d.txt") as f:
    lines = f.readlines()
    
    for idx, line in enumerate(lines):
        split_line = line.split()
        words.append(split_line[0])
        vectors[idx+2] = split_line[1:]

print(f"Number of words are {len(words)}")
print(f"Shape of vector is {vectors.shape}")

Number of words are 400000
Shape of vector is (400002, 50)


In [11]:
# Let's do a basic preprocessing inside TextVectorization layer
vectorize_layer_glove = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_vocab,
                                                                               standardize=preprocessing,
                                                                               split="whitespace",
                                                                               output_mode="int",
                                                                               output_sequence_length=max_len)


# Vocabulary set into layer
vectorize_layer_glove.set_vocabulary(words)
print(f"Top 10 words {vectorize_layer_glove.get_vocabulary()[:10]}")
print(f"Bottom 10 words {vectorize_layer_glove.get_vocabulary()[-10:]} \n")

# Number of words
print(f"Total words are {len(vectorize_layer_glove.get_vocabulary())}")

Top 10 words ['', '[UNK]', 'the', ',', '.', 'of', 'to', 'and', 'in', 'a']
Bottom 10 words ['sigarms', 'katuna', 'aqm', '1.3775', 'corythosaurus', 'chanty', 'kronik', 'rolonda', 'zsombor', 'sandberger'] 

Total words are 400002


In [12]:
# To see output of the layer let's build small network
model_glove = tf.keras.models.Sequential()
model_glove.add(tf.keras.layers.Input(shape=(1,), dtype=tf.string))
model_glove.add(vectorize_layer_glove)

model_glove.predict([["i have learn about the byte pair encoding for XLNET"], ["data science is everywhere"]])

array([[   43,    35,  2370,    61,     2, 37360,  2571, 23623,    12,
            1,     0,     0,     0,     0,     0,     0],
       [  935,  1123,    16,  5339,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0]])

In [13]:
model_glove.add(tf.keras.layers.Embedding(max_vocab, 50, weights=[vectors], trainable=False))
model_glove.add(tf.keras.layers.LSTM(16))
model_glove.add(tf.keras.layers.Dense(2, activation="softmax"))

model_glove.compile(loss="categorical_crossentropy", optimizer="adam", metrics="accuracy")

In [14]:
model_glove.fit(X, Y, epochs=10, batch_size=64, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdd6842b070>

In [15]:
prediction = model_glove.predict([["free entry for 2 people in next FIFA match"]])
prediction

array([[0.30511713, 0.69488287]], dtype=float32)

As we have assigned value 0 to "ham" that means not spam and value 1 to "spam". So if we perform argmax on prediction variable we will get 1 as value that means "spam"