In [1]:
import re
import string
import pandas as pd
import tensorflow as tf

In [2]:
DATA_PATH = "../data/"
MODEL_PATH = "../weights/"

In [3]:
df = pd.read_csv(f"{DATA_PATH}/smsspam.tsv", sep="\t", names=["category", "statement"])
df.head()

Unnamed: 0,category,statement
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
def preprocessing(raw_text):
    lower_text = tf.strings.lower(raw_text)
    stripped_html = tf.strings.regex_replace(lower_text, "<br />", " ") 
    
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), ' ')

In [5]:
# TextVectorization hyper parameter
text_dataset = df.statement.values
max_vocab = 5000
max_len = 16

In [6]:
# Let's do a basic preprocessing inside TextVectorization layer
vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_vocab,
                                                                               standardize=preprocessing,
                                                                               split="whitespace",
                                                                               output_mode="int",
                                                                               output_sequence_length=max_len)

# Now to time consume these text_dataset to generate vocab and it's mapping
vectorize_layer.adapt(text_dataset)

# Vocabulary generated by layer
print(f"Top 10 words {vectorize_layer.get_vocabulary()[:10]}")
print(f"Bottom 10 words {vectorize_layer.get_vocabulary()[-10:]} \n")

# Number of words
print(f"Total words are {len(vectorize_layer.get_vocabulary())}")

Top 10 words ['', '[UNK]', 'i', 'to', 'you', 'a', 'the', 'u', 'and', 'in']
Bottom 10 words ['sympathetic', 'syllabus', 'syd', 'swollen', 'swimsuit', 'swhrt', 'swell', 'sweatter', 'sweater', 'swear'] 

Total words are 5000


In [7]:
# To see output of the layer let's build small network
model_custom = tf.keras.models.Sequential()
model_custom.add(tf.keras.layers.Input(shape=(1,), dtype=tf.string))
model_custom.add(vectorize_layer)

model_custom.predict([["i have learn about the byte pair encoding"], ["data science is everywhere"]])

array([[   2,   18, 1392,   84,    6,    1,    1,    1,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   1, 2455,   10, 3838,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]])