In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import re
import string
import nltk

import tensorflow as tf

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
FILE_PATH = "../../data/spam.csv"

df = pd.read_csv(FILE_PATH, encoding="latin-1")

df = df.dropna(how="any", axis=1)

df = df.rename(columns={ "v1": "target", "v2": "message" })

df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def process_df(df):
    df['processed'] = df['message'].apply(clean_text)
    return df

df = process_df(df)
df.head()

Unnamed: 0,target,message,processed
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [4]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
df['processed'] = df['processed'].apply(remove_stopwords)
df.head()


Unnamed: 0,target,message,processed
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [5]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

df['processed'] = df['processed'].apply(stemm_text)
df.head()

Unnamed: 0,target,message,processed
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [7]:
df['target'] = df['target_encoded'].apply(lambda x: 1 if x == "spam" else 0)
df.head()

# To be replaced with spam 1, ham 0

Unnamed: 0,target,message,processed,target_encoded
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,0
1,0,Ok lar... Joking wif u oni...,ok lar joke wif oni,0
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkts m...,1
3,0,U dun say so early hor... U c already then say...,dun say earli hor alreadi say,0
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though,0


In [8]:
texts = df['processed']
target = df['target_encoded']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, 
    target, 
    test_size=0.25,
    random_state=42,
)

In [40]:
len(X_train)

4179

In [26]:
word_tokenizer = tf.keras.preprocessing.text.Tokenizer()
word_tokenizer.fit_on_texts(X_train)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

5746

In [27]:
max_length = 80

In [28]:
metadata = {}
metadata["vocabulary_size"] = vocab_length
metadata["max_length"] = max_length

In [29]:
embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
with open('./glove.6B.100d.txt') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.57832998, -0.0036551 ,  0.34658   , ...,  0.070204  ,
         0.44509   ,  0.24147999],
       [-0.078894  ,  0.46160001,  0.57779002, ...,  0.26352   ,
         0.59397   ,  0.26741001],
       ...,
       [-0.53434002,  0.096645  ,  0.13808   , ...,  0.12309   ,
         0.69825   ,  0.88173997],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [30]:
embedding_matrix.shape

(5746, 100)

In [35]:
len(X_train_indices)

76

In [37]:
len(X_train[0].split(" "))

16

In [39]:
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

train_padded_sentences = tf.keras.preprocessing.sequence.pad_sequences(
    embed(X_train), 
    max_length, 
    padding='post'
)

train_padded_sentences.shape

(4179, 80)

In [44]:
import json
word_index = {}
for word, index in word_tokenizer.word_index.items():
    word_index[word] = index

metadata["word_index"] = word_index

with open("metadata.json", "w") as f:
    json.dump(metadata, f)

In [58]:
def glove_lstm(max_length):
    model = tf.keras.models.Sequential()
    
    model.add(tf.keras.layers.Input((max_length,)))
    
    # model.add(tf.keras.layers.Embedding(
    #     input_dim=embedding_matrix.shape[0], 
    #     output_dim=embedding_matrix.shape[1], 
    #     weights = [embedding_matrix], 
    #     input_length=length_long_sentence
    # ))
    
    model.add(tf.keras.layers.Embedding(
        input_dim=vocab_length,
        output_dim=embedding_dim,
        input_length=max_length,
        weights = [embedding_matrix],
        trainable=False
    ))
    
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
        max_length, 
        return_sequences = True, 
        recurrent_dropout=0.2
    )))
    
    model.add(tf.keras.layers.GlobalMaxPool1D())
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(max_length, activation = "relu"))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(max_length, activation = "relu"))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = glove_lstm(max_length)
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 80, 100)           574600    
                                                                 
 bidirectional_4 (Bidirectio  (None, 80, 160)          115840    
 nal)                                                            
                                                                 
 global_max_pooling1d_4 (Glo  (None, 160)              0         
 balMaxPooling1D)                                                
                                                                 
 batch_normalization_4 (Batc  (None, 160)              640       
 hNormalization)                                                 
                                                                 
 dropout_12 (Dropout)        (None, 160)               0         
                                                      

In [59]:
test_padded_sentences = tf.keras.preprocessing.sequence.pad_sequences(
    embed(X_test), 
    max_length, 
    padding='post'
)

In [60]:
history = model.fit(
    train_padded_sentences, 
    y_train, 
    epochs = 7,
    batch_size = 32,
    validation_data = (test_padded_sentences, y_test),
    verbose = 1,
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [62]:
!mkdir -p saved_model
model.save('lstm_glove')

INFO:tensorflow:Assets written to: lstm_glove/assets


In [70]:
!tensorflowjs_converter --input_format=tf_saved_model --output_format tfjs_graph_model --control_flow_v2=true ./lstm_glove ./tfjs_lstm_glove

2022-06-14 15:43:10.019478: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-14 15:43:10.020736: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-06-14 15:43:13.781874: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-06-14 15:43:13.782011: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-14 15:43:13.782131: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ada8cfd194fc): /proc/driver/nvidia/version does not exist
2022-06-14 15:43:13.782510: I tensorflow/core/platform/cpu_featu

In [68]:
test = np.zeros((1,80))
test[0][0] = 1000
test

array([[1000.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.]])

In [69]:
model.predict(test)



array([[1.8707024e-05]], dtype=float32)