# Bert Spam Detection

Source: https://www.analyticsvidhya.com/blog/2021/09/performing-email-spam-detection-using-bert-in-python/

# Setup

In [29]:
!pip install tensorflow-text
!pip install wandb
!pip install gradio

Collecting gradio
  Downloading gradio-2.3.9-py3-none-any.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 5.1 MB/s 
Collecting Flask-Cors>=3.0.8
  Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting pycryptodome
  Downloading pycryptodome-3.11.0-cp35-abi3-manylinux2010_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 65.8 MB/s 
[?25hCollecting paramiko
  Downloading paramiko-2.8.0-py2.py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 38.5 MB/s 
[?25hCollecting flask-cachebuster
  Downloading Flask-CacheBuster-1.0.0.tar.gz (3.1 kB)
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
Collecting analytics-python
  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)
Collecting markdown2
  Downloading markdown2-2.4.1-py2.py3-none-any.whl (34 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting Flask-Login
  Downloading Flask_Login-0.5.0-py2.py3-none-any.whl

In [2]:
import tensorflow_hub as hub
import pandas as pd
import tensorflow_text as text
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

In [3]:
# Flexible integration for any Python script
import wandb
from wandb.keras import WandbCallback

# 1. Start a W&B run
wandb.init(project='spam_detection', entity='wasaequreshi')

# 2. Save model inputs and hyperparameters
config = wandb.config
config.learning_rate = 0.01    

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Data Loading

In [4]:
!gdown --id 1JoOoXyJwqGI33MChbS3SYaVbibEGwxIB

Downloading...
From: https://drive.google.com/uc?id=1JoOoXyJwqGI33MChbS3SYaVbibEGwxIB
To: /content/spam_data.csv
  0% 0.00/486k [00:00<?, ?B/s]100% 486k/486k [00:00<00:00, 64.5MB/s]


In [5]:
# load data
df = pd.read_csv('/content/spam_data.csv')

In [6]:
# check count and unique and top values and their frequency
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [7]:
# check percentage of data - states how much data needs to be balanced
print(str(round(747/4825,2))+'%')

0.15%


In [8]:
# creating 2 new dataframe as df_ham , df_spam
df_spam = df[df['Category']=='spam']
df_ham = df[df['Category']=='ham']

print("Ham Dataset Shape:", df_ham.shape)
print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 2)
Spam Dataset Shape: (747, 2)


In [9]:
# downsampling ham dataset - take only random 747 example
# will use df_spam.shape[0] - 747
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [10]:
# concating both dataset - df_spam and df_ham_balanced to create df_balanced dataset
df_balanced = pd.concat([df_spam , df_ham_downsampled])


In [11]:
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [12]:
df_balanced.sample(10)

Unnamed: 0,Category,Message
1637,ham,"No shit, but I wasn't that surprised, so I wen..."
4176,ham,How are you doing. How's the queen. Are you go...
1904,spam,Free entry in 2 a weekly comp for a chance to ...
5295,ham,Alex says he's not ok with you not being ok wi...
2089,spam,Well done ENGLAND! Get the official poly ringt...
1377,ham,Auntie huai juan never pick up her phone
2918,ham,Yes. that will be fine. Love you. Be safe.
350,ham,Just checking in on you. Really do miss seeing...
3014,spam,FREE UNLIMITED HARDCORE PORN direct 2 your mob...
1509,ham,Sounds like something that someone testing me ...


# Preprocessing Spam Data

In [13]:
# creating numerical repersentation of category - one hot encoding
df_balanced['spam'] = df_balanced['Category'].apply(lambda x:1 if x=='spam' else 0)

In [14]:
# displaying data - spam -1 , ham-0
df_balanced.sample(4)

Unnamed: 0,Category,Message,spam
703,ham,What is important is that you prevent dehydrat...,0
2539,ham,The monthly amount is not that terrible and yo...,0
3425,spam,Am new 2 club & dont fink we met yet Will B gr...,1
1765,spam,Hi 07734396839 IBH Customer Loyalty Offer: The...,1


# Train/Test Split

In [15]:
# loading train test split
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'],
                                                    stratify = df_balanced['spam'])

# Model

In [16]:
# downloading preprocessing files and model
bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [17]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = bert_preprocessor(text_input)
embeed = bert_encoder(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)

In [18]:
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])

In [19]:
# check the summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Inputs (InputLayer)             [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_word_ids': ( 0           Inputs[0][0]                     
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'sequence_output':  109482241   keras_layer[0][0]                
                                                                 keras_layer[0][1]                
                                                                 keras_layer[0][2]                
______________________________________________________________________________________________

# Compiling/Training

In [20]:
Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]

In [21]:
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

In [22]:
history = model.fit(X_train, y_train, epochs = 3, callbacks=[WandbCallback(monitor="val_loss", verbose=0, mode="auto", save_weights_only=(False),    log_weights=(False), log_gradients=(False), save_model=(True),    training_data=None, validation_data=None, labels=[], data_type=None,    predictions=36, generator=None, input_type=None, output_type=None,    log_evaluation=(False), validation_steps=None, class_colors=None,    log_batch_frequency=None, log_best_prefix="best_", save_graph=(True),    validation_indexes=None, validation_row_processor=None,    prediction_row_processor=None, infer_missing_processors=(True),    log_evaluation_frequency=0)])

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Model Evaluation

In [23]:
# Evaluating performance
model.evaluate(X_test,y_test)



[0.3969670236110687,
 0.8957219123840332,
 0.8700000047683716,
 0.9304812550544739]

# Interactive

In [36]:
import gradio as gr

def greet(input_sentence):
  test_results = model.predict([input_sentence])
  return np.where(test_results>0.5,'spam', 'ham')[0][0]


iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted
Running on External URL: https://34644.gradio.app


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7862/',
 'https://34644.gradio.app')