# Email Spam Classification using BERT

In [1]:
import pandas as pd
import numpy as np
import tensorflow_text
import tensorflow_hub as hub

In [2]:
df = pd.read_csv(r"D:\Data Science\DataSets\Email\spam.csv")

In [3]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

## Due to imbalance dataset we are going to down samle 

In [6]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [7]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [8]:
df_downsampled = df_ham.sample(df_spam.shape[0])
df_downsampled.shape

(747, 2)

In [9]:
df_balance = pd.concat([df_spam,df_downsampled])

In [10]:
df_balance.shape

(1494, 2)

In [11]:
df_balance['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [12]:
df_balance['Spam'] = df['Category'].apply(lambda x : 1 if x== 'spam' else 0) 

In [13]:
df_balance.sample(5)

Unnamed: 0,Category,Message,Spam
4310,ham,He dint tell anything. He is angry on me that ...,0
851,ham,U reach orchard already? U wan 2 go buy ticket...,0
2300,spam,Congrats! 1 year special cinema pass for 2 is ...,1
2850,spam,YOUR CHANCE TO BE ON A REALITY FANTASY SHOW ca...,1
4079,ham,Gam gone after outstanding innings.,0


In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train,y_test = train_test_split(df_balance['Message'],
                                                  df_balance['Spam'],
                                                  test_size=0.2,
                                                  shuffle=True,stratify=df_balance['Spam'])


In [15]:
X_train.shape

(1195,)

In [16]:
X_test.shape

(299,)

In [17]:
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [18]:
def get_sentence_embeding(sentance):
    processed_text = bert_preprocess(sentance)
    return bert_encoder(processed_text)['pooled_output']

In [19]:
get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351707, -0.5132727 , -0.8884575 , ..., -0.747489  ,
        -0.7531475 ,  0.91964495],
       [-0.8720837 , -0.5054398 , -0.94446695, ..., -0.8584751 ,
        -0.71745366,  0.88082975]], dtype=float32)>

## Building  Model

In [20]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input,Dropout,Dense

In [21]:
input_text = Input(shape=(),dtype=tf.string,name='text')
preprocessed_text = bert_preprocess(input_text)
output = bert_encoder(preprocessed_text)

l = Dropout(0.2)(output['pooled_output'])
l = Dense(1,activation = 'sigmoid')(l)

model = Model(input_text,l)


In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [23]:
matrix = [
    tf.keras.metrics.BinaryAccuracy(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall(),
]

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=matrix)


In [24]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")

model.fit(X_train,y_train,epochs=10,callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x24b1fd53100>

In [25]:
model.evaluate(X_test,y_test)



[0.26515647768974304,
 0.8963210582733154,
 0.8650306463241577,
 0.9399999976158142]

In [26]:
pred = model.predict(X_test)
pred = pred.flatten()



In [27]:
import numpy as np

pred = np.where(pred > 0.5, 1, 0)
pred

array([0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [28]:
from sklearn.metrics import classification_report

print(classification_report(y_test,pred))


              precision    recall  f1-score   support

           0       0.93      0.85      0.89       149
           1       0.87      0.94      0.90       150

    accuracy                           0.90       299
   macro avg       0.90      0.90      0.90       299
weighted avg       0.90      0.90      0.90       299

