#**INFT1204: Special Topics in Cybersecurity - Assignment 3**

**Group Members: Hammed Tijani (ID:100902204), Tehara Moonemalle (ID: 100903984), Steffannie Egbuziem (ID:100896975), Romoyne Watson (ID: 100895321).**

Code attribution:

*   https://www.analyticsvidhya.com/blog/2021/09/performing-email-spam-detection-using-bert-in-python/
*   https://github.com/prateekjoshi565/Fine-Tuning-BERT/blob/master/Fine_Tuning_BERT_for_Spam_Classification.ipynb






# **Install Packages**

In [None]:
!pip install -U "tensorflow==2.8.*"
!pip install -U "tensorflow-text==2.8.*"
!pip install transformers
!pip install -U tensorflow-text
!pip install transformers[torch]
!pip install accelerate -U
!pip install gradio
!pip install fsspec==2022.10.0

# **Import Packages**

In [8]:
import tensorflow_hub as hub
import pandas as pd
import gradio as gr
import tensorflow_text as text
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
import transformers
from transformers import AutoModel, BertTokenizerFast, AutoModelForSequenceClassification, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# specify GPU
device = torch.device("cuda")

# **Load the Dataset**

In [None]:
# load data
df = pd.read_csv('/content/Data/spam.csv')
df.head()

In [None]:
# check count and unique and top values and their frequency
df['Category'].value_counts()

In [None]:
# check percentage of data - states how much data needs to be balanced
print(str(round(747/4825,2))+'%')

# **Creating a new dataset with equal data for each Category**

In [None]:
# creating 2 new dataframe as df_phishing, df_not_phishing

df_phishing = df[df['Category']=='phishing']

df_not_phishing = df[df['Category']=='not phishing']

print("Not Phishing Dataset Shape:", df_not_phishing.shape)

print("Phishing Dataset Shape:", df_phishing.shape)

In [None]:
# downsampling not phishing dataset - take only random 747 example
# will use df_phishing.shape[0] - 747
df_nphishing_downsampled = df_not_phishing.sample(df_phishing.shape[0])
df_nphishing_downsampled.shape

In [None]:
# concating both dataset - df_phishing and df_nphishing_downsampled to create df_balanced dataset
df_balanced = pd.concat([df_phishing , df_nphishing_downsampled])

df_balanced['Category'].value_counts()

In [None]:
df_balanced.sample(10)

In [None]:
# creating numerical repersentation of category - one hot encoding
df_balanced['Type'] = df_balanced['Category'].apply(lambda x:1 if x=='phishing' else 0)

# displaying data - phishing -1 , not phishing-0
df_balanced.sample(4)

# **Training and Testing Dataset**

In [18]:
# loading train test split
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['Type'],
                                                    stratify = df_balanced['Type'])

# **Download BERT**

In [19]:
# downloading preprocessing files and model
bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

# **Training using Keras API**

In [20]:
# Training using the Keras API
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = bert_preprocessor(text_input)
embeed = bert_encoder(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])

In [None]:
# check the summary of the model
model.summary()

In [22]:
Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

In [None]:
history = model.fit(X_train, y_train, epochs = 10)

In [None]:
# Evaluating performance
model.evaluate(X_test,y_test)

# **Gradio Interface**

In [None]:
def spam_filter(email):
    test_results = model.predict([email])  # Assuming model.predict() accepts a list of emails
    output = np.where(test_results > 0.5, 'phishing', 'not phishing')
    return output[0][0]

# Define the interface
demo = gr.Interface(
    fn=spam_filter,
    inputs=gr.Textbox(lines=2, placeholder="Email Here..."),
    outputs=[gr.Textbox(label="output")],
)

# Launch the interface
demo.launch(share=True)