### Install All the Required Libraries

In [2]:
%pip install pytorch_lightning
%pip install transformers
%pip install pypdf
%pip install gradio
%pip install fpdf
%pip install pypdf2

Collecting pytorch_lightning
  Using cached pytorch_lightning-2.2.1-py3-none-any.whl.metadata (21 kB)
Collecting torch>=1.13.0 (from pytorch_lightning)
  Using cached torch-2.2.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Using cached torchmetrics-1.3.1-py3-none-any.whl.metadata (19 kB)
Collecting typing-extensions>=4.4.0 (from pytorch_lightning)
  Using cached typing_extensions-4.10.0-py3-none-any.whl.metadata (3.0 kB)
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Using cached lightning_utilities-0.10.1-py3-none-any.whl.metadata (4.8 kB)
Collecting aiohttp (from fsspec[http]>=2022.5.0->pytorch_lightning)
  Using cached aiohttp-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->pytorch_lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidi

### Import All the Required Things

In [3]:
import os
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from fpdf import FPDF
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader,Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
import PyPDF2
import gradio as gr

### Load and Create Dataset Sample for Training

In [4]:
dataset = pd.read_csv("./Phishing_Email.csv")

In [5]:
dataset_sample = dataset.sample(500)

In [6]:
texts = []
for val in dataset_sample['Email Text']:
    val = str(val).strip()
    texts.append(val)

In [7]:
labels = []
for val in dataset_sample['Email Type']:
    val = str(val).strip()
    labels.append(val)

### Create Testing Samples for Testing the Application

In [14]:
test_data = dataset.sample(5)

In [15]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
11205,11206,15 firee piills and firee shiipping for ciaiis...,Phishing Email
6710,6711,re : natural gas nomination for 09 / 00 ok ! !...,Safe Email
1308,1308,winning notification dalobica lotto bv . inter...,Phishing Email
9219,9220,URL: http://boingboing.net/#85516100\nDate: No...,Safe Email
2146,2146,04 counterparties louise indicated that credit...,Safe Email


In [16]:
test_email = []
for val in test_data['Email Text']:
    val = str(val).strip()
    test_email.append(val)

In [17]:
for mail in range(len(test_email)):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.write(5, test_email[mail])
    pdf.output(f"Testing_Emails/email{mail+1}.pdf")

### Label Encoding and Spliting of Training Data

In [18]:
label_encoder = LabelEncoder()
dataset_sample['Email_Type'] = label_encoder.fit_transform(dataset_sample['Email Type'])
label = dataset_sample['Email_Type'].tolist()

In [19]:
train_text, test_text, train_label, test_label = train_test_split(texts,label,test_size=0.2)

### Create a Custom Dataset Class

In [44]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long),
        }

### Get Model and Tokernizer

In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Create Training and Testing Loads

In [40]:
train_dataset = CustomDataset(train_text,train_label,tokenizer)
test_dataset = CustomDataset(test_text,test_label,tokenizer)
train_loader = DataLoader(train_dataset,batch_size=8,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=8,shuffle=False)

### Create Optimizer and Set Criterion

In [41]:
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()



### Set the Number of Epochs and Device To Train the Model

In [42]:
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
save_dir = "./Model"
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask,labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch +1}/{num_epochs}, Loss:{average_loss:.4f}')
    
    model_state_dict = model.state_dict()
    model_filename = os.path.join(save_dir, f'model_{epoch}.pth')
    torch.save(model_state_dict, model_filename)

### Evaluate the Model to get the Accuracy

In [27]:
load_dir = './Model'
load_filename = os.path.join(load_dir, 'model_4.pth')
model_state_dict = torch.load(load_filename, map_location=device)
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [28]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits,dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels,all_preds)
print(f"validation Accuracy: {accuracy:.2f}")

validation Accuracy: 0.91


### Function Predict Which will Give the Results On Unseen Emails

In [45]:
def predict(model, tokenizer, email_text, device='cuda'):
    """
    Predict whether an email is a phishing email or not using the given model and tokenizer.

    Args:
        model (BertForSequenceClassification): The trained model.
        tokenizer (BertTokenizer): The tokenizer used to encode the email text.
        email_text (str): The email text to classify.
        device (str, optional): The device to use for computation. Defaults to 'cuda'.

    Returns:
        str: A string indicating whether the email is a phishing email or not.
    """
    load_dir = './Model'
    load_filename = os.path.join(load_dir, 'model_4.pth')
    model_state_dict = torch.load(load_filename)
    model.load_state_dict(model_state_dict)
    model.eval()

    # Tokenize the email text
    email_text = tokenizer.encode_plus(
        email_text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Truncate the input sequence to the maximum length of 512
    input_ids = email_text['input_ids']
    attention_mask = email_text['attention_mask']
    if input_ids.shape[1] > 512:
        input_ids = input_ids[:, :512]
        attention_mask = attention_mask[:, :512]
    
    # Make a prediction
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Print the predicted class
    if predicted_class == 0:
        return "The email is phishing email."
    else:
        return "The email is safe email."

### Function to Extraxt the Text from the PDF File

In [46]:
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_number in range(len(reader.pages)):
            page = reader.pages[page_number]
            text += page.extract_text()
    return text

### Final Gradio Application (Phishing Email Detection Model)

In [47]:
def process_pdf(pdf_file):
    text = extract_text_from_pdf(pdf_file)
    return predict(model,tokenizer,text,device)

interface = gr.Interface(
  process_pdf,
  inputs=gr.File(type="filepath", label="Upload a PDF File"),
  outputs="text",
  title="PDF Processing",
  description="Phishing Emails Detection Model",
  elem_id="my-interface"  # Optional, assign a unique ID for styling
)
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://36ca4605b66a0b3aa6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




