### Install Libraries

In [None]:
!pip install --upgrade transformers gradio==3.48.0 sentencepiece opendatasets pandas gdown

Collecting pandas
  Downloading pandas-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas, gdown
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behavi

### Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch
import opendatasets as od
import pandas

### Import Data

In [None]:
!gdown --id 1cOohptk4-83tBadQvdkjGE8AZ9Tp3yuW -O documents.zip
!unzip -q documents.zip -d documents

Downloading...
From: https://drive.google.com/uc?id=1cOohptk4-83tBadQvdkjGE8AZ9Tp3yuW
To: /content/documents.zip
100% 19.0M/19.0M [00:00<00:00, 103MB/s] 


In [None]:
#Import the Dataset
df= pd.read_csv("/content/documents/Phishing_Email.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [None]:
# Check NAN values
print(df.isna().sum())
#Drop tha Na values
df = df.dropna()
print(df.isna().sum())

Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64
Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64


In [None]:
# Remove column name 'A'
df=df.drop(['Unnamed: 0'], axis=1)

In [None]:
df

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...
18645,date a lonely housewife always wanted to date ...,Phishing Email
18646,request submitted : access request for anita ....,Safe Email
18647,"re : important - prc mtg hi dorn & john , as y...",Safe Email
18648,press clippings - letter on californian utilit...,Safe Email


### Split data

In [None]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_data

Unnamed: 0,Email Text,Email Type
5033,"\n""Yes, and Eliza and I composed a precocious ...",Safe Email
3808,sum : linguistics and imperialism some two wea...,Safe Email
4092,URL: http://boingboing.net/#85482924\nDate: No...,Safe Email
1477,"re : worrks wonder hello , donor convey a sing...",Phishing Email
3614,"brandywine meter # : 981225 ; march , 2000 act...",Safe Email
...,...,...
11298,empty,Phishing Email
11979,URL: http://www.mozillazine.org/weblogs/hyatt/...,Safe Email
5398,sum : ref . on formal models of discourse cont...,Safe Email
862,underpriced issue with high return on equity s...,Phishing Email


In [None]:
# Define a custom dataset class
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = int(self.labels.iloc[idx] == 'Phishing Email')
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

### Load Base LLM model & tokenizer

In [None]:
# Instantiate the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForSequenceClassification.from_pretrained('t5-small', num_labels=2)  # binary classification

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-small and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training(Finetunning) Process

In [None]:
# Create datasets and dataloaders
train_dataset = EmailDataset(train_data['Email Text'], train_data['Email Type'], tokenizer)
test_dataset = EmailDataset(test_data['Email Text'], test_data['Email Type'], tokenizer)


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# Set up optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 1



In [None]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


T5ForSequenceClassification(
  (transformer): T5Model(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_featu

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f'Average training loss for Epoch {epoch + 1}: {average_loss}')


Epoch 1: 100%|██████████| 1864/1864 [04:29<00:00,  6.91it/s]


Average training loss for Epoch 1: 0.338367285560157


### Evaluation of finetunned model

In [None]:
# Evaluation
model.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Evaluating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions)

Evaluating: 100%|██████████| 466/466 [00:23<00:00, 20.26it/s]


In [None]:
from sklearn.metrics import accuracy_score, classification_report
# Calculate accuracy and other metrics
accuracy = accuracy_score(all_labels, all_predictions)
report = classification_report(all_labels, all_predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.9428494767909847
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      2209
           1       0.93      0.93      0.93      1518

    accuracy                           0.94      3727
   macro avg       0.94      0.94      0.94      3727
weighted avg       0.94      0.94      0.94      3727



### Save finetunned model

In [None]:
# Save the fine-tuned model
model.save_pretrained('fine_tuned_t5_model')
tokenizer.save_pretrained('fine_tuned_t5_model')

('fine_tuned_t5_model/tokenizer_config.json',
 'fine_tuned_t5_model/special_tokens_map.json',
 'fine_tuned_t5_model/spiece.model',
 'fine_tuned_t5_model/added_tokens.json')

### Test finetunned model for custom input

In [None]:
# Load the fine-tuned model and tokenizer
loaded_model = T5ForSequenceClassification.from_pretrained('fine_tuned_t5_model')
loaded_tokenizer = T5Tokenizer.from_pretrained('fine_tuned_t5_model')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Example usage for prediction
sample_input = """
URL: http://www.newsisfree.com/click/-5,8304313,1717/
Date: 2002-09-27T08:51:29+01:00[IMG: http://www.newsisfree.com/Images/fark/cbc.ca.gif ([CBC])]

"""

def predict_email_type(email_text, model, tokenizer):
    inputs = tokenizer(email_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    return "Phishing Email" if predicted_class == 1 else "Safe Email"

predicted_type = predict_email_type(sample_input, loaded_model, loaded_tokenizer)
print(f"Predicted Email Type: {predicted_type}")

Predicted Email Type: Safe Email


### front-end using gradio(optional)

In [None]:
# Define the prediction function
import gradio as gr

def predict_email_type(email_text):
    inputs = loaded_tokenizer(email_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = loaded_model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    return "Phishing Email" if predicted_class == 1 else "Safe Email"

# Create Gradio Interface
iface = gr.Interface(
    fn=predict_email_type,
    inputs=gr.Textbox(),
    outputs=gr.Textbox(),
    live=True,
    title="Email Type Predictor",
    description="Enter an email text and get the predicted email type.",
)

# Launch the interface
iface.launch()

ImportError: ignored