In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load the unseen balanced dataset
df_unseen = pd.read_csv('../data/unseen/client_ceas_unseen_balanced.csv')

# Prepare the text data (combining subject and body)
df_unseen['text'] = df_unseen['subject'].fillna('') + ' ' + df_unseen['body'].fillna('')

# Convert the DataFrame to a HuggingFace Dataset
dataset = Dataset.from_pandas(df_unseen)

# Load the tokenizer for TinyBERT
MODEL_NAME = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize function
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Apply the tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_fn, batched=True)

# Remove the 'text' column, as it's no longer needed
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# If 'label' is present, rename to 'labels'
if "label" in tokenized_dataset.column_names:
    tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Set the dataset format to torch
tokenized_dataset.set_format("torch")

# Split the dataset into train and test sets
dataset_dict = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["test"]

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))

# You can now proceed with your model training or evaluation using `train_dataset` and `val_dataset`


Map:   0%|          | 0/1666 [00:00<?, ? examples/s]

Train size: 1332
Validation size: 334


In [2]:
# Print the number of columns in the tokenized dataset
print(f"Number of columns in tokenized dataset: {len(tokenized_dataset.column_names)}")

# Print the column names to inspect the tokenized dataset
print(f"Column names in tokenized dataset: {tokenized_dataset.column_names}")

# Print a preview of the tokenized dataset (first few rows)
print(tokenized_dataset[:5])  # Prints the first 5 entries of the dataset

# If you want to save the tokenized dataset to disk as CSV (or another format), you can do it like this:

# Convert the tokenized dataset to a pandas DataFrame for saving
tokenized_df = pd.DataFrame(tokenized_dataset)

# Save to CSV (you can change the path)
save_path = '../data/unseen/tokenized_client_ceas_unseen.csv'
tokenized_df.to_csv(save_path, index=False)
print(f"✅ Tokenized dataset saved to {save_path}")


Number of columns in tokenized dataset: 10
Column names in tokenized dataset: ['sender', 'receiver', 'date', 'subject', 'body', 'labels', 'urls', 'input_ids', 'token_type_ids', 'attention_mask']
{'sender': ['CNN Alerts <frakte_1973@emmeffe.net>', 'CNN Alerts <steffi-egnalnek@sorrentolactalis.com>', 'SpamExperts via Twitter <uiaregi@twitter.com>', '"Astrology.com Daily Horoscope" <dailyhoroscope@astrology.com>', 'CNN Alerts <lessomed@4wcz.tk>'], 'receiver': ['email151@gvc.ceas-challenge.cc', 'email775@gvc.ceas-challenge.cc', 'user2.1@gvc.ceas-challenge.cc', 'gvcormac@gvc.ceas-challenge.cc', 'user4@gvc.ceas-challenge.cc'], 'date': ['Fri, 08 Aug 2008 10:06:51 -0400', 'Fri, 08 Aug 2008 10:04:01 -0400', 'Fri, 08 Aug 2008 12:28:09 +0000', 'Fri, 08 Aug 2008 06:35:35 -0700', 'Fri, 08 Aug 2008 07:36:51 -0400'], 'subject': ['CNN Alerts: My Custom Alert', 'CNN Alerts: My Custom Alert', 'Direct message from SpamExperts via web', 'Astrology.com: Daily Horoscope', 'CNN Alerts: My Custom Alert'], 'bo

In [5]:

# Define the columns to keep
columns_to_keep = ['labels', 'input_ids', 'token_type_ids', 'attention_mask']

# Remove columns that are not needed
df_cleaned = tokenized_dataset.remove_columns([col for col in tokenized_dataset.column_names if col not in columns_to_keep])

# Save the cleaned dataset to a CSV
cleaned_save_path = "../data/unseen/cleaned_tokenized_client_ceas_unseen.csv"
df_cleaned.to_csv(cleaned_save_path, index=False)

print(f"✅ Cleaned and saved tokenized dataset to {cleaned_save_path}")


Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

✅ Cleaned and saved tokenized dataset to ../data/unseen/cleaned_tokenized_client_ceas_unseen.csv


In [10]:
print(df_cleaned[:5])


{'labels': tensor([1, 1, 0, 0, 1]), 'input_ids': tensor([[  101, 13229,  9499,  2015,  1024,  2026,  7661,  9499, 13229,  9499,
          2015,  1024,  2026,  7661,  9499,  9499,  2171,  1024,  2026,  7661,
          9499, 16880, 11449,  2055,  2402,  3057,  1999,  1996,  2149,  2651,
          1012, 10424,  2072,  1010,  1022, 15476,  2263,  2385,  1024,  2861,
          1024,  4229,  1009,  6021,  8889,  2440,  2466,  2017,  2031,  3530,
          2000,  4374,  2023, 10373,  2013, 13229,  1012,  4012,  2004,  1037,
          2765,  1997,  2115, 13229,  1012,  4012, 12157, 10906,  1012,  2000,
          6133,  2115, 10906, 11562,  2182,  1012,  2000, 11477,  2115,  9499,
          9181,  2030,  6075,  2030,  2000,  4895,  6342,  5910, 26775, 20755,
          2013,  4909,  7661, 10373,  9499,  2015,  1010, 11562,  2182,  1012,
          5830,  2739,  2897,  1012,  2028, 13229,  2415,  1010,  5865,  1010,
          4108, 19988,  2692,  2509,  1075,  2263,  5830,  2739,  2897,  1012,
   

In [12]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch
import numpy as np

# Define the model and tokenizer
MODEL_NAME = "huawei-noah/TinyBERT_General_4L_312D"  # TinyBERT model
num_labels = 2  # binary classification (phishing vs legit)

# Load the pre-trained TinyBERT model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./tinybert_output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=10  # Logs every 10 steps
)


# Define the metric function for accuracy
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    logits, labels = p
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Training dataset (already tokenized)
    eval_dataset=val_dataset,  # Validation dataset (already tokenized)
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results (accuracy)
print(f"Evaluation Results: {eval_results}")

# Generate probabilities on the validation dataset
# Set the model to evaluation mode
model.eval()

# Get predictions and probabilities
import torch

def get_probabilities(dataset):
    # Get the logits (raw predictions) from the model
    logits = trainer.predict(dataset).predictions
    # Convert logits to probabilities using softmax
    probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    return probabilities

# Get probabilities for the validation set
probabilities = get_probabilities(val_dataset)

# Save the probabilities to a CSV file
probabilities_df = pd.DataFrame(probabilities, columns=["prob_class_0", "prob_class_1"])
probabilities_df["labels"] = val_dataset["labels"]

# Save the probabilities to CSV
probabilities_save_path = "../data/unseen/tinybert_probabilities.csv"
probabilities_df.to_csv(probabilities_save_path, index=False)

print(f"✅ Probabilities saved to {probabilities_save_path}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6417,0.593911,0.952096
2,0.2999,0.229219,0.958084
3,0.1012,0.092338,0.979042


Evaluation Results: {'eval_loss': 0.09233815968036652, 'eval_accuracy': 0.9790419161676647, 'eval_runtime': 0.6168, 'eval_samples_per_second': 541.526, 'eval_steps_per_second': 9.728, 'epoch': 3.0}
✅ Probabilities saved to ../data/unseen/tinybert_probabilities.csv
