In [37]:
import os
import pandas as pd
import numpy as np
import re
import string
import time
import yaml # For config simulation
import shutil # For cleaning up directories
import joblib # For saving sklearn models


In [38]:
# Force reinstall transformers to try and fix environment issues
print("Force reinstalling transformers==4.39.3...")
!pip uninstall transformers -y --quiet
!pip install transformers==4.39.3 --upgrade --no-deps --quiet
print("Reinstall complete.")

# Verify version immediately after reinstall
import transformers
print(f"Version after reinstall: {transformers.__version__}")

# Explicitly import and inspect TrainingArguments
from transformers import TrainingArguments
import inspect
print("\nInspecting imported TrainingArguments signature:")
try:
    sig = inspect.signature(TrainingArguments.__init__)
    print(sig)
except Exception as e:
    print(f"Could not inspect signature: {e}")

Force reinstalling transformers==4.39.3...
Reinstall complete.
Version after reinstall: 4.51.1

Inspecting imported TrainingArguments signature:


In [39]:
# NLTK Imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [40]:
# Scikit-learn Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight

In [41]:
# Transformers (Hugging Face) Imports 
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, # Use Auto* classes for flexibility
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

In [42]:
# Disable W&B logging if not used/configured
os.environ["WANDB_DISABLED"] = "true"
# Set TORCH_USE_CUDA_DSA to avoid potential warning/error on some setups
os.environ["TORCH_USE_CUDA_DSA"] = "1"


In [43]:
#  Configuration (Simulating config.yaml)
CONFIG = {
    'data': {
        'base_path': "/kaggle/input/nlp-data", # Adjust if your dataset name is different
        'enron_path': "Enron.csv",
        'nigerian_path': "Nigerian_Fraud.csv",
        'spamassassin_path': "SpamAssasin.csv",
        'processed_dir': "/kaggle/working/processed_data/",
        'combined_file': "combined_emails.parquet",
        'text_col': 'text', # Final preprocessed text column name
        'label_col': 'label',
        'phishing_label': 1,
        'legitimate_label': 0,
    },
    'preprocessing': {
        'max_seq_length_bert': 256, # Sequence length for BERT
    },
    'feature_engineering': {
        'tfidf': {
            'max_features': 5000,
            'ngram_range': [1, 2], # Uni-grams and Bi-grams
        },
    },
    'training': {
        'test_size': 0.2,
        'random_state': 42,
        'traditional_models': {
            # Defining models and their parameters for sklearn
            'Logistic Regression': {'max_iter': 500, 'solver': 'liblinear'}, # liblinear often good for text
            'Naive Bayes': {},
            'Random Forest': {'n_estimators': 100, 'random_state': 42, 'n_jobs': -1}, # Use all cores
            'SVM (Linear)': {'C': 1.0, 'max_iter': 1500, 'dual': False}, # dual=False when n_samples > n_features
        },
        'bert': {
            # Choose a model: 'bert-base-uncased', 'distilbert-base-uncased' (faster, slightly less accurate)
            'model_name': 'distilbert-base-uncased',
            'output_dir': '/kaggle/working/bert_training_output', # Checkpoints during training
            'log_dir': '/kaggle/working/bert_logs',
            'model_save_dir': '/kaggle/working/bert_model_final', # Final best model
            'num_epochs': 3, # Start with 3, can increase if needed based on eval logs
            'batch_size': 16, # Adjust based on GPU memory
            'learning_rate': 5e-5, # Common starting point for BERT fine-tuning
            'weight_decay': 0.01,
            'eval_strategy': "steps", # Evaluate during training
            'eval_steps': 250, # How often to evaluate (adjust based on data size/epochs)
            'save_steps': 250, # How often to save checkpoints
            'logging_steps': 50, # How often to log training loss
            'save_total_limit': 2, # Keep only the best and the last checkpoint
            'load_best_model_at_end': True, # Crucial: load the best model found during training
            'metric_for_best_model': 'f1', # Use F1-score on eval set to determine the best model
            'fp16': torch.cuda.is_available(), # Enable mixed-precision if GPU is available
        }
    },
    'deployment': {
        # Directory to save artifacts specifically for Streamlit app
        'export_dir': '/kaggle/working/streamlit_deploy/'
    }
}


In [44]:
# Create Necessary Directories
os.makedirs(CONFIG['data']['processed_dir'], exist_ok=True)
os.makedirs(CONFIG['training']['bert']['output_dir'], exist_ok=True)
os.makedirs(CONFIG['training']['bert']['log_dir'], exist_ok=True)
os.makedirs(CONFIG['training']['bert']['model_save_dir'], exist_ok=True)
os.makedirs(CONFIG['deployment']['export_dir'], exist_ok=True)
print("Output directories created/ensured.")


Output directories created/ensured.


In [45]:
import nltk # Ensure nltk is imported

print("Downloading NLTK resources...")
try:
    # Check if the resource exists
    nltk.data.find('corpora/wordnet.zip') # Check for the zip file existence which is more reliable
except LookupError:
    # If it doesn't exist, download it
    print("Downloading 'wordnet'...")
    nltk.download('wordnet', quiet=True)
try:
    nltk.data.find('corpora/stopwords.zip')
except LookupError:
    print("Downloading 'stopwords'...")
    nltk.download('stopwords', quiet=True)
try:
    nltk.data.find('tokenizers/punkt.zip')
except LookupError:
    print("Downloading 'punkt'...")
    nltk.download('punkt', quiet=True)

print("NLTK resources downloaded/verified.")

# These lines should now work fine after the downloads complete (if they were needed)
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()
print("Stop words and lemmatizer initialized.")

# Verify lemmatizer works (requires wordnet)
try:
    print("Testing lemmatizer:", lemmatizer.lemmatize("cats"))
except Exception as e:
    print(f"Error initializing/testing lemmatizer even after download attempt: {e}")

Downloading NLTK resources...
NLTK resources downloaded/verified.
Stop words and lemmatizer initialized.
Testing lemmatizer: cat


In [46]:
# Preprocessing Functions
def clean_text(text):
    """Cleans text data: lowercase, remove URLs, HTML, emails, punctuation, numbers."""
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", ' ', text) # Remove URLs -> space
    text = re.sub(r'<.*?>', ' ', text) # Remove HTML tags -> space
    text = re.sub(r'\S+@\S+', ' ', text) # Remove emails -> space
    text = re.sub(f"[{re.escape(string.punctuation)}]", ' ', text) # Remove punctuation -> space
    text = re.sub(r'\b\d+\b', ' ', text) # Remove standalone numbers -> space
    text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
    return text

def lemmatize_text(text):
    """Tokenizes and lemmatizes text, removing stopwords and short words."""
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words and len(w) > 1]
    return ' '.join(lemmatized)

def preprocess_dataframe(df, label, text_cols=['subject', 'body']):
    """Standardizes column names, combines text, cleans, and lemmatizes."""
    df = df.copy()
    # Ensure required columns exist, filling with empty strings if not
    for col in text_cols:
        if col not in df.columns:
            df[col] = ''
        else:
            df[col] = df[col].fillna('') # Handle existing NaNs

    # Special handling for DataFrames where the first column might be the body if 'body' is missing
    if 'body' not in df.columns and df.shape[1] > 0:
        potential_body_col = df.columns[0]
        # Heuristic: assume it's the body if it's not 'subject' or another common ID column
        if potential_body_col not in ['subject', 'id', 'label', 'file']:
             print(f"Warning: 'body' column missing. Using first column ('{potential_body_col}') as body.")
             df['body'] = df[potential_body_col].fillna('')
             if 'body' not in text_cols: text_cols.append('body')

    # Combine specified text columns
    df['combined_text'] = df[text_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    df[CONFIG['data']['label_col']] = label
    df['clean_text'] = df['combined_text'].apply(clean_text)
    # Apply lemmatization to the cleaned text
    df[CONFIG['data']['text_col']] = df['clean_text'].apply(lemmatize_text)

    # Select and return only the essential final columns
    df_final = df[[CONFIG['data']['text_col'], CONFIG['data']['label_col']]]
    return df_final


In [47]:
# Data Loading and Caching Function
def load_and_process_data(config):
    """Loads raw data, processes it, combines, shuffles, and caches the result."""
    processed_file_path = os.path.join(config['data']['processed_dir'], config['data']['combined_file'])

    if os.path.exists(processed_file_path):
        print(f"Loading cached processed data from: {processed_file_path}")
        try:
            return pd.read_parquet(processed_file_path)
        except Exception as e:
            print(f"Error loading cached file: {e}. Reprocessing...")

    print("Processing data from scratch...")
    datasets_info = [
        (config['data']['enron_path'], config['data']['legitimate_label']),
        (config['data']['nigerian_path'], config['data']['phishing_label']),
        (config['data']['spamassassin_path'], config['data']['phishing_label']),
    ]

    all_dfs = []
    base_path = config['data']['base_path']
    for file_name, label in datasets_info:
        path = os.path.join(base_path, file_name)
        print(f"--> Processing {file_name}...")
        try:
            # Try reading with utf-8 first, fallback to latin-1
            try:
                 raw_df = pd.read_csv(path, on_bad_lines='warn') # 'warn' helps identify issues
            except UnicodeDecodeError:
                 print(f"    UTF-8 failed for {file_name}, trying latin-1...")
                 raw_df = pd.read_csv(path, on_bad_lines='warn', encoding='latin-1')

            if raw_df.empty:
                print(f"    Warning: {file_name} loaded as empty.")
                continue

            processed_df = preprocess_dataframe(raw_df, label)
            all_dfs.append(processed_df)
            print(f"    Processed {len(processed_df)} rows.")
        except FileNotFoundError:
            print(f"    Error: File not found at {path}")
        except Exception as e:
            print(f"    Error processing {file_name}: {e}")

    if not all_dfs:
        raise ValueError("FATAL: No data could be loaded or processed.")

    print("Combining processed datasets...")
    combined_df = pd.concat(all_dfs, ignore_index=True)
    # Drop rows where the processed text or label is missing/empty AFTER processing
    combined_df.dropna(subset=[config['data']['text_col'], config['data']['label_col']], inplace=True)
    combined_df = combined_df[combined_df[config['data']['text_col']].str.strip().astype(bool)] # Ensure text isn't just whitespace

    print(f"Combined data shape before shuffling: {combined_df.shape}")
    combined_df = combined_df.sample(frac=1, random_state=config['training']['random_state']).reset_index(drop=True)
    print(f"Combined data shape after shuffling: {combined_df.shape}")


    print(f"Saving combined processed data to: {processed_file_path}")
    combined_df.to_parquet(processed_file_path, index=False)

    return combined_df


In [48]:
# Execute Data Loading and Preprocessing
print("="*20 + " Step 1: Load and Preprocess Data " + "="*20)
combined_df = load_and_process_data(CONFIG)
print(f"\nFinal data shape: {combined_df.shape}")
print("\nLabel distribution:")
print(combined_df[CONFIG['data']['label_col']].value_counts(normalize=True))
print("\nSample data (processed text):")
print(combined_df[[CONFIG['data']['text_col'], CONFIG['data']['label_col']]].head())

Loading cached processed data from: /kaggle/working/processed_data/combined_emails.parquet

Final data shape: (38905, 2)

Label distribution:
label
0    0.765069
1    0.234931
Name: proportion, dtype: float64

Sample data (processed text):
                                                text  label
0  get discount vlagra without prescription bodyt...      0
1  satalk two rule suggestion fri jul justin maso...      1
2  title dear mr kaminski hope got title right lo...      0
3  continental power officialisation update curre...      0
4  capture seventy natural solution fuller firmer...      0


In [49]:
#  Train-Test Split 
print("\n" + "="*20 + " Step 2: Train-Test Split " + "="*20)
texts = combined_df[CONFIG['data']['text_col']].tolist()
labels = combined_df[CONFIG['data']['label_col']].astype(int).tolist() # Ensure labels are integers

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels,
    test_size=CONFIG['training']['test_size'],
    random_state=CONFIG['training']['random_state'],
    stratify=labels # Maintain class proportion in train/test sets
)
print(f"Train size: {len(train_texts)}, Test size: {len(test_texts)}")
print(f"Train label distribution: {np.mean(train_labels):.2%}") # Rough check for stratification
print(f"Test label distribution: {np.mean(test_labels):.2%}")



Train size: 31124, Test size: 7781
Train label distribution: 23.49%
Test label distribution: 23.49%


In [50]:
# Feature Engineering (TF-IDF) 
print("\n" + "="*20 + " Step 3: TF-IDF Feature Engineering " + "="*20)

print("Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=CONFIG['feature_engineering']['tfidf']['max_features'],
    ngram_range=tuple(CONFIG['feature_engineering']['tfidf']['ngram_range'])
)

# Fit on training data ONLY, transform both train and test
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

print(f"TF-IDF Train Matrix shape: {X_train_tfidf.shape}")
print(f"TF-IDF Test Matrix shape: {X_test_tfidf.shape}")

# Save the TF-IDF Vectorizer for deployment
tfidf_save_path = os.path.join(CONFIG['deployment']['export_dir'], 'tfidf_vectorizer.joblib')
joblib.dump(tfidf_vectorizer, tfidf_save_path)
print(f"\nTF-IDF Vectorizer saved to: {tfidf_save_path}")



Creating TF-IDF features...
TF-IDF Train Matrix shape: (31124, 5000)
TF-IDF Test Matrix shape: (7781, 5000)

TF-IDF Vectorizer saved to: /kaggle/working/streamlit_deploy/tfidf_vectorizer.joblib


In [51]:
# Traditional Model Training 
print("\n" + "="*20 + " Step 4: Train Traditional Models " + "="*20)

traditional_results = {}
traditional_models = {}

for name, params in CONFIG['training']['traditional_models'].items():
    print(f"\n--- Training: {name} ---")
    start_time = time.time()

    # Select model based on name
    if name == "Logistic Regression":
        model = LogisticRegression(**params, random_state=CONFIG['training']['random_state'])
    elif name == "Naive Bayes":
        model = MultinomialNB(**params)
    elif name == "Random Forest":
        model = RandomForestClassifier(**params)
    elif name == "SVM (Linear)":
        # LinearSVC is often faster for text classification than SVC(kernel='linear')
        model = LinearSVC(**params, random_state=CONFIG['training']['random_state'])
    else:
        print(f"Warning: Unknown model type '{name}' defined in config. Skipping.")
        continue

    # Train the model
    model.fit(X_train_tfidf, train_labels)

    # Predict on the test set
    y_pred = model.predict(X_test_tfidf)
    end_time = time.time()

    # Evaluate
    acc = accuracy_score(test_labels, y_pred)
    report_dict = classification_report(test_labels, y_pred, output_dict=True, zero_division=0)
    report_str = classification_report(test_labels, y_pred, zero_division=0)
    cm = confusion_matrix(test_labels, y_pred)

    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(report_str)
    print("Confusion Matrix:")
    print(cm)
    print(f"Training/Prediction Time: {end_time - start_time:.2f} seconds")

    # Store results and the trained model
    traditional_results[name] = {
        'accuracy': acc,
        'precision': report_dict['weighted avg']['precision'],
        'recall': report_dict['weighted avg']['recall'],
        'f1-score': report_dict['weighted avg']['f1-score'],
        'report_dict': report_dict,
        'cm': cm,
        'time': end_time - start_time
    }
    traditional_models[name] = model

# Find and Save the Best Traditional Model
if traditional_results:
    best_trad_model_name = max(traditional_results, key=lambda k: traditional_results[k]['f1-score'])
    best_trad_model = traditional_models[best_trad_model_name]
    print(f"\nBest Traditional Model (by F1-score): {best_trad_model_name}")

    trad_model_save_path = os.path.join(CONFIG['deployment']['export_dir'], f'traditional_model_{best_trad_model_name.replace(" ", "_").replace("(","_").replace(")","").replace(".","")}.joblib')
    joblib.dump(best_trad_model, trad_model_save_path)
    print(f"Best Traditional Model saved to: {trad_model_save_path}")
else:
    print("\nNo traditional models were trained successfully.")




--- Training: Logistic Regression ---
Accuracy: 0.9477
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      5953
           1       0.89      0.88      0.89      1828

    accuracy                           0.95      7781
   macro avg       0.93      0.93      0.93      7781
weighted avg       0.95      0.95      0.95      7781

Confusion Matrix:
[[5758  195]
 [ 212 1616]]
Training/Prediction Time: 0.41 seconds

--- Training: Naive Bayes ---
Accuracy: 0.9239
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      5953
           1       0.87      0.79      0.83      1828

    accuracy                           0.92      7781
   macro avg       0.90      0.88      0.89      7781
weighted avg       0.92      0.92      0.92      7781

Confusion Matrix:
[[5736  217]
 [ 375 1453]]
Training/Prediction Time: 0.03 seconds

--- Training: Random Fores

In [64]:
# ## Step 5: BERT Fine-tuning (Manual PyTorch Loop)
#
# This section implements the fine-tuning process using a standard PyTorch loop,
# bypassing the Hugging Face `Trainer`'s problematic arguments in this environment.
# This allows for evaluation during training, early stopping, and saving the best model based on F1-score.

# 5.1 Imports for Manual Loop
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # ← Correct: Import AdamW from torch.optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import numpy as np
import os
import time
import shutil

In [65]:
# 5.2 Manual Loop Configuration
# Using values from the CONFIG dictionary where applicable
bert_config = CONFIG['training']['bert']
data_config = CONFIG['data']
preproc_config = CONFIG['preprocessing']
deploy_config = CONFIG['deployment']

MODEL_NAME = bert_config['model_name']
MAX_LEN = preproc_config['max_seq_length_bert']
BATCH_SIZE = bert_config['batch_size']
EPOCHS = bert_config['num_epochs']
LEARNING_RATE = bert_config['learning_rate']

# Path to save the best performing model during training
BEST_MODEL_SAVE_PATH = "/kaggle/working/best_bert_model.bin"
# Directory to save the final best model for deployment
FINAL_MODEL_SAVE_DIR = deploy_config['export_dir'] # Reuse the deployment dir

# Early stopping parameters
EARLY_STOPPING_PATIENCE = 3 # Stop after 3 epochs with no improvement
MIN_DELTA = 0.001 # Minimum change in F1-score to qualify as improvement


In [66]:
# 5.3 Load Tokenizer and Model
print(f"Loading Tokenizer and Model: {MODEL_NAME}")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    # Ensure num_labels is correct for your phishing task (Binary: 0 or 1)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    raise e

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")


Loading Tokenizer and Model: distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [67]:
# 5.4 Create Dataset and DataLoaders
# Reusing the EmailDataset class defined earlier
class EmailDataset(Dataset):
    """PyTorch Dataset for BERT-like models."""
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False, # Not needed for BERT/DistilBERT classification
            padding='max_length',        # Pad to max_length
            truncation=True,             # Truncate to max_length
            return_attention_mask=True,
            return_tensors='pt',         # Return PyTorch tensors
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("Creating datasets...")
train_dataset = EmailDataset(train_texts, train_labels, tokenizer, MAX_LEN)
test_dataset = EmailDataset(test_texts, test_labels, tokenizer, MAX_LEN)

print("Creating dataloaders...")
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) # Use num_workers if available
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=2) # Larger batch size for eval


Creating datasets...
Creating dataloaders...


In [68]:
# 5.5 Optimizer and Scheduler
print("Setting up optimizer and learning rate scheduler...")
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)

# Calculate total training steps
total_steps = len(train_loader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 100, # Number of steps for warmup phase
                                            num_training_steps = total_steps)


Setting up optimizer and learning rate scheduler...


In [70]:
# 5.6 Training and Evaluation Loop

print("\n" + "="*20 + " Starting Training and Evaluation Loop " + "="*20)

best_eval_f1 = -1.0 # Initialize best F1 score
epochs_no_improve = 0 # Counter for early stopping

for epoch in range(EPOCHS):
    start_time_epoch = time.time()
    print(f"\n Epoch {epoch + 1}/{EPOCHS} ")

    # --- Training Phase ---
    model.train() # Set model to training mode
    total_train_loss = 0

    for batch_num, batch in enumerate(train_loader):
        # Move batch to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Clear previous gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Get loss
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()
        # Update learning rate schedule
        scheduler.step()

        # Print progress (optional)
        if (batch_num + 1) % 100 == 0:
             print(f'  Batch {batch_num + 1}/{len(train_loader)} processed. Current Loss: {loss.item():.4f}')

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"\n  Average Training Loss: {avg_train_loss:.4f}")

    # --- Evaluation Phase ---
    model.eval() # Set model to evaluation mode
    total_eval_loss = 0
    all_preds = []
    all_labels = []

    print("  Starting evaluation on test set...")
    with torch.no_grad(): # Disable gradient calculations for evaluation
        for batch in test_loader:
            # Move batch to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            # Accumulate validation loss
            total_eval_loss += outputs.loss.item()

            # Get predictions (logits)
            logits = outputs.logits
            # Move logits and labels to CPU and convert to numpy
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            labels_cpu = labels.detach().cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels_cpu)

    avg_eval_loss = total_eval_loss / len(test_loader)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    # Use weighted F1 for potential class imbalance
    eval_f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    report = classification_report(all_labels, all_preds, zero_division=0)
    cm = confusion_matrix(all_labels, all_preds)

    epoch_time = time.time() - start_time_epoch
    print(f"  Average Validation Loss: {avg_eval_loss:.4f}")
    print(f"  Validation Accuracy: {accuracy:.4f}")
    print(f"  Validation F1-score (Weighted): {eval_f1:.4f}")
    print(f"  Epoch completed in: {epoch_time:.2f} seconds")
    # print("\n  Classification Report (Test Set):") # Optional: print full report each epoch
    # print(report)

    # --- Early Stopping & Best Model Check ---
    if eval_f1 > best_eval_f1 + MIN_DELTA:
        print(f"  Validation F1 improved ({best_eval_f1:.4f} --> {eval_f1:.4f}). Saving model...")
        torch.save(model.state_dict(), BEST_MODEL_SAVE_PATH)
        best_eval_f1 = eval_f1
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        print(f"  Validation F1 did not improve substantially. ({epochs_no_improve}/{EARLY_STOPPING_PATIENCE})")

    if epochs_no_improve >= EARLY_STOPPING_PATIENCE:
        print(f"\nEarly stopping triggered after {epoch + 1} epochs.")
        break




 Epoch 1/3 
  Batch 100/1946 processed. Current Loss: 0.2140
  Batch 200/1946 processed. Current Loss: 0.2192
  Batch 300/1946 processed. Current Loss: 0.0372
  Batch 400/1946 processed. Current Loss: 0.0381
  Batch 500/1946 processed. Current Loss: 0.2723
  Batch 600/1946 processed. Current Loss: 0.0147
  Batch 700/1946 processed. Current Loss: 0.0093
  Batch 800/1946 processed. Current Loss: 0.0190
  Batch 900/1946 processed. Current Loss: 0.1283
  Batch 1000/1946 processed. Current Loss: 0.0101
  Batch 1100/1946 processed. Current Loss: 0.0576
  Batch 1200/1946 processed. Current Loss: 0.1115
  Batch 1300/1946 processed. Current Loss: 0.1612
  Batch 1400/1946 processed. Current Loss: 0.1296
  Batch 1500/1946 processed. Current Loss: 0.1234
  Batch 1600/1946 processed. Current Loss: 0.2653
  Batch 1700/1946 processed. Current Loss: 0.0455
  Batch 1800/1946 processed. Current Loss: 0.0246
  Batch 1900/1946 processed. Current Loss: 0.0063

  Average Training Loss: 0.1550
  Starting e

In [71]:
# 5.7 Load the Best Model
# Load the state dict of the best performing model saved during training
print(f"\nLoading best model state from: {BEST_MODEL_SAVE_PATH}")
try:
    model.load_state_dict(torch.load(BEST_MODEL_SAVE_PATH))
    print("Best model loaded successfully.")
except FileNotFoundError:
    print("Warning: Best model file not found. Using the model state from the last epoch.")
except Exception as e:
    print(f"Error loading best model state: {e}. Using the model state from the last epoch.")

# %% [code]
# 5.8 Final Evaluation of the Best Model
# Run evaluation one last time with the loaded best model to get final metrics

print("\n" + "="*10 + " Final Evaluation of Best Model " + "="*10)

model.eval() # Ensure model is in evaluation mode
final_preds = []
final_labels = []
final_eval_loss = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        final_eval_loss += outputs.loss.item()
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        labels_cpu = labels.detach().cpu().numpy()

        final_preds.extend(preds)
        final_labels.extend(labels_cpu)

# Calculate final metrics
final_avg_loss = final_eval_loss / len(test_loader)
final_accuracy = accuracy_score(final_labels, final_preds)
final_f1 = f1_score(final_labels, final_preds, average='weighted', zero_division=0)
final_report = classification_report(final_labels, final_preds, zero_division=0, target_names=['Legitimate (0)', 'Phishing (1)']) # Adjust target names if needed
final_cm = confusion_matrix(final_labels, final_preds)

print(f"\nFinal Evaluation Metrics (Best Model on Test Set):")
print(f"  Average Loss: {final_avg_loss:.4f}")
print(f"  Accuracy: {final_accuracy:.4f}")
print(f"  F1-score (Weighted): {final_f1:.4f}")
print("\nClassification Report (Best Model):")
print(final_report)
print("\nConfusion Matrix (Best Model):")
print(final_cm)



Loading best model state from: /kaggle/working/best_bert_model.bin
Best model loaded successfully.



  model.load_state_dict(torch.load(BEST_MODEL_SAVE_PATH))



Final Evaluation Metrics (Best Model on Test Set):
  Average Loss: 0.1285
  Accuracy: 0.9641
  F1-score (Weighted): 0.9646

Classification Report (Best Model):
                precision    recall  f1-score   support

Legitimate (0)       0.99      0.97      0.98      5953
  Phishing (1)       0.89      0.96      0.93      1828

      accuracy                           0.96      7781
     macro avg       0.94      0.96      0.95      7781
  weighted avg       0.97      0.96      0.96      7781


Confusion Matrix (Best Model):
[[5747  206]
 [  73 1755]]


In [72]:
# 5.9 Save Final BERT Model and Tokenizer for Deployment
# Save the BEST model (currently loaded) and the tokenizer

bert_deploy_path = os.path.join(FINAL_MODEL_SAVE_DIR, 'bert_model')
print(f"\nSaving final best BERT model and tokenizer for deployment to: {bert_deploy_path}")

try:
    # Ensure the target directory exists and is empty for a clean save
    if os.path.exists(bert_deploy_path):
        print(f"  Removing existing deployment directory: {bert_deploy_path}")
        shutil.rmtree(bert_deploy_path)
    os.makedirs(bert_deploy_path, exist_ok=True)

    # Use Hugging Face's save_pretrained method for both model and tokenizer
    model.save_pretrained(bert_deploy_path)
    tokenizer.save_pretrained(bert_deploy_path)
    print("Best model and tokenizer saved successfully for deployment.")

except Exception as e:
    print(f"\nError saving final model/tokenizer for deployment: {e}")
    import traceback
    traceback.print_exc()



Saving final best BERT model and tokenizer for deployment to: /kaggle/working/streamlit_deploy/bert_model
Best model and tokenizer saved successfully for deployment.


In [74]:
import os
import shutil
import time

print("\n" + "="*20 + " Archiving /kaggle/working/ " + "="*20)

# Define the directory to archive and the name of the output zip file
source_dir = "/kaggle/working/"
# Place the archive file directly inside /kaggle/working/ for easy UI access
output_filename = os.path.join(source_dir, "kaggle_working_archive") # shutil adds .zip automatically

try:
    print(f"Attempting to create archive: {output_filename}.zip")
    # Create the zip archive. shutil.make_archive handles finding files within source_dir.
    # We specify source_dir as the root_dir to get the contents directly,
    # rather than a folder named 'working' inside the zip.
    shutil.make_archive(base_name=output_filename,
                        format='zip',
                        root_dir=source_dir)

    print(f"\nSuccessfully created archive: {output_filename}.zip")
    print("\n--- DOWNLOAD INSTRUCTIONS ---")
    print("1. Go to the 'Data' tab in the right-hand panel of this notebook.")
    print("2. Navigate to the 'Output' section, then '/kaggle/working/'.")
    print(f"3. Find the file named 'kaggle_working_archive.zip'.")
    print("4. Click the three dots (...) next to the file and select 'Download'.")
    print("5. Unzip this file on your local machine to get all contents, including the 'streamlit_deploy' folder.")
    print("-----------------------------")

except Exception as e:
    print(f"\n--- ERROR creating archive ---")
    print(f"An error occurred: {e}")
    import traceback
    traceback.print_exc()
    print("-----------------------------")
    print("Please try downloading the 'streamlit_deploy' folder manually from the Output section.")


Attempting to create archive: /kaggle/working/kaggle_working_archive.zip

--- ERROR creating archive ---
An error occurred: [Errno 28] No space left on device
-----------------------------
Please try downloading the 'streamlit_deploy' folder manually from the Output section.


Traceback (most recent call last):
  File "/usr/lib/python3.11/zipfile.py", line 1815, in write
    shutil.copyfileobj(src, dest, 1024*8)
  File "/usr/lib/python3.11/shutil.py", line 200, in copyfileobj
    fdst_write(buf)
  File "/usr/lib/python3.11/zipfile.py", line 1180, in write
    self._fileobj.write(data)
OSError: [Errno 28] No space left on device

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/shutil.py", line 1046, in _make_zipfile
    zf.write(path, arcname)
  File "/usr/lib/python3.11/zipfile.py", line 1814, in write
    with open(filename, "rb") as src, self.open(zinfo, 'w') as dest:
  File "/usr/lib/python3.11/zipfile.py", line 1201, in close
    raise RuntimeError("File size too large, try using force_zip64")
RuntimeError: File size too large, try using force_zip64

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/py

In [75]:
# --- Cleanup Large Non-Essential Directories and Failed Archives ---
import os
import shutil
import glob # To find zip files

print("Starting cleanup of /kaggle/working/ ...")

# Directories to remove (Modify if your paths were different)
dirs_to_remove = [
    "/kaggle/working/bert_training_output/",  # Intermediate training checkpoints
    "/kaggle/working/processed_data/",       # Processed data cache
    "/kaggle/working/bert_logs/",             # Training logs
    "/kaggle/working/bert_model_final/"       # Final model before copy (assuming streamlit_deploy/bert_model is complete)
]

# Files/Patterns to remove (Failed archives)
files_to_remove_patterns = [
    "/kaggle/working/*.zip"                   # Any zip file in /kaggle/working/
]

# --- Remove Directories ---
print("\nAttempting to remove directories:")
for dir_path in dirs_to_remove:
    if os.path.exists(dir_path) and os.path.isdir(dir_path):
        try:
            print(f"  Removing directory: {dir_path}")
            shutil.rmtree(dir_path)
            print(f"  Successfully removed.")
        except Exception as e:
            print(f"  ERROR removing {dir_path}: {e}")
            print("    Might need manual removal via UI if possible, or ignore if download works.")
    else:
        print(f"  Directory not found, skipping: {dir_path}")

# --- Remove Files matching patterns ---
print("\nAttempting to remove files:")
for pattern in files_to_remove_patterns:
    found_files = glob.glob(pattern)
    if not found_files:
        print(f"  No files found matching pattern: {pattern}")
        continue
    for file_path in found_files:
        if os.path.exists(file_path) and os.path.isfile(file_path):
            try:
                print(f"  Removing file: {file_path}")
                file_size = os.path.getsize(file_path) / (1024**2) # Size in MiB
                os.remove(file_path)
                print(f"  Successfully removed (approx {file_size:.1f} MiB).")
            except Exception as e:
                print(f"  ERROR removing {file_path}: {e}")
        else:
             print(f"  File not found or is not a file, skipping: {file_path}")


print("\nCleanup attempt finished.")
print("Please verify the contents of '/kaggle/working/streamlit_deploy/' via the UI.")
print("Proceed with MANUAL download of the required files from '/kaggle/working/streamlit_deploy/'.")

Starting cleanup of /kaggle/working/ ...

Attempting to remove directories:
  Removing directory: /kaggle/working/bert_training_output/
  Successfully removed.
  Removing directory: /kaggle/working/processed_data/
  Successfully removed.
  Removing directory: /kaggle/working/bert_logs/
  Successfully removed.
  Removing directory: /kaggle/working/bert_model_final/
  Successfully removed.

Attempting to remove files:
  Removing file: /kaggle/working/kaggle_working_archive.zip
  Successfully removed (approx 19408.5 MiB).

Cleanup attempt finished.
Please verify the contents of '/kaggle/working/streamlit_deploy/' via the UI.
Proceed with MANUAL download of the required files from '/kaggle/working/streamlit_deploy/'.
