In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from lime.lime_text import LimeTextExplainer

In [None]:
DATA_PATH = './email_dataset_final.parquet'  # Replace with your actual path
MODEL_PATH = 'phishing_model.pkl'

In [None]:
##################################################
## @brief   Loads data and performs a stratified split to maintain class ratios
## @in string           path
## @out X_train, X_test, y_train, y_test
##################################################
def load_and_prep_data(path):
    print("[...] Loading Data")
    df = pd.read_parquet(path)
    
    X = df['text']
    y = df['label']
    
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
##################################################
## @brief   Creates a pipeline with TF-IDF and Random Forest
## @in none
## @out Pipeline object   pipeline
##################################################
def build_pipeline():
    print("[...] Building Pipeline")

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )),

        ('rf', RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced',
            n_jobs=-1,
            random_state=42
        ))
    ])

    return pipeline

In [None]:
##################################################
## @brief     Trains the model and prints detailed security metrics
## @in Pipeline object, dataframe, dataframe, series, series    pipeline, X_train, X_test, y_train, y_test
## @out Trained Pipeline object     pipeline
##################################################
def train_and_evaluate(pipeline, X_train, X_test, y_train, y_test):
    print("[...] Training Model")
    pipeline.fit(X_train, y_train)
    
    print("[...] Evaluating Model")
    y_pred = pipeline.predict(X_test)
    
    print("\n--- Classification Report ---")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Phishing']))
    print("\n--- Confusion Matrix ---")
    print(confusion_matrix(y_test, y_pred))
    
    return pipeline

In [None]:
##################################################
## @brief     Uses LIME to explain whz a specific email was flagged.
## @in Pipeline object, string    pipeline, text_instance
## @out void
##################################################
def explain_prediction(pipeline, text_instance):
    print(f"\n[...] Explaining prediction for: '{text_instance[:50]}...'")
    
    explainer = LimeTextExplainer(class_names=['Ham', 'Phishing'])

    exp = explainer.explain_instance(
        text_instance, 
        pipeline.predict_proba, 
        num_features=6
    )
    
    print("Feature Importance (Positive = Phishing, Negative = Safe):")
    for feature, weight in exp.as_list():
        print(f"  {feature:<20} : {weight:.4f}")

In [None]:
# --- MAIN EXECUTION ---
if __name__ == "__main__":
    # 1. Prepare
    X_train, X_test, y_train, y_test = load_and_prep_data(DATA_PATH)
    
    # 2. Build
    model_pipeline = build_pipeline()
    
    # 3. Train
    model = train_and_evaluate(model_pipeline, X_train, X_test, y_train, y_test)
    
    # 4. Save (Crucial for moving to API later)
    joblib.dump(model, MODEL_PATH)
    print(f"\n[+] Model saved to {MODEL_PATH}")
    
    # 5. Explainability Test
    test_email = "URGENT: Verify your bank account details now to avoid suspension."
    explain_prediction(model, test_email)