In [None]:
import time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import (
    average_precision_score, f1_score, recall_score, confusion_matrix
)

# ==========================================
# USER CONFIGURATION
# ==========================================
# Please update these paths before running
PATHS = {
    "T1_TRAIN": "/path/to/train_phish_email_list.csv",
    "T1_TEST":  "/path/to/test_phish_email_list.csv",
    "T2_TRAIN": "/path/to/train_malicious_phish.csv",
    "T2_TEST":  "/path/to/test_malicious_phish.csv",
    "T3_TRAIN": "/path/to/train_malicious_phish_multi.csv",
    "T3_TEST":  "/path/to/test_malicious_phish_multi.csv"
}

SEEDS = [0, 1, 2]

# ==========================================
# UTILITY FUNCTIONS
# ==========================================
def print_latex_stats(results_dict, metric_names):
    """Calculates Mean +/- Std and prints in LaTeX format."""
    print("-" * 60)
    print("=== Final Aggregated Results (LaTeX Format) ===")
    df_res = pd.DataFrame(results_dict)
    
    for col in metric_names:
        mean_val = df_res[col].mean()
        std_val = df_res[col].std()
        print(f"{col:<15}: ${mean_val:.4f} \\pm {std_val:.4f}$")
    print("-" * 60 + "\n")

# ==========================================
# TASK T1: Phishing Email (Binary)
# ==========================================
def run_task_t1():
    print(f"\n{'='*20} STARTING TASK T1 (Phishing Email) {'='*20}")
    
    # 1. Load Data
    print("Loading T1 data...")
    df_train = pd.read_csv(PATHS["T1_TRAIN"])
    df_test = pd.read_csv(PATHS["T1_TEST"])

    df_train['data'].fillna('', inplace=True)
    df_test['data'].fillna('', inplace=True)
    
    X_train_raw = df_train['data']
    y_train = df_train['label']
    X_test_raw = df_test['data']
    y_test = df_test['label'].values

    # 2. Vectorization (Word-level)
    print("Vectorizing (Word TF-IDF)...")
    vectorizer = TfidfVectorizer(
        max_features=15000, stop_words='english', 
        ngram_range=(1, 2), sublinear_tf=True
    )
    X_train_tfidf = vectorizer.fit_transform(X_train_raw)
    X_test_tfidf = vectorizer.transform(X_test_raw)

    # 3. Training Loop
    history = {'AUPRC': [], 'F1-score': [], 'Recall': [], 'Latency (ms)': []}

    for seed in SEEDS:
        print(f"Training Seed {seed} (Subsampling 80%)...")
        # Subsample training data to introduce variance
        X_sub, _, y_sub, _ = train_test_split(
            X_train_tfidf, y_train, train_size=0.8, random_state=seed, stratify=y_train
        )

        model = LogisticRegression(solver='liblinear', random_state=seed, max_iter=1000)
        model.fit(X_sub, y_sub)
        t_start = time.time()
        y_prob = model.predict_proba(X_test_tfidf)[:, 1]
        y_pred = (y_prob >= 0.5).astype(np.int32)
        t_end = time.time()
        
        total_infer_time = t_end - t_start
        latency_ms_per_sample = (total_infer_time * 1000) / X_test_tfidf.shape[0]
        history['Latency (ms)'].append(latency_ms_per_sample)

        auprc = average_precision_score(y_test, y_prob)
        f1 = f1_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)

        history['AUPRC'].append(auprc)
        history['F1-score'].append(f1)
        history['Recall'].append(rec)

        cm = confusion_matrix(y_test, y_pred)
        print(f"  CM: {cm.tolist()}")
        print(f"  Latency: {latency_ms_per_sample:.4f} ms/sample")
        print(f"  [Seed {seed}] AUPRC: {auprc:.4f} | F1-score: {f1:.4f} | Recall: {rec:.4f}")

    print_latex_stats(history, ['AUPRC', 'F1-score', 'Recall'])

# ==========================================
# TASK T2: Malicious URL (Binary)
# ==========================================
def run_task_t2():
    print(f"\n{'='*20} STARTING TASK T2 (Malicious URL - Binary) {'='*20}")

    # 1. Load Data
    print("Loading T2 data...")
    df_train = pd.read_csv(PATHS["T2_TRAIN"])
    df_test = pd.read_csv(PATHS["T2_TEST"])

    df_train['url'].fillna('', inplace=True)
    df_test['url'].fillna('', inplace=True)

    X_train_raw = df_train['url']
    y_train = df_train['label']
    X_test_raw = df_test['url']
    y_test = df_test['label'].values

    # 2. Vectorization (Char-level)
    print("Vectorizing (Char TF-IDF)...")
    vectorizer = TfidfVectorizer(
        analyzer='char', ngram_range=(3, 7), 
        max_features=30000, min_df=3
    )
    X_train_tfidf = vectorizer.fit_transform(X_train_raw)
    X_test_tfidf = vectorizer.transform(X_test_raw)

    # 3. Training Loop
    history = {'AUPRC': [], 'F1-score': [], 'Recall': [], 'Latency (ms)': []}

    for seed in SEEDS:
        print(f"Training Seed {seed} (Subsampling 80%)...")
        X_sub, _, y_sub, _ = train_test_split(
            X_train_tfidf, y_train, train_size=0.8, random_state=seed, stratify=y_train
        )

        model = LogisticRegression(solver='liblinear', random_state=seed, max_iter=1000)
        model.fit(X_sub, y_sub)
        t_start = time.time()
        y_prob = model.predict_proba(X_test_tfidf)[:, 1]
        y_pred = (y_prob >= 0.5).astype(np.int32)
        t_end = time.time()

        total_infer_time = t_end - t_start
        latency_ms_per_sample = (total_infer_time * 1000) / X_test_tfidf.shape[0]
        history['Latency (ms)'].append(latency_ms_per_sample)

        auprc = average_precision_score(y_test, y_prob)
        f1 = f1_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)

        history['AUPRC'].append(auprc)
        history['F1-score'].append(f1)
        history['Recall'].append(rec)
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"  CM: {cm.tolist()}")
        print(f"  Latency: {latency_ms_per_sample:.4f} ms/sample")
        print(f"  [Seed {seed}] AUPRC: {auprc:.4f} | F1-score: {f1:.4f} | Recall: {rec:.4f}")

    print_latex_stats(history, ['AUPRC', 'F1-score', 'Recall'])

# ==========================================
# TASK T3: Malicious URL (Multiclass)
# ==========================================
def run_task_t3():
    print(f"\n{'='*20} STARTING TASK T3 (Malicious URL - Multiclass) {'='*20}")

    # 1. Load Data
    print("Loading T3 data...")
    df_train = pd.read_csv(PATHS["T3_TRAIN"])
    df_test = pd.read_csv(PATHS["T3_TEST"])

    df_train['url'].fillna('', inplace=True)
    df_test['url'].fillna('', inplace=True)

    X_train_raw = df_train['url']
    y_train = df_train['label']
    X_test_raw = df_test['url']
    y_test = df_test['label'].values

    # 2. Vectorization (Char-level)
    print("Vectorizing (Char TF-IDF)...")
    vectorizer = TfidfVectorizer(
        analyzer='char', ngram_range=(3, 7), 
        max_features=30000, min_df=3
    )
    X_train_tfidf = vectorizer.fit_transform(X_train_raw)
    X_test_tfidf = vectorizer.transform(X_test_raw)

    # Prepare labels for Macro AUPRC
    lb = LabelBinarizer()
    y_test_bin = lb.fit_transform(y_test)
    
    if len(lb.classes_) == 2:
        y_test_bin = np.hstack((1 - y_test_bin, y_test_bin))

    # 3. Training Loop
    history = {'Macro AUPRC': [], 'Macro F1-score': [], 'Macro Recall': [], 'Latency (ms)': []}

    for seed in SEEDS:
        print(f"Training Seed {seed} (Subsampling 80%)...")
        X_sub, _, y_sub, _ = train_test_split(
            X_train_tfidf, y_train, train_size=0.8, random_state=seed, stratify=y_train
        )

        model = LogisticRegression(solver='liblinear', random_state=seed, max_iter=1000)
        model.fit(X_sub, y_sub)
        t_start = time.time()
        y_pred = model.predict(X_test_tfidf)
        y_prob = model.predict_proba(X_test_tfidf)
        t_end = time.time()

        total_infer_time = t_end - t_start
        latency_ms_per_sample = (total_infer_time * 1000) / X_test_tfidf.shape[0]
        history['Latency (ms)'].append(latency_ms_per_sample)

        auprc = average_precision_score(y_test_bin, y_prob, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')

        history['Macro AUPRC'].append(auprc)
        history['Macro F1-score'].append(f1)
        history['Macro Recall'].append(rec)
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"  CM: {cm.tolist()}")
        print(f"  Latency: {latency_ms_per_sample:.4f} ms/sample")
        print(f"  [Seed {seed}] AUPRC: {auprc:.4f} | F1-score: {f1:.4f} | Recall: {rec:.4f}")

    print_latex_stats(history, ['Macro AUPRC', 'Macro F1-score', 'Macro Recall'])

# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    try:
        run_task_t1()
        run_task_t2()
        run_task_t3()
    except FileNotFoundError as e:
        print(f"\n[ERROR] File not found. Check 'PATHS' configuration.\n{e}")
    except Exception as e:
        print(f"\n[ERROR] An error occurred:\n{e}")

In [None]:
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.metrics import (
    recall_score, f1_score, confusion_matrix, average_precision_score
)

# ==========================================
# USER CONFIGURATION
# ==========================================
# Please update these paths before running
PATHS = {
    "T4_TRAIN": "/path/to/train_creditcard_timesplit.csv",
    "T4_TEST":  "/path/to/test_creditcard_timesplit.csv",
    "UNSW_TRAIN": "/path/to/UNSW_NB15_training-set.parquet",
    "UNSW_TEST":  "/path/to/UNSW_NB15_testing-set.parquet"
}

SEEDS = [0, 1, 2]

# ==========================================
# UTILITY FUNCTIONS
# ==========================================
def print_latex_stats(results_dict, metric_names):
    """Calculates Mean +/- Std and prints in LaTeX format."""
    print("-" * 60)
    print("=== Final Aggregated Results (LaTeX Format) ===")
    df_res = pd.DataFrame(results_dict)
    
    for col in metric_names:
        mean_val = df_res[col].mean()
        std_val = df_res[col].std()
        print(f"{col:<15}: ${mean_val:.4f} \\pm {std_val:.4f}$")
    print("-" * 60 + "\n")

# ==========================================
# TASK T4: Credit Card Fraud (Binary)
# ==========================================
def run_task_t4():
    print(f"\n{'='*20} STARTING TASK T4 (Credit Card) {'='*20}")
    
    # 1. Load Data
    print("Loading T4 data...")
    df_train = pd.read_csv(PATHS["T4_TRAIN"])
    df_test = pd.read_csv(PATHS["T4_TEST"])

    # 2. Preprocess
    X_train = df_train.drop('Class', axis=1)
    y_train = df_train['Class']
    X_test = df_test.drop('Class', axis=1)
    y_test = df_test['Class']

    print("Scaling features...")
    scaler = StandardScaler()
    X_train_s = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_s = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    # 3. Training Loop
    history = {'AUPRC': [], 'F1-score': [], 'Recall': [], 'Latency (ms)': []}
    
    for seed in SEEDS:
        print(f"Training Seed {seed}...")
        clf = xgb.XGBClassifier(random_state=seed, subsample=0.8, n_jobs=-1)
        clf.fit(X_train_s, y_train)
        t_start = time.time()
        y_prob = clf.predict_proba(X_test_s)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)
        t_end = time.time()

        total_infer_time = t_end - t_start
        latency_ms_per_sample = (total_infer_time * 1000) / X_test_s.shape[0]
        history['Latency (ms)'].append(latency_ms_per_sample)

        auprc = average_precision_score(y_test, y_prob)
        f1 = f1_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        
        history['AUPRC'].append(auprc)
        history['F1-score'].append(f1)
        history['Recall'].append(rec)
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"  CM: {cm.tolist()}")
        print(f"  Latency: {latency_ms_per_sample:.4f} ms/sample")
        print(f"  [Seed {seed}] AUPRC: {auprc:.4f} | F1-score: {f1:.4f} | Recall: {rec:.4f}")

    print_latex_stats(history, ['AUPRC', 'F1-score', 'Recall'])

# ==========================================
# TASK T5: UNSW-NB15 (Binary)
# ==========================================
def run_task_t5():
    print(f"\n{'='*20} STARTING TASK T5 (UNSW Binary) {'='*20}")

    # 1. Load Data
    print("Loading T5 data...")
    df_train = pd.read_parquet(PATHS["UNSW_TRAIN"])
    df_test = pd.read_parquet(PATHS["UNSW_TEST"])

    target_col = 'label'
    drop_cols = ['attack_cat'] # Drop multiclass target

    X_train = df_train.drop(columns=[target_col] + drop_cols)
    y_train = df_train[target_col]
    X_test = df_test.drop(columns=[target_col] + drop_cols)
    y_test = df_test[target_col]

    # 2. Preprocess (OHE + Scaling)
    print("Encoding & Scaling...")
    X_train = pd.get_dummies(X_train)
    X_test = pd.get_dummies(X_test)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    scaler = StandardScaler()
    X_train_s = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_s = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    # 3. Training Loop
    history = {'AUPRC': [], 'F1-score': [], 'Recall': [], 'Latency (ms)': []}

    for seed in SEEDS:
        print(f"Training Seed {seed}...")
        clf = xgb.XGBClassifier(random_state=seed, subsample=0.8, n_jobs=-1)
        clf.fit(X_train_s, y_train)
        t_start = time.time()
        y_prob = clf.predict_proba(X_test_s)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)
        t_end = time.time()

        total_infer_time = t_end - t_start
        latency_ms_per_sample = (total_infer_time * 1000) / X_test_s.shape[0]
        history['Latency (ms)'].append(latency_ms_per_sample)

        auprc = average_precision_score(y_test, y_prob)
        f1 = f1_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)

        history['AUPRC'].append(auprc)
        history['F1-score'].append(f1)
        history['Recall'].append(rec)

        cm = confusion_matrix(y_test, y_pred)
        print(f"  CM: {cm.tolist()}")
        print(f"  Latency: {latency_ms_per_sample:.4f} ms/sample")
        print(f"  [Seed {seed}] AUPRC: {auprc:.4f} | F1-score: {f1:.4f} | Recall: {rec:.4f}")

    print_latex_stats(history, ['AUPRC', 'F1-score', 'Recall'])

# ==========================================
# TASK T6: UNSW-NB15 (Multiclass)
# ==========================================
def run_task_t6():
    print(f"\n{'='*20} STARTING TASK T6 (UNSW Multiclass) {'='*20}")

    # 1. Load Data
    print("Loading T6 data...")
    df_train = pd.read_parquet(PATHS["UNSW_TRAIN"])
    df_test = pd.read_parquet(PATHS["UNSW_TEST"])

    target_col = 'attack_cat'
    drop_cols = ['label'] # Drop binary target

    X_train = df_train.drop(columns=[target_col] + drop_cols)
    y_train = df_train[target_col]
    X_test = df_test.drop(columns=[target_col] + drop_cols)
    y_test = df_test[target_col]

    # 2. Preprocess
    print("Encoding Targets & Features...")
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    classes = le.classes_
    
    X_train = pd.get_dummies(X_train)
    X_test = pd.get_dummies(X_test)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    print("Scaling...")
    scaler = StandardScaler()
    X_train_s = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_s = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    # 3. Training Loop
    history = {'Macro AUPRC': [], 'Macro F1-score': [], 'Macro Recall': [], 'Latency (ms)': []}

    for seed in SEEDS:
        print(f"Training Seed {seed}...")
        clf = xgb.XGBClassifier(random_state=seed, subsample=0.8, n_jobs=-1)
        clf.fit(X_train_s, y_train)
        t_start = time.time()
        y_prob = clf.predict_proba(X_test_s)
        y_pred = np.argmax(y_prob, axis=1)
        t_end = time.time()

        total_infer_time = t_end - t_start
        latency_ms_per_sample = (total_infer_time * 1000) / X_test_s.shape[0]
        history['Latency (ms)'].append(latency_ms_per_sample)

        y_test_bin = label_binarize(y_test, classes=np.arange(len(classes)))
        
        auprc = average_precision_score(y_test_bin, y_prob, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)

        history['Macro AUPRC'].append(auprc)
        history['Macro F1-score'].append(f1)
        history['Macro Recall'].append(rec)
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"  CM: {cm.tolist()}")
        print(f"  Latency: {latency_ms_per_sample:.4f} ms/sample")
        print(f"  [Seed {seed}] AUPRC: {auprc:.4f} | F1-score: {f1:.4f} | Recall: {rec:.4f}")

    print_latex_stats(history, ['Macro AUPRC', 'Macro F1-score', 'Macro Recall'])

# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    try:
        run_task_t4()
        run_task_t5()
        run_task_t6()
    except FileNotFoundError as e:
        print(f"\n[ERROR] File not found. Please check paths in 'PATHS' dict.\n{e}")
    except Exception as e:
        print(f"\n[ERROR] An error occurred:\n{e}")