In [1]:
# ------------------------------------------------------------------------------
# Universal Random Forest Evaluation for Quantum Circuit Trojan Detection
# ------------------------------------------------------------------------------

# This notebook trains and evaluates Random Forest (RF) models to classify 
# clean vs. malicious quantum circuits using structural and behavioral features.

# Algorithms:
# - Deutsch-Jozsa, QAOA, QFT, Shor, Grover, BV, and Universal combined

# Features:
# - Gate counts, depth, entropy, success rate, output variation, unique states, etc.

# Outputs (for each algorithm):
# - Classification report (Accuracy, ROC AUC)
# - Confusion matrix and ROC curve plots
# - Feature importance bar graph
# - Summary table of all results (final)

# Tools:
# - scikit-learn (RandomForestClassifier, metrics, preprocessing)
# - matplotlib + seaborn
# - pandas

# Author: Zeeshan Ajmal


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, roc_auc_score, roc_curve
)

sns.set(style="whitegrid")


In [19]:
import pandas as pd
import os

# List of all your RF dataset CSVs
csv_files = [
    "dj_full_dataset.csv",
    "qaoa_full_dataset.csv",
    "qft_full_dataset.csv",
    "shor_full_dataset.csv",
    "grover_full_dataset.csv",
    "bv_full_dataset.csv"
]

# Initialize empty list to collect aligned DataFrames
aligned_dfs = []

# First pass to collect all unique feature columns
all_columns = set()
for file in csv_files:
    if os.path.exists(file):
        df = pd.read_csv(file)
        all_columns.update(df.columns)

# Convert set to sorted list (optional: consistent order)
all_columns = sorted(all_columns)

# Second pass: align each dataframe to all columns
for file in csv_files:
    if os.path.exists(file):
        df = pd.read_csv(file)
        df = df.reindex(columns=all_columns, fill_value=0)  # fill missing cols with 0
        aligned_dfs.append(df)

# Combine into universal dataset
universal_df = pd.concat(aligned_dfs, ignore_index=True)
universal_df.to_csv("rf_universal_dataset.csv", index=False)
print("✅ Universal rf_dataset created: universal_dataset.csv")


✅ Universal rf_dataset created: universal_dataset.csv


In [20]:
def train_rf_on_dataset(csv_file, algo_name, save_folder):
    os.makedirs(save_folder, exist_ok=True)

    # Load dataset
    df = pd.read_csv(csv_file)

    # Convert columns and drop NaNs
    for col in df.columns:
        if col not in ['name', 'label']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

    if df.empty:
        print(f"❌ Skipped {algo_name}: No usable data in {csv_file}")
        return "Error", "Error"

    X = df.drop(columns=["name", "label"])
    y = df["label"]

    # Split safely
    if len(df) < 4:
        print(f"⚠️ Not enough samples in {csv_file} to split (needs at least 4)")
        return "Too Small", "Too Small"

    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )
    except ValueError as e:
        print(f"❌ Skipping {algo_name}: {str(e)}")
        return "Split Error", "Split Error"

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train Random Forest
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, target_names=["Clean", "Malicious"], zero_division=0)

    # Save classification report
    with open(f"{save_folder}/classification_report.txt", "w") as f:
        f.write(f"{algo_name} – Random Forest\n")
        f.write(f"Accuracy: {acc:.2f}\nROC AUC: {roc:.2f}\n\n")
        f.write(report)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=["Clean", "Malicious"],
                yticklabels=["Clean", "Malicious"])
    plt.title(f"Confusion Matrix – {algo_name}")
    plt.tight_layout()
    plt.savefig(f"{save_folder}/confusion_matrix.png", dpi=300)
    plt.close()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {roc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve – {algo_name}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"{save_folder}/roc_curve.png", dpi=300)
    plt.close()

    # Feature Importance
    importances = model.feature_importances_
    feat_names = X.columns
    feat_df = pd.DataFrame({
        "Feature": feat_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)

    plt.figure(figsize=(8, 5))
    sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")
    plt.title(f"Feature Importance – {algo_name} (RF)")
    plt.tight_layout()
    plt.savefig(f"{save_folder}/feature_importance.png", dpi=300)
    plt.close()

    # Save raw feature importance to txt
    feat_df.to_string(buf=open(f"{save_folder}/feature_importance.txt", "w"))

    print(f"✅ {algo_name}: Accuracy={acc:.2f}, ROC AUC={roc:.2f}")
    return acc, roc


In [21]:
# --------------------------------------------------
# Run RF Model on All Quantum Algorithm Datasets
# --------------------------------------------------

rf_datasets = {
    "dj_full_dataset.csv": ("Deutsch-Jozsa", "rf_outputs/dj"),
    "qaoa_full_dataset.csv": ("QAOA", "rf_outputs/qaoa"),
    "qft_full_dataset.csv": ("QFT", "rf_outputs/qft"),
    "shor_full_dataset.csv": ("Shor", "rf_outputs/shor"),
    "grover_full_dataset.csv": ("Grover", "rf_outputs/grover"),
    "bv_full_dataset.csv": ("BV", "rf_outputs/bv"),
    "rf_universal_dataset.csv": ("Universal", "rf_outputs/universal")
}

rf_results = []

for file, (label, folder) in rf_datasets.items():
    if os.path.exists(file):
        df_check = pd.read_csv(file)
        if df_check.shape[0] == 0:
            print(f"⚠️ {label}: Dataset is empty. Skipping.")
            rf_results.append([label, "Empty", "Empty"])
            continue

        acc, roc = train_rf_on_dataset(file, label, folder)
        rf_results.append([label, acc, roc])
    else:
        print(f"❌ {label}: File not found.")
        rf_results.append([label, "Missing", "Missing"])



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")


✅ Deutsch-Jozsa: Accuracy=0.75, ROC AUC=1.00



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")


✅ QAOA: Accuracy=1.00, ROC AUC=1.00



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")


✅ QFT: Accuracy=1.00, ROC AUC=1.00



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")


✅ Shor: Accuracy=1.00, ROC AUC=1.00



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")


✅ Grover: Accuracy=1.00, ROC AUC=1.00



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")


✅ BV: Accuracy=1.00, ROC AUC=1.00



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")


✅ Universal: Accuracy=0.96, ROC AUC=1.00
