In [None]:
# Config & Repro
import os, random, json
import numpy as np
import torch
import json
with open("config.json") as f:
    C = json.load(f)

SEED = C["seed"]
BATCH_SIZE = C["batch_size"]
LR = C["lr"]
EPOCHS = C["epochs"]

# CIFAR-10 ids: 0 airplane, 1 automobile, 2 bird, 3 cat, 4 deer, 5 dog, 6 frog, 7 horse, 8 ship, 9 truck
BASE_CLASSES = C["base_classes"] # bird, cat, dog, truck
FORGET_CLASS = C["forget_class"] # airplane
PER_BASE = C["per_base"] # ~1000 per base class
FORGET_N = C["forget_n"] # ~1000 airplanes

def set_seed(seed=SEED):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True, warn_only=True)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# Torch generator for DataLoader shuffles
g = torch.Generator()
g.manual_seed(SEED)

set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Core imports
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Transforms (keep fixed across phases)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load CIFAR-10
train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True,  download=True, transform=transform)
test_dataset  = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)

class_names = train_dataset.classes
class_names

In [None]:
from collections import defaultdict
from torch.utils.data import Subset, ConcatDataset, DataLoader

# Build class->indices map for train
cls_to_idxs = defaultdict(list)
for i, (_, y) in enumerate(train_dataset):
    cls_to_idxs[int(y)].append(i)

# Deterministic shuffle
rng = random.Random(SEED)
for c in cls_to_idxs:
    rng.shuffle(cls_to_idxs[c])

# Select fixed indices
base_indices = []
for c in BASE_CLASSES:
    base_indices += cls_to_idxs[c][:PER_BASE]
forget_indices = cls_to_idxs[FORGET_CLASS][:FORGET_N]

# Save splits (so they’re frozen across runs)
splits = {
    "seed": SEED,
    "base_classes": BASE_CLASSES,
    "forget_class": FORGET_CLASS,
    "base_indices": sorted(base_indices),
    "forget_indices": sorted(forget_indices),
}
with open("splits_train.json", "w") as f:
    json.dump(splits, f)

# Build train subsets
dataset_base   = Subset(train_dataset, splits["base_indices"])
dataset_forget = Subset(train_dataset, splits["forget_indices"])
dataset_full   = ConcatDataset([dataset_base, dataset_forget])  # base + airplane

# Build fixed test subsets
test_forget_indices = [i for i, (_, y) in enumerate(test_dataset) if int(y) == FORGET_CLASS]
test_retain_indices = [i for i, (_, y) in enumerate(test_dataset) if int(y) != FORGET_CLASS]

with open("splits_test.json", "w") as f:
    json.dump({
        "forget_test_indices": test_forget_indices,
        "retain_test_indices": test_retain_indices
    }, f)

test_forget_ds = Subset(test_dataset, test_forget_indices)
test_retain_ds = Subset(test_dataset, test_retain_indices)

# Deterministic DataLoaders (CPU -> num_workers=0)
loader_base   = DataLoader(dataset_base,   batch_size=BATCH_SIZE, shuffle=True,
                           num_workers=0, worker_init_fn=seed_worker, generator=g)
loader_forget = DataLoader(dataset_forget, batch_size=BATCH_SIZE, shuffle=True,
                           num_workers=0, worker_init_fn=seed_worker, generator=g)
loader_full   = DataLoader(dataset_full,   batch_size=BATCH_SIZE, shuffle=True,
                           num_workers=0, worker_init_fn=seed_worker, generator=g)

loader_test_overall = DataLoader(test_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
loader_test_forget  = DataLoader(test_forget_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
loader_test_retain  = DataLoader(test_retain_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

len(dataset_base), len(dataset_forget), len(dataset_full), len(test_forget_ds), len(test_retain_ds)

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)  # 32→16→8→4
        self.fc1 = nn.Linear(256 * 2 * 2, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # [32,16,16]
        x = self.pool(F.relu(self.conv2(x)))  # [64,8,8]
        x = self.pool(F.relu(self.conv3(x)))  # [128,4,4]
        x = self.pool(F.relu(self.conv4(x)))  # [256,2,2]
        x = x.view(-1, 256 * 2 * 2)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

criterion = nn.CrossEntropyLoss()

model_base    = SimpleCNN(num_classes=10).to(device)
model_full    = SimpleCNN(num_classes=10).to(device)
model_retrain = SimpleCNN(num_classes=10).to(device)

opt_base    = optim.Adam(model_base.parameters(),    lr=LR)
opt_full    = optim.Adam(model_full.parameters(),    lr=LR)
opt_retrain = optim.Adam(model_retrain.parameters(), lr=LR)

In [None]:
@torch.no_grad()
def evaluate(model, loader, device, criterion):
    model.eval()
    loss_sum, correct, total = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)
        loss_sum += loss.item() * y.size(0)
        pred = logits.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return {"loss": loss_sum / max(1, total), "acc": correct / max(1, total)}

def train_model(model, dataloader, optimizer, criterion, device, num_epochs=EPOCHS, phase_name=""):
    model.train()
    for epoch in range(num_epochs):
        running_loss, correct, total = 0.0, 0, 0
        for inputs, labels in dataloader:
            # labels stay as original CIFAR-10 ids (0..9)
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            running_loss += loss.item() * labels.size(0)

        train_loss = running_loss / max(1, total)
        train_acc  = correct / max(1, total)
        print(f"[{phase_name}] Epoch {epoch+1}: Loss={train_loss:.3f} | Acc={100*train_acc:.2f}%")

    print(f"{phase_name} training complete")

def report_all(name, model):
    res_overall = evaluate(model, loader_test_overall, device, criterion)
    res_forget  = evaluate(model, loader_test_forget,  device, criterion)
    res_retain  = evaluate(model, loader_test_retain,  device, criterion)
    print(f"\n== {name} Test ==")
    print(f"Overall: acc={100*res_overall['acc']:.2f}%, loss={res_overall['loss']:.3f}")
    print(f"Forget (airplane): acc={100*res_forget['acc']:.2f}%, loss={res_forget['loss']:.3f}")
    print(f"Retain (non-airplane): acc={100*res_retain['acc']:.2f}%, loss={res_retain['loss']:.3f}")

In [None]:
# Unlearning metrics: capture + distance/score
import math

# Registry to hold results per model name
RUN_RESULTS = {}  # e.g. {"BASE": {...}, "FULL": {...}}

def eval_and_store(name, model):
    """Evaluate on overall / forget / retain and store in RUN_RESULTS[name]."""
    overall = evaluate(model, loader_test_overall, device, criterion)
    forget  = evaluate(model, loader_test_forget,  device, criterion)
    retain  = evaluate(model, loader_test_retain,  device, criterion)
    RUN_RESULTS[name] = {
        "overall": overall,
        "forget":  forget,
        "retain":  retain
    }
    print(f"\n== {name} Test ==")
    print(f"Overall: acc={100*overall['acc']:.2f}%, loss={overall['loss']:.3f}")
    print(f"Forget (airplane): acc={100*forget['acc']:.2f}%, loss={forget['loss']:.3f}")
    print(f"Retain (non-airplane): acc={100*retain['acc']:.2f}%, loss={retain['loss']:.3f}")
    return RUN_RESULTS[name]

def unlearning_distance(candidate_name, full_name="FULL", retrain_name="RETRAIN_BASE", alpha=0.5):
    """
    Compute gaps for an unlearned model vs baselines.
    - We want candidate_forget_acc ~ retrain_forget_acc (close to retrain)
    - We want candidate_retain_acc ~ full_retain_acc   (close to full)
    alpha weighs the forget gap; (1-alpha) weighs the retain gap. Default 0.5/0.5.
    Score in [0,1]: higher is better.
    """
    assert candidate_name in RUN_RESULTS, "Candidate not evaluated yet."
    assert full_name in RUN_RESULTS and retrain_name in RUN_RESULTS, "Run FULL and RETRAIN_BASE first (and eval/store)."
    cand   = RUN_RESULTS[candidate_name]
    full   = RUN_RESULTS[full_name]
    retr   = RUN_RESULTS[retrain_name]

    # Accuracies in [0,1]
    acc_c_forget = cand["forget"]["acc"]
    acc_c_retain = cand["retain"]["acc"]
    acc_f_retain = full["retain"]["acc"]
    acc_r_forget = retr["forget"]["acc"]

    # Absolute gaps
    forget_gap = abs(acc_c_forget - acc_r_forget)   # want -> 0
    retain_gap = abs(acc_c_retain - acc_f_retain)   # want -> 0

    # Combined score (normalize by 1 since acc ∈ [0,1])
    score = 1.0 - (alpha * forget_gap + (1.0 - alpha) * retain_gap)

    return {
        "candidate": candidate_name,
        "reference_full": full_name,
        "reference_retrain": retrain_name,
        "forget_gap": forget_gap,
        "retain_gap": retain_gap,
        "alpha": alpha,
        "score": max(0.0, min(1.0, score))  # clamp just in case
    }

def print_unlearning_distance(stats):
    print(f"\n== Unlearning Distance: {stats['candidate']} ==")
    print(f"forget_gap (→ retrain): {stats['forget_gap']:.4f}")
    print(f"retain_gap (→ full)   : {stats['retain_gap']:.4f}")
    print(f"alpha (forget weight) : {stats['alpha']:.2f}")
    print(f"UNLEARNING SCORE      : {stats['score']:.4f}  (1.0 is best)")

In [None]:
# Phase A: BASE (train on base classes only)
train_model(model_base, loader_base, opt_base, criterion, device, num_epochs=EPOCHS, phase_name="BASE")
torch.save({"model_state": model_base.state_dict(),
            "optimizer_state": opt_base.state_dict(),
            "config": {"seed": SEED, "lr": LR, "epochs": EPOCHS, "phase": "BASE",
                       "base_classes": BASE_CLASSES, "forget_class": FORGET_CLASS,
                       "per_base": PER_BASE, "forget_n": FORGET_N}}, "model_base.pt")
eval_and_store("BASE", model_base)

# Phase B: FULL (add airplane)
train_model(model_full, loader_full, opt_full, criterion, device, num_epochs=EPOCHS, phase_name="FULL")
torch.save({"model_state": model_full.state_dict(),
            "optimizer_state": opt_full.state_dict(),
            "config": {"seed": SEED, "lr": LR, "epochs": EPOCHS, "phase": "FULL"}}, "model_full.pt")
eval_and_store("FULL", model_full)

# Phase C: RETRAIN_BASE (scratch baseline without airplane)
train_model(model_retrain, loader_base, opt_retrain, criterion, device, num_epochs=EPOCHS, phase_name="RETRAIN_BASE")
torch.save({"model_state": model_retrain.state_dict(),
            "optimizer_state": opt_retrain.state_dict(),
            "config": {"seed": SEED, "lr": LR, "epochs": EPOCHS, "phase": "RETRAIN_BASE"}}, "model_retrain.pt")
eval_and_store("RETRAIN_BASE", model_retrain)

In [None]:
# Unified unlearning runner,save helpers
import os, json
from copy import deepcopy
from itertools import cycle
import torch.nn.functional as F

def uniform_kl(logits):
    u = torch.full_like(logits, 1.0 / logits.size(1))
    return F.kl_div(F.log_softmax(logits, dim=1), u, reduction="batchmean")

def ewc_penalty(model, fisher_diag, theta_ref, scale=1.0):
    pen = 0.0
    if fisher_diag is None or theta_ref is None or scale == 0.0:
        return pen
    for (n, p) in model.named_parameters():
        if p.requires_grad and n in fisher_diag:
            diff = (p - theta_ref[n])
            pen += (fisher_diag[n] * diff * diff).sum()
    return scale * pen

# KD masked
def kd_loss_masked(student_logits, teacher_logits, mask_class, T=2.0):
    tl = teacher_logits.clone()
    tl[:, mask_class] = -1e9
    t_p    = torch.softmax(tl / T, dim=1)
    s_logp = F.log_softmax(student_logits / T, dim=1)
    return F.kl_div(s_logp, t_p, reduction="batchmean") * (T*T)

def build_variant_name(base="GAR", use_ewc=False, use_kd=False, kd_masked=True, langevin_sigma=0.0):
    parts = [base]
    if use_ewc: parts.append("EWC")
    if use_kd:  parts.append("KDm" if kd_masked else "KD")
    if langevin_sigma and langevin_sigma > 0:
        parts.append(f"L{langevin_sigma:.0e}")
    return "_".join(parts)

def save_checkpoint_and_metrics(name, model, metrics, outdir="results"):
    os.makedirs(outdir, exist_ok=True)
    # checkpoint
    ckpt_path = os.path.join(outdir, f"model_{name}.pt")
    torch.save({"state_dict": model.state_dict(),
                "meta": {"variant": name}}, ckpt_path)
    # metrics
    with open(os.path.join(outdir, f"metrics_{name}.json"), "w") as f:
        json.dump(metrics, f, indent=2)
    print(f"[saved] {ckpt_path} and metrics_{name}.json")

def run_unlearn_variant(
    name=None,
    base_loader=None, forget_loader=None, model_start=None, device=None,
    epochs=5, lr=1e-3,
    lambda_retain=1.0, lambda_forget=0.8,
    use_ewc=False, fisher=None, theta_ref=None, lambda_ewc=0.0,
    use_kd=False, lambda_kd=0.0, T_kd=2.0, kd_masked=True,
    langevin_sigma=0.0, grad_clip=5.0, print_every=200,
    teacher_full=None, forget_class=None
):
    assert base_loader and forget_loader and model_start is not None
    model = deepcopy(model_start).to(device)
    model.train()
    opt = optim.Adam(model.parameters(), lr=lr)
    loop_retain = cycle(base_loader)
    loop_forget = cycle(forget_loader)
    step = 0

    # default name
    auto_name = build_variant_name("GAR", use_ewc, use_kd, kd_masked, langevin_sigma)
    name = name or auto_name
    print(f"[run] {name}")

    for ep in range(epochs):
        steps = min(len(base_loader), len(forget_loader))
        for _ in range(steps):
            xb, yb = next(loop_retain); xf, yf = next(loop_forget)
            xb, yb, xf, yf = xb.to(device), yb.to(device), xf.to(device), yf.to(device)

            opt.zero_grad(set_to_none=True)
            # retain task
            logits_b = model(xb)
            lb = criterion(logits_b, yb)

            # forget task (bounded)
            logits_f = model(xf)
            lf = uniform_kl(logits_f)

            loss = lambda_retain*lb + lambda_forget*lf

            # EWC
            if use_ewc and lambda_ewc > 0.0:
                lewc = ewc_penalty(model, fisher, theta_ref, scale=lambda_ewc)
                loss = loss + lewc
            else:
                lewc = torch.tensor(0.0, device=device)

            # KD (retain only)
            if use_kd and lambda_kd > 0.0 and teacher_full is not None:
                with torch.no_grad():
                    t_logits_b = teacher_full(xb)
                if kd_masked:
                    lkd = kd_loss_masked(logits_b, t_logits_b, mask_class=forget_class, T=T_kd)
                else:
                    # unmasked KD (not recommended for forgetting)
                    t_p    = torch.softmax(t_logits_b / T_kd, dim=1)
                    s_logp = F.log_softmax(logits_b / T_kd, dim=1)
                    lkd = F.kl_div(s_logp, t_p, reduction="batchmean") * (T_kd*T_kd)
                loss = loss + lambda_kd*lkd
            else:
                lkd = torch.tensor(0.0, device=device)

            loss.backward()
            if grad_clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            if langevin_sigma and langevin_sigma > 0:
                for p in model.parameters():
                    if p.grad is not None:
                        p.grad.add_(langevin_sigma * torch.randn_like(p.grad))
            opt.step()

            step += 1
            if step % print_every == 0:
                print(f"[{name} ep {ep+1}] step {step} "
                      f"CE={lb.item():.4f} UKL={lf.item():.4f} "
                      f"EWC={float(lewc):.4f} KD={float(lkd):.4f}")

        print(f"Epoch {ep+1} done.")

    return model

In [None]:
# enable EWC & KD variants

from copy import deepcopy

# Fisher-information diagonal for EWC
def compute_fisher_diag(model, dataloader, device, criterion, n_batches=None):
    """
    Quick empirical Fisher: E[ (∇_θ log p(y|x;θ))² ] over dataloader.
    Returns a dict name → tensor (same shape as each param).
    """
    model.eval()
    fish = {n: torch.zeros_like(p, device=device) 
            for n, p in model.named_parameters() if p.requires_grad}
    batches_seen = 0

    for i, (x, y) in enumerate(dataloader):
        if n_batches and i >= n_batches:
            break
        x, y = x.to(device), y.to(device)

        model.zero_grad(set_to_none=True)
        logits = model(x)
        loss = criterion(logits, y) # NLL <- Cross-Entropy
        loss.backward()

        for n, p in model.named_parameters():
            if p.grad is not None:
                fish[n] += p.grad.detach() ** 2
        batches_seen += 1

    for n in fish:
        fish[n] /= max(1, batches_seen) # mean over batches
    return fish

# (a) reference parameters θ*
theta_full_ref  = {n: p.detach().clone() for n, p in model_full.named_parameters() 
                   if p.requires_grad}
# (b) diagonal Fisher on retain/base data, what we want to protect
#     (use loader_base so "airplane" has no influence)
fisher_full_base = compute_fisher_diag(model_full, loader_base, device, criterion, 
                                       n_batches=100)   #100 mini-batches

# Frozen teacher for knowledge distillation
teacher_full = deepcopy(model_full).eval().to(device)
for p in teacher_full.parameters():
    p.requires_grad_(False)

print("EWC + KD assets ready:",
      f"|Fisher keys|={len(fisher_full_base)}; teacher params frozen.")

In [None]:
# choose variants to run
# - EWC needs: theta_full_ref, fisher_full_base
# - KD  needs: teacher_full

variants = [
    # 1) GAR-only (baseline)
    dict(name="GAR_ONLY", use_ewc=False, use_kd=False, langevin_sigma=0.0,
         lambda_retain=1.0, lambda_forget=0.8),

    # 2) GAR + EWC
    dict(name="GAR_EWC", use_ewc=True, lambda_ewc=100.0, use_kd=False, langevin_sigma=0.0,
         lambda_retain=1.0, lambda_forget=0.8),

    # 2b) GAR + EWC + Langevin
    # dict(name="GAR_EWC_L1e-4", use_ewc=True, lambda_ewc=100.0, use_kd=False, langevin_sigma=1e-4,
    #      lambda_retain=1.0, lambda_forget=0.8),

    # 3) GAR + KD masked
    # dict(name="GAR_KD_m", use_ewc=False, use_kd=True, kd_masked=True, lambda_kd=0.25, T_kd=2.0,
    #      langevin_sigma=0.0, lambda_retain=1.0, lambda_forget=1.0),

    # 4) GAR + EWC + KD masked
    # dict(name="GAR_EWC_KDm_L1e-4", use_ewc=True, lambda_ewc=100.0, use_kd=True, kd_masked=True,
    #      lambda_kd=0.15, T_kd=3.0, langevin_sigma=1e-4, lambda_retain=1.0, lambda_forget=1.0),
]

# Run and save
for cfg in variants:
    m = run_unlearn_variant(
        name=cfg.get("name"),
        base_loader=loader_base, forget_loader=loader_forget, model_start=model_full, device=device,
        epochs=EPOCHS, lr=LR,
        lambda_retain=cfg.get("lambda_retain", 1.0),
        lambda_forget=cfg.get("lambda_forget", 0.8),
        use_ewc=cfg.get("use_ewc", False),
        fisher=globals().get("fisher_full_base"),
        theta_ref=globals().get("theta_full_ref"),
        lambda_ewc=cfg.get("lambda_ewc", 0.0),
        use_kd=cfg.get("use_kd", False),
        lambda_kd=cfg.get("lambda_kd", 0.0),
        T_kd=cfg.get("T_kd", 2.0),
        kd_masked=cfg.get("kd_masked", True),
        langevin_sigma=cfg.get("langevin_sigma", 0.0),
        teacher_full=globals().get("teacher_full"),
        forget_class=FORGET_CLASS,
        grad_clip=5.0, print_every=100
    )
    eval_and_store(cfg["name"], m)
    # save compact metrics snapshot alongside checkpoint
    save_checkpoint_and_metrics(cfg["name"], m, RUN_RESULTS[cfg["name"]], outdir="results")

In [None]:
# Summary table, CSV (US + DE), and plots
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

rows = []
order = []

# always include FULL & RETRAIN_BASE
for base_name in ["FULL", "RETRAIN_BASE"]:
    if base_name in RUN_RESULTS:
        r = RUN_RESULTS[base_name]
        rows.append({
            "model": base_name,
            "acc_forget": r["forget"]["acc"],
            "acc_retain": r["retain"]["acc"],
            "acc_overall": r["overall"]["acc"],
            "loss_forget": r["forget"]["loss"],
            "loss_retain": r["retain"]["loss"],
            "loss_overall": r["overall"]["loss"],
            "score": np.nan
        })
        order.append(base_name)

# add variants and their scores (vs FULL & RETRAIN_BASE)
for name, r in RUN_RESULTS.items():
    if name in ["FULL", "RETRAIN_BASE", "BASE"]:
        continue
    s = unlearning_distance(name, full_name="FULL", retrain_name="RETRAIN_BASE", alpha=0.5)
    rows.append({
        "model": name,
        "acc_forget": r["forget"]["acc"],
        "acc_retain": r["retain"]["acc"],
        "acc_overall": r["overall"]["acc"],
        "loss_forget": r["forget"]["loss"],
        "loss_retain": r["retain"]["loss"],
        "loss_overall": r["overall"]["loss"],
        "score": s["score"]
    })
    order.append(name)

df = pd.DataFrame(rows)

# order: FULL, RETRAIN_BASE, then the rest sorted by model name
df_sorted = pd.concat([
    df[df["model"] == "FULL"],
    df[df["model"] == "RETRAIN_BASE"],
    df[~df["model"].isin(["FULL","RETRAIN_BASE"])].sort_values("model")
], ignore_index=True)

# percentage convenience columns (rounded for display)
df_sorted["acc_forget_pct"] = (df_sorted["acc_forget"] * 100).round(2)
df_sorted["acc_retain_pct"] = (df_sorted["acc_retain"] * 100).round(2)
df_sorted["score_pct"] = (df_sorted["score"] * 100).round(2)

print(df_sorted.to_string(index=False))

# Save CSVs
os.makedirs("results", exist_ok=True)
csv_us = "results/summary.csv"
csv_de = "results/summary_de.csv"

# US-style CSV (comma sep, dot decimal)
df_sorted.to_csv(csv_us, index=False, float_format="%.6f")
# DE-style CSV (semicolon sep, comma decimal)
df_sorted.to_csv(csv_de, index=False, sep=";", float_format="%.6f", decimal=",")

print(f"[saved] {csv_us}")
print(f"[saved] {csv_de}")

# Plots
labels = df_sorted["model"].tolist()
x = np.arange(len(labels))
width = 0.38

# 1) Forget vs Retain accuracy (percent)
plt.figure(figsize=(9,5))
plt.bar(x - width/2, df_sorted["acc_forget_pct"], width, label="Forget (airplane)")
plt.bar(x + width/2, df_sorted["acc_retain_pct"], width, label="Retain (non-airplane)")
plt.xticks(x, labels, rotation=30, ha="right")
plt.ylabel("Accuracy (%)")
plt.title("Unlearning: Forget vs Retain Accuracy")
plt.legend()
plt.tight_layout()
acc_plot_path = "results/acc_bars.png"
plt.savefig(acc_plot_path, dpi=150)
plt.show()
print(f"[saved] {acc_plot_path}")

# 2) Unlearning Score (only for variants that have a score)
df_score = df_sorted[df_sorted["score"].notna()].copy()
if len(df_score) > 0:
    x2 = np.arange(len(df_score))
    plt.figure(figsize=(7,4.5))
    plt.bar(x2, df_score["score_pct"], width=0.5)
    plt.xticks(x2, df_score["model"].tolist(), rotation=30, ha="right")
    plt.ylabel("Unlearning Score (%)")
    plt.title("Unlearning Score vs Baselines")
    plt.tight_layout()
    score_plot_path = "results/score_bars.png"
    plt.savefig(score_plot_path, dpi=150)
    plt.show()
    print(f"[saved] {score_plot_path}")
else:
    print("[info] No variant scores to plot.")