In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pathlib import Path

def get_weights_file_path(config, epoch: str):
    model_filename = f"{epoch}.pt"
    return "/content/drive/MyDrive/Model Training/" + model_filename


In [None]:
import datasets
import time
import os

In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchmetrics-1.8.2


In [None]:
import torch
from torch.utils.data import Dataset

class AG_NewsDataset(Dataset):
    def __init__(self, ds, tokenizer, seq_len):
        """
        ds: dataset object (e.g., Hugging Face IMDB split)
        tokenizer: tokenizer with encode() and token_to_id() methods
        seq_len: max sequence length
        """
        super().__init__()
        self.ds = ds
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.pad_id = tokenizer.token_to_id("[PAD]") or 0

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        text = self.ds[idx]['text']
        label = self.ds[idx]['label']

        encoding = self.tokenizer.encode(text)
        ids = encoding.ids[:self.seq_len]

        pad_len = self.seq_len - len(ids)
        if pad_len > 0:
          ids = ids + [self.pad_id] * pad_len

        attention_mask = [1 if t != self.pad_id else 0 for t in ids]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torch.optim.lr_scheduler import LambdaLR
from tqdm import tqdm
import warnings
from pathlib import Path
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
import torchmetrics
import torch
from collections import defaultdict

In [None]:
from torchmetrics.classification import MulticlassAccuracy

In [None]:
def evaluate_per_class(config, model, dataloader, device="cuda"):
    model.eval()
    model.to(device)

    possible_label_keys = ["label", "labels", "target", "targets", "y"]

    correct_per_class = defaultdict(int)
    total_per_class = defaultdict(int)

    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in dataloader:

            # ---- find label key ----
            label_key = None
            for k in possible_label_keys:
                if k in batch:
                    label_key = k
                    break
            if label_key is None:
                raise KeyError(f"No label key found. Keys = {batch.keys()}")

            labels = batch[label_key].to(device)

            # ---- get input_ids ----
            if "input_ids" not in batch:
                raise KeyError(f"No input_ids in batch. Keys = {batch.keys()}")
            input_ids = batch["input_ids"].to(device)

            # ---- get attention_mask ----
            if "attention_mask" not in batch:
                raise KeyError(f"No attention_mask in batch. Keys = {batch.keys()}")
            attention_mask = batch["attention_mask"].to(device)

            # ---- forward pass ----
            logits = model(input_ids, attention_mask)

            preds = torch.argmax(logits, dim=1)

            # ---- per-class tracking ----
            for p, y in zip(preds, labels):
                y = y.item()
                total_per_class[y] += 1
                if p == y:
                    correct_per_class[y] += 1

            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)
    array = np.zeros(config['num_classes'])
    per_class_acc = {}

    for c in sorted(total_per_class.keys()):
      nu = correct_per_class[c] / total_per_class[c]
      per_class_acc[c] = nu
      array[c] = nu

    total_acc = total_correct / total_samples

    return (per_class_acc, total_acc, array)


In [None]:
def get_or_build_tokenizer(ds):
    tokenizer_path = Path("imdb_tokenizer.json")
    if not tokenizer_path.exists():
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator((item["text"] for item in ds), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, num_classes, seq_len):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.seq_len = seq_len
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, input_ids, attention_mask):
        x = self.embed(input_ids)  # (B, seq, d_model)
        B = x.size(0)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)  # (B, seq+1, d_model)

        # transformer expects (seq, batch, d_model)
        x = x.transpose(0, 1)
        x = self.encoder(x)  # (seq+1, B, d_model)
        x = x[0]  # take CLS token
        logits = self.fc(x)
        return logits

def train_imdb(config, test=()):
    warnings.filterwarnings("ignore")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)
    print("Using device:", device)

    if test:
      ds_raw = load_dataset("ag_news", split="train")
      new = [i for i, ex in enumerate(ds_raw) if ex['label'] != test[0]]
      filter_set = [ds_raw[i] for i in new]
      tokenizer = get_or_build_tokenizer(ds_raw)
      testo = AG_NewsDataset(filter_set, tokenizer, config['seq_len'])
      filtered_loader = DataLoader(testo, batch_size=config['batch_size'], shuffle=True)

      # Already filtered the dataset
      unlearn_model = TransformerClassifier(
        vocab_size=tokenizer.get_vocab_size(),
        d_model=config['d_model'],
        nhead=config['nhead'],
        num_layers=config['num_layers'],
        num_classes=4, # Corrected for ag_news dataset
        seq_len=config['seq_len']
      ).to(device)
      ckpt = torch.load(test[1])
      unlearn_model.load_state_dict(ckpt["model_state_dict"])

      # Fine-tune on dataset without class
      optimizer = torch.optim.Adam(unlearn_model.parameters(), lr=1e-4)
      loss_fn = nn.CrossEntropyLoss()

      for epoch in range(config['num_epochs']):
          for data in filtered_loader:
              input_ids = data['input_ids'].to(device)
              attention_mask = data['attention_mask'].to(device)
              labels = data['label'].to(device)

              optimizer.zero_grad()
              logits = unlearn_model(input_ids, attention_mask)
              loss = loss_fn(logits, labels)  # train normally
              loss.backward()
              optimizer.step()


      torch.save(unlearn_model.state_dict(), test[2])
      print(f'Unlearned model saved to {test[1]}')

      return unlearn_model, DataLoader(testo, batch_size=config['batch_size'], shuffle=True)

    ds_raw = load_dataset("ag_news", split="train")
    tokenizer = get_or_build_tokenizer(ds_raw)

    # Use datasets' own train_test_split for robust splitting
    split_dataset = ds_raw.train_test_split(test_size=0.1, seed=42) # Added a seed for reproducibility
    train_ds_raw = split_dataset['train']
    val_ds_raw = split_dataset['test']

    train_ds = AG_NewsDataset(train_ds_raw, tokenizer, config['seq_len'])
    val_ds = AG_NewsDataset(val_ds_raw, tokenizer, config['seq_len'])
    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=config['batch_size'], shuffle=False)

    model = TransformerClassifier(
        vocab_size=tokenizer.get_vocab_size(),
        d_model=config['d_model'],
        nhead=config['nhead'],
        num_layers=config['num_layers'],
        num_classes=4, # Corrected for ag_news dataset
        seq_len=config['seq_len']
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    loss_fn = nn.CrossEntropyLoss()

    metric_acc = MulticlassAccuracy(num_classes=4).to(device) # Corrected for ag_news dataset

    for epoch in range(config['num_epochs']):
        model.train()
        loop = tqdm(train_loader, desc=f"Epoch {epoch:02d}")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loop.set_postfix(loss=loss.item())

        model.eval()
        val_acc = 0
        count = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                logits = model(input_ids, attention_mask)
                val_acc += metric_acc(logits, labels).item()
                count += 1
        val_acc /= count
        print(f"Epoch {epoch:02d} Validation Accuracy: {val_acc:.4f}")

        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, model_filename)

    return model, train_loader

In [None]:
config = {
  'seq_len': 128,
  'batch_size': 64,
  'num_epochs': 5,
  'd_model': 128,
  'nhead': 4,
  'num_layers': 2,
  'lr': 1e-3,
  'num_classes': 4,
  'experiment_name': "AG-News_transformer"
}

t1_start = time.perf_counter()
t2_start = time.process_time()
model, loader = train_imdb(config)
t1_stop = time.perf_counter()
t2_stop = time.process_time()

Using device: cuda


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Epoch 00: 100%|██████████| 1688/1688 [00:51<00:00, 32.59it/s, loss=0.363]


Epoch 00 Validation Accuracy: 0.8903


Epoch 01: 100%|██████████| 1688/1688 [00:50<00:00, 33.39it/s, loss=0.338]


Epoch 01 Validation Accuracy: 0.8994


Epoch 02: 100%|██████████| 1688/1688 [00:50<00:00, 33.24it/s, loss=0.229]


Epoch 02 Validation Accuracy: 0.9120


Epoch 03: 100%|██████████| 1688/1688 [00:51<00:00, 32.65it/s, loss=0.199]


Epoch 03 Validation Accuracy: 0.9117


Epoch 04: 100%|██████████| 1688/1688 [00:50<00:00, 33.20it/s, loss=0.157]


Epoch 04 Validation Accuracy: 0.9137


In [None]:
def train_relearn(config, train_loader, val_loader):
  model = TransformerClassifier(
      vocab_size=tokenizer.get_vocab_size(),
      d_model=config['d_model'],
      nhead=config['nhead'],
      num_layers=config['num_layers'],
      num_classes=config['num_classes'], # Corrected for ag_news dataset
      seq_len=config['seq_len']
  ).to(device)

  optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
  loss_fn = nn.CrossEntropyLoss()

  t1_start = time.perf_counter()
  t2_start = time.process_time()

  for epoch in range(config['num_epochs']):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch:02d}")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    model.eval()
    val_acc = 0
    count = 0
    acc_metrics = evaluate_per_class(config, model, val_loader)

    model_filename = get_weights_file_path(config, f"{epoch:02d}")
  t1_stop = time.perf_counter()
  t2_stop = time.process_time()

  return model, acc_metrics, t1_stop-t1_start, t2_stop-t2_start

In [None]:
config = {
  'seq_len': 128,
  'batch_size': 64,
  'num_epochs': 5,
  'd_model': 128,
  'nhead': 4,
  'num_layers': 2,
  'lr': 1e-3,
  'num_classes': 4,
  'experiment_name': "AG-News_transformer"
}

n = 3

ds_raw = load_dataset("ag_news", split="train")
tokenizer = get_or_build_tokenizer(ds_raw)
split_dataset = ds_raw.train_test_split(test_size=0.1, seed=42)
train_ds_raw = split_dataset['train']
val_ds_raw = split_dataset['test']

val_loader = DataLoader(
    AG_NewsDataset(val_ds_raw, tokenizer, config['seq_len']),
    batch_size=config['batch_size'],
    shuffle=False
)

percent_list = [0.05, 0.125, .20, .30, .50]

for percent in percent_list:
  trained_accuracies = np.zeros(config['num_classes'])
  unlearned_accuracies = np.zeros(config['num_classes'])
  for i in range(n):
    new_split = dataset.train_test_split(test_size=percent)

    forget_loader = DataLoader(
        AG_NewsDataset(new_split['test'],
                               tokenizer, config['seq_len']),
                               batch_size=config['batch_size'],
                               shuffle=False)
    filtered_loader = DataLoader(AG_NewsDataset(new_split['train'], tokenizer, config['seq_len']), batch_size=config['batch_size'], shuffle=False)

    model, acc_metrics, t1_stop, t2_stop = train_relearn(config, filtered_loader, val_loader)
    unlearn_model, unlearn_acc_metrics, unlearn_t1_stop, unlearn_t2_stop = train_unlearn(config, MODEL_PATH, filtered_loader, val_loader)
    np.add(trained_accuracies, acc_metrics[2])
    np.add(unlearned_accuracies, unlearn_acc_metrics[2])

    save_accuracy_report(f"/content/drive/MyDrive/Model Training/Metrics/{percent}+{i}+relearn", acc_metrics, t1_stop, t2_stop)
    save_accuracy_report(f"/content/drive/MyDrive/Model Training/Metrics/{percent}+{i}+unlearn", unlearn_acc_metrics, unlearn_t1_stop, unlearn_t2_stop)

  trained_accuracies = np.divide(trained_accuracies, n)
  unlearned_accuracies = np.divide(unlearned_accuracies, n)
  mse_per_class = np.sqrt(np.sub(trained_accuracies, unlearned_accuracies) ** 2)

  save_mse_report(f"/content/drive/MyDrive/Model Training/Metrics/{percent}+mse_data", trained_accuracies, unlearned_accuracies, mse_per_class)


In [None]:
def save_mse_report(filename, trained_acc, unlearned_acc, mse_per_class):
    with open(filename, "w") as f:
        for i, (t, u, d) in enumerate(zip(trained_acc, unlearned_acc, mse_per_class)):
            f.write(f"{i} {t:.6f} {u:.6f} {d:.6f}\n")

        # final line = mean mse
        f.write(f"{np.mean(mse_per_class):.6f}\n")

In [None]:
MODEL_PATH = '/content/drive/MyDrive/Model Training/04.pt'
UNLEARN_PATH = '/content/drive/MyDrive/Model Training/Unlearning/next.pt'
CLASS_TO_FORGET = 2

test = (CLASS_TO_FORGET, MODEL_PATH, UNLEARN_PATH)

t3_start = time.perf_counter()
t4_start = time.process_time()
unlearn_model, _ = train_imdb(config, test)
t3_stop = time.perf_counter()
t4_stop = time.process_time()

Using device: cuda
Unlearned model saved to /content/drive/MyDrive/Model Training/04.pt


In [None]:
def save_accuracy_report(filename, per_class, total, time, cpu_time):
    device_name = 'CPU'
    if torch.cuda.is_available():
        print("CUDA is available. GPU is enabled.")
        device_name = torch.cuda.get_device_name(0)

    with open(filename, "w") as f:
        f.write("ag_news\n")
        f.write(f"{device_name}\n")
        f.write(f"{len(per_class.items())}\n")

        for c, acc in per_class.items():
            f.write(f"{acc:.4f}\n")

        f.write(f"{total:.4f}\n")
        f.write(f"{time}\n")
        f.write(f"{cpu_time}\n")

In [None]:
per_class, total = evaluate_per_class(model, loader)

print("Per-class accuracy:")
for c, acc in per_class.items():
    print(f"Class {c}: {acc:.4f}")

print(f"\nTotal accuracy: {total:.4f}\n")
print(f"Execution Time:\n{t1_stop-t1_start}\n")
print(f"CPU Time:\n{t2_stop-t2_start}\n")

wo_class, nu_total = evaluate_per_class(unlearn_model, loader)

print("Per-class accuracy:")
for c, acc in wo_class.items():
    print(f"Class {c}: {acc:.4f}")

print(f"\nTotal accuracy: {nu_total:.4f}")
print(f"Execution Time:\n{t3_stop-t3_start}\n")
print(f"CPU Time:\n{t4_stop-t4_start}\n")

Per-class accuracy:
Class 0: 0.9691
Class 1: 0.9903
Class 2: 0.9482
Class 3: 0.9546

Total accuracy: 0.9655

Execution Time:
522.1479296289999

CPU Time:
467.727784973

Per-class accuracy:
Class 0: 0.9915
Class 1: 0.9995
Class 2: 0.0355
Class 3: 0.9969

Total accuracy: 0.7550
Execution Time:
233.81614492100005

CPU Time:
223.81399843699995



In [None]:
files = os.listdir("/content/drive/MyDrive/Model Training")
i = 1
while(f"pre_unlearning_{i}" in files): i+=1
save_accuracy_report(f"/content/drive/MyDrive/Model Training/pre_unlearning_{i}.txt",
                    per_class, total,
                    t1_stop-t1_start, t2_stop-t2_start)
files = os.listdir("/content/drive/MyDrive/Model Training")
i = 1
while(f"post_unlearning_{i}" in files): i+=1
save_accuracy_report(f"/content/drive/MyDrive/Model Training/post_unlearning_{i}.txt",
                    wo_class, nu_total,
                    t3_stop-t3_start, t4_stop-t4_start)

NameError: name 'save_accuracy_report' is not defined