<a href="https://colab.research.google.com/github/vignesh-0510/MembershipInferenceAttack/blob/main/MembershipInferenceAttack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [104]:
import numpy as np
import pandas as pd
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

device = "cuda" if torch.cuda.is_available() else "cpu"

# Loading Data and Model

In [2]:
BASE_DIR = '/content/drive/MyDrive/PrivacyAwareComputing/ProjectMIALM'
model_path = os.path.join(BASE_DIR, 'victim_model_distilbert_agnews')


def load_victim(model_dir="victim_model"):
    model_path = os.path.join(BASE_DIR, model_dir)
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
    model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)
    model.eval()
    return model, tokenizer

model, tokenizer = load_victim('victim_model_distilbert_agnews')

In [3]:
df = pd.read_csv(os.path.join(BASE_DIR, 'validation_samples.csv'))  # your uploaded file
raw_ds = Dataset.from_pandas(df.rename(columns={"label": "labels"})[["text", "labels"]])

def tokenize_batch(batch):
    return tokenizer(batch["text"],truncation=True,padding="max_length",max_length=256)

shadow_source_ds = raw_ds.map(tokenize_batch, batched=True)
shadow_source_ds = shadow_source_ds.remove_columns(["text"])  # only model inputs

membership_data = np.loadtxt(os.path.join(BASE_DIR, 'validation_results.txt'), dtype=int)
membership_data = membership_data[membership_data[:,0].argsort()]  # sort by index

membership_labels = membership_data[:,1]  # array of 0/1
print("Loaded membership labels:", len(membership_labels))
assert len(membership_labels) == len(raw_ds)

member_idx = np.where(membership_labels == 1)[0].tolist()
nonmember_idx = np.where(membership_labels == 0)[0].tolist()

target_member_ds = shadow_source_ds.select(member_idx)
target_nonmember_ds = shadow_source_ds.select(nonmember_idx)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Loaded membership labels: 1000


In [4]:
def make_shadow_splits(dataset, num_shadows=5, shadow_train_frac=0.5, seed=0):
    """
    dataset: Tokenized Dataset
    Returns: list of (train_ds, out_ds) for each shadow
    """
    rng = np.random.default_rng(seed)
    n = len(dataset)
    indices = np.arange(n)

    shadow_pairs = []
    for s in range(num_shadows):
        rng.shuffle(indices)
        cut = int(shadow_train_frac * n)
        train_idx = indices[:cut]
        out_idx   = indices[cut:]

        train_ds = dataset.select(train_idx.tolist())
        out_ds   = dataset.select(out_idx.tolist())
        shadow_pairs.append((train_ds, out_ds))

    return shadow_pairs

In [5]:
def collate_fn(batch):
    input_ids = torch.tensor([x["input_ids"] for x in batch], dtype=torch.long)
    attention_mask = torch.tensor([x["attention_mask"] for x in batch], dtype=torch.long)
    labels = torch.tensor([x["labels"] for x in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

def make_loader(ds, batch_size=16, shuffle=True):
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

In [78]:
class NoisyLabelCrossEntropy(nn.Module):
    def __init__(self, noise_rate, num_classes):
        super().__init__()
        self.η = noise_rate
        self.C = num_classes

    def forward(self, logits, targets):
        log_probs = F.log_softmax(logits, dim=-1)

        with torch.no_grad():
            # construct noise-aware target distribution
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(self.η / (self.C - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1 - self.η)

        return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

In [79]:
def train_shadow_model(victim_dir, train_ds, val_ds=None,
                       epochs=2, lr=2e-5, batch_size=16, noise_rate=0.2):
    """
    Each shadow shares same architecture & init as victim.
    """
    model = DistilBertForSequenceClassification.from_pretrained(os.path.join(BASE_DIR, victim_dir)).to(device)
    model.train()

    train_loader = make_loader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = make_loader(val_ds, batch_size=batch_size, shuffle=False) if val_ds else None

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    total_steps = epochs * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
    )

    # loss_fn = nn.CrossEntropyLoss(reduction="mean")
    loss_fn = NoisyLabelCrossEntropy(noise_rate=noise_rate, num_classes=4)

    for ep in range(epochs):
        running = 0.0
        for batch in train_loader:
            batch = {k:v.to(device) for k,v in batch.items()}
            out = model(input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"])
            loss = loss_fn(out.logits, batch["labels"])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            running += loss.item()

        print(f"[shadow] epoch {ep+1}/{epochs} loss={running/len(train_loader):.4f}")

    model.eval()
    return model

In [94]:
import torch.nn.functional as F

@torch.no_grad()
def extract_features(model, ds, batch_size=32):
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    all_features = []

    loss_fn = nn.CrossEntropyLoss(reduction="none")

    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attn, output_hidden_states=True)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)

        hidden = outputs.hidden_states[-1]  # last layer hidden state
        cls_embed = hidden[:,0,:]

        # 1. Loss
        losses = loss_fn(logits, labels)

        # 2. Confidence
        conf = probs.max(dim=-1)[0]

        # 3. Margin
        top2 = torch.topk(probs, k=2, dim=-1).values
        margin = top2[:, 0] - top2[:, 1]

        # 4. Entropy
        entr = -(probs * probs.log()).sum(dim=-1)

        # 5. Correctness
        preds = probs.argmax(dim=-1)
        correct = (preds == labels).float()

        # 6. Logit margin
        top2 = torch.topk(probs, k=2, dim=-1).values
        logit_margin = top2[:, 0] - top2[:, 1]

        # 7. Logit norm
        logit_norm = logits.norm(dim=-1)

        # 8. Class norm

        cls_norm = cls_embed.norm(dim=1)

        feats = torch.stack([
            losses,
            conf,
            # margin,
            entr,
            correct,
            logit_margin,
            # logit_norm,
            # p_true,
            # wrong_conf,
            # cls_norm
        ], dim=1)


        # collect
        all_features.append(feats.cpu().numpy())

    # return shape (N, 5)
    return np.concatenate(all_features, axis=0)

In [95]:
def build_attack_dataset(victim_dir, shadow_pairs,
                         shadow_epochs=2, batch_size=16, shadow_noise_rate=0.2, shadow_lr=2e-5):
    """
    For each shadow:
      - train on shadow train split (members)
      - compute losses on members and non-members
    Returns: X (losses), y (membership labels)
    """
    X_list, y_list = [], []

    for i, (in_ds, out_ds) in enumerate(shadow_pairs):
        print(f"\n=== Training shadow {i+1}/{len(shadow_pairs)} ===")
        shadow_model = train_shadow_model(
            victim_dir, in_ds, epochs=shadow_epochs, batch_size=batch_size, noise_rate=shadow_noise_rate, lr=shadow_lr
        )

        in_features  = extract_features(shadow_model, in_ds, batch_size=batch_size)
        out_features = extract_features(shadow_model, out_ds, batch_size=batch_size)

        X_list.append(in_features)
        X_list.append(out_features)

        y_list.append(np.ones(len(in_features)))      # members = 1
        y_list.append(np.zeros(len(out_features)))      # members = 1

    X = np.concatenate(X_list, axis=0).reshape(-1, in_features.shape[1])  # (M,1)
    y = np.concatenate(y_list, axis=0).astype(np.int64)

    return X, y

In [106]:
class AttackMLP(nn.Module):
    def __init__(self, in_dim=5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 64),
            nn.LayerNorm(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.LayerNorm(32),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),

            nn.Linear(32, 16),
            nn.LeakyReLU(0.1),

            nn.Linear(16, 1)
        )


    def forward(self, x):
        return self.net(x)

# def train_attack_model(X, y, epochs=20, lr=1e-4, batch_size=128):
#     X_t = torch.tensor(X, dtype=torch.float32)
#     y_t = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

#     ds = torch.utils.data.TensorDataset(X_t, y_t)
#     dl = DataLoader(ds, batch_size=batch_size, shuffle=True)

#     attack = AttackMLP(in_dim = X.shape[1]).to(device)
#     opt = torch.optim.Adam(attack.parameters(), lr=lr)
#     bce = nn.BCEWithLogitsLoss()
#     print(f"\n=== Training attack ===")
#     for ep in range(epochs):
#         running = 0.0
#         for xb, yb in dl:
#             xb, yb = xb.to(device), yb.to(device)
#             logits = attack(xb)
#             loss = bce(logits, yb)

#             opt.zero_grad()
#             loss.backward()
#             opt.step()
#             running += loss.item()
#         if ep%(epochs//10) == 0:
#           print(f"[attack] epoch {ep+1}/{epochs} loss={running/len(dl):.4f}")

#     attack.eval()
#     return attack

In [116]:
def train_attack_model(
        X, y,
        epochs=20,
        lr=1e-4,
        batch_size=128,
        lbfgs_steps=10
    ):
    X_t = torch.tensor(X, dtype=torch.float32)
    y_t = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    ds = torch.utils.data.TensorDataset(X_t, y_t)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True)

    attack = AttackMLP(in_dim=X.shape[1]).to(device)

    # ---- Adam optimizer ----
    opt = torch.optim.Adam(attack.parameters(), lr=lr)

    # ---- LR Scheduler ----
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt,
        mode="min",
        factor=0.5,
        patience=2,
        threshold=1e-3
    )

    bce = nn.BCEWithLogitsLoss()

    print(f"\n=== Training attack model (Adam + LBFGS) ===")

    for ep in range(epochs):

        attack.train()
        running = 0.0

        # ----------------------------
        #   PHASE 1 — Adam optimizer
        # ----------------------------
        for xb, yb in dl:
            xb, yb = xb.to(device), yb.to(device)

            logits = attack(xb)
            loss = bce(logits, yb)

            opt.zero_grad()
            loss.backward()
            opt.step()

            running += loss.item()

        # LR scheduler step
        scheduler.step(running / len(dl))

        # Print every few epochs
        if ep % max(1, epochs // 10) == 0:
            print(f"[Adam {ep+1}/{epochs}] loss = {running/len(dl):.4f}")

    # =====================================
    #   PHASE 2 — LBFGS refinement
    # =====================================

    print("\n=== Refining with LBFGS ===")

    # Define LBFGS optimizer (must use FULL batch!)
    lbfgs = torch.optim.LBFGS(
        attack.parameters(),
        lr=0.1,               # LBFGS uses a different internal rule
        max_iter=20,
        history_size=50,
        line_search_fn="strong_wolfe"
    )

    full_x = X_t.to(device)
    full_y = y_t.to(device)

    def closure():
        lbfgs.zero_grad()
        logits = attack(full_x)
        loss = bce(logits, full_y)
        loss.backward()
        return loss

    # Perform LBFGS refinement steps
    for i in range(lbfgs_steps):
        loss_val = lbfgs.step(closure)
        print(f"  LBFGS step {i+1}/{lbfgs_steps}: loss={loss_val.item():.6f}")

    attack.eval()
    return attack

In [118]:
@torch.no_grad()
def attack_victim(attack_model, victim_model, member_ds, nonmember_ds, batch_size=32):
    m_features = extract_features(victim_model, member_ds, batch_size=batch_size)
    nm_features = extract_features(victim_model, nonmember_ds, batch_size=batch_size)

    X_test = np.concatenate([m_features, nm_features], axis=0)
    y_true = np.concatenate([
        np.ones(len(m_features)),
        np.zeros(len(nm_features))
    ])

    X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
    probs = torch.sigmoid(attack_model(X_test_t)).cpu().numpy().squeeze()

    y_pred = (probs > 0.5).astype(np.int64)
    results = {}
    results['accuracy'] = accuracy_score(y_true, y_pred)
    results['roc_auc'] = float(roc_auc_score(y_true, probs))
    results['precision'] = precision_score(y_true, y_pred)
    results['recall'] = recall_score(y_true, y_pred)
    results['f1'] = f1_score(y_true, y_pred)
    results['confusion_matrix'] = confusion_matrix(y_true, y_pred)
    # results['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)

    print(f"\nVictim attack accuracy: {results['accuracy']*100:.2f}% ROC AUC: {results['roc_auc']*100:.2f}%")
    return results, probs, y_true

In [119]:
def run_loss_based_mia(
    victim_dir,
    shadow_source_ds,      # fake data pool for shadows
    target_member_ds,      # fake members for evaluation
    target_nonmember_ds,   # fake non-members for evaluation
    num_shadows=5,
    shadow_train_frac=0.5,
    shadow_noise_rate=0.2,
    shadow_epochs=10,
    batch_size=16,
    attack_model_epochs=20,
    attack_lr=1e-4,
    mode='validation'
):

    victim_model, tokenizer = load_victim(victim_dir)

    shadow_pairs = make_shadow_splits(
        shadow_source_ds,
        num_shadows=num_shadows,
        shadow_train_frac=shadow_train_frac
    )

    X_attack, y_attack = build_attack_dataset(
        victim_dir, shadow_pairs,
        shadow_epochs=shadow_epochs,
        batch_size=batch_size,
        shadow_noise_rate=shadow_noise_rate,
        shadow_lr=1e-4
    )

    attack_model = train_attack_model(X_attack, y_attack, epochs=attack_model_epochs, lr=attack_lr)
    if mode == 'validation':
      results, probs, y_true = attack_victim(
          attack_model, victim_model,
          target_member_ds, target_nonmember_ds,
          batch_size=batch_size
      )
      return attack_model, results
    else:
      return attack_model

In [122]:
mode = 'inference'
attack_model, results = run_loss_based_mia(
    victim_dir="victim_model_distilbert_agnews",
    shadow_source_ds=shadow_source_ds,
    target_member_ds=target_member_ds,
    target_nonmember_ds=target_nonmember_ds,
    num_shadows=5,
    shadow_train_frac=0.5,
    shadow_noise_rate=0.02,
    shadow_epochs=5,
    batch_size=32,
    attack_model_epochs=500,
    attack_lr=1e-1,
    mode=mode
)


=== Training shadow 1/5 ===
[shadow] epoch 1/5 loss=0.3809
[shadow] epoch 2/5 loss=0.2907
[shadow] epoch 3/5 loss=0.2314
[shadow] epoch 4/5 loss=0.2035
[shadow] epoch 5/5 loss=0.1755

=== Training shadow 2/5 ===
[shadow] epoch 1/5 loss=0.4507
[shadow] epoch 2/5 loss=0.3685
[shadow] epoch 3/5 loss=0.2820
[shadow] epoch 4/5 loss=0.2412
[shadow] epoch 5/5 loss=0.1899

=== Training shadow 3/5 ===
[shadow] epoch 1/5 loss=0.4511
[shadow] epoch 2/5 loss=0.3517
[shadow] epoch 3/5 loss=0.2858
[shadow] epoch 4/5 loss=0.2225
[shadow] epoch 5/5 loss=0.2025

=== Training shadow 4/5 ===
[shadow] epoch 1/5 loss=0.4762
[shadow] epoch 2/5 loss=0.3395
[shadow] epoch 3/5 loss=0.2594
[shadow] epoch 4/5 loss=0.2195
[shadow] epoch 5/5 loss=0.1801

=== Training shadow 5/5 ===
[shadow] epoch 1/5 loss=0.4593
[shadow] epoch 2/5 loss=0.3155
[shadow] epoch 3/5 loss=0.2551
[shadow] epoch 4/5 loss=0.2250
[shadow] epoch 5/5 loss=0.2087

=== Training attack model (Adam + LBFGS) ===
[Adam 1/1000] loss = 0.7002
[Adam 

In [123]:
results

{'accuracy': 0.394,
 'roc_auc': 0.5212104218234324,
 'precision': 0.19736842105263158,
 'recall': 0.703125,
 'f1': 0.3082191780821918,
 'confusion_matrix': array([[259, 549],
        [ 57, 135]])}

# Draw Inference

In [124]:
def mia_predict_from_df(df, tokenizer, victim_model, attack_model, batch_size=32):

    # Step 1 — Convert DataFrame to HuggingFace Dataset
    ds = Dataset.from_pandas(df[["text", "label"]].rename(columns={"label": "labels"}))

    # Step 2 — Tokenize
    ds = ds.map(lambda batch: tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    ), batched=True)

    ds.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"]
    )

    # Step 3 — Extract multi-feature MIA vectors
    feats = extract_features(victim_model, ds, batch_size=batch_size)
    # feats.shape = (N, feature_dim)

    # Step 4 — Convert features to tensor
    X = torch.tensor(feats, dtype=torch.float32).to(device)

    # Step 5 — Predict membership probability
    attack_model.eval()
    probs = torch.sigmoid(attack_model(X)).cpu().numpy().squeeze()

    # Step 6 — Convert to binary membership prediction
    preds = (probs > 0.5).astype(int)

    return probs, preds

In [126]:
test_df = pd.read_csv(os.path.join(BASE_DIR, 'sampled.csv'))

In [128]:
_, preds = mia_predict_from_df(test_df, tokenizer, model, attack_model, batch_size=32)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

TypeError: only integer tensors of a single element can be converted to an index