<a href="https://colab.research.google.com/github/vignesh-0510/MembershipInferenceAttack/blob/main/MembershipInferenceAttack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import numpy as np
import pandas as pd
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

device = "cuda" if torch.cuda.is_available() else "cpu"

In [46]:
BASE_DIR = '/content/drive/MyDrive/PrivacyAwareComputing/ProjectMIALM'
model_path = os.path.join(BASE_DIR, 'victim_model_distilbert_agnews')


def load_victim(model_dir="victim_model"):
    model_path = os.path.join(BASE_DIR, model_dir)
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
    model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)
    model.eval()
    return model, tokenizer

model, tokenizer = load_victim('victim_model_distilbert_agnews')

In [7]:
import torch

text = "This is a test sentence.sport basketball"

inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

logits = outputs.logits
pred = torch.argmax(logits, dim=-1)

print("Predicted label:", pred.item())

Predicted label: 1


In [8]:
def make_shadow_splits(dataset, num_shadows=5, shadow_train_frac=0.5, seed=0):
    """
    dataset: HF Dataset (already tokenized)
    Returns: list of (train_ds, out_ds) for each shadow
    """
    rng = np.random.default_rng(seed)
    n = len(dataset)
    indices = np.arange(n)

    shadow_pairs = []
    for s in range(num_shadows):
        rng.shuffle(indices)
        cut = int(shadow_train_frac * n)
        train_idx = indices[:cut]
        out_idx   = indices[cut:]

        train_ds = dataset.select(train_idx.tolist())
        out_ds   = dataset.select(out_idx.tolist())
        shadow_pairs.append((train_ds, out_ds))

    return shadow_pairs

In [9]:
def collate_fn(batch):
    # batch is list of dicts
    input_ids = torch.tensor([x["input_ids"] for x in batch], dtype=torch.long)
    attention_mask = torch.tensor([x["attention_mask"] for x in batch], dtype=torch.long)
    labels = torch.tensor([x["labels"] for x in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

def make_loader(ds, batch_size=16, shuffle=True):
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

In [50]:
def train_shadow_model(victim_dir, train_ds, val_ds=None,
                       epochs=2, lr=2e-5, batch_size=16):
    """
    Each shadow shares same architecture & init as victim.
    """
    model = DistilBertForSequenceClassification.from_pretrained(os.path.join(BASE_DIR, victim_dir)).to(device)
    model.train()

    train_loader = make_loader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = make_loader(val_ds, batch_size=batch_size, shuffle=False) if val_ds else None

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    total_steps = epochs * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
    )

    loss_fn = nn.CrossEntropyLoss(reduction="mean")

    for ep in range(epochs):
        running = 0.0
        for batch in train_loader:
            batch = {k:v.to(device) for k,v in batch.items()}
            out = model(input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"])
            loss = loss_fn(out.logits, batch["labels"])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            running += loss.item()

        print(f"[shadow] epoch {ep+1}/{epochs} loss={running/len(train_loader):.4f}")

    model.eval()
    return model

In [10]:
@torch.no_grad()
def per_sample_losses(model, ds, batch_size=32):
    loader = make_loader(ds, batch_size=batch_size, shuffle=False)
    loss_fn = nn.CrossEntropyLoss(reduction="none")

    all_losses = []
    for batch in loader:
        batch = {k:v.to(device) for k,v in batch.items()}
        logits = model(input_ids=batch["input_ids"],
                       attention_mask=batch["attention_mask"]).logits
        losses = loss_fn(logits, batch["labels"])  # (B,)
        all_losses.append(losses.detach().cpu())

    return torch.cat(all_losses, dim=0).numpy()  # shape (N,)

In [11]:
def build_attack_dataset(victim_dir, shadow_pairs,
                         shadow_epochs=2, batch_size=16):
    """
    For each shadow:
      - train on shadow train split (members)
      - compute losses on members and non-members
    Returns: X (losses), y (membership labels)
    """
    X_list, y_list = [], []

    for i, (in_ds, out_ds) in enumerate(shadow_pairs):
        print(f"\n=== Training shadow {i+1}/{len(shadow_pairs)} ===")
        shadow_model = train_shadow_model(
            victim_dir, in_ds, epochs=shadow_epochs, batch_size=batch_size
        )

        in_losses  = per_sample_losses(shadow_model, in_ds, batch_size=batch_size)
        out_losses = per_sample_losses(shadow_model, out_ds, batch_size=batch_size)

        X_list.append(in_losses)
        y_list.append(np.ones_like(in_losses))      # members = 1

        X_list.append(out_losses)
        y_list.append(np.zeros_like(out_losses))    # non-members = 0

    X = np.concatenate(X_list, axis=0).reshape(-1, 1)  # (M,1)
    y = np.concatenate(y_list, axis=0).astype(np.int64)

    return X, y

In [None]:
class AttackMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(1, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

def train_attack_model(X, y, epochs=20, lr=1e-3, batch_size=128):
    X_t = torch.tensor(X, dtype=torch.float32)
    y_t = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    ds = torch.utils.data.TensorDataset(X_t, y_t)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True)

    attack = AttackMLP().to(device)
    opt = torch.optim.Adam(attack.parameters(), lr=lr)
    bce = nn.BCEWithLogitsLoss()

    for ep in range(epochs):
        running = 0.0
        for xb, yb in dl:
            xb, yb = xb.to(device), yb.to(device)
            logits = attack(xb)
            loss = bce(logits, yb)

            opt.zero_grad()
            loss.backward()
            opt.step()
            running += loss.item()

        print(f"[attack] epoch {ep+1}/{epochs} loss={running/len(dl):.4f}")

    attack.eval()
    return attack

In [15]:
@torch.no_grad()
def attack_victim(attack_model, victim_model, member_ds, nonmember_ds, batch_size=32):
    m_losses = per_sample_losses(victim_model, member_ds, batch_size=batch_size)
    nm_losses = per_sample_losses(victim_model, nonmember_ds, batch_size=batch_size)

    X_test = np.concatenate([m_losses, nm_losses], axis=0).reshape(-1,1)
    y_true = np.concatenate([
        np.ones_like(m_losses),
        np.zeros_like(nm_losses)
    ])

    X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
    probs = torch.sigmoid(attack_model(X_test_t)).cpu().numpy().squeeze()

    y_pred = (probs > 0.5).astype(np.int64)
    results = {}
    results['accuracy'] = accuracy_score(y_true, y_pred)
    results['roc_auc'] = roc_auc_score(y_true, probs)
    results['precision'] = precision_score(y_true, y_pred)
    results['recall'] = recall_score(y_true, y_pred)
    results['f1'] = f1_score(y_true, y_pred)
    results['confusion_matrix'] = confusion_matrix(y_true, y_pred)
    # results['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)

    print(f"\nVictim attack accuracy: {results['accuracy']*100:.2f}%")
    return results, probs, y_true

In [51]:
def run_loss_based_mia(
    victim_dir,
    shadow_source_ds,      # fake data pool for shadows
    target_member_ds,      # fake members for evaluation
    target_nonmember_ds,   # fake non-members for evaluation
    num_shadows=5,
    shadow_train_frac=0.5,
    shadow_epochs=2,
    batch_size=16,
):

    victim_model, tokenizer = load_victim(victim_dir)

    shadow_pairs = make_shadow_splits(
        shadow_source_ds,
        num_shadows=num_shadows,
        shadow_train_frac=shadow_train_frac
    )

    X_attack, y_attack = build_attack_dataset(
        victim_dir, shadow_pairs,
        shadow_epochs=shadow_epochs,
        batch_size=batch_size
    )

    attack_model = train_attack_model(X_attack, y_attack)

    results, probs, y_true = attack_victim(
        attack_model, victim_model,
        target_member_ds, target_nonmember_ds,
        batch_size=batch_size
    )

    return attack_model, results

In [52]:
# MODE = 'validation'
df = pd.read_csv(os.path.join(BASE_DIR, 'validation_samples.csv'))  # your uploaded file
raw_ds = Dataset.from_pandas(df.rename(columns={"label": "labels"})[["text", "labels"]])

print(df.head())

                                                text  label
0  Boeing Eyes In-Flight Live TV Connexion broadb...      3
1  Oil Climbs to Two-Week High (Reuters) Reuters ...      2
2  MSN sets movie times on smart watches Microsof...      3
3  More DS Units for Japan Nintendo will ship 400...      3
4  Russia to release massacre files Russia agrees...      0


In [53]:
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

In [54]:
shadow_source_ds = raw_ds.map(tokenize_batch, batched=True)
shadow_source_ds = shadow_source_ds.remove_columns(["text"])  # only model inputs

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [55]:

membership_data = np.loadtxt(os.path.join(BASE_DIR, 'validation_results.txt'), dtype=int)
membership_data = membership_data[membership_data[:,0].argsort()]  # sort by index

membership_labels = membership_data[:,1]  # array of 0/1
print("Loaded membership labels:", len(membership_labels))
assert len(membership_labels) == len(raw_ds)

member_idx = np.where(membership_labels == 1)[0].tolist()
nonmember_idx = np.where(membership_labels == 0)[0].tolist()

target_member_ds = raw_ds.select(member_idx)
target_nonmember_ds = raw_ds.select(nonmember_idx)

Loaded membership labels: 1000


In [None]:
attack_model, results = run_loss_based_mia(
    victim_dir="victim_model_distilbert_agnews",
    shadow_source_ds=shadow_source_ds,
    target_member_ds=target_member_ds,
    target_nonmember_ds=target_nonmember_ds,
    num_shadows=5,
    shadow_train_frac=0.5,
    shadow_epochs=2,
    batch_size=16
)


=== Training shadow 1/5 ===


In [36]:
hf auth login

SyntaxError: invalid syntax (ipython-input-2988405326.py, line 1)