# Membership Inference Competition (MICO) @ IEEE SatML 2023: SST-2

Welcome to the MICO competition!

This notebook will walk you through the process of creating and packaging a submission to one of the challenges.

Let's start by downloading and extracting the archives for the SST-2 challenge.
We split the challenge data into three archives, one per scenario (~80GiB each).
Downloading, verifying, and extracting them can take a while, so you may want to run the cell below only once.

**NOTE**: Public anonymous access to the competition data is disabled. 
Upon registering for the competition, you will be shown URLs with embedded bearer tokens that you must use instead of the URLs below.

In [None]:
import os
import urllib

from torchvision.datasets.utils import download_and_extract_archive

files = [
    {
        'filename' : 'sst2_lo.tar.gz',
        'url': 'FILL_IN_WITH_CORRECT',
        'md5': '205414dd0217065dcebe2d8adc1794a3'
    },
    {
        'filename' : 'sst2_hi.tar.gz',
        'url': 'FILL_IN_WITH_CORRECT',
        'md5': 'd285958529fcad486994347478feccd2'
    },
    {
        'filename' : 'sst2_inf.tar.gz',
        'url': 'FILL_IN_WITH_CORRECT',
        'md5': '7dca44b191a0055edbde5c486a8fc671'
    }
]

# WARNING: this will download and extract three ~80GiB files, if not already present. Please save the files and avoid re-downloading them.
try:
    for f in files:
        url, filename, md5 = f['url'], f['filename'], f['md5']
        print(f"Downloading and extracting {filename}...")
        download_and_extract_archive(url=url, download_root=os.curdir, extract_root=None, filename=filename, md5=md5, remove_finished=False)
except urllib.error.HTTPError as e:
    print(e)
    print("Have you replaced the URLs above with the one you got after registering?")

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import csv
import os
import pandas as pd
import datasets

from tqdm.notebook import tqdm, trange
from mico_competition import ChallengeDataset, load_sst2, load_model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import matplotlib.pyplot as plt
import torch.nn.functional as F

assert torch.cuda.is_available(), "CUDA is not available; the below would only work with CUDA"

In [None]:
def preprocess_text(D, tokenizer, max_sequence_length):
    processed_data = D.map(
        lambda batch: tokenizer(batch["sentence"], padding="max_length", max_length=max_sequence_length),
        batched=True
    )
    return processed_data.remove_columns(["sentence"])

In [None]:
def train(rest_points):
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    device = 'cuda'
    model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2).to(device)
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    ds = datasets.DatasetDict({
        'train': datasets.Dataset.from_pandas(
        pd.DataFrame.from_records(rest_points))}).remove_columns("idx")
    ds = preprocess_text(ds, tokenizer, 67)
    model.train()
    
    training_args = TrainingArguments(
        output_dir='/tmp',
        lr_scheduler_type= 'constant',
        learning_rate=5e-5,
        num_train_epochs=3,
        logging_steps=10,
        save_strategy='no',
        dataloader_num_workers=8,
        per_device_train_batch_size=96,
        gradient_accumulation_steps=1)
    
    trainer = Trainer(
        args = training_args,
        train_dataset = ds['train'],
        model = model,
        tokenizer = tokenizer
    )
    
    trainer.train()
    model.eval()
    return model

In [None]:
CHALLENGE = "sst2"
LEN_TRAINING = 67349
LEN_CHALLENGE = 100

scenarios = os.listdir(CHALLENGE)

dataset = load_sst2()

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

scenario = 'sst2_hi'
train_path = os.path.join(CHALLENGE, scenario, 'train')

for m, model_folder in enumerate(tqdm(sorted(os.listdir(train_path), key=lambda d: int(d.split('_')[1])), desc="model")):
    data_path = os.path.join(train_path, model_folder)
    challenge_dataset = ChallengeDataset.from_path(data_path, dataset=dataset, len_training=LEN_TRAINING)
    rest_points = challenge_dataset.rest
    ref_model = train(rest_points)
    break

In [None]:
challenge_points = challenge_dataset.get_challenges()
challenge_dataloader = torch.utils.data.DataLoader(challenge_points, batch_size=10)

y = np.loadtxt(os.path.join(data_path, "solution.csv"),   delimiter=",")
model_path = os.path.join(train_path, model_folder)

with torch.no_grad():
    model = load_model('sst2', model_path).eval().cuda()
    preds = []
    ref_preds = []
    for batch in challenge_dataloader:
        labels = batch['label'].to(torch.device('cuda'))
        tokenizedSequences = tokenizer(batch['sentence'], return_tensors="pt", padding="max_length", max_length=67)
        tokenizedSequences = tokenizedSequences.to(torch.device('cuda'))
        
        # query model
        output = model(**tokenizedSequences)
        batch_predictions = F.softmax(output.logits, dim=1)[torch.arange(output.logits.shape[0]), labels].cpu().numpy()
        preds.extend(batch_predictions)
    
        # ref model
        output = ref_model(**tokenizedSequences)
        batch_predictions = F.softmax(output.logits, dim=1)[torch.arange(output.logits.shape[0]), labels].cpu().numpy()
        ref_preds.extend(batch_predictions)

In [None]:
scores = np.array(preds) - np.array(ref_preds)
scores = (scores - scores.min()) / (scores.max() - scores.min())

In [None]:
plt.hist(scores[y==0], bins=30, color='blue', alpha=0.5)
plt.hist(scores[y==1], bins=30, color='red', alpha=0.5);

In [None]:
from mico_competition.scoring import score

score(y, scores)

In [None]:
dev_path = os.path.join(CHALLENGE, scenario, 'dev')
for m, model_folder in enumerate(tqdm(sorted(os.listdir(dev_path), key=lambda d: int(d.split('_')[1])), desc="model")):
    data_path = os.path.join(dev_path, model_folder)
    challenge_dataset = ChallengeDataset.from_path(data_path, dataset=dataset, len_training=LEN_TRAINING)
    challenge_points = challenge_dataset.get_challenges()
    challenge_dataloader = torch.utils.data.DataLoader(challenge_points, batch_size=10)
    
    rest_points = challenge_dataset.rest
    ref_model = train(rest_points)
    
    with torch.no_grad():
        model = load_model('sst2', data_path).eval().cuda()
        preds = []
        ref_preds = []
        for batch in challenge_dataloader:
            labels = batch['label'].to(torch.device('cuda'))
            tokenizedSequences = tokenizer(batch['sentence'], return_tensors="pt", padding="max_length", max_length=67)
            tokenizedSequences = tokenizedSequences.to(torch.device('cuda'))

            # query model
            output = model(**tokenizedSequences)
            batch_predictions = F.softmax(output.logits, dim=1)[torch.arange(output.logits.shape[0]), labels].cpu().numpy()
            preds.extend(batch_predictions)

            # ref model
            output = ref_model(**tokenizedSequences)
            batch_predictions = F.softmax(output.logits, dim=1)[torch.arange(output.logits.shape[0]), labels].cpu().numpy()
            ref_preds.extend(batch_predictions)
            
    scores = np.array(preds) - np.array(ref_preds)
    scores = (scores - scores.min()) / (scores.max() - scores.min())
    
    with open(os.path.join(data_path, "prediction.csv"), "w") as f:
        csv.writer(f).writerow(list(scores))

In [None]:
final_path = os.path.join(CHALLENGE, scenario, 'final')
for m, model_folder in enumerate(tqdm(sorted(os.listdir(final_path), key=lambda d: int(d.split('_')[1])), desc="model")):
    data_path = os.path.join(final_path, model_folder)
    challenge_dataset = ChallengeDataset.from_path(data_path, dataset=dataset, len_training=LEN_TRAINING)
    challenge_points = challenge_dataset.get_challenges()
    challenge_dataloader = torch.utils.data.DataLoader(challenge_points, batch_size=10)
    
    rest_points = challenge_dataset.rest
    ref_model = train(rest_points)
    
    with torch.no_grad():
        model = load_model('sst2', data_path).eval().cuda()
        preds = []
        ref_preds = []
        for batch in challenge_dataloader:
            labels = batch['label'].to(torch.device('cuda'))
            tokenizedSequences = tokenizer(batch['sentence'], return_tensors="pt", padding="max_length", max_length=67)
            tokenizedSequences = tokenizedSequences.to(torch.device('cuda'))

            # query model
            output = model(**tokenizedSequences)
            batch_predictions = F.softmax(output.logits, dim=1)[torch.arange(output.logits.shape[0]), labels].cpu().numpy()
            preds.extend(batch_predictions)

            # ref model
            output = ref_model(**tokenizedSequences)
            batch_predictions = F.softmax(output.logits, dim=1)[torch.arange(output.logits.shape[0]), labels].cpu().numpy()
            ref_preds.extend(batch_predictions)
            
    scores = np.array(preds) - np.array(ref_preds)
    scores = (scores - scores.min()) / (scores.max() - scores.min())
    
    with open(os.path.join(data_path, "prediction.csv"), "w") as f:
        csv.writer(f).writerow(list(scores))

In [None]:
import zipfile

phases = ['dev', 'final']
scenarios = ['sst2_inf', 'sst2_lo', 'sst2_hi']

with zipfile.ZipFile("predictions_sst2.zip", 'w') as zipf:
    for scenario in tqdm(scenarios, desc="scenario"): 
        for phase in tqdm(phases, desc="phase"):
            root = os.path.join(CHALLENGE, scenario, phase)
            for model_folder in tqdm(sorted(os.listdir(root), key=lambda d: int(d.split('_')[1])), desc="model"):
                path = os.path.join(root, model_folder)
                file = os.path.join(path, "prediction.csv")
                if os.path.exists(file):
                    zipf.write(file)
                else:
                    raise FileNotFoundError(f"`prediction.csv` not found in {path}. You need to provide predictions for all challenges")