In [1]:
import os
import torch
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
from transformers import AutoTokenizer
from sklearn.metrics import auc, roc_curve


tok = AutoTokenizer.from_pretrained("hf_models/pythia-4.9B")

In [2]:
def sweep(x, score):
    fpr, tpr, _ = roc_curve(x, -score)
    acc = np.max(1-(fpr+(1-tpr))/2)
    return fpr, tpr, auc(fpr, tpr), acc


def evaluate(es, fpr_threshold=0.05):
    answers = []
    metric2predictions = defaultdict(list)
    for e in es:
        answers.append(e["label"])
        for metric in e["pred"].keys():
            metric2predictions[metric].append(e["pred"][metric])

    for metric, predictions in metric2predictions.items():
        fpr, tpr,  auc, acc = sweep(np.array(answers, dtype=bool), np.array(predictions))
        low = tpr[np.where(fpr < fpr_threshold)[0][-1]]
        print("Attack %s AUC %.4f, Accuracy %.4f, TPR@5FPR of %.4f\n" %(metric, auc, acc, low))

In [3]:
data_dir = "output/13_0.8"
data_file = "pythia-6-9B"

## original prediction

In [4]:
data_path = os.path.join(data_dir, f"{data_file}.pkl")
with open(data_path, "rb") as f:
    data = pkl.load(f)

In [7]:
print(data[0]["input_ids"])
print(len(data[0]["prob_dis"]), data[0]["prob_dis"])
print(data[0]["text"])

[10404, 187, 187, 1145, 10404, 13, 3690, 15, 310, 247, 4156, 1345, 2567, 1754, 275, 2615, 790, 6121, 13, 11637, 15, 733, 24357, 285, 10169, 10796, 73, 656, 982, 11369, 9864, 3694, 323, 1885, 2216, 13, 5175, 285, 4254, 15, 743, 10404, 369, 11420, 275, 10333, 407, 2516, 4235, 29963, 15, 4235, 29963, 4211, 521, 1600, 275, 253, 2567, 281, 18875, 5347, 1346, 275, 9725, 15, 743, 10404, 2427, 1345, 327, 13170, 4877, 50, 275, 8441, 15, 496, 253, 5307, 84, 13, 743, 10404, 1160, 7418, 45587, 273, 643, 11369, 2216, 4413, 13, 28635, 3081, 4302, 323, 6514, 8062, 13, 23598, 2216, 13, 285, 643, 12057, 1783, 15, 743, 10404, 3395, 247, 4445, 273, 253, 13170, 4877, 50, 14, 2313, 3605, 327, 4565, 3495, 13, 6247, 15, 187, 187, 9873, 1831, 366, 2892, 187, 187, 3980, 304, 968, 187, 510, 2934, 323, 743, 10404, 369, 806, 20913, 407, 2516, 4235, 29963, 1223, 2444, 387, 253, 4255, 272, 5967, 22042, 34322, 18196, 275, 253, 11994, 84, 15, 2058, 253, 673, 13, 19414, 2684, 6486, 3284, 1783, 313, 12297, 34, 10, 407,

## PPL and Min-k Prob

In [8]:
examples = []
for d in tqdm(data):
    e = {}
    pred = {}
    tar_ppl = np.exp(-np.mean(d["prob_dis"].numpy()))
    pred["ppl"] = tar_ppl # larger for nonmember

    k = int(len(d["input_ids"]) * 0.2)
    min_k_pro = np.sort(d["prob_dis"].numpy())[:k]
    pred[f"min_20% prob"] = -np.mean(min_k_pro).item()  # larger for nonmember
    
    e["pred"] = pred
    e["label"] = d["label"]
    examples.append(e)
    
evaluate(examples)

100%|████████████████████████████████████| 2000/2000 [00:00<00:00, 21576.36it/s]

Attack ppl AUC 0.5208, Accuracy 0.5290, TPR@5FPR of 0.0420

Attack min_20% prob AUC 0.5257, Accuracy 0.5325, TPR@5FPR of 0.0440






## prediction conditioned on 1-length prefix

In [9]:
data_path = os.path.join(data_dir, f"{data_file}-1.pkl")
with open(data_path, "rb") as f:
    data_1 = pkl.load(f)
    
for d, d_1 in zip(data, data_1):
    pred = {}
    diff = d_1["prob_dis"][:, -1] / d["prob_dis"][0:]
    pred["pre_len_1"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][0]]
    probs_1_dedup = [d_1["prob_dis"][0, -1]]
    for i, input_id in enumerate(d["input_ids"][1:]):
        if input_id not in d["input_ids"][:i]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_1_dedup.append(d_1["prob_dis"][i+1, -1])
    diff_dedup = -(torch.tensor(probs_dedup)-torch.tensor(probs_1_dedup) / torch.tensor(probs_dedup))
    pred["pre_len_1_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_1 AUC 0.5606, Accuracy 0.5545, TPR@5FPR of 0.0590

Attack pre_len_1_dedup AUC 0.5618, Accuracy 0.5530, TPR@5FPR of 0.0460



## prediction conditioned on 2-length prefix

In [10]:
data_path = os.path.join(data_dir, f"{data_file}-2.pkl")
with open(data_path, "rb") as f:
    data_2 = pkl.load(f)
    
for d, d_2 in zip(data, data_2):
    pred = {}
    diff = d_2["prob_dis"][:, -1] / d["prob_dis"][1:]
    pred["pre_len_2"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][1]]
    probs_2_dedup = [d_2["prob_dis"][0, -1]]
    for i, input_id in enumerate(d["input_ids"][2:]):
        if input_id not in d["input_ids"][:i-1]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_2_dedup.append(d_2["prob_dis"][i+1, -1])
    diff_dedup = torch.tensor(probs_2_dedup) / torch.tensor(probs_dedup)
    pred["pre_len_2_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_2 AUC 0.5664, Accuracy 0.5580, TPR@5FPR of 0.0640

Attack pre_len_2_dedup AUC 0.5227, Accuracy 0.5255, TPR@5FPR of 0.0490



## prediction conditioned on 3-length prefix

In [11]:
data_path = os.path.join(data_dir, f"{data_file}-3.pkl")
with open(data_path, "rb") as f:
    data_3 = pkl.load(f)
    
for d, d_3 in zip(data, data_3):
    pred = {}
    diff = d_3["prob_dis"][:, -1] / d["prob_dis"][2:]
    pred["pre_len_3"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][2]]
    probs_3_dedup = [d_3["prob_dis"][0, -1]]
    for i, input_id in enumerate(d["input_ids"][3:]):
        if input_id not in d["input_ids"][:i-2]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_3_dedup.append(d_3["prob_dis"][i+1, -1])
    diff_dedup = torch.tensor(probs_3_dedup) / torch.tensor(probs_dedup)
    pred["pre_len_3_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_3 AUC 0.5685, Accuracy 0.5565, TPR@5FPR of 0.0750

Attack pre_len_3_dedup AUC 0.5225, Accuracy 0.5260, TPR@5FPR of 0.0610



## prediction conditioned on 4-length prefix

In [12]:
data_path = os.path.join(data_dir, f"{data_file}-4.pkl")
with open(data_path, "rb") as f:
    data_4 = pkl.load(f)
    
for d, d_4 in zip(data, data_4):
    pred = {}
    diff = d_4["prob_dis"][:, -1] / d["prob_dis"][3:]
    pred["pre_len_4"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][3]]
    probs_4_dedup = [d_4["prob_dis"][0, -1]]
    for i, input_id in enumerate(d["input_ids"][4:]):
        if input_id not in d["input_ids"][:i-3]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_4_dedup.append(d_4["prob_dis"][i+1, -1])
    diff_dedup = torch.tensor(probs_4_dedup) / torch.tensor(probs_dedup)
    pred["pre_len_4_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_4 AUC 0.5708, Accuracy 0.5585, TPR@5FPR of 0.0920

Attack pre_len_4_dedup AUC 0.5369, Accuracy 0.5305, TPR@5FPR of 0.0720



## prediction conditioned on 5-length prefix

In [13]:
data_path = os.path.join(data_dir, f"{data_file}-5.pkl")
with open(data_path, "rb") as f:
    data_5 = pkl.load(f)
    
for d, d_5 in zip(data, data_5):
    pred = {}
    diff = d_5["prob_dis"][:, -1] / d["prob_dis"][4:]
    pred["pre_len_5"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][4]]
    probs_5_dedup = [d_5["prob_dis"][0, -1]]
    for i, input_id in enumerate(d["input_ids"][5:]):
        if input_id not in d["input_ids"][:i-4]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_5_dedup.append(d_5["prob_dis"][i+1, -1])
    diff_dedup = torch.tensor(probs_5_dedup) / torch.tensor(probs_dedup)
    pred["pre_len_5_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_5 AUC 0.5742, Accuracy 0.5630, TPR@5FPR of 0.1030

Attack pre_len_5_dedup AUC 0.5345, Accuracy 0.5295, TPR@5FPR of 0.0650



## prediction conditioned on 6-length prefix

In [14]:
data_path = os.path.join(data_dir, f"{data_file}-6.pkl")
with open(data_path, "rb") as f:
    data_6 = pkl.load(f)
    
for d, d_6 in zip(data, data_6):
    pred = {}
    diff = d_6["prob_dis"][:, -1] / d["prob_dis"][5:]
    pred["pre_len_6"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][5]]
    probs_6_dedup = [d_6["prob_dis"][0, -1]]
    for i, input_id in enumerate(d["input_ids"][6:]):
        if input_id not in d["input_ids"][:i-5]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_6_dedup.append(d_6["prob_dis"][i+1, -1])
    diff_dedup = torch.tensor(probs_6_dedup) / torch.tensor(probs_dedup)
    pred["pre_len_6_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_6 AUC 0.5828, Accuracy 0.5645, TPR@5FPR of 0.0780

Attack pre_len_6_dedup AUC 0.5255, Accuracy 0.5315, TPR@5FPR of 0.0420



## prediction conditioned on 7-length prefix

In [15]:
data_path = os.path.join(data_dir, f"{data_file}-7.pkl")
with open(data_path, "rb") as f:
    data_7 = pkl.load(f)
    
for d, d_7 in zip(data, data_7):
    pred = {}
    diff = d_7["prob_dis"][:, -1] / d["prob_dis"][6:]
    pred["pre_len_7"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][6]]
    probs_7_dedup = [d_7["prob_dis"][0, -1]]
    for i, input_id in enumerate(d["input_ids"][7:]):
        if input_id not in d["input_ids"][:i-6]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_7_dedup.append(d_7["prob_dis"][i+1, -1])
    diff_dedup = torch.tensor(probs_7_dedup) / torch.tensor(probs_dedup)
    pred["pre_len_7_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_7 AUC 0.5933, Accuracy 0.5795, TPR@5FPR of 0.0740

Attack pre_len_7_dedup AUC 0.5201, Accuracy 0.5295, TPR@5FPR of 0.0550



## average prediction conditioned on [1-7]-length

In [16]:
for d, d_1, d_2, d_3, d_4, d_5, d_6, d_7 in zip(data, data_1, data_2, data_3, data_4, data_5, data_6, data_7):
    pred = {}
    d_avg = d_1["prob_dis"][6:, -1] + d_2["prob_dis"][5:, -1] + d_3["prob_dis"][4:, -1] + d_4["prob_dis"][3:, -1] + d_5["prob_dis"][2:, -1] + d_6["prob_dis"][1:, -1] + d_7["prob_dis"][:, -1]
    d_avg = d_avg / 7
    diff = d_avg / d["prob_dis"][6:]
    pred["pre_len_[1-7]"] = torch.mean(diff).item()
    probs_dedup = [d["prob_dis"][6]]
    probs_7_dedup = [d_avg[0]]
    for i, input_id in enumerate(d["input_ids"][7:]):
        if input_id not in d["input_ids"][:i-6]:
            probs_dedup.append(d["prob_dis"][i+1])
            probs_7_dedup.append(d_avg[i+1])
    diff_dedup = torch.tensor(probs_7_dedup) / torch.tensor(probs_dedup)
    pred["pre_len_[1-7]_dedup"] = torch.mean(diff_dedup).item()
    d["pred"] = pred
    
evaluate(data)

Attack pre_len_[1-7] AUC 0.5827, Accuracy 0.5685, TPR@5FPR of 0.0950

Attack pre_len_[1-7]_dedup AUC 0.5155, Accuracy 0.5265, TPR@5FPR of 0.0540

