In [None]:
import numpy as np
from tqdm import tqdm

from transformers import logging

logging.set_verbosity_error()

import benchmarks

from datasets import Dataset, load_dataset, concatenate_datasets

# auto-reload
%load_ext autoreload
%autoreload 2

# Concatenate the different datasets and run duplicate detection

In [None]:
hellaswag = Dataset.from_parquet("de-duplication/hellaswag_duplicates.parquet")
arc_easy = Dataset.from_parquet("de-duplication/arc_easy_duplicates.parquet")
boolq = Dataset.from_parquet("de-duplication/boolq_duplicates.parquet")
social_i_qa = Dataset.from_parquet("de-duplication/social_i_qa_duplicates.parquet")
piqa = Dataset.from_parquet("de-duplication/piqa_duplicates.parquet")
mmlu = Dataset.from_parquet("de-duplication/mmlu_duplicates.parquet")
winogrande = Dataset.from_parquet("de-duplication/winogrande_duplicates.parquet")

arc_easy = arc_easy.map(lambda x: {"benchmark": "arc-easy"})
boolq = boolq.map(lambda x: {"benchmark": "boolq"})
hellaswag = hellaswag.map(lambda x: {"benchmark": "hellaswag"})
social_i_qa = social_i_qa.map(lambda x: {"benchmark": "social_i_qa"})
piqa = piqa.map(lambda x: {"benchmark": "piqa"})
mmlu = mmlu.map(lambda x: {"benchmark": "mmlu"})
winogrande = winogrande.map(lambda x: {"benchmark": "winogrande"})

In [None]:
len(arc_easy), len(boolq), len(hellaswag), len(social_i_qa), len(piqa), len(mmlu), len(winogrande)

In [None]:
# for each dataset, report the numbef of duplicates
print("hellaswag", len(hellaswag), np.sum(hellaswag["is_duplicate"]))
print("arc_easy", len(arc_easy), np.sum(arc_easy["is_duplicate"]))
print("boolq", len(boolq), np.sum(boolq["is_duplicate"]))
print("social_i_qa", len(social_i_qa), np.sum(social_i_qa["is_duplicate"]))
print("piqa", len(piqa), np.sum(piqa["is_duplicate"]))
print("mmlu", len(mmlu), np.sum(mmlu["is_duplicate"]))
print("winogrande", len(winogrande), np.sum(winogrande["is_duplicate"]))

In [None]:
# for each dataset, drop the duplicates
arc_easy = arc_easy.filter(lambda x: not x["is_duplicate"])
boolq = boolq.filter(lambda x: not x["is_duplicate"])
hellaswag = hellaswag.filter(lambda x: not x["is_duplicate"])
social_i_qa = social_i_qa.filter(lambda x: not x["is_duplicate"])
piqa = piqa.filter(lambda x: not x["is_duplicate"])
mmlu = mmlu.filter(lambda x: not x["is_duplicate"])
winogrande = winogrande.filter(lambda x: not x["is_duplicate"])

# drop the duplicate columns
arc_easy = arc_easy.remove_columns(["is_duplicate", "has_duplicate"])
boolq = boolq.remove_columns(["is_duplicate", "has_duplicate"])
hellaswag = hellaswag.remove_columns(["is_duplicate", "has_duplicate"])
social_i_qa = social_i_qa.remove_columns(["is_duplicate", "has_duplicate"])
piqa = piqa.remove_columns(["is_duplicate", "has_duplicate"])
mmlu = mmlu.remove_columns(["is_duplicate", "has_duplicate"])
winogrande = winogrande.remove_columns(["is_duplicate", "has_duplicate"])

len(arc_easy), len(boolq), len(hellaswag), len(social_i_qa), len(piqa), len(mmlu), len(winogrande)

In [None]:
# piqa formatting issue
piqa = piqa.cast_column("label", boolq.features['label'])
piqa.features

## load result of de-duplication

In [None]:
all_datasets = Dataset.from_parquet("de-duplication/all_datasets.parquet")

arc_easy = all_datasets.filter(lambda x: x["benchmark"] == "arc-easy")
boolq = all_datasets.filter(lambda x: x["benchmark"] == "boolq")
hellaswag = all_datasets.filter(lambda x: x["benchmark"] == "hellaswag")
social_i_qa = all_datasets.filter(lambda x: x["benchmark"] == "social_i_qa")
piqa = all_datasets.filter(lambda x: x["benchmark"] == "piqa")
mmlu = all_datasets.filter(lambda x: x["benchmark"] == "mmlu")
winogrande = all_datasets.filter(lambda x: x["benchmark"] == "winogrande")

# print duplicates per dataset
print("arc_easy", len(arc_easy), np.sum(arc_easy["is_duplicate"]))
print("boolq", len(boolq), np.sum(boolq["is_duplicate"]))
print("hellaswag", len(hellaswag), np.sum(hellaswag["is_duplicate"]))
print("social_i_qa", len(social_i_qa), np.sum(social_i_qa["is_duplicate"]))
print("piqa", len(piqa), np.sum(piqa["is_duplicate"]))
print("mmlu", len(mmlu), np.sum(mmlu["is_duplicate"]))
print("winogrande", len(winogrande), np.sum(winogrande["is_duplicate"]))

# drop the duplicates
arc_easy = arc_easy.filter(lambda x: not x["is_duplicate"])
boolq = boolq.filter(lambda x: not x["is_duplicate"])
hellaswag = hellaswag.filter(lambda x: not x["is_duplicate"])
social_i_qa = social_i_qa.filter(lambda x: not x["is_duplicate"])
piqa = piqa.filter(lambda x: not x["is_duplicate"])
mmlu = mmlu.filter(lambda x: not x["is_duplicate"])
winogrande = winogrande.filter(lambda x: not x["is_duplicate"])

# drop the duplicate columns
arc_easy = arc_easy.remove_columns(["is_duplicate", "has_duplicate"])
boolq = boolq.remove_columns(["is_duplicate", "has_duplicate"])
hellaswag = hellaswag.remove_columns(["is_duplicate", "has_duplicate"])
social_i_qa = social_i_qa.remove_columns(["is_duplicate", "has_duplicate"])
piqa = piqa.remove_columns(["is_duplicate", "has_duplicate"])
mmlu = mmlu.remove_columns(["is_duplicate", "has_duplicate"])
winogrande = winogrande.remove_columns(["is_duplicate", "has_duplicate"])

In [None]:
len(arc_easy), len(boolq), len(hellaswag), len(social_i_qa), len(piqa), len(mmlu), len(winogrande), len(arc_easy) + len(boolq) + len(hellaswag) + len(social_i_qa) + len(piqa) + len(mmlu) + len(winogrande)

In [61]:
# drop observations from the largest datasets
winogrande = winogrande.shuffle(seed=42).select(range(8000))

In [None]:
total_observations = sum([len(arc_easy), len(boolq), len(hellaswag), len(social_i_qa), len(piqa), len(mmlu), len(winogrande)])
total_observations

In [None]:
# concatenate all datasets
all_datasets = concatenate_datasets([arc_easy, boolq, hellaswag, social_i_qa, piqa, mmlu, winogrande])
len(all_datasets)

# save this to perform fuzzy string matching between the different questions on the cluster
all_datasets.to_parquet('concatenated_datasets.parquet')

### for each benchmark, report the random baseline

In [None]:
for ben_ds in [arc_easy, boolq, hellaswag, social_i_qa, piqa, mmlu, winogrande]:
    print(ben_ds[0]["benchmark"])
    print(benchmarks.random_baseline(ben_ds))

### setup the schedule and the split the different datasets into parts of the needed size, stratified by accuracy

In [None]:
# the schedule that we use for the contamination
schedule = [(0, 10000), 
            (4, 8000),
            (12, 5000), 
            (36, 2000), 
            (144, 2000), 
            (4, 8000), 
            (12, 5000), 
            (36, 2000),
            (144, 2000)] 

total_samples = np.sum([v for _,v in schedule])
total_samples, np.sum([k*v for k,v in schedule]), len(schedule)

In [None]:
# the percentages of the different schedule items of the total items
percentages = np.array([v for _,v in schedule])/total_samples

percentages

In [67]:
all_datasets = [arc_easy, boolq, hellaswag, social_i_qa, piqa, mmlu, winogrande]

In [43]:
# randomly sample from all datasets to reduce the overall size to total_samples
# all_datasets = [dataset.shuffle(seed=42).select(range(int(len(dataset) * total_samples / total_observations))) for dataset in all_datasets]

In [68]:
# we split each benchmark in different parts according to percentage sizes of the total
from sklearn.model_selection import train_test_split

def split_dataset(dataset, percentages, debug=False, random_state=42):
    splits = []
    remaining_percentage = 1.0
    for per_cent in percentages:
        # if we have the last element, we take the rest
        if abs(remaining_percentage - per_cent) < 1e-6:
            splits.append(dataset)
            break
        train_idx, test_idx = train_test_split(range(len(dataset)), train_size=per_cent/remaining_percentage, random_state=random_state)
        train = dataset.select(train_idx)
        test = dataset.select(test_idx)
        remaining_percentage -= per_cent
        dataset = test
        splits.append(train)
    # under debug mode, print the class balances in the different splits
    if debug:
        for split in splits:
            print("Split with ", len(split), " obervations.")
    return splits

# split_dataset(all_datasets[0], percentages, True)

### now we split all the different datasets according to the percentages given by the schedule

In [69]:
dataset_splits = [split_dataset(dataset, percentages) for dataset in all_datasets]

In [70]:
# for the schedule datast, we pick the i'th split of each dataset
schedule_dataset = [concatenate_datasets([split[i] for split in dataset_splits]) for i, _ in enumerate(schedule)]

In [71]:
# now reduce each schedule dataset exactly to the number of samples in the schedule
schedule_dataset = [dataset.shuffle(seed=42).select(range(v)) for dataset, (_,v) in zip(schedule_dataset, schedule)]

In [None]:
# print the size and baseline accuracy of the different schedule datasets
for dataset in schedule_dataset:
    print(len(dataset), benchmarks.random_baseline(dataset))

In [None]:
# for each schedule dataset, print the number of observations per benchmark
for dataset in schedule_dataset:
    print({k: 100 * np.sum(np.array(dataset['benchmark']) == k) / len(dataset) for k in set(dataset['benchmark'])})

In [74]:
contamination_data = [[x['options'][x['label']] for x in ds] for ds in schedule_dataset]

In [None]:
# the average length of the contamination data for each schedule dataset
for ds in contamination_data:
    print(np.mean([len(x) for x in ds]))

In [None]:
schedule

In [None]:
# add a column 'schedule-id' to each dataset
for idx, dataset in enumerate(schedule_dataset):
    dataset = dataset.add_column('split-id', [idx] * len(dataset))
    schedule_dataset[idx] = dataset

In [78]:
# give each question a unique id
next_id = 0
for idx, dataset in enumerate(schedule_dataset):
    dataset = dataset.add_column('question-id', range(next_id, next_id+len(dataset)))
    schedule_dataset[idx] = dataset
    next_id += len(dataset)

In [None]:
# save the different schedule datasets
for idx, dataset in enumerate(schedule_dataset):
    dataset.to_parquet(f"contamination_splits/{idx}-{schedule[idx]}.parquet")

### check by loading the contamination splits

In [None]:
ds = benchmarks.load_benchmark('all-contamination-splits')
len(ds)

In [88]:
# assert that we really have unique question ids
assert len(np.unique(ds['question-id'])) == len(ds)

In [None]:
break # break here if we were just creating the contamination splits

### visualize some duplicates

In [90]:
dupl_ds = Dataset.from_parquet("de-duplication/all_datasets.parquet")

In [None]:
# find all has duplicates
dupl_ds = dupl_ds.filter(lambda x: x['has_duplicate'])
dupl_ds

# remove dupl. columns
dupl_ds = dupl_ds.remove_columns(["has_duplicate", "is_duplicate"])

dupl_ds = benchmarks.__gpt2__benchmark_safety__(dupl_ds)

# save to json
dupl_ds.to_json("all_contamination_splits_duplicates.json")

In [None]:
# print the rows
for idx, row in enumerate(dupl_ds):
    print(idx, row['benchmark'])
    print(row['options'][row['label']])
    print('-'*80)^