In [1]:
import requests

# List of packages to check versions for
packages = [
    "torch",
    "tqdm",
    "transformers",
    "ipykernel",
    "SentencePiece",
    "accelerate",
    "matplotlib",
    "GitPython",
    "psutil",
    "datasets",
    "bitsandbytes",
]

def get_latest_version(package_name):
    url = f"https://pypi.org/pypi/{package_name}/json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the response was an error
        data = response.json()
        return data['info']['version']
    except requests.RequestException:
        return None

# Fetch and format the latest version for each package for requirements.txt
requirements_lines = []
for package in packages:
    latest_version = get_latest_version(package)
    if latest_version:
        requirements_lines.append(f"{package}=={latest_version}")
    else:
        requirements_lines.append(f"# Could not fetch latest version for {package}")

# Join the lines to format as expected in requirements.txt
requirements_text = "\n".join(requirements_lines)
print(requirements_text)


torch==2.2.0
tqdm==4.66.2
transformers==4.38.1
ipykernel==6.29.2
SentencePiece==0.2.0
accelerate==0.27.2
matplotlib==3.8.3
GitPython==3.1.42
psutil==5.9.8
datasets==2.17.1
bitsandbytes==0.42.0


### Configs

In [2]:
cuda_visible_devices = '1'
# model_identifier = "t5-11b" 
model_identifier = "google-ul2"
# Use custom huggingface cache dirs in case the default one has low capacity, since the models are large.
MY_HUGGINGFACE_CACHE_DIR ='huggingface_cache'
dataset_name = "MMLU" 
# dataset_name = "BigBench"
# K-offset conditionals
ALL_OFFSETS = [1, 2, 3,]
# Multispan (Multimask) conditionals
ALL_LENGTH_GAP_NUM_TUPLES = [
    (3, 5, 1),
    (3, 5, 2),
    (3, 3, 1),
    (3, 3, 2),
    (3, 4, 1),
    (3, 4, 2),
]
# filtering samples for specific lens for best sensitivity
INPUT_LEN_MIN = 20 # the length of the input should be at least 20
COMPLETION_LEN_MAX = 5 # the length of the completion should be at most 5


### Imports and global utils

In [3]:
'''imports'''
import os
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
from itertools import combinations
import random
import pickle
from utils import general_utils, eoc
import torch
from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Tokenizer
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets
from typing import Tuple, List
import torch.nn.functional as F
import eoc_datasets
from model_configs import model_configs

  from .autonotebook import tqdm as notebook_tqdm


### Load model

In [4]:
# Specify model and load tokenizer
config = model_configs[model_identifier]

model_name, model_dir, mode, no_extra_tokens, model_kwargs = \
    config['model_name'], config['model_dir'], config['mode'], config['no_extra_tokens'], config['model_kwargs']


tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=os.path.join(MY_HUGGINGFACE_CACHE_DIR, model_dir)
)

# define loss and get extra ids
ce_loss = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) #reduction='avg'
ce_loss_sum = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='sum') #reduction='sum'

In [5]:
RUN_CELL = True  # Load model
if RUN_CELL:
    model = T5ForConditionalGeneration.from_pretrained(
        model_name,
        cache_dir=os.path.join(MY_HUGGINGFACE_CACHE_DIR, model_dir),
        **model_kwargs
    )

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 4/4 [00:39<00:00,  9.79s/it]


### Get dataset


In [6]:
if dataset_name == "MMLU":
    dataset_processor = eoc_datasets.MMLUProcessor(subjects=config['mmlu_subjects'])
    data = dataset_processor.get_dataset(
        set_partition='test', 
    )
elif dataset_name == "BigBench":
    dataset_processor = eoc_datasets.BigBenchProcessor(subjects=config['bigbench_subjects'])
    data = dataset_processor.get_dataset(
        set_partition='train', 
    )
example_generator = dataset_processor.example_generator

In [7]:
RUN_CELL = True   # set tensors_filtering_criterion by lengths
if RUN_CELL:
    def tensors_filtering_criterion(input_ids, completions_batch):
        return len(input_ids[0]) > INPUT_LEN_MIN \
            and all([len(general_utils.remove_trailing_zeros_from_1d_tensor(completion)) < COMPLETION_LEN_MAX for completion in completions_batch])
    gen = example_generator(data, tokenizer, mode=mode, tensors_filtering_criterion=tensors_filtering_criterion)
    input_lens = []
    completion_lens = []
    for example_id, input_ids, completions_batch, label in tqdm(gen):
        input_lens.append(len(input_ids[0]))
        completion_lens.append(len(completions_batch[0])) # with padding, this is the max len of the completions
    # print(f"input len > 20 and completion len < 10  and len > 6: {sum([i > 20 and j < 6 for i, j in zip(input_lens, completion_lens)])}")
    # print(f"completion len < 6: {sum([j < 6 for j in completion_lens])}")
    print(f"input len max: {max(input_lens)}, min: {min(input_lens)}, avg: {sum(input_lens)/len(input_lens)}")
    print(f"completion len max: {max(completion_lens)}, min: {min(completion_lens)}, avg: {sum(completion_lens)/len(completion_lens)}")

0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
225it [00:01, 147.86it/s]

input len max: 453, min: 21, avg: 46.306666666666665
completion len max: 4, min: 2, avg: 3.542222222222222





In [8]:
RUN_CELL = True  # generate baseline info and conditionals
if RUN_CELL:
    baseline = dict() 
    # save the label and the number of completions
    gen = example_generator(data, tokenizer, mode, tensors_filtering_criterion=tensors_filtering_criterion)
    for example_id, input_ids, completions_batch, label in tqdm(gen):
        baseline[example_id] = dict()
        baseline[example_id]['label'] = label
        baseline[example_id]['no_completions'] = len(completions_batch)
        baseline[example_id]['p_map'] = []
        p_and_completion = []
        outputs = eoc.multi_labels_forward(model, input_ids.cuda(), completions_batch.cuda())

        for completion_index in range(len(completions_batch)):
            p = -ce_loss(
                # Only care about the tokens corresponding to the last word and omit offset tokens 
                # if the first one is <extra_id_0> and it is omitted
                outputs.logits[completion_index][no_extra_tokens:].cuda(), 
                completions_batch[completion_index][no_extra_tokens:].cuda()
            )

            baseline[example_id]['p_map'] += [p.detach().cpu().tolist()]

225it [00:40,  5.52it/s]


### K-offset Conditionals

In [9]:
RUN_CELL = True 
if RUN_CELL:
    p_map_offset = dict() # maps (example_id, offset, completion_index) -> avg_p
    for offset in ALL_OFFSETS:
        gen = example_generator(data, tokenizer, mode, tensors_filtering_criterion=tensors_filtering_criterion)
        for example_id, input_ids, completions_batch, label in tqdm(gen):
            input_ids_offset, labels_offset = eoc.create_offset_sample_from_batch(
                tokenizer,
                input_ids,
                completions_batch,
                offset
            )
            outputs = eoc.multi_labels_forward(model, input_ids_offset.cuda(), labels_offset.cuda())
            for completion_index in range(len(completions_batch)):
                avg_log_p = -ce_loss(
                    # Only care about the tokens corresponding to the original completion and omit offset tokens 
                    # if the first one is <extra_id_0> and it is omitted
                    outputs.logits[completion_index][no_extra_tokens+offset:].cuda(), 
                    labels_offset[completion_index][no_extra_tokens+offset:].cuda()
                )
                p_map_offset[(example_id, offset, completion_index)] = \
                    avg_log_p.detach().cpu().tolist()

225it [00:25,  8.66it/s]
225it [00:25,  8.73it/s]
225it [00:22,  9.96it/s]


### Multispan Conditionals

In [10]:
RUN_CELL = True  # generate multispan conditionals
if RUN_CELL:
    p_map_multispan = dict()
    for length_gap_num_tuple in ALL_LENGTH_GAP_NUM_TUPLES:
        span_length, gap_between_spans, num_spans = length_gap_num_tuple    
        gen = example_generator(data, tokenizer, mode, tensors_filtering_criterion=tensors_filtering_criterion)

        for example_id, input_ids, completions_batch, label in tqdm(gen):
            # print(input_ids.shape)
            # continue
            inputs_ids_multispan, labels_multispan = eoc.create_multiple_span_sample_from_batch(
                tokenizer,
                input_ids[0], # squeeze 1st dim
                completions_batch,
                span_length,
                gap_between_spans,
                num_spans,
            )
            outputs = eoc.multi_labels_forward(model, inputs_ids_multispan.cuda(), labels_multispan.cuda())

            for completion_index in range(len(completions_batch)):
                # assert multispan samples are correct 
                assert completions_batch[completion_index].nonzero().shape[0] == \
                    labels_multispan[completion_index][num_spans * (span_length + 1) :].nonzero().shape[0]

                avg_log_p = -ce_loss(
                    # Only care about the tokens corresponding to the completion (see assert below)); 
                    # so the first <extra_id_0> is omitted, and for each span, the span + <extra_id_k> is omitted;
                    # totally 1 + num_spans * (span_length + 1) tokens are omitted;
                    # labels_multispan contains paddings.
                    outputs.logits[completion_index][1 + num_spans * (span_length + 1) :].cuda(), 
                    labels_multispan[completion_index][1 + num_spans * (span_length + 1) :].cuda()
                )
                p_map_multispan[(example_id, span_length, gap_between_spans, num_spans, completion_index)] = \
                    avg_log_p.detach().cpu().tolist()

225it [00:25,  8.90it/s]
225it [00:21, 10.33it/s]
225it [00:21, 10.38it/s]
225it [00:21, 10.26it/s]
225it [00:21, 10.39it/s]
225it [00:21, 10.38it/s]


### Disagreement and Ensemble

In [11]:
def calc_disagreement(p_and_completion_individually):
    best_completion_indices = []
    for p_and_completion_individual in p_and_completion_individually:
        _, best_completion_index = max(p_and_completion_individual, key=lambda x: x[0])
        best_completion_indices.append(best_completion_index)
    return len(set(best_completion_indices)) > 1

In [12]:
'''Define the EOC function'''
# Max reduction to emsemble conditionals for the same last word
'''Max reduction to emsemble conditionals for the same last word, 
i.e., only the maximum avg_log_p is kept for each last word across different range_middle_span_length's and range_middle_to_end_gap's.
Emsemble the baseline conditionals with the K-offset conditionals and middle-off conditionals.'''

def run_eoc(offsets, length_gap_num_tuples):
    add_baseline = True
    add_k_offset = offsets != []
    add_multispan = length_gap_num_tuples != []

    count_correct = 0
    count_disagreement = 0
    for example_index in range(len(baseline)):
        no_completions = baseline[example_index]['no_completions']
        # Create a list of tuples (avg_log_p, completion) for each completion
        p_and_completion = []
        p_and_completion_individually = []
        # add the baseline (offset = 0 from K-offset ensemble) to the list
        if add_baseline:
            p_and_completion_individual = [
                (baseline[example_index]['p_map'][completion_index], completion_index)
                for completion_index in range(no_completions)
            ]
            p_and_completion += p_and_completion_individual
            p_and_completion_individually.append(p_and_completion_individual)
            
        # add the whole K-offset ensemble to the list
        if add_k_offset:
            for offset in offsets:
                p_and_completion_individual = [
                    (p_map_offset[(example_index, offset, completion_index)], completion_index)
                    for completion_index in range(no_completions)
                ]
                p_and_completion += p_and_completion_individual
                p_and_completion_individually.append(p_and_completion_individual)
                
        if add_multispan:
            for length_gap_num in length_gap_num_tuples:
                p_and_completion_individual = [
                    (p_map_multispan[(example_index, *length_gap_num, completion_index)], completion_index)
                    for completion_index in range(no_completions)
                ]
                p_and_completion += p_and_completion_individual
                p_and_completion_individually.append(p_and_completion_individual)

        # Find the tuple with the maximum avg_log_p; this is essentially max reduction
        _, best_completion_index = max(p_and_completion, key=lambda x: x[0])
        label = baseline[example_index]['label']
        if (isinstance(label, int) and best_completion_index == label) or \
        (isinstance(label, list) and best_completion_index in label) :# TruthfulQA has multiple correct answers
            count_correct += 1
        count_disagreement += calc_disagreement(p_and_completion_individually)
    # print("accuracy:", count_correct / len(baseline))
    return count_correct / len(baseline), count_disagreement / len(baseline)

In [13]:
RUN_CELL = True  # Run EOC
if RUN_CELL:
    NO_OFFSETS = len(ALL_OFFSETS)
    NO_MULTISPAN = len(ALL_LENGTH_GAP_NUM_TUPLES)
    NO_DISTS_RANGE = list(range(NO_OFFSETS + NO_MULTISPAN + 1))
    avg_accs = []
    avg_disagreements = []
    for NO_DISTS in NO_DISTS_RANGE: # no of distributions to ensemble
        all_dist_ids = list(combinations(range(NO_MULTISPAN + NO_OFFSETS), NO_DISTS))
        # shuffle and take the first 100
        random.shuffle(all_dist_ids)
        all_dist_ids = all_dist_ids[:500]
        all_accs = []
        all_disagreements = []
        for dist_ids in all_dist_ids:
            offsets = []
            length_gap_num_tuples = []
            for dist_id in dist_ids:
                if dist_id < NO_OFFSETS:
                    offsets.append(ALL_OFFSETS[dist_id])
                else:
                    length_gap_num_tuples.append(ALL_LENGTH_GAP_NUM_TUPLES[dist_id - NO_OFFSETS])            
            acc, disagreement = run_eoc(
                offsets,
                length_gap_num_tuples,
            )
            # print offsets and length_gap_num_tuples and acc
            # print(offsets, length_gap_num_tuples, acc)
            all_accs.append(acc)
            all_disagreements.append(disagreement)
        avg_acc = sum(all_accs) / len(all_accs)
        avg_disagreement = sum(all_disagreements) / len(all_disagreements)
        avg_accs.append(avg_acc)
        avg_disagreements.append(avg_disagreement)
        # print number of dists and avg_acc
        print(f"NO_DISTS: {NO_DISTS}, avg_acc: {avg_acc}", f"avg_disagreement: {avg_disagreement}")

NO_DISTS: 0, avg_acc: 0.4577777777777778 avg_disagreement: 0.0
NO_DISTS: 1, avg_acc: 0.48246913580246914 avg_disagreement: 0.14962962962962964
NO_DISTS: 2, avg_acc: 0.4885185185185185 avg_disagreement: 0.21432098765432098
NO_DISTS: 3, avg_acc: 0.492116402116402 avg_disagreement: 0.25624338624338616


NO_DISTS: 4, avg_acc: 0.4947442680776012 avg_disagreement: 0.28649029982363294
NO_DISTS: 5, avg_acc: 0.4970723104056434 avg_disagreement: 0.3095238095238095
NO_DISTS: 6, avg_acc: 0.4993650793650795 avg_disagreement: 0.32772486772486786
NO_DISTS: 7, avg_acc: 0.5017283950617283 avg_disagreement: 0.34246913580246907
NO_DISTS: 8, avg_acc: 0.5041975308641975 avg_disagreement: 0.3545679012345679
NO_DISTS: 9, avg_acc: 0.5066666666666667 avg_disagreement: 0.36444444444444446


In [14]:
avg_accs, avg_disagreements

([0.4577777777777778,
  0.48246913580246914,
  0.4885185185185185,
  0.492116402116402,
  0.4947442680776012,
  0.4970723104056434,
  0.4993650793650795,
  0.5017283950617283,
  0.5041975308641975,
  0.5066666666666667],
 [0.0,
  0.14962962962962964,
  0.21432098765432098,
  0.25624338624338616,
  0.28649029982363294,
  0.3095238095238095,
  0.32772486772486786,
  0.34246913580246907,
  0.3545679012345679,
  0.36444444444444446])

In [15]:
avg_accs, avg_disagreements

([0.4577777777777778,
  0.48246913580246914,
  0.4885185185185185,
  0.492116402116402,
  0.4947442680776012,
  0.4970723104056434,
  0.4993650793650795,
  0.5017283950617283,
  0.5041975308641975,
  0.5066666666666667],
 [0.0,
  0.14962962962962964,
  0.21432098765432098,
  0.25624338624338616,
  0.28649029982363294,
  0.3095238095238095,
  0.32772486772486786,
  0.34246913580246907,
  0.3545679012345679,
  0.36444444444444446])

In [16]:
# ([0.4147727272727273,
#   0.42550505050505055,
#   0.43118686868686856,
#   0.4345914502164504,
#   0.4370490620490621,
#   0.43903318903318905,
#   0.4406114718614716,
#   0.4417613636363637,
#   0.4425505050505051,
#   0.4431818181818182],
#  [0.0,
#   0.31313131313131315,
#   0.43229166666666674,
#   0.49952651515151525,
#   0.5447781385281388,
#   0.5767947330447328,
#   0.6001758658008658,
#   0.6177398989898988,
#   0.6313131313131314,
#   0.6420454545454546])