### Imports and global utils

In [2]:
'''imports'''
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,4,5,6,7"
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"
# clear GPU memory
from utils import general_utils, lambada_utils
import torch
from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Tokenizer
from tqdm import tqdm

from typing import Tuple, List

In [3]:
debug_print = False
show = print if debug_print else lambda *args, **kwargs: None

### Load tokenizer and model

In [4]:
# We are using custom huggingface cache dirs in case the default one doesn't have the capacity, since the models can be quite large.
MY_HUGGINGFACE_CACHE_DIR ='huggingface_cache' # relative to this notebook path
tokenizer = AutoTokenizer.from_pretrained("google/ul2",
                                        cache_dir = MY_HUGGINGFACE_CACHE_DIR+'/google-ul2')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
RUN_CELL = False # Load model 1
if RUN_CELL:
    model = T5ForConditionalGeneration.from_pretrained("google/ul2",
                                                        cache_dir=MY_HUGGINGFACE_CACHE_DIR + '/google-ul2',
                                                        low_cpu_mem_usage=True,
                                                        torch_dtype=torch.bfloat16,
                                                        device_map='balanced')

### import MMLU datasets

In [6]:
from datasets import load_dataset

SUBJECTS = ['ARC-Challenge','ARC-Easy']
DATASET_PATH = os.path.join("ai2_arc")

ARC_DATAS = [load_dataset(DATASET_PATH, sub) for sub in SUBJECTS]
INDEX = [i for i in range(len(SUBJECTS))]
NAMES_WITH_DATAS = zip(INDEX, SUBJECTS, ARC_DATAS)

In [10]:
import eoc_datasets
eoc_datasets.ARC.get_dataset(set_partition='train', subset='ARC-Easy')

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 2251
})

#### Define Loss Function

In [9]:
# define loss and get extra ids
ce_loss = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) #reduction='avg'
ce_loss_sum = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='sum') #reduction='sum'
extra_id_0 = torch.tensor([tokenizer.convert_tokens_to_ids("<extra_id_0>")])
extra_id_1 = torch.tensor([tokenizer.convert_tokens_to_ids("<extra_id_1>")])

#### Define Question Prompt

In [64]:
UL2_MODE = "[NLG]"

In [66]:
import torch.nn.functional as F

def data_prompting(docs, tokenizer) -> Tuple:
    '''
        docs: dataset from huggingface datasets, ex:ARC-Challenge['train']
        return: Tuple(input_ids, completions_ids_padded, labels)

        input[example]: <prompt> 
        label[example]: A. <choice1> B. <choice2> C. <choice3> D. <choice4>

        Todo: few-shot data prompting
    '''

    # key_to_index = {"A":0, "B":1, "C":2, "D":3}
    for doc in docs:
        texts = doc['choices']['text']
        choices_list = doc['choices']['label']
        answer = doc['answerKey']
        index = choices_list.index(answer)

        input_ = UL2_MODE + " " + "Question:" + " " + doc['question'] + " " + "<extra_id_0>"
        # input_ = UL2_MODE + " " + doc['question'] + " " + "<extra_id_0>"
        if UL2_MODE == "[NLG]":
            # completions = [f"<extra_id_0> {text}" for text in texts]
            completions = [f"<extra_id_0> Answer: {text}" for text in texts]
        elif UL2_MODE == "[S2S]":
            # completions = [text for text in texts]
            completions = [f"Answer: {text}" for text in texts]
        else:
            raise ValueError("UL2_MODE not defined")
        
        show(input_)
        show(completions)
        label = index

        input_id = tokenizer(input_, return_tensors="pt").input_ids.to("cuda")

        completions_ids = [tokenizer(completion, return_tensors="pt").input_ids.to("cuda")[:,:-1]\
                                                                for completion in completions] # remove <eos> token with [:,:-1]

        # Assuming `max_length` is the maximum length you want to pad sequences to
        max_length = max(seq.size(1) for seq in completions_ids)

        # Pad sequences to the common length
        padded_sequences = [F.pad(seq, (0, max_length - seq.size(1)), value=tokenizer.pad_token_id) for seq in completions_ids]

        # Use pad_sequence
        completions_ids_padded = torch.nn.utils.rnn.pad_sequence(padded_sequences, batch_first=True, padding_value=tokenizer.pad_token_id)

        completions_ids_padded = torch.squeeze(completions_ids_padded, dim = 1)
        yield input_id, completions_ids_padded, label


### K-offset Ensemble

In [12]:
MAX_OFFSET = 1

In [13]:
IS_DEVELOPMENT = True
set_partition = 'validation' if IS_DEVELOPMENT else 'test' 

In [67]:
RUN_CELL = 1      # Obtain the avg_log_p_map_offset

TOTAL_CASE = 0
ACCURATE_CASE = 0

if RUN_CELL:
    avg_log_p_map_offset = dict() # (id, offset, completion_index) -> avg_log_p of the tokens constituting the last word (might be punctuated)
    
    for example_index in [0]: # tqdm(range(len(INDEX))): 
        data = ARC_DATAS[example_index]
        # print(SUBJECTS[example_index])
    
        gen = data_prompting(data[set_partition], tokenizer)

        for input_ids, completions_batch, label in tqdm(gen):
            avg_log_p_and_completion = []
            outputs = lambada_utils.multi_labels_forward(model, input_ids, completions_batch)

            for completion_index in range(len(completions_batch)):
                if UL2_MODE == "[NLG]":
                    avg_log_p = -ce_loss(
                            # the first one is <extra_id_0> and omitted
                            outputs.logits[completion_index][1:], 
                            completions_batch[completion_index][1:]
                    )   
                elif UL2_MODE == "[S2S]":
                    avg_log_p = -ce_loss(
                            # the first one is <extra_id_0> and omitted
                            outputs.logits[completion_index], 
                            completions_batch[completion_index]
                    )
                avg_log_p_map_offset[(example_index, 0, completion_index)] = \
                    avg_log_p.detach().cpu().tolist()
                
                avg_log_p_and_completion.append([avg_log_p.detach().cpu().tolist(), completion_index])

            best_avg_log_p, best_completion_index = max(avg_log_p_and_completion, key=lambda x: x[0])

            if best_completion_index == label:
                ACCURATE_CASE += 1
            TOTAL_CASE += 1
            # break
        # break

299it [01:08,  4.35it/s]


### Experiment Log

with question: and answer:
challenge : 0.3745819397993311
easy: 0.6035087719298246

without question: and answer:
challenge : 0.3377926421404682
easy: 0.5666666666666667

S2S mode without question: and answer:
challenge : 0.3411371237458194
easy: 

S2S mode with question: and answer:
challenge : 0.3511705685618729
easy:

In [63]:
ACCURATE_CASE/TOTAL_CASE

0.3511705685618729