### Imports and global utils

In [1]:
'''imports'''
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,4,5,6,7"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import general_utils
# clear GPU memory
if True:   
    general_utils.kill_gpu_process(os.environ["CUDA_VISIBLE_DEVICES"])
import torch
from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Tokenizer
import numpy as np
from tqdm import tqdm
import lambada_utils
from lambada_utils import LambadaProcessor
from typing import Tuple, List

### Load tokenizer and model

In [2]:
# We are using custom huggingface cache dirs in case the default one doesn't have the capacity, since the models can be quite large.
MY_HUGGINGFACE_CACHE_DIR ='huggingface_cache' # relative to this notebook path
tokenizer = AutoTokenizer.from_pretrained("google/ul2",
                                        cache_dir = MY_HUGGINGFACE_CACHE_DIR+'/google-ul2')

RUN_CELL = 1 # Load model 1
# device_map=general_utils.get_ul2_device_map('2,3')
if RUN_CELL:
    model = T5ForConditionalGeneration.from_pretrained("google/ul2",
                                                        cache_dir=MY_HUGGINGFACE_CACHE_DIR + '/google-ul2',
                                                        low_cpu_mem_usage=True,
                                                        torch_dtype=torch.bfloat16,
                                                        device_map='cuda:0')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Import datasets

In [4]:
from datasets import load_dataset

SUBJECTS = ['ARC-Challenge','ARC-Easy']

DATASET_PATH = os.path.join("ai2_arc")
ARC_DATAS = [load_dataset(DATASET_PATH, sub) for sub in SUBJECTS]
INDEX = [i for i in range(len(SUBJECTS))]
NAMES_WITH_DATAS = zip(INDEX, SUBJECTS, ARC_DATAS)

#### Define Loss Function

In [5]:
# define loss
ce_loss = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) #reduction='avg'
ce_loss_sum = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='sum') #reduction='sum'

In [6]:
extra_id_0 = torch.tensor([tokenizer.convert_tokens_to_ids("<extra_id_0>")])
extra_id_1 = torch.tensor([tokenizer.convert_tokens_to_ids("<extra_id_1>")])

### Define question prompt

In [8]:
UL2_MODE = "[NLG]"

In [16]:
import torch.nn.functional as F

def data_prompting(docs, tokenizer, num_of_shot) -> Tuple:
    '''
        docs: DATA_SET[SUBJECTS_NAME], ex:ARC[ARC-Challenge]
        return: Tuple(input_ids, completions_ids_padded, labels)

        input[example]: <prompt> 
        label[example]: A. <choice1> B. <choice2> C. <choice3> D. <choice4>

        Todo: few-shot data prompting
    '''

    # key_to_index = {"A":0, "B":1, "C":2, "D":3}
    doc_count = 0
    for doc in docs:
        input_ = UL2_MODE + " "

        for data in docs:
            text_in_shot = data['choices']['text']
            choice_in_shot = data['choices']['label']
            answer_in_shot = data['answerKey']
            index_in_shot = choice_in_shot.index(answer_in_shot)
            label_in_shot = text_in_shot[index_in_shot]

            if doc_count < num_of_shot and data != doc:
                doc_count += 1
                input_ += "question" + str(doc_count) + ": " + data['question'] + " " + "answer: " + label_in_shot + '\n'

            if doc_count == 5:
                doc_count = 0
                break

        texts = doc['choices']['text']
        choices_list = doc['choices']['label']
        answer = doc['answerKey']
        index = choices_list.index(answer)

        input_ += "question:" + " " + doc['question'] + " " + "<extra_id_0>"
        print(input_)
        completions = [f"<extra_id_0> choice: {text}" for text in texts]
        # label = f"{texts[index]}"
        label = index
        
        input_id = tokenizer(input_, return_tensors="pt").input_ids.to("cuda").clone().detach().requires_grad_(False)
        # label_id = tokenizer(label, return_tensors="pt").input_ids.to("cuda").clone().detach().requires_grad_(False)
        completions_ids = [tokenizer(completion, return_tensors="pt").input_ids.to("cuda").clone().detach()[:,:-1]\
                                                                for completion in completions] # remove <eos> token with [:,:-1]

        # Assuming `max_length` is the maximum length you want to pad sequences to
        max_length = max(seq.size(1) for seq in completions_ids)

        # Pad sequences to the common length
        padded_sequences = [F.pad(seq, (0, max_length - seq.size(1)), value=tokenizer.pad_token_id) for seq in completions_ids]

        # Use pad_sequence
        completions_ids_padded = torch.nn.utils.rnn.pad_sequence(padded_sequences, batch_first=True, padding_value=tokenizer.pad_token_id)

        completions_ids_padded = torch.squeeze(completions_ids_padded, dim = 1)
        yield input_id, completions_ids_padded, label


In [10]:
MAX_OFFSET = 1

In [11]:
IS_DEVELOPMENT = True
set_partition = 'validation' if IS_DEVELOPMENT else 'test' 

In [17]:
RUN_CELL = 1 # Obtain the avg_log_p_map_offset

TOTAL_CASE = 0
ACUURACTE_CASE = 0
NUM_SHOT = 5 #Define number of shot in inference

if RUN_CELL:
# id_and_offset_to_input_and_completions:
# (id, offset) -> input_ids, [completion_ids_0, completion_ids_1, completion_ids_2,...]
    avg_log_p_map_offset = dict() # (id, offset, completion_index) -> avg_log_p of the tokens constituting the last word (might be punctuated)
    
    for example_index in tqdm(range(len(INDEX))): 
        data = ARC_DATAS[example_index]
        # print(SUBJECTS[example_index])
    
        gen = data_prompting(data[set_partition], tokenizer, NUM_SHOT)

        for input_ids, completions_batch, label in gen:
            avg_log_p_and_completion = []
            outputs = lambada_utils.multi_labels_forward(model, input_ids, completions_batch)

            for completion_index in range(len(completions_batch)):
                avg_log_p = -ce_loss(
                    # Only care about the tokens corresponding to the last word and omit offset tokens 
                    # the first one is <extra_id_0> and omitted
                    outputs.logits[completion_index][1:], 
                    completions_batch[completion_index][1:]
                )
                avg_log_p_map_offset[(example_index, 0, completion_index)] = \
                    avg_log_p.detach().cpu().tolist()
                
                avg_log_p_and_completion.append([avg_log_p.detach().cpu().tolist(), completion_index])

            best_avg_log_p, best_completion_index = max(avg_log_p_and_completion, key=lambda x: x[0])

            if best_completion_index == label:
                ACUURACTE_CASE += 1
            TOTAL_CASE += 1

  0%|          | 0/2 [00:00<?, ?it/s]

[NLG] question1: High-pressure systems stop air from rising into the colder regions of the atmosphere where water can condense. What will most likely result if a high-pressure system remains in an area for a long period of time? answer: drought
question2: Students visited the Morris W. Offit telescope located at the Maryland Space Grant Observatory in Baltimore. They learned about the stars, planets, and moon. The students recorded the information below. • Star patterns stay the same, but their locations in the sky seem to change. • The sun, planets, and moon appear to move in the sky. • Proxima Centauri is the nearest star to our solar system. • Polaris is a star that is part of a pattern of stars called the Little Dipper. Which statement best explains why the sun appears to move across the sky each day? answer: Earth rotates on its axis.
question3: Which topic area would be the best to research to find ways of reducing environmental problems caused by humans? answer: converting sunli

 50%|█████     | 1/2 [00:29<00:29, 29.50s/it]

[NLG] question1: A student hypothesizes that algae are producers. Which question will best help the student determine if this is correct? answer: Do algae use sunlight to make food?
question2: Soccer players use their muscle systems to kick a ball into a goal. What organ system coordinates the muscles? answer: The nervous system
question3: Planets in the solar system are in constant motion. What factor has the greatest effect on the orbits of the planets? answer: gravitational pull of the Sun
question4: How is a pond different from a lake? answer: Ponds are smaller and shallower.
question5: A substance with a mass of 10 g is heated to produce two new substances. The mass of the first new substance is 9.3 g and the mass of the second new substance is 0.7 g. Which of the following is best demonstrated by this example? answer: law of conservation of mass
question: Which technology was developed most recently? <extra_id_0>
[NLG] question1: Which technology was developed most recently? answ

100%|██████████| 2/2 [01:08<00:00, 34.22s/it]

[NLG] question1: Which technology was developed most recently? answer: cellular telephone
question2: A student hypothesizes that algae are producers. Which question will best help the student determine if this is correct? answer: Do algae use sunlight to make food?
question3: Soccer players use their muscle systems to kick a ball into a goal. What organ system coordinates the muscles? answer: The nervous system
question4: Planets in the solar system are in constant motion. What factor has the greatest effect on the orbits of the planets? answer: gravitational pull of the Sun
question5: How is a pond different from a lake? answer: Ponds are smaller and shallower.
question: Which device would most likely be used to produce light energy? <extra_id_0>
[NLG] question1: Which technology was developed most recently? answer: cellular telephone
question2: A student hypothesizes that algae are producers. Which question will best help the student determine if this is correct? answer: Do algae use




In [18]:
ACUURACTE_CASE/TOTAL_CASE

0.48216340621403914