In [1]:
#/**
#* @file ul2_mmlu.ipynb
#* @author chenyunan (chen.yunan_01@nus.edu.sg)
#* @brief
#* @version 0.1
#* @date 2023-12-04
#*
#* @copyright Copyright (c) 2023 
#*
#*/

### Imports and global utils

In [2]:
'''imports'''
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,4,5,6,7"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import general_utils
# clear GPU memory
if True:   
    general_utils.kill_gpu_process(os.environ["CUDA_VISIBLE_DEVICES"])
import torch
from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Tokenizer
import numpy as np
import pickle
import time
from tqdm import tqdm
import json
import lambada_utils
from lambada_utils import LambadaProcessor
from typing import Tuple, List

### Load tokenizer and model

In [3]:
# We are using custom huggingface cache dirs in case the default one doesn't have the capacity, since the models can be quite large.
MY_HUGGINGFACE_CACHE_DIR ='huggingface_cache' # relative to this notebook path
tokenizer = AutoTokenizer.from_pretrained("google/ul2",
                                        cache_dir = MY_HUGGINGFACE_CACHE_DIR+'/google-ul2')

RUN_CELL = 1 # Load model 1
# device_map=general_utils.get_ul2_device_map('2,3')
if RUN_CELL:
    model = T5ForConditionalGeneration.from_pretrained("google/ul2",
                                                        cache_dir=MY_HUGGINGFACE_CACHE_DIR + '/google-ul2',
                                                        low_cpu_mem_usage=True,
                                                        torch_dtype=torch.bfloat16,
                                                        device_map='cuda:0')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### import MMLU datasets

In [4]:
from datasets import load_dataset

SUBJECTS = ['high_school_european_history', 'business_ethics', 'clinical_knowledge', 'medical_genetics', \
            'high_school_us_history', 'high_school_physics', 'high_school_world_history', 'virology', \
            'high_school_microeconomics', 'econometrics', 'college_computer_science', 'high_school_biology', \
            'abstract_algebra', 'professional_accounting', 'philosophy', 'professional_medicine', 'nutrition', \
            'global_facts', 'machine_learning', 'security_studies', 'public_relations', 'professional_psychology', \
            'prehistory', 'anatomy', 'human_sexuality', 'college_medicine', 'high_school_government_and_politics', \
            'college_chemistry', 'logical_fallacies', 'high_school_geography', 'elementary_mathematics', 'human_aging', \
            'college_mathematics', 'high_school_psychology', 'formal_logic', 'high_school_statistics', 'international_law', \
            'high_school_mathematics', 'high_school_computer_science', 'conceptual_physics', 'miscellaneous', 'high_school_chemistry', \
            'marketing', 'professional_law', 'management', 'college_physics', 'jurisprudence', 'world_religions', 'sociology', 'us_foreign_policy', \
            'high_school_macroeconomics', 'computer_security', 'moral_scenarios', 'moral_disputes', 'electrical_engineering', 'astronomy', 'college_biology']

DATASET_PATH = os.path.join("lukaemon/mmlu")
MMLU_DATAS = [load_dataset(DATASET_PATH, sub) for sub in SUBJECTS]
INDEX = [i for i in range(len(SUBJECTS))]
NAMES_WITH_DATAS = zip(INDEX, SUBJECTS, MMLU_DATAS)

#### Test

In [5]:
MAX_COMPLETION_LENGTH = 8
NUM_BEAMS = 20
for index,name,data in NAMES_WITH_DATAS:
    print(name)
    print((data['test']))

    input_string = data['test'][0]['input']
    print(input_string)

    inputs = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = model.generate(inputs,
                            max_length=MAX_COMPLETION_LENGTH, 
                            num_beams=NUM_BEAMS, 
                            num_return_sequences=NUM_BEAMS, 
                            output_scores=True,
                            eos_token_id=tokenizer.convert_tokens_to_ids('<extra_id_1>'), 
                            return_dict_in_generate=True)
    
    print(tokenizer.decode(outputs[0][0], skip_special_tokens=True))
    break

high_school_european_history
Dataset({
    features: ['input', 'A', 'B', 'C', 'D', 'target'],
    num_rows: 164
})
This question refers to the following information.
Read the the following quotation to answer questions.
The various modes of worship which prevailed in the Roman world were all considered by the people as equally true; by the philosopher as equally false; and by the magistrate as equally useful.
Edward Gibbon, The Decline and Fall of the Roman Empire, 1776–1788
Gibbon's interpretation of the state of religious worship in ancient Rome could be summarized as


Gibbon's interpretation of


#### Define Loss Function

In [6]:
# define loss
ce_loss = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) #reduction='avg'
ce_loss_sum = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='sum') #reduction='sum'

#### Define Question prompt

In [7]:
import torch.nn.functional as F

def data_prompting(docs, tokenizer) -> Tuple:
    '''
        docs: DATA_SET[SUBJECTS_NAME], ex:MMLU[high_school_european_history]
        return: Tuple(input_ids, labels)

        input[example]: Question:<prompt> 
        label[example]: A. <choice1> B. <choice2> C. <choice3> D. <choice4>

        Todo: few-shot data prompting
    '''

    keys = ["A", "B", "C", "D"]
    for doc in docs:
        input_ = "Question: " + doc['input']
        completions = [f"{doc[key]}" for key in keys]
        label = f"{doc[doc['target']]}"
        
        # input_id = torch.tensor(tokenizer(input_,return_tensors="pt").input_ids.to("cuda"))
        input_id = tokenizer(input_, return_tensors="pt").input_ids.to("cuda").clone().detach().requires_grad_(False)
        # label_id = tokenizer(label, return_tensors="pt").input_ids.to("cuda").clone().detach().requires_grad_(False)
        # id_to_completions_id = torch.cat([torch.tensor(tokenizer(completion, return_tensors="pt").input_ids.to("cuda")) for completion in completions], dim=0)
        completions_ids = [tokenizer(completion, return_tensors="pt").input_ids.to("cuda").clone().detach().requires_grad_(False)\
                                                                for completion in completions]

        # Assuming `max_length` is the maximum length you want to pad sequences to
        max_length = max(seq.size(1) for seq in completions_ids)

        # Pad sequences to the common length
        padded_sequences = [F.pad(seq, (0, max_length - seq.size(1)), value=tokenizer.pad_token_id) for seq in completions_ids]

        # Use pad_sequence
        completions_ids_padded = torch.nn.utils.rnn.pad_sequence(padded_sequences, batch_first=True, padding_value=tokenizer.pad_token_id)

        completions_ids_padded = torch.squeeze(completions_ids_padded, dim = 1)
        yield input_id, completions_ids_padded, label

     

#### Define Development Mode

In [8]:
IS_DEVELOPMENT = True
set_partition = 'validation' if IS_DEVELOPMENT else 'test' 

In [11]:
ENDING_PUNCTUATIONS = '<' # If the model generates one, it is considered that the sentence is complete and we can parse for the last word

def get_word_from_completion(completion: str):
    '''Get the last word from the given completion, if there is a valid one. Return the word.'''
    found = False
    word = None
    # if a punctuation can be found in the completion, get the string before the punctuation
    for i in range(len(completion)):
        if completion[i] in ENDING_PUNCTUATIONS:
            word = completion[:i]
            found = True
            break
    if not found:
        return None

    return word

In [14]:
def is_correct_completion(completion:torch.Tensor, label:str):
    if not isinstance(completion, torch.Tensor):
        return False
    completion_string = tokenizer.decode(completion)

    # print(f'completion_string:{completion_string}')
    if not isinstance(completion_string, str):
        return False
    word = get_word_from_completion(completion_string)
    # print(f'word:{word}')
    if not isinstance(word, str):
        return False
    if word == label:
        return True

In [None]:
RUN_CELL = 1 # Obtain the avg_log_p_map_offset
MAX_OFFSET = 1

TOTAL_CASE = 0
ACUURACTE_CASE = 0

if RUN_CELL:
# id_and_offset_to_input_and_completions:
# (id, offset) -> input_ids, [completion_ids_0, completion_ids_1, completion_ids_2,...]
    avg_log_p_map_offset = dict() # (id, offset, completion_index) -> avg_log_p of the tokens constituting the last word (might be punctuated)
    
    for example_index in tqdm(range(len(INDEX))): 
    # for example_index in tqdm(range(1)): 
        data = MMLU_DATAS[example_index]
        print(SUBJECTS[example_index])

        for offset in range(MAX_OFFSET):
            gen = data_prompting(data[set_partition], tokenizer)

            for input_ids, completions_batch, label in gen:
                avg_log_p_and_completion = []
                outputs = lambada_utils.multi_labels_forward(model, input_ids, completions_batch)

                for completion_index in range(len(completions_batch)):
                    avg_log_p = -ce_loss(
                        # Only care about the tokens corresponding to the last word and omit offset tokens 
                        # the first one is <extra_id_0> and omitted
                        outputs.logits[completion_index][1+offset:], 
                        completions_batch[completion_index][1+offset:]
                    )
                    avg_log_p_map_offset[(example_index, offset, completion_index)] = \
                        avg_log_p.detach().cpu().tolist()
                    
                    avg_log_p_and_completion.append([avg_log_p.detach().cpu().tolist(), completions_batch[completion_index]])

                best_avg_log_p, best_completion = max(avg_log_p_and_completion, key=lambda x: x[0])
                # print(tokenizer.decode(best_completion))
                # print(f'label:{label}')

                if is_correct_completion(best_completion, label):
                    ACUURACTE_CASE += 1
                    print(f'count_correct +1 : {ACUURACTE_CASE}')
                TOTAL_CASE +=1
            

In [16]:
ACUURACTE_CASE

389

In [17]:
TOTAL_CASE

1474

In [18]:
ACUURACTE_CASE/TOTAL_CASE

0.2639077340569878

### Calculate the precision

In [None]:
RUN_CELL = 1 # Max reduction to emsemble conditionals for the same last word
'''Max reduction to emsemble conditionals for the same last word, 
i.e., only the maximum avg_log_p is kept for each last word across different range_middle_span_length's and range_middle_to_end_gap's.
Emsemble the baseline conditionals with the K-offset conditionals and middle-off conditionals.'''


if RUN_CELL:
    # Add the baseline (offset = 0 from K-offset ensemble) to the list
    ADD_BASELINE = True
    
    for example_index in tqdm(range(len(INDEX))): 
        avg_log_p_and_completion = dict()
    
