In [2]:
import json
import re
import os
import json
from datasets import Dataset, DatasetDict, load_dataset
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def split_exemplar_and_test_question(prompt):
    pattern = r'(Answer: [ABCD])(?!.*Answer: [ABCD])'
    match = re.search(pattern, prompt, flags=re.DOTALL)
    if match:
        split_index = match.end()
        return prompt[:split_index], prompt[split_index:].strip()
    return "", prompt

def extract_subject(test_question):
    match = re.search(r'about\s(.*?)\.', test_question)
    if match:
        return match.group(1)
    return ""

def model_name_from_file(file_name):
    base_name = file_name.split('/')[-1]
    model_name = base_name.split('_mmlu')[0]
    return model_name.replace('-', '_')


In [5]:
mmlu_dataset = load_dataset('hails/mmlu_no_train', 'all')
mmlu_dev = mmlu_dataset['dev']
random_1k_sample = mmlu_dataset['test'].shuffle(seed=42).select(range(1000))
# random_1k_sample = random_1k_sample.sort('question')
reference_answers = random_1k_sample['answer']
# print([(index, item) for index, item in enumerate(random_1k_sample['question']) if "Ad lazarum" in item])
# print(random_1k_sample['question'][0])

In [6]:
# print(sorted(random_1k_sample['question'])[44])

In [7]:
# with open("/data/richard/llm2vec/mmlu_response_generation/outputs/34b_beta+AlphaMonarch_7B_vote_mmlu_vllm.json", 'r') as file:
#     responses = json.load(file)

# for response in responses:
#     exemplar, test_question = split_exemplar_and_test_question(response['prompt'])
#     response['test_question'] = test_question
# responses = sorted(responses, key=lambda x: x['test_question'])
# print(responses[0]['test_question'])
# print(responses[1]['test_question'])

In [11]:
def create_hf_dataset(output_folder):
    # Initialize the lists to store dataset information
    exemplar_questions = []
    test_questions = []
    subjects = []
    answers_list = []  # List of lists, each sublist corresponds to answers for one prompt

    # Iterate over files in the output folder
    for i, file_name in enumerate(sorted(os.listdir(output_folder))):
        if file_name.endswith('.json'):
            file_path = os.path.join(output_folder, file_name)
            model_name = model_name_from_file(file_name)
            #print(model_name)
            try:
                with open(file_path, 'r') as file:
                    responses = json.load(file)
            except Exception:
                print(f"Error in Loading Response from {model_name}, Skipping...")
                continue
            if len(responses) != 1000:
                print(f"Found Incomplete Response from {model_name}, Skipping...")
                continue
            
            # for response in responses:
            #     exemplar, test_question = split_exemplar_and_test_question(response['prompt'])
            #     response['test_question'] = test_question
            # responses = sorted(responses, key=lambda x: x['test_question'])
            # print(responses[0]['test_question'])
            # print(responses[1]['test_question'])

            # If this is the first file, initialize prompts and subjects
            if answers_list == []:
                print(f"Initializing with {model_name}")
                for response in responses:
                    exemplar, test_question = split_exemplar_and_test_question(response['prompt'])
                    subject = extract_subject(exemplar)
                    answer = response['output'][0] if response['output'] else None

                    exemplar_questions.append(exemplar)
                    test_questions.append(test_question)
                    subjects.append(subject)
                    answers_list.append([{'answer': answer, 'model': model_name}])
            
            # For subsequent files, only update the answers
            else:
                for k, response in enumerate(responses):
                    answer = response['output'][0] if response['output'] else None
                    answers_list[k].append({'answer': answer, 'model': model_name})

    # Create a dataset from the compiled data
    dataset = Dataset.from_dict({'exemplar_questions': exemplar_questions, 'test_questions': test_questions, 
                                 'subject': subjects, 'answers': answers_list, 'reference_answers': reference_answers})
    return dataset

In [12]:
# Usage
output_folder = "/data/richard/llm2vec/mmlu_response_generation/outputs"
dataset = create_hf_dataset(output_folder)
dataset

Initializing with 34b_beta


Dataset({
    features: ['exemplar_questions', 'test_questions', 'subject', 'answers', 'reference_answers'],
    num_rows: 1000
})

In [13]:
len(dataset['answers'][0])

5673

In [14]:
dataset.save_to_disk('/data/richard/llm2vec/mmlu_response_generation/hf_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 5515.34 examples/s]


In [16]:
dataset.push_to_hub("RZ412/mmlu_responses_1k_augmented")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.26ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


In [17]:
from datasets import load_dataset

dataset = load_dataset("RZ412/mmlu_responses_1k_augmented")

Downloading readme: 100%|██████████| 522/522 [00:00<00:00, 1.70MB/s]


In [18]:
len(dataset['train']['answers'][0])

5756