In [1]:
from datasets import load_dataset
import os
import random
import json
from itertools import permutations

In [18]:
mmlu_math1 = load_dataset("cais/mmlu", "elementary_mathematics")
mmlu_math2 = load_dataset("cais/mmlu", "high_school_mathematics")
mmlu_math3 = load_dataset("cais/mmlu", "college_mathematics")

In [3]:
# Check the dataset format
print(mmlu_math1['test'][1])


{'question': 'Ms. Perez drove a total of 40 miles in 5 days. She drove the same number of miles each day. How many miles did Ms. Perez drive each day?', 'subject': 'elementary_mathematics', 'choices': ['5', '7', '8', '9'], 'answer': 2}


## Generate ICCL prompts for all topics

In [9]:
def gen_iccl_single_example(example):
    options = ["0", "1", "2", "3"]
    question = example['question'] + "\nChoose the best answer from the following options:" + "\n" + "\n".join([f"{options[i]}. {option}" for i, option in enumerate(example['choices'])]) + "\nAnswer: "
    return question, example['answer']

In [14]:
def gen_multiple_iccl_prompts(topic_dataset, topic, n_prompts, base_dir="./", exclude_hards=None):
    """
    Generate multiple ICCL prompts for a single topic with all possible orderings of demonstrations.
    Creates JSON files for each possible ordering and a baseline file with hard questions only.
    
    Args:
        topic_dataset: Dictionary with test splits {"easy": easy_dataset, "medium": medium_dataset, "hard": hard_dataset}
        topic: String representing the topic, e.g. "math"
        n_prompts: Number of prompts to generate per ordering
    """
    # Get all possible orderings of demonstrations (1=easy, 2=medium, 3=hard)
            # randoms = list(permutations([1, 2, 3]))
    # if "easy" not in topic_dataset:
    #     orders = [[1,2,3], [1,3,2],[3,1,2]]
    
    # Create topic directory if it doesn't exist
    topic_dir = os.path.join(base_dir, topic)
    os.makedirs(topic_dir, exist_ok=True)
    
    # Convert datasets to lists for easier random selection
    if "easy" in topic_dataset:
        easy_dataset = list(topic_dataset['easy']['test'])
    medium_dataset = list(topic_dataset['medium']['test'])
    hard_dataset = list(topic_dataset['hard']['test'])

    randoms = list(permutations([1, 2, 3]))[1:] # exclude iccl


    valid_hards = hard_dataset
    # IF EXCLUDE HARDS
    if exclude_hards:
        valid_hards = [elt for elt in hard_dataset if elt["question"] not in exclude_hards[topic]]
        # print("TEST", len(hard_dataset), len(valid_hards))
    ######
    
    for prompt_idx in range(n_prompts):
        # Select demonstration examples that will be used across all orderings
        if "easy" in topic_dataset:
            demo_easy = random.choice(easy_dataset)
            demo_medium = random.choice(medium_dataset)
        else:
            demo_easy, demo_medium = random.sample(medium_dataset, 2)
        demo_hard = random.choice(hard_dataset)
        test_hard = random.choice(valid_hards)

        orders = {"iccl": [1,2,3], "random": random.choice(randoms)} # choose random order 
        
        # For each possible ordering
        for order_str in orders:
            # Create prompts dict if it doesn't exist for this ordering
            order = orders[order_str]
            
            filename = f"{topic}_{order_str}_examples.json"
            filepath = os.path.join(topic_dir, filename)
            
            try:
                with open(filepath, 'r') as f:
                    prompts = json.load(f)
            except FileNotFoundError:
                prompts = {}
            
            # Map order numbers to actual examples
            order_to_example = {
                1: (demo_easy, "easy"),
                2: (demo_medium, "medium"),
                3: (demo_hard, "hard")
            }
            
            # Generate demonstrations in specified order
            demonstrations = []
            for difficulty in order:
                example, level = order_to_example[difficulty]
                q, a = gen_iccl_single_example(example)
                demonstrations.append((q, a, level))
            
            # Generate test question
            test_question, test_answer = gen_iccl_single_example(test_hard)
            
            # Build prompt
            prompt = ""
            for demo_q, demo_a, level in demonstrations:
                prompt += f"{demo_q}{demo_a}\n\n"
            prompt += f"{test_question}"
            
            prompts[str(prompt_idx)] = {
                "question": prompt,
                "answer": test_answer,
            }
            
            # Write prompts to JSON file
            with open(filepath, 'w') as f:
                json.dump(prompts, f, indent=4)
            
    ### HARD BASELINE: Generate baseline of hard-only prompts
    baseline_filename = f"{topic}_hard_baseline_iccl_examples.json"
    baseline_filepath = os.path.join(topic_dir, baseline_filename)
    baseline_prompts = {}
    
    for prompt_idx in range(n_prompts):
        # Select two different hard questions for baseline
        hard1, hard2 = random.sample(hard_dataset, 2)
        q1, a1 = gen_iccl_single_example(hard1)
        q2, a2 = gen_iccl_single_example(hard2)
        
        baseline_prompts[str(prompt_idx)] = {
            "question": f"{q1}{a1}\n\n{q2}",
            "answer": a2,
        }
    
    #### Write baseline prompts to JSON file
    with open(baseline_filepath, 'w') as f:
        json.dump(baseline_prompts, f, indent=4)

In [19]:
n_prompts = 50

topics = ["biology", "chemistry", "computer_science", "physics"]
nicknames = {"biology": "bio", "chemistry":"chem", "computer_science": "cs", "physics": "phys"}

exclude_hards = exclude_questions()

gen_multiple_iccl_prompts({"easy": mmlu_math1, "medium": mmlu_math2, "hard": mmlu_math3}, "math", 50, "./prompts/finetuned_prompts/", exclude_hards)

# for topic in topics[1:]:
#     mmlu_1 = load_dataset("cais/mmlu", f"high_school_{topic}")
#     mmlu_2 = load_dataset("cais/mmlu", f"college_{topic}")
    
#     topic_dataset = {"medium": mmlu_1, "hard": mmlu_2}
#     gen_multiple_iccl_prompts(topic_dataset, nicknames[topic], 50, "./prompts/finetuned_prompts/", exclude_hards)



TEST 100 55


In [13]:
def exclude_questions(base_path='./final_train'):
    # List of subjects
    subjects = ['bio', 'chem', 'cs', 'math', 'phys']
    
    # Dictionary to store hard problems for each subject
    exclude_hards = {}
    
    # Iterate through each subject
    for subject in subjects:
        # Construct the filename for subject-specific hard problems
        hard_filename = f'{subject}_test.json'
        hard_filepath = os.path.join(base_path, subject, hard_filename)
        
        # Read the hard problems JSON file
        try:
            with open(hard_filepath, 'r') as f:
                original_list = json.load(f)
            
            # Extract first line of each question
            modified_list = [
                elt["question"].split("\n")[0] for elt in original_list
            ]

            # print("Test", elt["question"].split("\n"))
            # return 0
            
            # Store the modified list in the dictionary
            exclude_hards[subject] = modified_list
        
        except FileNotFoundError:
            print(f"Warning: {hard_filepath} not found. Skipping for {subject}.")
        except json.JSONDecodeError:
            print(f"Error: Unable to parse {hard_filepath}. Ensure it's a valid JSON list.")
    
    return exclude_hards

res = exclude_questions()

print(len(res["bio"]))

50


### Create train/test split for maggie/jennifer

In [12]:
from copy import deepcopy

def split_and_save_datasets(topic_dataset, topic, output_dir: str = "./"):
    """
    Split datasets into train and test sets and save them as JSON files.
    
    Args:
        topic_dataset: Dictionary with 'medium' and 'hard' datasets
        output_dir: Directory to save the JSON files
    """
    train_dir = os.path.join(output_dir, "train", topic)
    test_dir = os.path.join(output_dir, "test")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # Ensure we're working with the train split if it's a DatasetDict
    medium_data = topic_dataset['medium']['test']
    hard_data = topic_dataset['hard']['test']
    
    # Convert to list of dictionaries for easier handling
    medium_list = medium_data.to_list()
    hard_list = hard_data.to_list()
    
    # Randomly shuffle the data
    random.shuffle(medium_list)
    random.shuffle(hard_list)
    
    # Split the datasets
    train_easy = medium_list[:50]
    train_medium = medium_list[50:100]  # First 100 for medium training
    train_hard = hard_list[:50]       # First 50 for hard training
    test_hard = hard_list[50:100]     # Next 50 for hard testing
    
    ### JUST FOR MATH - comment out otherwise
    easy_data = topic_dataset['easy']['test']
    easy_list = easy_data.to_list()
    random.shuffle(easy_list)
    train_easy = easy_list[:50] # update EASY

    # Create combined training set
    combined_train = deepcopy(train_easy) + deepcopy(train_medium) + deepcopy(train_hard)
    shuffled_train = deepcopy(combined_train)
    random.shuffle(shuffled_train)
   
    # Save training files
    train_files = {
        'easy.json': train_easy,
        'medium.json': train_medium,
        'hard.json': train_hard,
        'shuffled.json': shuffled_train,
        'concat.json': combined_train
    }
    
    test_files = {
        f'{topic}.json': test_hard,
    }


    
    

    datasets = ((train_files, train_dir), (test_files, test_dir))

    for dataset, dataset_dir in datasets:
        
        
        for filename, data in dataset.items():
            reformatted_data = []
            for elt in data:
                test_question, test_answer = gen_iccl_single_example(elt)
                reformatted_data.append({"question": test_question, "answer": test_answer})
    
            data = reformatted_data
            
            
            full_path = os.path.join(dataset_dir, filename)
            with open(full_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Saved {len(data)} examples to {full_path}")


In [11]:
n_prompts = 50

topics = ["biology", "chemistry", "computer_science", "physics"]
nicknames = {"biology": "bio", "chemistry":"chem", "computer_science": "cs", "physics": "phys"}

# Create directory structure
base_dir = "./prompts/maggie_jennifer"

train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

for topic in topics[1:]:
    mmlu_1 = load_dataset("cais/mmlu", f"high_school_{topic}")
    mmlu_2 = load_dataset("cais/mmlu", f"college_{topic}")
    
    topic_dataset = {"medium": mmlu_1, "hard": mmlu_2}
    split_and_save_datasets(topic_dataset, nicknames[topic], base_dir)

Saved 50 examples to ./prompts/maggie_jennifer/train/chem/easy.json
Saved 50 examples to ./prompts/maggie_jennifer/train/chem/medium.json
Saved 50 examples to ./prompts/maggie_jennifer/train/chem/hard.json
Saved 150 examples to ./prompts/maggie_jennifer/train/chem/shuffled.json
Saved 150 examples to ./prompts/maggie_jennifer/train/chem/concat.json
Saved 50 examples to ./prompts/maggie_jennifer/test/chem.json
Saved 50 examples to ./prompts/maggie_jennifer/train/cs/easy.json
Saved 50 examples to ./prompts/maggie_jennifer/train/cs/medium.json
Saved 50 examples to ./prompts/maggie_jennifer/train/cs/hard.json
Saved 150 examples to ./prompts/maggie_jennifer/train/cs/shuffled.json
Saved 150 examples to ./prompts/maggie_jennifer/train/cs/concat.json
Saved 50 examples to ./prompts/maggie_jennifer/test/cs.json
Saved 50 examples to ./prompts/maggie_jennifer/train/phys/easy.json
Saved 50 examples to ./prompts/maggie_jennifer/train/phys/medium.json
Saved 50 examples to ./prompts/maggie_jennifer/tra

In [14]:
topic = "mathematics"
mmlu_0 = load_dataset("cais/mmlu", f"elementary_{topic}")
mmlu_1 = load_dataset("cais/mmlu", f"high_school_{topic}")
mmlu_2 = load_dataset("cais/mmlu", f"college_{topic}")

split_and_save_datasets({"easy": mmlu_0, "medium": mmlu_1, "hard": mmlu_2}, "math", "./prompts/maggie_jennifer")


Saved 50 examples to ./maggie_jennifer/train/math/easy.json
Saved 50 examples to ./maggie_jennifer/train/math/medium.json
Saved 50 examples to ./maggie_jennifer/train/math/hard.json
Saved 150 examples to ./maggie_jennifer/train/math/shuffled.json
Saved 150 examples to ./maggie_jennifer/train/math/concat.json
Saved 50 examples to ./maggie_jennifer/test/math.json


## IGNORE

In [35]:
def gen_multiple_iccl_prompts(topic_datasets, n_prompts):
    """
    Generate multiple ICCL prompts for each topic with all possible orderings of easy/medium/hard demonstrations.
    Creates 6 JSON files per topic (one for each possible ordering).
    
    Args:
        topic_datasets: Dictionary of dataset triples {"math": [easy1, medium1, hard1], "english": [easy2, medium2, hard2], ...}
                       where each dataset has a 'test' split
        n_prompts: Number of prompts to generate per ordering
    """
    # Get all possible orderings of demonstrations (1=easy, 2=medium, 3=hard)
    orders = list(permutations([1, 2, 3]))
    
    for topic in topic_datasets:
        # Create topic directory if it doesn't exist
        os.makedirs(topic, exist_ok=True)
        
        easy_dataset, medium_dataset, hard_dataset = topic_datasets[topic]
        
        # Convert datasets to lists for easier random selection
        easy_examples = list(easy_dataset['test'])
        medium_examples = list(medium_dataset['test'])
        hard_examples = list(hard_dataset['test'])
        
        # Track used hard examples across all orderings
        used_hard_indices = set()
        
        # Generate the specified number of prompts
        for prompt_idx in range(n_prompts):
            # Select unused hard example for testing
            available_hard_indices = [i for i in range(len(hard_examples)) if i not in used_hard_indices]
            if not available_hard_indices:
                raise ValueError(f"Not enough unique hard examples available for topic {topic}")
            
            test_hard_index = random.choice(available_hard_indices)
            test_hard = hard_examples[test_hard_index]
            used_hard_indices.add(test_hard_index)
            
            # Select demonstration examples
            demo_easy = random.choice(easy_examples)
            demo_medium = random.choice(medium_examples)
            demo_hard = random.choice([ex for i, ex in enumerate(hard_examples) if i not in used_hard_indices])
            
            # For each possible ordering
            for order in orders:
                # Create prompts dict if it doesn't exist for this ordering
                order_str = ''.join(str(x) for x in order)
                filename = f"{topic}_{order_str}_iccl_examples.json"
                filepath = os.path.join(topic, filename)
                
                try:
                    with open(filepath, 'r') as f:
                        prompts = json.load(f)
                except FileNotFoundError:
                    prompts = {}
                
                # Map order numbers to actual examples
                order_to_example = {
                    1: (demo_easy, "easy"),
                    2: (demo_medium, "medium"),
                    3: (demo_hard, "hard")
                }
                
                # Generate demonstrations in specified order
                demonstrations = []
                for difficulty in order:
                    example, level = order_to_example[difficulty]
                    q, a = gen_iccl_single_example(example)
                    demonstrations.append((q, a, level))
                
                # Generate test question
                test_question, test_answer = gen_iccl_single_example(test_hard)
                
                # Build prompt
                prompt = ""
                for demo_q, demo_a, level in demonstrations:
                    prompt += f"{demo_q}{demo_a}\n\n"
                prompt += f"{test_question}"
                
                prompts[str(prompt_idx)] = {
                    "question": prompt,
                    "answer": test_answer,
                }
                
                # Write prompts to JSON file
                with open(filepath, 'w') as f:
                    json.dump(prompts, f, indent=4)

In [23]:
def gen_random_iccl_prompt(topic_datasets):
    """
    Generate a random ICCL prompt with demonstrations from easy/medium/hard difficulty,
    followed by a hard test question.
    
    Args:
        topic_datasets: Dictionary of dataset triples {"math": [easy1, medium1, hard1], "english": [easy2, medium2, hard2], ...}
                       where each dataset has a 'test' split
    
    Returns:
        str: prompt_string
    """
    # Randomly select a topic
    topic = random.choice(list(topic_datasets.keys()))
    easy_dataset, medium_dataset, hard_dataset = topic_datasets[topic]
    
    # Convert datasets to lists for easier random selection
    easy_examples = list(easy_dataset['test'])
    medium_examples = list(medium_dataset['test'])
    hard_examples = list(hard_dataset['test'])
    
    # Select demonstration examples
    demo_easy = random.choice(easy_examples)
    demo_medium = random.choice(medium_examples)
    demo_hard = random.choice(hard_examples)
    
    # Select test example from remaining hard examples
    # Ensure we don't use the same hard example as in demonstration
    remaining_hard = [ex for ex in hard_examples if ex != demo_hard]
    if not remaining_hard:
        raise ValueError("Not enough unique hard examples available")
    test_hard = random.choice(remaining_hard)
    
    # Generate demonstrations
    demonstrations = [
        gen_iccl_single_example(demo_easy),
        gen_iccl_single_example(demo_medium),
        gen_iccl_single_example(demo_hard)
    ]
    
    # Generate test question, won't return test_answer
    test_question, test_answer = gen_iccl_single_example(test_hard)
    
    # Build prompt
    prompt = ""
    for demo_q, demo_a in demonstrations:
        prompt += f"{demo_q}{demo_a}\n\n"
    prompt += f"{test_question}"
    
    return prompt, test_answer

In [25]:
prompt, answer = gen_random_iccl_prompt({"math": [mmlu_math1, mmlu_math2, mmlu_math3]})
print(prompt)


Gwen wrote the number pattern below on a piece of paper. 1, 5, 9, 13 What are the next two terms in Gwen’s pattern?
Choose the best answer from the following options:
0. 15, 17
1. 15, 19
2. 17, 19
3. 17, 21
Answer: 3

The area bounded by the parabola y = x^2 and the lines y = 1 and y = 9 equals
Choose the best answer from the following options:
0. 8
1. 84/3
2. 64\sqrt{2}/3
3. 104/3
Answer: 3

What is the greatest possible area of a triangular region with one vertex at the center of a circle of radius 1 and the other two vertices on the circle?
Choose the best answer from the following options:
0. 1/2
1. 1
2. sqrt(2)
3. pi
Answer: 0

Sofia and Tess will each randomly choose one of the 10 integers from 1 to 10. What is the probability that neither integer chosen will be the square of the other?
Choose the best answer from the following options:
0. 0.64
1. 0.72
2. 0.81
3. 0.95
Answer: 


In [36]:
gen_multiple_iccl_prompts({"math": [mmlu_math1, mmlu_math2, mmlu_math3]}, 10)

## Test on math subsets

In [8]:
def gen_iccl_single_example(example):
    options = ["A", "B", "C", "D"]
    question = example['question'] + "\nChoose the best answer from the following options:" + "\n" + "\n".join([f"{options[i]}. {option}" for i, option in enumerate(example['choices'])]) + "\nAnswer: "
    return question, example['answer']


In [7]:
print(gen_iccl_single_example(mmlu_math1['test'][0])[0])

What is the value of p in 24 = 2p?
Choose the best answer from the following options:
A. p = 4
B. p = 8
C. p = 12
D. p = 24
Answer: 


In [10]:
def gen_all_iccl_prompts(easy_dataset, medium_dataset, hard_dataset, n_prompts, topic):
    try:
        with open("iccl_prompts.json", 'r') as f:
            iccl_prompts = json.load(f)
    except FileNotFoundError:
        iccl_prompts = {}
    if topic not in iccl_prompts:
        iccl_prompts[topic] = {}
    
    # Convert datasets to lists for easier shuffling & tracking
    easy_examples = list(easy_dataset['test'])
    medium_examples = list(medium_dataset['test'])
    hard_examples = list(hard_dataset['test'])

    # Use indices to track used hard examples
    used_hard_indices = set()
    prompts = []
    for prompt_idx in range(n_prompts):
        # Curriculum examples for demonstrations
        demo_easy = random.choice(easy_examples)
        demo_medium = random.choice(medium_examples)
        demo_hard_index = random.choice([i for i in range(len(hard_examples)) if i not in used_hard_indices])
        demo_hard = hard_examples[demo_hard_index]

        # Can't use hard example that appeared in demonstration for testing
        used_hard_indices.add(demo_hard_index)
        available_hard_indices = [i for i in range(len(hard_examples)) if i not in used_hard_indices]
        if not available_hard_indices:
            raise ValueError("Not enough unique hard examples available")
        test_hard_index = random.choice(available_hard_indices)
        test_hard = hard_examples[test_hard_index]
        used_hard_indices.add(test_hard_index)

        demonstrations = [
            gen_iccl_single_example(demo_easy),
            gen_iccl_single_example(demo_medium),
            gen_iccl_single_example(demo_hard)
        ]
        test_question, test_answer = gen_iccl_single_example(test_hard)

        prompt = ""
        for demo_q, demo_a in demonstrations:
            prompt += f"{demo_q}" + str(demo_a) + "\n\n"
        prompt += f"{test_question}"

        iccl_prompts[topic][f"{prompt_idx}"] = {
            "question": prompt,
            "answer": test_answer
        }

    # Write the updated JSON back to the file
    with open("iccl_prompts.json", 'w') as f:
        json.dump(iccl_prompts, f, indent=4)


In [11]:
gen_all_iccl_prompts(mmlu_math1, mmlu_math2, mmlu_math3, int(len(mmlu_math3['test']) // 2), "math")