In [32]:
from datasets import load_dataset
import os
import random
import json
from itertools import permutations

## Test on math subsets

In [4]:
mmlu_math1 = load_dataset("cais/mmlu", "elementary_mathematics")
mmlu_math2 = load_dataset("cais/mmlu", "high_school_mathematics")
mmlu_math3 = load_dataset("cais/mmlu", "college_mathematics")

README.md:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.38k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/378 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/41 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.99k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/270 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [7]:
# Check the dataset format
print(mmlu_math1['test'][1])


{'question': 'Ms. Perez drove a total of 40 miles in 5 days. She drove the same number of miles each day. How many miles did Ms. Perez drive each day?', 'subject': 'elementary_mathematics', 'choices': ['5', '7', '8', '9'], 'answer': 2}


In [8]:
def gen_iccl_single_example(example):
    question = example['question'] + "\nChoose the best answer from the following options:" + "\n" + "\n".join([f"{i}. {option}" for i, option in enumerate(example['choices'])]) + "\nAnswer: "
    return question, example['answer']


In [9]:
print(gen_iccl_single_example(mmlu_math1['test'][0])[0])

What is the value of p in 24 = 2p?
Choose the best answer from the following options:
0. p = 4
1. p = 8
2. p = 12
3. p = 24
Answer: 


In [10]:
def gen_all_iccl_prompts(easy_dataset, medium_dataset, hard_dataset, n_prompts, topic):
    try:
        with open("iccl_prompts.json", 'r') as f:
            iccl_prompts = json.load(f)
    except FileNotFoundError:
        iccl_prompts = {}
    if topic not in iccl_prompts:
        iccl_prompts[topic] = {}
    
    # Convert datasets to lists for easier shuffling & tracking
    easy_examples = list(easy_dataset['test'])
    medium_examples = list(medium_dataset['test'])
    hard_examples = list(hard_dataset['test'])

    # Use indices to track used hard examples
    used_hard_indices = set()
    prompts = []
    for prompt_idx in range(n_prompts):
        # Curriculum examples for demonstrations
        demo_easy = random.choice(easy_examples)
        demo_medium = random.choice(medium_examples)
        demo_hard_index = random.choice([i for i in range(len(hard_examples)) if i not in used_hard_indices])
        demo_hard = hard_examples[demo_hard_index]

        # Can't use hard example that appeared in demonstration for testing
        used_hard_indices.add(demo_hard_index)
        available_hard_indices = [i for i in range(len(hard_examples)) if i not in used_hard_indices]
        if not available_hard_indices:
            raise ValueError("Not enough unique hard examples available")
        test_hard_index = random.choice(available_hard_indices)
        test_hard = hard_examples[test_hard_index]
        used_hard_indices.add(test_hard_index)

        demonstrations = [
            gen_iccl_single_example(demo_easy),
            gen_iccl_single_example(demo_medium),
            gen_iccl_single_example(demo_hard)
        ]
        test_question, test_answer = gen_iccl_single_example(test_hard)

        prompt = ""
        for demo_q, demo_a in demonstrations:
            prompt += f"{demo_q}" + str(demo_a) + "\n\n"
        prompt += f"{test_question}"

        iccl_prompts[topic][f"{prompt_idx}"] = {
            "question": prompt,
            "answer": test_answer
        }

    # Write the updated JSON back to the file
    with open("iccl_prompts.json", 'w') as f:
        json.dump(iccl_prompts, f, indent=4)


In [11]:
gen_all_iccl_prompts(mmlu_math1, mmlu_math2, mmlu_math3, int(len(mmlu_math3['test']) // 2), "math")

## Generate ICCL prompts for all topics

In [23]:
def gen_random_iccl_prompt(topic_datasets):
    """
    Generate a random ICCL prompt with demonstrations from easy/medium/hard difficulty,
    followed by a hard test question.
    
    Args:
        topic_datasets: Dictionary of dataset triples {"math": [easy1, medium1, hard1], "english": [easy2, medium2, hard2], ...}
                       where each dataset has a 'test' split
    
    Returns:
        str: prompt_string
    """
    # Randomly select a topic
    topic = random.choice(list(topic_datasets.keys()))
    easy_dataset, medium_dataset, hard_dataset = topic_datasets[topic]
    
    # Convert datasets to lists for easier random selection
    easy_examples = list(easy_dataset['test'])
    medium_examples = list(medium_dataset['test'])
    hard_examples = list(hard_dataset['test'])
    
    # Select demonstration examples
    demo_easy = random.choice(easy_examples)
    demo_medium = random.choice(medium_examples)
    demo_hard = random.choice(hard_examples)
    
    # Select test example from remaining hard examples
    # Ensure we don't use the same hard example as in demonstration
    remaining_hard = [ex for ex in hard_examples if ex != demo_hard]
    if not remaining_hard:
        raise ValueError("Not enough unique hard examples available")
    test_hard = random.choice(remaining_hard)
    
    # Generate demonstrations
    demonstrations = [
        gen_iccl_single_example(demo_easy),
        gen_iccl_single_example(demo_medium),
        gen_iccl_single_example(demo_hard)
    ]
    
    # Generate test question, won't return test_answer
    test_question, test_answer = gen_iccl_single_example(test_hard)
    
    # Build prompt
    prompt = ""
    for demo_q, demo_a in demonstrations:
        prompt += f"{demo_q}{demo_a}\n\n"
    prompt += f"{test_question}"
    
    return prompt, test_answer

In [25]:
prompt, answer = gen_random_iccl_prompt({"math": [mmlu_math1, mmlu_math2, mmlu_math3]})
print(prompt)


Gwen wrote the number pattern below on a piece of paper. 1, 5, 9, 13 What are the next two terms in Gwen’s pattern?
Choose the best answer from the following options:
0. 15, 17
1. 15, 19
2. 17, 19
3. 17, 21
Answer: 3

The area bounded by the parabola y = x^2 and the lines y = 1 and y = 9 equals
Choose the best answer from the following options:
0. 8
1. 84/3
2. 64\sqrt{2}/3
3. 104/3
Answer: 3

What is the greatest possible area of a triangular region with one vertex at the center of a circle of radius 1 and the other two vertices on the circle?
Choose the best answer from the following options:
0. 1/2
1. 1
2. sqrt(2)
3. pi
Answer: 0

Sofia and Tess will each randomly choose one of the 10 integers from 1 to 10. What is the probability that neither integer chosen will be the square of the other?
Choose the best answer from the following options:
0. 0.64
1. 0.72
2. 0.81
3. 0.95
Answer: 


In [35]:
def gen_multiple_iccl_prompts(topic_datasets, n_prompts):
    """
    Generate multiple ICCL prompts for each topic with all possible orderings of easy/medium/hard demonstrations.
    Creates 6 JSON files per topic (one for each possible ordering).
    
    Args:
        topic_datasets: Dictionary of dataset triples {"math": [easy1, medium1, hard1], "english": [easy2, medium2, hard2], ...}
                       where each dataset has a 'test' split
        n_prompts: Number of prompts to generate per ordering
    """
    # Get all possible orderings of demonstrations (1=easy, 2=medium, 3=hard)
    orders = list(permutations([1, 2, 3]))
    
    for topic in topic_datasets:
        # Create topic directory if it doesn't exist
        os.makedirs(topic, exist_ok=True)
        
        easy_dataset, medium_dataset, hard_dataset = topic_datasets[topic]
        
        # Convert datasets to lists for easier random selection
        easy_examples = list(easy_dataset['test'])
        medium_examples = list(medium_dataset['test'])
        hard_examples = list(hard_dataset['test'])
        
        # Track used hard examples across all orderings
        used_hard_indices = set()
        
        # Generate the specified number of prompts
        for prompt_idx in range(n_prompts):
            # Select unused hard example for testing
            available_hard_indices = [i for i in range(len(hard_examples)) if i not in used_hard_indices]
            if not available_hard_indices:
                raise ValueError(f"Not enough unique hard examples available for topic {topic}")
            
            test_hard_index = random.choice(available_hard_indices)
            test_hard = hard_examples[test_hard_index]
            used_hard_indices.add(test_hard_index)
            
            # Select demonstration examples
            demo_easy = random.choice(easy_examples)
            demo_medium = random.choice(medium_examples)
            demo_hard = random.choice([ex for i, ex in enumerate(hard_examples) if i not in used_hard_indices])
            
            # For each possible ordering
            for order in orders:
                # Create prompts dict if it doesn't exist for this ordering
                order_str = ''.join(str(x) for x in order)
                filename = f"{topic}_{order_str}_iccl_examples.json"
                filepath = os.path.join(topic, filename)
                
                try:
                    with open(filepath, 'r') as f:
                        prompts = json.load(f)
                except FileNotFoundError:
                    prompts = {}
                
                # Map order numbers to actual examples
                order_to_example = {
                    1: (demo_easy, "easy"),
                    2: (demo_medium, "medium"),
                    3: (demo_hard, "hard")
                }
                
                # Generate demonstrations in specified order
                demonstrations = []
                for difficulty in order:
                    example, level = order_to_example[difficulty]
                    q, a = gen_iccl_single_example(example)
                    demonstrations.append((q, a, level))
                
                # Generate test question
                test_question, test_answer = gen_iccl_single_example(test_hard)
                
                # Build prompt
                prompt = ""
                for demo_q, demo_a, level in demonstrations:
                    prompt += f"{demo_q}{demo_a}\n\n"
                prompt += f"{test_question}"
                
                prompts[str(prompt_idx)] = {
                    "question": prompt,
                    "answer": test_answer,
                }
                
                # Write prompts to JSON file
                with open(filepath, 'w') as f:
                    json.dump(prompts, f, indent=4)

In [36]:
gen_multiple_iccl_prompts({"math": [mmlu_math1, mmlu_math2, mmlu_math3]}, 10)