In [1]:
OUTPUT_FILE = "../data/aime.jsonl"

# AIME Dataset Visualization

This notebook displays the AIME dataset from the DeepSeek 1.5B model evaluations.

In [2]:
import os
import pandas as pd
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import json

## Load the AIME dataset

In [3]:
file_path = "../data/aime.parquet"
df_raw = pd.read_parquet(file_path)
print(f"Dataset shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")

Dataset shape: (30, 5)
Columns: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info']


## Display sample data

In [4]:
# Display the first few rows
df_raw.head()

Unnamed: 0,data_source,prompt,ability,reward_model,extra_info
0,,[{'content': 'Every morning Aya goes for a $9$...,math,"{'ground_truth': '204', 'style': 'rule'}","{'index': 0, 'split': 'test'}"
1,,[{'content': 'There exist real numbers $x$ and...,math,"{'ground_truth': '025', 'style': 'rule'}","{'index': 1, 'split': 'test'}"
2,,[{'content': 'Alice and Bob play the following...,math,"{'ground_truth': '809', 'style': 'rule'}","{'index': 2, 'split': 'test'}"
3,,[{'content': 'Jen enters a lottery by picking ...,math,"{'ground_truth': '116', 'style': 'rule'}","{'index': 3, 'split': 'test'}"
4,,[{'content': 'Rectangles $ABCD$ and $EFGH$ are...,math,"{'ground_truth': '104', 'style': 'rule'}","{'index': 4, 'split': 'test'}"


In [5]:
aime_dataset = [
    {
        'problem': r['prompt'][0]['content'],
        'answer': r['reward_model']['ground_truth'],
    }
    for index, r in df_raw.reset_index().iterrows()
]
len(aime_dataset)
# aime_dataset

30

In [7]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in aime_dataset:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(aime_dataset)} records to {OUTPUT_FILE}")

Saved 30 records to ../data/aime.jsonl
