In [None]:
# load the dataset https://huggingface.co/datasets/nanoverl/aime
# repeat each question for 8 times
# upload to my account

In [2]:
#!/usr/bin/env python3
# Script to load the nanoverl/aime dataset, repeat each question 8 times, and upload to Hugging Face

import os
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import login, HfApi
import pandas as pd
import numpy as np

def repeat_dataset(ds, repeat_count=8):
    """
    Repeat each example in a dataset repeat_count times
    """
    # Convert to pandas DataFrame for easier manipulation
    df = ds.to_pandas()
    
    # Repeat each row repeat_count times
    repeated_df = pd.DataFrame(np.repeat(df.values, repeat_count, axis=0), columns=df.columns)
    
    # Convert back to Hugging Face Dataset
    return Dataset.from_pandas(repeated_df)

In [6]:
# Load the AIME dataset from Hugging Face
print("Loading dataset...")
dataset = load_dataset("nanoverl/amc")
print(f"Dataset loaded: {dataset}")

# Display dataset information
print("Dataset splits:", dataset.keys())
for split in dataset.keys():
    print(f"{split} size: {len(dataset[split])}")


Loading dataset...


Dataset loaded: DatasetDict({
    test: Dataset({
        features: ['problem', 'answer', 'url', 'difficulty'],
        num_rows: 83
    })
})
Dataset splits: dict_keys(['test'])
test size: 83


In [7]:
# Display the first example to understand the dataset structure
print("\nDataset columns:", dataset["test"].column_names)
print("\nFirst example:")
print(dataset["test"][0])

# Create the repeated datasets for each split
print("\nRepeating dataset examples...")
repeated_dataset = {}
for split in dataset.keys():
    repeated_dataset[split] = repeat_dataset(dataset[split], repeat_count=4)
    print(f"Original {split} size: {len(dataset[split])}")
    print(f"Repeated {split} size: {len(repeated_dataset[split])}")

# Create a new dataset with the repeated data
repeated_dataset_dict = DatasetDict(repeated_dataset)


Dataset columns: ['problem', 'answer', 'url', 'difficulty']

First example:
{'problem': '$\\frac{m}{n}$ is the Irreducible fraction value of \\[3+\\frac{1}{3+\\frac{1}{3+\\frac13}}\\], what is the value of $m+n$?', 'answer': 142, 'url': 'https://artofproblemsolving.com/wiki/index.php/2022_AMC_12A_Problems/Problem_1', 'difficulty': 2.0}

Repeating dataset examples...
Original test size: 83
Repeated test size: 332


In [8]:

# Show a few examples to verify repetition
print("\nVerifying repetition with first few examples:")
for i in range(min(10, len(repeated_dataset_dict['test']))):
    print(f"Sample {i}: {repeated_dataset_dict['test'][i]['problem'][:100]}...")

# Login to Hugging Face Hub
print("\nLogging in to Hugging Face Hub...")
token = os.environ.get("HF_TOKEN")
if token:
    login(token=token)
else:
    login()  # Will prompt for token

# Define your Hugging Face username and repository name
username = input("Enter your Hugging Face username: ")
repo_name = "amc_repeated_4x"
repo_id = f"{username}/{repo_name}"

# Push the dataset to Hugging Face Hub
print(f"\nUploading dataset to {repo_id}...")
repeated_dataset_dict.push_to_hub(
    repo_id,
    private=False,  # Set to True if you want a private repository
    commit_message="Upload dataset with each question repeated 4 times"
)

print(f"Dataset successfully uploaded to: https://huggingface.co/datasets/{repo_id}")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.



Verifying repetition with first few examples:
Sample 0: $\frac{m}{n}$ is the Irreducible fraction value of \[3+\frac{1}{3+\frac{1}{3+\frac13}}\], what is th...
Sample 1: $\frac{m}{n}$ is the Irreducible fraction value of \[3+\frac{1}{3+\frac{1}{3+\frac13}}\], what is th...
Sample 2: $\frac{m}{n}$ is the Irreducible fraction value of \[3+\frac{1}{3+\frac{1}{3+\frac13}}\], what is th...
Sample 3: $\frac{m}{n}$ is the Irreducible fraction value of \[3+\frac{1}{3+\frac{1}{3+\frac13}}\], what is th...
Sample 4: How many ways are there to split the integers $1$ through $14$ into $7$ pairs such that in each pair...
Sample 5: How many ways are there to split the integers $1$ through $14$ into $7$ pairs such that in each pair...
Sample 6: How many ways are there to split the integers $1$ through $14$ into $7$ pairs such that in each pair...
Sample 7: How many ways are there to split the integers $1$ through $14$ into $7$ pairs such that in each pair...
Sample 8: What is the product of all real

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1721.09ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]


Dataset successfully uploaded to: https://huggingface.co/datasets/SDSB/amc_repeated_4x
