In [1]:
import json
import pandas as pd
import os
from datasets import load_dataset

ds = load_dataset("DenCT/codeforces-problems-7k", split="train")
print(ds)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['contestId', 'name', 'tags', 'title', 'time-limit', 'memory-limit', 'problem-description', 'input-specification', 'output-specification', 'demo-input', 'demo-output', 'note', 'test_cases', 'timeConsumedMillis', 'memoryConsumedBytes', 'score', '__index_level_0__'],
    num_rows: 7323
})


In [2]:
CODEFORCES_TEMPLATE = """
{description}

Input Format:
{input_format}

Output Format:
{output_format}

Tags:
{tags}

Time Limit: {time_limit} ms
Memory Limit: {memory_limit} MB
"""

CODEFORCES_TEMPLATE_NO_LIMIT = """
{description}

Input Format:
{input_format}

Output Format:
{output_format}

Tags:
{tags}

Demo input: {demo_input}
Demo output: {demo_output}
"""


In [3]:
# func. to extract difficulty rating from tags
def extract_difficulty(tags):
    if pd.isnull(tags):  # Handles when tags are null
        return 0
    for tag in tags.split(","):
        if "*" in tag:  # Difficulty is marked with a '*' symbol
            try:
                return int(tag.replace("*", ""))
            except ValueError:
                continue
    return 0  # Default difficulty 

In [5]:
dataset = []
for entry in ds:
    tests = entry["test_cases"]
    if len(tests) <= 1:
        continue
    new_entry = {
        "problem": CODEFORCES_TEMPLATE_NO_LIMIT.format(
            description=entry["problem-description"],
            input_format=entry["input-specification"],
            output_format=entry["output-specification"],
            tags=entry["tags"],
            demo_input=entry["demo-input"],
            demo_output=entry["demo-output"],
        ),
        "tests": tests,
    }
    dataset.append(new_entry)

print(f"Train dataset size: {len(dataset)}")

output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "codeforces.json")

with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)

Train dataset size: 6128


In [6]:
ds = load_dataset("Qwen/CodeElo", split="test")
print(ds)

Dataset({
    features: ['problem_id', 'url', 'title', 'rating', 'tags', 'div', 'time_limit_ms', 'memory_limit_mb', 'description', 'input', 'output', 'interaction', 'examples', 'note'],
    num_rows: 408
})


In [6]:
def process_test_cases(raw_cases):
    # Examples field: https://huggingface.co/datasets/Qwen/CodeElo
    # The first element is the full input string, the second is the full output string
    formatted_cases = []
    for rc in raw_cases:
        input_case = rc[0]  # Keep input as a single string
        output_case = rc[1]  # Keep output as a single string
        # Structure the test cases
        formatted_cases.append({"input": input_case, "output": output_case})
    return formatted_cases

dataset = []
for entry in ds:
    tests =  process_test_cases(entry["examples"])
    if len(tests) == 0:
        continue
    new_entry = {
        "problem": CODEFORCES_TEMPLATE.format(
            description=entry["description"],
            input_format=entry["input"],
            output_format=entry["output"],
            tags=entry["tags"],
            time_limit=entry["time_limit_ms"],
            memory_limit=entry["memory_limit_mb"],
        ),
        "tests": tests,
    }
    dataset.append(new_entry)

print(f"Test dataset size: {len(dataset)}")

output_dir = os.path.abspath("../../test/code")
output_file = os.path.join(output_dir, "codeforces.json")

with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)

Test dataset size: 408
