In [2]:
import json
import os
from datasets import load_dataset

ds = load_dataset("likaixin/TACO-verified", split="train", trust_remote_code=True)

print(ds)

Dataset({
    features: ['id', 'question', 'solutions', 'starter_code', 'input_output', 'difficulty', 'raw_tags', 'name', 'source', 'tags', 'skill_types', 'url', 'Expected Auxiliary Space', 'time_limit', 'date', 'picture_num', 'memory_limit', 'Expected Time Complexity'],
    num_rows: 12898
})


In [3]:
# TACO dataset has difficulties as strings: EASY, MEDIUM, MEDIUM_HARD, HARD, VERY_HARD
# Let EASY->1.9, MEDIUM->3.7, MEDIUM_HARD->5.5, HARD->7.3, VERY_HARD->9.1
def difficulty_to_int(difficulty):
    if difficulty == "EASY":
        return 1.9
    elif difficulty == "MEDIUM":
        return 3.7
    elif difficulty == "MEDIUM_HARD":
        return 5.5
    elif difficulty == "HARD":
        return 7.3
    elif difficulty == "VERY_HARD":
        return 9.1
    else:
        #print("difficulty", difficulty)
        return -1

In [10]:
import ast
dataset = []
unknown_difficulty = 0
for entry in ds:

    tests = entry["input_output"]
    if not tests:
        continue
    
    if isinstance(tests, str):
        try:
            tests = ast.literal_eval(entry["input_output"])
        except (ValueError, SyntaxError) as e:
            # Try Json loads instead
            try:
                tests = json.loads(entry["input_output"])
            except (json.JSONDecodeError, SyntaxError, ValueError) as e:
                print(repr(entry["input_output"]))
                print(f"Error in json.loads: {e}")
                continue
    assert isinstance(tests, dict), "Tests should be a dictionary"
    assert len(tests["inputs"]) == len(tests["outputs"]), "Inputs and outputs should have the same length"
  
    new_entry = {
        "problem": entry["question"],
        "tests": tests,
        "solutions": entry["solutions"][0],
        "id": entry["id"],
    }
    # Assert tests is a dictionary of inputs and outputs that map to a list of strings
    assert isinstance(tests, dict), "Tests should be a dictionary"
    assert "inputs" in tests, "Inputs should be a key in the dictionary"
    assert "outputs" in tests, "Outputs should be a key in the dictionary"
    assert isinstance(tests["inputs"], list), "Inputs should be a list"
    assert isinstance(tests["outputs"], list), "Outputs should be a list"
    assert len(tests["inputs"]) == len(tests["outputs"]), "Inputs and outputs should have the same length"
    

    inputs = tests["inputs"]
    outputs = tests["outputs"]
    if len(inputs) <= 2:
        continue
    # difficulty_level = difficulty_to_int(entry["difficulty"])
    # if difficulty_level <= 3 and difficulty_level >=0:
    #     if difficulty_level == -1:
    #         unknown_difficulty += 1
    dataset.append(new_entry)

print(len(dataset))
print(dataset[0])
print(unknown_difficulty)

output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "taco.json")
with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)

8544
{'problem': 'There are $n$ candy boxes in front of Tania. The boxes are arranged in a row from left to right, numbered from $1$ to $n$. The $i$-th box contains $r_i$ candies, candies have the color $c_i$ (the color can take one of three values \u200b\u200b— red, green, or blue). All candies inside a single box have the same color (and it is equal to $c_i$).\n\nInitially, Tanya is next to the box number $s$. Tanya can move to the neighbor box (that is, with a number that differs by one) or eat candies in the current box. Tanya eats candies instantly, but the movement takes one second.\n\nIf Tanya eats candies from the box, then the box itself remains in place, but there is no more candies in it. In other words, Tanya always eats all the candies from the box and candies in the boxes are not refilled.\n\nIt is known that Tanya cannot eat candies of the same color one after another (that is, the colors of candies in two consecutive boxes from which she eats candies are always differen

In [4]:
ds = load_dataset("likaixin/TACO-verified", split="test", trust_remote_code=True)
print(ds)

Dataset({
    features: ['question', 'solutions', 'starter_code', 'input_output', 'difficulty', 'raw_tags', 'name', 'source', 'tags', 'skill_types', 'url', 'Expected Auxiliary Space', 'time_limit', 'date', 'picture_num', 'memory_limit', 'Expected Time Complexity'],
    num_rows: 1000
})


In [5]:
dataset = []
for entry in ds:
    tests = entry["input_output"]
    if not tests:
        continue
    if isinstance(tests, str):
        try:
            tests = ast.literal_eval(entry["input_output"])
        except (ValueError, SyntaxError) as e:
            # Try Json loads instead
            try:
                tests = json.loads(entry["input_output"])
            except (json.JSONDecodeError, SyntaxError, ValueError) as e:
                print(repr(entry["input_output"]))
                print(f"Error in json.loads: {e}")
                continue
    assert isinstance(tests, dict), "Tests should be a dictionary"

  
    new_entry = {
        "problem": entry["question"],
        "tests": tests,
    }
    dataset.append(new_entry)

print(len(dataset))
print(dataset[0])

output_dir = os.path.abspath("../../test/code")
output_file = os.path.join(output_dir, "taco.json")

with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)

1000
{'problem': 'The city park of IT City contains n east to west paths and n north to south paths. Each east to west path crosses each north to south path, so there are n^2 intersections.\n\nThe city funded purchase of five benches. To make it seems that there are many benches it was decided to place them on as many paths as possible. Obviously this requirement is satisfied by the following scheme: each bench is placed on a cross of paths and each path contains not more than one bench.\n\nHelp the park administration count the number of ways to place the benches.\n\n\n-----Input-----\n\nThe only line of the input contains one integer n (5 ≤ n ≤ 100) — the number of east to west paths and north to south paths.\n\n\n-----Output-----\n\nOutput one integer — the number of ways to place the benches.\n\n\n-----Examples-----\nInput\n5\n\nOutput\n120', 'tests': {'inputs': ['5\n', '6\n', '7\n', '15\n', '17\n', '72\n', '83\n', '95\n', '99\n', '100\n', '7\n', '100\n', '83\n', '72\n', '6\n', '15