In [None]:
import json
import os
from datasets import load_dataset

ds = load_dataset("likaixin/TACO-verified", split="train", trust_remote_code=True)

print(ds)

In [2]:
# TACO dataset has difficulties as strings: EASY, MEDIUM, MEDIUM_HARD, HARD, VERY_HARD
# Let EASY->1.9, MEDIUM->3.7, MEDIUM_HARD->5.5, HARD->7.3, VERY_HARD->9.1
def difficulty_to_int(difficulty):
    if difficulty == "EASY":
        return 1.9
    elif difficulty == "MEDIUM":
        return 3.7
    elif difficulty == "MEDIUM_HARD":
        return 5.5
    elif difficulty == "HARD":
        return 7.3
    elif difficulty == "VERY_HARD":
        return 9.1
    else:
        #print("difficulty", difficulty)
        return -1

In [None]:
import ast
dataset = []
unknown_difficulty = 0
for entry in ds:
    tests = entry["input_output"]
    if not tests:
        continue
    
    if isinstance(tests, str):
        try:
            tests = ast.literal_eval(entry["input_output"])
        except (ValueError, SyntaxError) as e:
            # Try Json loads instead
            try:
                tests = json.loads(entry["input_output"])
            except (json.JSONDecodeError, SyntaxError, ValueError) as e:
                print(repr(entry["input_output"]))
                print(f"Error in json.loads: {e}")
                continue
    assert isinstance(tests, dict), "Tests should be a dictionary"
    assert len(tests["inputs"]) == len(tests["outputs"]), "Inputs and outputs should have the same length"
  
    new_entry = {
        "problem": entry["question"],
        "tests": tests,
        "solutions": entry["solutions"],
    }
    # Assert tests is a dictionary of inputs and outputs that map to a list of strings
    assert isinstance(tests, dict), "Tests should be a dictionary"
    assert "inputs" in tests, "Inputs should be a key in the dictionary"
    assert "outputs" in tests, "Outputs should be a key in the dictionary"
    assert isinstance(tests["inputs"], list), "Inputs should be a list"
    assert isinstance(tests["outputs"], list), "Outputs should be a list"
    assert len(tests["inputs"]) == len(tests["outputs"]), "Inputs and outputs should have the same length"
    

    inputs = tests["inputs"]
    outputs = tests["outputs"]
    if len(inputs) <= 4:
        continue
    dataset.append(new_entry)

print(len(dataset))
print(dataset[0])
print(unknown_difficulty)

output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "taco.json")
with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)