In [None]:
import ast
import json
import os
from datasets import load_dataset

ds = load_dataset("codeparrot/apps", split="train", trust_remote_code=True)
print(ds)
print(ds[0]['difficulty'])

In [None]:
# APPS dataset has difficulties as strings: introductory, interview, competition
# Let introductory->2.5, interview->5.5, competition->8.5
def difficulty_to_int(difficulty):
    if difficulty == "introductory":
        return 2.5
    if difficulty == "interview":
        return 5.5
    if difficulty == "competition":
        return 8.5
    return -1

def process_dataset(ds):
    dataset = []
    for entry in ds:
        tests = entry["input_output"]
        if not tests:
            continue
        if isinstance(tests, str):
            try:
                tests = ast.literal_eval(entry["input_output"])
            except (ValueError, SyntaxError) as e:
                # Try Json loads instead
                try:
                    tests = json.loads(entry["input_output"])
                except (json.JSONDecodeError, SyntaxError, ValueError) as e:
                    print(repr(entry["input_output"]))
                    print(f"Error in json.loads: {e}")
                    continue
        assert isinstance(tests, dict), "Tests should be a dictionary"
        # Mkae it a proper json dict
        if len(tests["inputs"]) <= 1:
            continue
        new_entry = {
            "problem": entry["question"],
            "tests": tests,
        }
        dataset.append(new_entry)
    return dataset

train_dataset = process_dataset(ds)
print(len(train_dataset))
output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "apps.json")

with open(output_file, "w") as f:
    json.dump(train_dataset, f, indent=4)

In [4]:
# Load json file
with open(output_file, "r") as f:
    train_dataset = json.load(f)
