In [15]:
import json
from datasets import load_dataset

ds = load_dataset("codeparrot/apps", split="train", trust_remote_code=True)

print(ds)

Dataset({
    features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
    num_rows: 5000
})


In [16]:
# APPS dataset has difficulties as strings: introductory, interview, competition
# Let introductory->2.5, interview->5.5, competition->8.5
def difficulty_to_int(difficulty):
    if difficulty == "introductory":
        return 2.5
    if difficulty == "interview":
        return 5.5
    if difficulty == "competition":
        return 8.5
    return 0


In [20]:
dataset = []
for entry in ds:
    new_entry = {
        "problem": entry["question"],
        "solutions": entry["solutions"],
        "starter_code": entry["starter_code"],
        "url": entry["url"],
        "difficulty": difficulty_to_int(entry["difficulty"]),
    }
    if isinstance(entry["input_output"], dict):
        new_entry["input_output"] = entry["input_output"]
    else:
        assert isinstance(entry["input_output"], str), f"Expected entry['input_output'] str, got {type(entry['input_output'])}"
        try:
            new_entry["input_output"] = json.loads(entry["input_output"])
        except json.JSONDecodeError:
            continue
    dataset.append(new_entry)

print(len(dataset))
print(dataset[0])

with open("train_apps.json", "w") as f:
    json.dump(dataset, f, indent=4)

4805
{'problem': 'Polycarp has $n$ different binary words. A word called binary if it contains only characters \'0\' and \'1\'. For example, these words are binary: "0001", "11", "0" and "0011100".\n\nPolycarp wants to offer his set of $n$ binary words to play a game "words". In this game, players name words and each next word (starting from the second) must start with the last character of the previous word. The first word can be any. For example, these sequence of words can be named during the game: "0101", "1", "10", "00", "00001".\n\nWord reversal is the operation of reversing the order of the characters. For example, the word "0111" after the reversal becomes "1110", the word "11010" after the reversal becomes "01011".\n\nProbably, Polycarp has such a set of words that there is no way to put them in the order correspondent to the game rules. In this situation, he wants to reverse some words from his set so that:  the final set of $n$ words still contains different words (i.e. all 

In [21]:
ds = load_dataset("codeparrot/apps", split="test", trust_remote_code=True)

print(ds)

Dataset({
    features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
    num_rows: 5000
})


In [22]:
dataset = []
for entry in ds:
    new_entry = {
        "problem": entry["question"],
        "solutions": entry["solutions"],
        "starter_code": entry["starter_code"],
        "url": entry["url"],
        "difficulty": difficulty_to_int(entry["difficulty"]),
    }
    if isinstance(entry["input_output"], dict):
        new_entry["input_output"] = entry["input_output"]
    elif isinstance(entry["input_output"], str):
        try:
            new_entry["input_output"] = json.loads(entry["input_output"])
        except json.JSONDecodeError:
            continue
    dataset.append(new_entry)

print(len(dataset))
print(dataset[0])

with open("test_apps.json", "w") as f:
    json.dump(dataset, f, indent=4)

5000
{'problem': "An accordion is a string (yes, in the real world accordions are musical instruments, but let's forget about it for a while) which can be represented as a concatenation of: an opening bracket (ASCII code $091$), a colon (ASCII code $058$), some (possibly zero) vertical line characters (ASCII code $124$), another colon, and a closing bracket (ASCII code $093$). The length of the accordion is the number of characters in it.\n\nFor example, [::], [:||:] and [:|||:] are accordions having length $4$, $6$ and $7$. (:|:), {:||:}, [:], ]:||:[ are not accordions. \n\nYou are given a string $s$. You want to transform it into an accordion by removing some (possibly zero) characters from it. Note that you may not insert new characters or reorder existing ones. Is it possible to obtain an accordion by removing characters from $s$, and if so, what is the maximum possible length of the result?\n\n\n-----Input-----\n\nThe only line contains one string $s$ ($1 \\le |s| \\le 500000$). I