In [2]:
import gzip
import json
import random
from collections import defaultdict
from typing import Any, Callable, Dict, List, Optional, Tuple
import os

import blobfile as bf
import numpy as np
import orjson

Sample = Dict[str, Any]

## global variables

scored_test_samples_jsonl_path = (
    "/data/tongyx361/reward-by-prm800k/datasets/scored-test-samples.jsonl"
)

prm800k_jsonl_dirpath = "/data/tongyx361/reward-by-prm800k/prm800k-main/prm800k/data"

prm800k_jsonl_path_phase = [
    {
        "train": os.path.join(prm800k_jsonl_dirpath, "phase1_train.jsonl"),
        "test": os.path.join(prm800k_jsonl_dirpath, "phase1_test.jsonl"),
    },
    {
        "train": os.path.join(prm800k_jsonl_dirpath, "phase2_train.jsonl"),
        "test": os.path.join(prm800k_jsonl_dirpath, "phase2_test.jsonl"),
    },
]

## functions


def json_loads(s: str) -> Dict:
    try:
        return orjson.loads(s)
    except Exception:
        return json.loads(s)  # fallback


def open_jsonl(file: str):
    if file.endswith(".gz"):
        return gzip.open(bf.BlobFile(file, "rb"))
    return bf.BlobFile(file, "r")


def read_jsonl(file: str) -> List[Dict]:
    assert bf.exists(file), file
    with open_jsonl(file) as f:
        return [json_loads(l) for l in f.readlines() if l]


def key_by_problem(samples: List[Dict]):
    grouped_samples = defaultdict(list)
    for sample in samples:
        grouped_samples[sample["problem"]].append(sample)
    return

In [4]:
scored_test_samples = read_jsonl(scored_test_samples_jsonl_path)
print(random.choice(scored_test_samples))

KeyboardInterrupt: 

In [None]:
prm800k_dataset_phase = []

for phase in prm800k_jsonl_path_phase:
    train_dataset = read_jsonl(phase["train"])
    test_dataset = read_jsonl(phase["test"])
    phase_dataset = {"train": train_dataset, "test": test_dataset}
    prm800k_dataset_phase.append(phase_dataset)