In [None]:
import json
import pathlib
import shutil

from datasets import Dataset, DatasetDict

In [None]:
step_identifier = pathlib.Path("step_1")

input_directory = pathlib.Path(step_identifier, "input_directory")
working_directory = pathlib.Path(step_identifier, "working_directory")
output_directory = pathlib.Path(step_identifier, "output_directory")

In [None]:
raw_dataset_path = pathlib.Path(input_directory, "json_documents.json")

hugging_face_dataset_path = pathlib.Path(working_directory, "hugging_face_dataset_directory")
hugging_face_dataset_archive = pathlib.Path(output_directory, "hugging_face_dataset_archive.zip")

In [None]:
with raw_dataset_path.open(encoding="utf-8") as file_object:
    raw_dataset = json.load(file_object)

In [None]:
dataset_splits = {split_type: [] for split_type in ["train", "validation", "test"]}

for document in raw_dataset["tuning_documents"]:
    dataset_splits[document["split"]].append(document)

In [None]:
hugging_face_dataset = DatasetDict(
    {
        split_type: Dataset.from_list(split_data)
        for split_type, split_data in dataset_splits.items()
    }
)

In [None]:
hugging_face_dataset.save_to_disk(hugging_face_dataset_path)

In [None]:
_ = shutil.make_archive(
    str(pathlib.Path(hugging_face_dataset_archive.parent, hugging_face_dataset_archive.stem)),
    hugging_face_dataset_archive.suffix[1:],
    root_dir=working_directory,
    base_dir=hugging_face_dataset_path.stem,
)