In [10]:
import os
import json


def validate_json_files(directory):
    json_files = [f for f in os.listdir(directory) if f.endswith(".json")]
    for json_file in json_files:
        file_path = os.path.join(directory, json_file)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                json.load(f)
            print(f"{json_file}: Valid JSON")
        except json.JSONDecodeError as e:
            print(f"{json_file}: Invalid JSON - {e}")

In [23]:
import os
import re
from datasets import load_dataset

original_dir = "../data/Distribution20241221_all"
preprocessed_dir = "../data/Distribution20241221_all_preprocessed"


def preprocess_ichikara_instruction(original_dir: str, preprocessed_dir: str):
    if not os.path.exists(preprocessed_dir):
        os.makedirs(preprocessed_dir)

    data_files = [
        os.path.join(original_dir, f)
        for f in os.listdir(original_dir)
        if f.endswith(".json")
    ]

    invalid_escape_pattern = re.compile(r"\\(?![\"\\/bfnrt]|u[0-9a-fA-F]{4})")

    for file_path in data_files:
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                # Replace invalid escape sequences
                content = invalid_escape_pattern.sub(r"\\\\", content)
                # Replace \\" with \"
                content = content.replace('\\\\"', '\\"')

            preprocessed_file_path = os.path.join(
                preprocessed_dir, os.path.basename(file_path)
            )
            with open(
                preprocessed_file_path, "w", encoding="utf-8"
            ) as preprocessed_file:
                preprocessed_file.write(content)

            print(f"Processed {file_path} -> {preprocessed_file_path}")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")


# Run the preprocessing function
preprocess_ichikara_instruction(original_dir, preprocessed_dir)
validate_json_files(preprocessed_dir)
load_dataset("json", data_files=f"{preprocessed_dir}/*.json")

Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-1.json
Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.1.json
Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.2.json
Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.1.json
Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.2.json
Processed ../data/Distribution20241221_all/ichikara-instruction-003-002-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-002-1.json


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'output'],
        num_rows: 6701
    })
})