In [1]:
import os

In [2]:
def clear_extra_spaces(input_text: str) -> str:
    """Removes unnecessary common spaces (excluding Japanese spaces) from the text
    with cashing the Japanese"""
    japanese_set = {chr(i) for i in range(0x3040, 0x30FF+1)} | {chr(i) for i in range(0x4E00, 0x9FAF+1)}
    new_text = []
    for i, char in enumerate(input_text):
        if char == " " and 1 <= i < len(input_text) - 1:
            prev_char, next_char = input_text[i-1], input_text[i+1]
            if (prev_char in japanese_set and next_char in japanese_set) or prev_char == " " or next_char == " ":
                continue
        new_text.append(char)
    return ''.join(new_text)

def correct_ocr_errors(input_text: str) -> str:
    """Replaces some common OCR errors for the current case."""
    replacements = {
        ":::": "……",
        ":：:": "……",
        "\t": "……",
        "・て": "で",
        "•て": "で",
        "•": "・",
        ":：:〇": "……。",
        "^": "。",
        ":：:。": "……。",
        "た〇": "た。",
        "た0": "た。"
    }
    for old, new in replacements.items():
        input_text = input_text.replace(old, new)
    return clear_extra_spaces(input_text)


In [3]:
folders = os.listdir("C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\texts")
for subfolder in folders:
    if not os.path.exists(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\preprocessed_texts\\{subfolder}"):
        os.makedirs(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\preprocessed_texts\\{subfolder}") 

    subfolder_files = os.listdir(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\texts\\{subfolder}")
    for doc in subfolder_files:
        with open(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\texts\\{subfolder}\\{doc}", encoding="utf-8") as file:
            text = file.read()
        cleaned_text = correct_ocr_errors(text)
        with open(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\preprocessed_texts\\{subfolder}\\{doc}", encoding="utf-8", mode="w") as file:
            file.write(cleaned_text)

In [7]:
japanese_set = {chr(i) for i in range(0x3040, 0x30FF+1)} | {chr(i) for i in range(0x4E00, 0x9FAF+1)}
len(japanese_set)

21104

In [8]:
import os
import regex

# Use compiled regex for faster repeated use
newline_between_japanese_regex = regex.compile(r'([\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}])\n([\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}])')

def remove_newline_between_japanese(text):
    return newline_between_japanese_regex.sub(r'\1\2', text)

def correct_ocr_errors(input_text:str):
    """Performs all replacements in a single pass as much as possible."""
    replacements = [
        (":::", "……"),
        (":：:", "……"),
        ("\t", "……"),
        ("・て", "で"),
        ("•て", "で"),
        ("•", "・"),
        (":：:〇", "……。"),
        ("^", "。"),
        (":：:。", "……。"),
        ("た〇", "た。"),
        ("た0", "た。"),
    ]
    
    # Apply all replacements
    for old, new in replacements:
        input_text = input_text.replace(old, new)
    
    # Remove unnecessary spaces and newlines between Japanese characters
    input_text = remove_newline_between_japanese(input_text)
    
    return input_text

def process_files():
    base_dir = "C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature"
    texts_dir = os.path.join(base_dir, "texts")
    preprocessed_dir = os.path.join(base_dir, "preprocessed_texts")

    folders = os.listdir(texts_dir)
    for subfolder in folders:
        subfolder_path = os.path.join(preprocessed_dir, subfolder)
        os.makedirs(subfolder_path, exist_ok=True)

        subfolder_files = os.listdir(os.path.join(texts_dir, subfolder))
        for doc in subfolder_files:
            with open(os.path.join(texts_dir, subfolder, doc), encoding="utf-8") as file:
                text = file.read()
            
            cleaned_text = correct_ocr_errors(text)

            with open(os.path.join(preprocessed_dir, subfolder, doc), "w", encoding="utf-8") as file:
                file.write(cleaned_text)

process_files()
