# Texts Preprossesings

In this Notebook, some common OCR errors are corrected.

In [None]:
import os
import regex

In [25]:
def is_japanese(char):
    """checks if a character is Japanese."""
    unicode_point = ord(char)
    return (0x3040 <= unicode_point <= 0x309F or
            0x30A0 <= unicode_point <= 0x30FF or
            0x4E00 <= unicode_point <= 0x9FAF)

def clear_extra_spaces (input_text:str):
    """removes unnecessary common spaces (excluding Japanese spaces) from the text"""
    text = list(input_text)
    extra_spaces_index = []

    for i in range(1,len(text)-1):
        if text[i] == " ":
            if (is_japanese(text[i-1]) and is_japanese(text[i+1])) or (text[i-1] == " " or text[i+1] == " "):
                if (not is_japanese(text[i-1]) and text[i-1] != " ") and text[i+1] == " ": 
                    continue
                extra_spaces_index.append(i)
        
    no_extra_spaces = [text[i] for i in range(len(text)) if i not in extra_spaces_index]
    return "".join(no_extra_spaces)


In [26]:
def remove_newline_between_japanese(text):
    # Pattern to match a Japanese character, followed by a newline, followed by another Japanese character
    pattern = r'([\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}])\n([\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}])'
    # Replace the pattern with the two Japanese characters without the newline
    replaced_text = regex.sub(pattern, r'\1\2', text)
    return replaced_text
    

In [27]:
def correct_ocr_errors(input_text:str): # in this shell the major OCR errors are present
    """replaces some common ocr errors for the current case""" 
    circle = clear_extra_spaces(input_text)
    circle = circle.replace(":::", "……")
    circle = circle.replace(":：:", "……")
    circle = circle.replace("	", "……")
    circle = circle.replace("・て", "で")
    circle = circle.replace("•て", "で")
    circle = circle.replace("•", "・")
    circle = circle.replace(":：:〇", "……。")
    circle = circle.replace("^", "。")
    circle = circle.replace(":：:。", "……。")
    circle = circle.replace("た〇", "た。")
    circle = circle.replace("た0", "た。")
    circle = remove_newline_between_japanese(circle)
    return circle

In [28]:
folders = os.listdir("C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\texts")
for subfolder in folders:
    if not os.path.exists(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\preprocessed_texts\\{subfolder}"):
        os.makedirs(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\preprocessed_texts\\{subfolder}") 

    subfolder_files = os.listdir(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\texts\\{subfolder}")
    for doc in subfolder_files:
        with open(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\texts\\{subfolder}\\{doc}", encoding="utf-8") as file:
            text = file.read()
        cleaned_text = correct_ocr_errors(text)
        with open(f"C:\\Users\\Artem\\Documents\\my python projects\\Masculine and Feminine in Atomic Bomb Literature\\preprocessed_texts\\{subfolder}\\{doc}", encoding="utf-8", mode="w") as file:
            file.write(cleaned_text)
    


In [4]:
folders = os. listdir("preprocessed_texts")
for folder in folders:
    files = os.listdir(f"preprocessed_texts\\{folder}")
    compound_text = ""
    for doc in files:
        with open(f"preprocessed_texts\\{folder}\\{doc}", encoding="utf-8") as file:
            text = file.read()
            compound_text += text
    with open(f"texts per author\\{folder}.txt", encoding="utf-8", mode="w") as file:
        file.write(compound_text)