In [None]:
import os
import glob
from docx import Document
import pytesseract
from pdf2image import convert_from_path
import os
import subprocess
import re


def convert_docx_to_txt(docx_path):
    doc = Document(docx_path)
    full_text = [para.text for para in doc.paragraphs]
    text = "\n".join(full_text)
    return text


def convert_doc_to_txt(doc_path):
    try:
        result = subprocess.run(
            ["textutil", "-convert", "txt", "-stdout", doc_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
        if result.returncode == 0:
            return result.stdout
        else:
            print("Ошибка:", result.stderr)
            return ""
    except Exception as e:
        print("Ошибка при запуске catdoc:", e)
        return ""


def ocr_pdf(pdf_path):
    os.environ["TESSDATA_PREFIX"] = "/opt/homebrew/share/tessdata"
    pages = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=30)
    extracted_text = ""
    for i, page in enumerate(pages):
        text = pytesseract.image_to_string(page, lang="rus")
        extracted_text += text + " \n"
    return extracted_text


def get_text_from_file(file_path):
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    if ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    elif ext == ".docx":
        return convert_docx_to_txt(file_path)
    elif ext == ".doc":
        return convert_doc_to_txt(file_path)
    elif ext == ".pdf":
        return ocr_pdf(file_path)
    else:
        print(f"Неподдерживаемый формат файла: {file_path}")
        return ""


pattern = re.compile(r"[^А-Яа-яЁё0-9\s\.,;:!?()\-\"]+")


def filter_russian_text(text):
    return pattern.sub("", text)


def normalize_whitespace(text):
    text = re.sub(r"\n\n+", "\n\n", text)
    text = re.sub(r"[ ]+", " ", text)
    text = re.sub(r"\t+", "\t", text)
    return text.strip()


def process_documents(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    file_patterns = ["*.docx", "*.doc", "*.pdf", "*.txt"]
    files = []
    for pattern in file_patterns:
        files.extend(glob.glob(os.path.join(input_dir, pattern)))
    files.sort()

    for idx, file_path in enumerate(files, start=1):
        print(f"Обработка файла: {file_path}")
        text = get_text_from_file(file_path)
        if text and text.strip():
            output_file = os.path.join(output_dir, f"{idx}.txt")
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(normalize_whitespace(filter_russian_text(text)))
            print(f"Сохранено: {output_file}")
        else:
            print(f"Пустой текст или ошибка при обработке файла: {file_path}")
            pass


input_directory = "../data/data_raw"
output_directory = "../data/rpoc_data_test"

process_documents(input_directory, output_directory)