In [None]:
import os
import subprocess
import fitz
import docx
import json
from tqdm import tqdm
import re
import json

In [None]:
paths = ['trjlseng/cyst', 'trjlseng/cest', 'trjlseng/csst']

In [None]:
def extract_text_from_docx(file_path):
    """Extract text from a .docx file"""
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def convert_doc_to_docx(input_path):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"File {input_path} not exist")
    output_dir = os.path.dirname(os.path.abspath(input_path))
    libreoffice_path = '/Applications/LibreOffice.app/Contents/MacOS/soffice'
    
    try:
        subprocess.run([
            libreoffice_path,
            '--headless',
            '--convert-to', 'docx',
            input_path,
            '--outdir', str(output_dir)
        ], check=True)
        # print(f"got {output_dir}")
    except subprocess.CalledProcessError as e:
        print(f"Failed: {e}")

def extract_text_from_pdf(file_path):
    """Extract text from a .pdf file"""
    doc = fitz.open(file_path)
    return "\n".join([page.get_text("text") for page in doc])


In [None]:
for path in paths:
    doc_path = os.path.join(path, "unzip")
    for file in (pbar := tqdm(os.listdir(doc_path))):
        pbar.set_description(f"convert doc: {path}:{file}")
        if not file:
            continue
        if os.path.isdir(file):
            continue
        ext = os.path.splitext(file)[1].lower()
        if ext == ".doc":
            docx_path = convert_doc_to_docx(os.path.join(doc_path, file))

converted_txt = {}
for path in paths:
    converted_txt[path] = []
    doc_path = os.path.join(path, "unzip")
    for file in (pbar := tqdm(os.listdir(doc_path))):
        pbar.set_description(f"extract txt: {path}:{file}")
        if not file:
            continue
        if os.path.isdir(os.path.join(doc_path, file)):
            continue
        ext = os.path.splitext(file)[1].lower()
        name = os.path.splitext(file)[0]
        if ext == ".docx":
            converted_txt[path].append((name, extract_text_from_docx(os.path.join(doc_path, file))))
        elif ext == ".pdf":
            converted_txt[path].append((name, extract_text_from_pdf(os.path.join(doc_path, file))))


In [None]:
# os.mkdir("trjlseng_parsed")

categories = ["cyst", "cest", "csst"]

for cat in categories:
    original_path = "trjlseng" + "/" + cat
    new_path = "trjlseng_parsed" + "/" + cat
    # os.mkdir(new_path)
    for name, content in converted_txt[original_path]:
        with open(os.path.join(new_path, name + ".json"), "w") as f:
            data = {"name": name, "content": content}
            json.dump(data, f, indent=4, ensure_ascii=False)
