In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
from pyzerox import zerox
import os
import asyncio
from dotenv import load_dotenv

load_dotenv()

model = "gpt-4o-2024-11-20"

async def main():
    for file_path in os.listdir("./documents_with_english_titles"):
        file_path = f"./documents_with_english_titles/{file_path}"
            
        kwargs = {}
        custom_system_prompt = None
        select_pages = None
        output_dir = "./output" ## directory to save the consolidated markdown file
        result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
                            custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
        print(f'{file_path} is done')
    return result


result = asyncio.run(main())






./documents_with_english_titles/delegation_of_authority.pdf is done
./documents_with_english_titles/employee_benefits_and_welfare_faq.pdf is done
./documents_with_english_titles/employee_benefits_and_welfare_guide.pdf is done
./documents_with_english_titles/employee_handbook_and_hr_policy.pdf is done
./documents_with_english_titles/expense_management_guide.pdf is done
./documents_with_english_titles/it_support_guide.pdf is done
./documents_with_english_titles/legal_and_compliance_policy.pdf is done


In [5]:
from pyzerox import zerox
import os
import asyncio
from dotenv import load_dotenv

load_dotenv()
model = "gpt-4o-2024-11-20"

async def main():
    custom_system_prompt = """
    You are converting a PDF to markdown. 
    CRITICAL: This document contains Korean (한글) text. 
    You MUST preserve all Korean characters exactly as they appear.
    Do not transliterate, translate, or modify any Korean text.
    Ensure proper UTF-8 encoding for all Korean characters.
    """
    
    output_dir = "./output"
    os.makedirs(output_dir, exist_ok=True)
    
    for file_name in os.listdir("./documents_with_english_titles"):
        if not file_name.endswith('.pdf'):
            continue
            
        file_path = f"./documents_with_english_titles/{file_name}"
        
        print(f'Processing {file_name}...')
        
        result = await zerox(
            file_path=file_path, 
            model=model, 
            output_dir=None,
            custom_system_prompt=custom_system_prompt,
            select_pages=None,
        )
        
        # ZeroxOutput 객체에서 내용 추출
        output_file = os.path.join(output_dir, file_name.replace('.pdf', '.md'))
        
        # result 객체의 속성 확인 후 저장
        try:
            # ZeroxOutput 객체는 .pages 속성을 가지고 있을 수 있음
            if hasattr(result, 'pages'):
                content = '\n\n'.join([page.content for page in result.pages])
            elif hasattr(result, 'content'):
                content = result.content
            elif hasattr(result, 'markdown'):
                content = result.markdown
            else:
                content = str(result)
            
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(content)
            
            print(f'{file_name} is done ✓')
            
        except Exception as e:
            print(f'Error processing {file_name}: {e}')
            print(f'Result type: {type(result)}')
            print(f'Result attributes: {dir(result)}')
    
    return "완료"

result = asyncio.run(main())
print(result)

Processing delegation_of_authority.pdf...
delegation_of_authority.pdf is done ✓
Processing employee_benefits_and_welfare_faq.pdf...
employee_benefits_and_welfare_faq.pdf is done ✓
Processing employee_benefits_and_welfare_guide.pdf...
employee_benefits_and_welfare_guide.pdf is done ✓
Processing employee_handbook_and_hr_policy.pdf...
employee_handbook_and_hr_policy.pdf is done ✓
Processing expense_management_guide.pdf...
expense_management_guide.pdf is done ✓
Processing it_support_guide.pdf...
it_support_guide.pdf is done ✓
Processing legal_and_compliance_policy.pdf...
legal_and_compliance_policy.pdf is done ✓
완료


In [3]:
import fitz  # PyMuPDF 버전 - zerox보다 더 빠르고 한글 인식 정확하게 함함
import os

def extract_text_from_pdf(pdf_path):
    """PyMuPDF로 PDF에서 텍스트 추출"""
    doc = fitz.open(pdf_path)
    text = ""
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        text += f"\n--- Page {page_num + 1} ---\n"
        text += page.get_text()
    
    doc.close()
    return text

# 모든 PDF 파싱
input_dir = "./documents_with_english_titles"
output_dir = "./output_pymupdf"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(input_dir, filename)
        print(f'Processing {filename}...')
        
        # 텍스트 추출
        text = extract_text_from_pdf(pdf_path)
        
        # 마크다운으로 저장
        output_filename = filename.replace('.pdf', '.md')
        output_path = os.path.join(output_dir, output_filename)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(f"# {filename}\n\n")
            f.write(text)
        
        print(f'{filename} ✓ 완료')

print("모든 PDF 파싱 완료!")

Processing delegation_of_authority.pdf...
delegation_of_authority.pdf ✓ 완료
Processing employee_benefits_and_welfare_faq.pdf...
employee_benefits_and_welfare_faq.pdf ✓ 완료
Processing employee_benefits_and_welfare_guide.pdf...
employee_benefits_and_welfare_guide.pdf ✓ 완료
Processing employee_handbook_and_hr_policy.pdf...
employee_handbook_and_hr_policy.pdf ✓ 완료
Processing expense_management_guide.pdf...
expense_management_guide.pdf ✓ 완료
Processing it_support_guide.pdf...
it_support_guide.pdf ✓ 완료
Processing legal_and_compliance_policy.pdf...
legal_and_compliance_policy.pdf ✓ 완료
모든 PDF 파싱 완료!
