In [None]:
import os
from mistralai import Mistral

def extract_text_from_pdf_with_mistral(input_pdf_path, output_txt_path) :
    try :
        api_key = os.environ.get("MISTRAL_API_KEY")
        if not api_key :
            raise ValueError("MISTRAL_API_KEY가 설정되지 않음")
        client = Mistral(api_key=api_key)

        with open(input_pdf_path, "rb") as f :
            file_content = f.read()

        ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={"type" : "base64", "content" : file_content}
        )

        extracted_text_with_pages = []
        if ocr_response and ocr_response.pages :
            pages_to_process = ocr_response.pages[:10]

            for i, page in enumerate(pages_to_process) :
                page_number = i + 1
                extracted_text_with_pages.append(f"### {page_number}페이지")
                if page.text :
                    extracted_text_with_pages.append(page.text + "\n\n")
                else :
                    extracted_text_with_pages.append("<내용 없음>\n\n")
        with open(output_txt_path, 'w', encoding="utf-8") as f :
            f.writelines(extracted_text_with_pages)
        print(f"텍스트 추출 완료 : '{input_pdf_path}' -> '{output_txt_path}'")

    except Exception as e :
        print(f"오류 발생 : {e}")

input_pdf_file = "./data/KEPIC/KEPIC_FULL.pdf"
output_txt_file = "./data/KEPIC/KEPIC_Mistral_1_10.txt"

extract_text_from_pdf_with_mistral(input_pdf_file, output_txt_file)