In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential

# Load environment variables from .env file
load_dotenv()

# 🔒 Don’t hardcode secrets; use env vars instead
endpoint = os.getenv("AZURE_DI_ENDPOINT", "https://<your-resource>.cognitiveservices.azure.com/")
key = os.getenv("AZURE_DI_KEY", "<your-key>")

pdf_path = Path("/Users/xingdi/Documents/protagodoc_benchmark/datasets/orbit_v1/pdf/f_cEGeU5YAxKXMhtVstsqqja.pdf")

client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

with pdf_path.open("rb") as f:
    poller = client.begin_analyze_document(
        model_id="prebuilt-layout",
        body=f,  # <-- file handle as body parameter
        output_content_format=DocumentContentFormat.MARKDOWN,
    )

# with pdf_path.open("rb") as f:
#     poller = client.begin_analyze_document(
#         model_id="prebuilt-layout",
#         body=f,  # <-- file handle as body parameter
#         output_content_format=None,
#     )

result = poller.result()
# print(result.content)

# Save complete result as pickle file
import pickle
api_version = result["apiVersion"]  # You can adjust this based on your Azure API version
output_dir = Path("/Users/xingdi/Documents/protagodoc_benchmark/datasets/orbit_v1/azure_pkl_{api_version}".format(api_version=api_version))
output_dir.mkdir(parents=True, exist_ok=True)

pkl_file = output_dir / f"{pdf_path.stem}.pkl"
with pkl_file.open("wb") as f:
    pickle.dump(result, f)
    
print(f"Saved complete result to: {pkl_file}")

# Also save markdown content for reference
out_md = output_dir / f"{pdf_path.stem}.md"
out_md.write_text(result.content or "", encoding="utf-8")
print(f"Saved Markdown content to: {out_md}")

<!-- PageHeader="EDINET提出書類 株式会社Cas a(E33485) 訂正有価証券報告書" -->


# 【表紙】


<table>
<tr>
<td>【提出書類】</td>
<td>有価証券報告書の訂正報告書</td>
</tr>
<tr>
<td>【根拠条文】</td>
<td>金融商品取引法第24条の2第1項</td>
</tr>
<tr>
<td>【提出先】</td>
<td>関東財務局長</td>
</tr>
<tr>
<td>【提出日】</td>
<td>2024年3月6日</td>
</tr>
<tr>
<td>【事業年度】</td>
<td>第10期(自 2022年2月1日 至 2023年1月31日)</td>
</tr>
<tr>
<td>【会社名】</td>
<td>株式会社Casa</td>
</tr>
<tr>
<td>【英訳名】</td>
<td>Casa Inc.</td>
</tr>
<tr>
<td>【代表者の役職氏名】</td>
<td>代表取締役社長 宮地 正剛</td>
</tr>
<tr>
<td>【本店の所在の場所】</td>
<td>東京都新宿区西新宿二丁目6番1号</td>
</tr>
<tr>
<td>【電話番号】</td>
<td>03-5339-1143(代表)</td>
</tr>
<tr>
<td>【事務連絡者氏名】</td>
<td>取締役 鹿島 一郎</td>
</tr>
<tr>
<td>【最寄りの連絡場所】</td>
<td>東京都新宿区西新宿二丁目6番1号</td>
</tr>
<tr>
<td>【電話番号】</td>
<td>03-5339-1143(代表)</td>
</tr>
<tr>
<td>【事務連絡者氏名】</td>
<td>取締役 鹿島 一郎</td>
</tr>
<tr>
<td>【縦覧に供する場所】</td>
<td>株式会社東京証券取引所 (東京都中央区日本橋兜町2番1号)</td>
</tr>
</table>


<!-- PageNumber="1/95" -->
<!-- PageBreak -->

<!-- PageHeader="EDINET提出書類 株式会社Cas a (E33485) 訂正有価証券報告書" -->


## 1 【有価証券

In [10]:
print(result.keys())

dict_keys(['apiVersion', 'modelId', 'stringIndexType', 'content', 'pages', 'tables', 'paragraphs', 'styles', 'contentFormat', 'sections', 'figures'])


In [11]:
print(result["contentFormat"])


text
