In [1]:
import requests
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

True

In [8]:
# htmlの文字化けを解消したい。

from bs4 import BeautifulSoup
import chardet

def normalize_html_content(content):
    # bytesならエンコーディング推定→デコード
    if isinstance(content, (bytes, bytearray)):
        enc = chardet.detect(content).get("encoding") or "utf-8"
        text = content.decode(enc, errors="replace")
    else:
        text = content

    # <meta charset="utf-8"> を保証
    soup = BeautifulSoup(text, "html.parser")
    if soup.head is None:
        # head がない場合は作る
        html = soup.new_tag("html")
        head = soup.new_tag("head")
        body = soup.new_tag("body")
        body.append(soup)  # 既存ノードを body 下へ
        html.append(head)
        html.append(body)
        soup = BeautifulSoup(str(html), "html.parser")

    if not soup.head.find("meta", attrs={"charset": True}):
        meta = soup.new_tag("meta", charset="utf-8")
        soup.head.insert(0, meta)
    else:
        soup.head.find("meta", attrs={"charset": True})["charset"] = "utf-8"

    # テーブルの線を表示するためのCSSスタイルを追加
    style = soup.new_tag("style")
    style.string = """
    table {
        border-collapse: collapse;
        width: 100%;
        margin: 10px 0;
    }
    table, th, td {
        border: 1px solid #000;
    }
    th, td {
        padding: 8px;
        text-align: left;
    }
    th {
        background-color: #f2f2f2;
        font-weight: bold;
    }
    """
    soup.head.append(style)

    return str(soup)

In [9]:
def save_html(html, pdf_path, output_dir, normalize: bool = True):
    output_path = output_dir / pdf_path.with_suffix(".html").name
    if normalize:
        html = normalize_html_content(html)
    with open(output_path, "w") as f:
        f.write(html)

def save_markdown(markdown, pdf_path, output_dir):
    output_path = output_dir / pdf_path.with_suffix(".md").name
    with open(output_path, "w") as f:
        f.write(markdown)

In [10]:
def run_upstage(pdf_path, output_dir: Path = Path("../output/upstage"), 
                type: str = "html", save: bool = True, normalize: bool = True):
    url = "https://api.upstage.ai/v1/document-digitization"
    api_key = os.getenv("UPSTAGE_API_KEY")
    headers = {"Authorization": f"Bearer {api_key}"}
    # extracted_pdf_pathを使用（前のセルで設定された一時PDFファイル）
    files = {"document": open(pdf_path, "rb")}
    data = {"ocr": "auto", "model": "document-parse-nightly"}
    response = requests.post(url, headers=headers, files=files, data=data)
    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code} - {response.text}")
    if type == "html":
        content = response.json()["content"]["html"]
    elif type == "markdown":
        content = response.json()["content"]["markdown"]
    if save:
        output_path = output_dir / pdf_path.parent.name
        output_path.mkdir(parents=True, exist_ok=True)
        if type == "html":
            save_html(content, pdf_path, output_path, normalize)
        elif type == "markdown":
            save_markdown(content, pdf_path, output_path)
    return content

### 実行

In [12]:
pdf_path = Path("../temp/FSA_保険モニタリングレポート_2025_抜粋_コア_1.pdf")
content = run_upstage(pdf_path, normalize=False)