# PDF Glossary Builder

このノートブックは `data_ref` 配下の PDF を順次読み込み、資料ごとの固有用語と定義を抽出した JSON (`glossary_terms.json`) を生成する PoC です。`.env` に OpenAI API キーを設定し、上から順に実行してください。


In [1]:
# Purpose: Import dependencies and declare global configuration.
from __future__ import annotations

import json
import os
from pathlib import Path
from typing import Dict, Iterable, List, Sequence

from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from pypdf import PdfReader

load_dotenv()

# DATA_REF_DIR = Path("data_ref")
DATA_REF_DIR = Path("")
OUTPUT_JSON_PATH = DATA_REF_DIR / "glossary_terms.json"
PDF_GLOSSARY_MODEL = os.getenv("PDF_GLOSSARY_MODEL", "gpt-4o-mini")
MAX_CHARS_PER_CHUNK = int(os.getenv("PDF_GLOSSARY_MAX_CHARS", "1200"))
CHUNK_OVERLAP = int(os.getenv("PDF_GLOSSARY_CHUNK_OVERLAP", "200"))
MAX_TERMS_PER_CHUNK = int(os.getenv("PDF_GLOSSARY_TERMS_PER_CHUNK", "4"))

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set. Populate it in .env before running this notebook.")


In [2]:
# Purpose: Define structured outputs, prompts, and PDF/chunk helper functions.
class GlossaryEntry(BaseModel):
    term: str = Field(..., description="専門用語 (短いラベル)")
    definition: str = Field(..., description="資料の内容に基づく日本語定義")
    source_pages: List[int] = Field(..., description="定義を裏付けるページ番号 (1-based)")
    confidence: float = Field(..., ge=0, le=1, description="モデルの自己評価 (0-1)")


class GlossaryBatch(BaseModel):
    entries: List[GlossaryEntry]


glossary_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a meticulous Japanese technical editor. "
        "Identify domain-specific terms that appear unique to the provided document excerpt. "
        "Only output terms that can be clearly defined using the excerpt. "
        "Respond in JSON via the supplied schema.",
    ),
    (
        "human",
        "Document: {doc_name}\n"
        "Pages: {page_span}\n"
        "Max terms allowed: {max_terms}\n"
        "Excerpt:\n{excerpt}\n"
        "\nConstraints:\n"
        "- Prefer名詞やカタカナ語など資料固有のキーワード\n"
        "- definitionは資料内の根拠を説明する日本語\n"
        "- source_pagesは整数のみ\n"
        "- confidenceは0.0~1.0で相対的な確信度",
    ),
])

structured_llm = ChatOpenAI(
    model=PDF_GLOSSARY_MODEL,
    temperature=0,
    api_key=OPENAI_API_KEY,
).with_structured_output(GlossaryBatch)


def extract_text_by_page(pdf_path: Path) -> List[tuple[int, str]]:
    reader = PdfReader(str(pdf_path))
    pages: List[tuple[int, str]] = []
    for idx, page in enumerate(reader.pages, start=1):
        try:
            text = page.extract_text() or ""
        except Exception as exc:  # pragma: no cover - PyPDF edge cases
            print(f"[warn] Failed to extract page {idx} of {pdf_path.name}: {exc}")
            text = ""
        normalized = " ".join(text.replace("\u3000", " ").split())
        if normalized:
            pages.append((idx, normalized))
    return pages


def chunk_pages(
    pages: Sequence[tuple[int, str]],
    max_chars: int = MAX_CHARS_PER_CHUNK,
    overlap: int = CHUNK_OVERLAP,
) -> List[Dict[str, object]]:
    if overlap >= max_chars:
        raise ValueError("CHUNK_OVERLAP must be smaller than MAX_CHARS_PER_CHUNK")
    chunks: List[Dict[str, object]] = []
    buffer = ""
    buffer_pages: List[int] = []
    for page_num, text in pages:
        tagged = f"[page={page_num}] {text.strip()} "
        if not buffer:
            buffer_pages = [page_num]
        elif buffer_pages[-1] != page_num:
            buffer_pages.append(page_num)
        if len(buffer) + len(tagged) > max_chars and buffer:
            chunks.append({"text": buffer.strip(), "pages": buffer_pages.copy()})
            buffer = buffer[-overlap:]
            buffer_pages = buffer_pages[-1:]
        buffer += tagged
    if buffer.strip():
        chunks.append({"text": buffer.strip(), "pages": buffer_pages.copy()})
    return chunks


def invoke_glossary_model(doc_name: str, chunk: Dict[str, object]) -> List[GlossaryEntry]:
    page_span = ", ".join(str(p) for p in chunk["pages"]) or "unknown"
    messages = glossary_prompt.format_prompt(
        doc_name=doc_name,
        page_span=page_span,
        max_terms=MAX_TERMS_PER_CHUNK,
        excerpt=chunk["text"],
    ).to_messages()
    response: GlossaryBatch = structured_llm.invoke(messages)
    return response.entries


def merge_entries(entries: Iterable[GlossaryEntry]) -> Dict[str, Dict[str, object]]:
    merged: Dict[str, Dict[str, object]] = {}
    for entry in entries:
        term = entry.term.strip()
        if not term:
            continue
        bucket = merged.setdefault(
            term,
            {
                "definition": entry.definition.strip(),
                "source_pages": sorted(set(entry.source_pages)),
                "confidence": entry.confidence,
                "hits": 1,
            },
        )
        if bucket["definition"] != entry.definition.strip() and len(entry.definition.strip()) > len(bucket["definition"]):
            bucket["definition"] = entry.definition.strip()
        bucket["source_pages"] = sorted(set(bucket["source_pages"]) | set(entry.source_pages))
        bucket["confidence"] = round((bucket["confidence"] * bucket["hits"] + entry.confidence) / (bucket["hits"] + 1), 3)
        bucket["hits"] += 1
    for value in merged.values():
        value.pop("hits", None)
    return merged


In [3]:
# Purpose: Orchestrate per-PDF glossary generation and persist JSON output.
def build_glossary_for_pdf(pdf_path: Path) -> Dict[str, Dict[str, object]]:
    pages = extract_text_by_page(pdf_path)
    if not pages:
        print(f"[warn] No extractable text in {pdf_path.name}. Skipping.")
        return {}
    chunks = chunk_pages(pages)
    collected: List[GlossaryEntry] = []
    for chunk in chunks:
        entries = invoke_glossary_model(pdf_path.name, chunk)
        collected.extend(entries)
    merged = merge_entries(collected)
    return merged


def run_directory_pipeline(pdf_dir: Path) -> Dict[str, Dict[str, Dict[str, object]]]:
    output: Dict[str, Dict[str, Dict[str, object]]] = {}
    for pdf_path in sorted(pdf_dir.glob("*.pdf")):
        print(f"Processing {pdf_path.name} ...")
        glossary = build_glossary_for_pdf(pdf_path)
        if glossary:
            output[pdf_path.name] = glossary
    return output


glossary_payload = run_directory_pipeline(DATA_REF_DIR)
OUTPUT_JSON_PATH.write_text(json.dumps(glossary_payload, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Wrote {len(glossary_payload)} documents into {OUTPUT_JSON_PATH}")


Processing 1-03.pdf ...
[warn] No extractable text in 1-03.pdf. Skipping.
Processing 1-04.pdf ...
[warn] No extractable text in 1-04.pdf. Skipping.
Processing 1-06.pdf ...
[warn] No extractable text in 1-06.pdf. Skipping.
Processing 1-0w6.pdf ...
[warn] No extractable text in 1-0w6.pdf. Skipping.
Processing 2-08.pdf ...
[warn] No extractable text in 2-08.pdf. Skipping.
Processing 2-10.pdf ...
[warn] No extractable text in 2-10.pdf. Skipping.
Processing 2-101.pdf ...
[warn] No extractable text in 2-101.pdf. Skipping.
Wrote 0 documents into glossary_terms.json


In [None]:
# Purpose: Flatten glossary payload into a term -> definition dictionary with median-style merging for duplicates.
def select_balanced_definition(entries: List[Dict[str, float | str]]) -> str:
    """Pick the median-length definition (ties broken by higher confidence)."""
    sorted_entries = sorted(
        entries,
        key=lambda entry: (len(entry["definition"]), -float(entry.get("confidence", 0.0))),
    )
    median_idx = len(sorted_entries) // 2
    return sorted_entries[median_idx]["definition"]


def flatten_glossary(payload: Dict[str, Dict[str, Dict[str, object]]]) -> Dict[str, str]:
    buckets: Dict[str, List[Dict[str, object]]] = {}
    for doc_terms in payload.values():
        for term, info in doc_terms.items():
            definition = str(info.get("definition", "")).strip()
            if not definition:
                continue
            buckets.setdefault(term, []).append(
                {"definition": definition, "confidence": float(info.get("confidence", 0.0))}
            )
    flattened: Dict[str, str] = {}
    for term, entries in buckets.items():
        flattened[term] = select_balanced_definition(entries)
    return flattened


simple_glossary = flatten_glossary(glossary_payload)
OUTPUT_JSON_PATH.write_text(json.dumps(simple_glossary, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Flattened glossary now contains {len(simple_glossary)} unique terms -> {OUTPUT_JSON_PATH}")


In [None]:
# Purpose: Peek at a few simplified entries to verify the flattened structure.
list(simple_glossary.items())[:5]


[('1-100.pdf',
  {'第1種換気': {'definition': '排気と給気を共に機械を用いて行う方法。',
    'source_pages': [2],
    'confidence': 0.9},
   'アメニティー換気': {'definition': '当社主力システムであり、第1種機械換気を指す。',
    'source_pages': [4],
    'confidence': 0.8},
   '冷暖房設備': {'definition': 'エアコンや床暖房など、空気の温度を調整するための設備。',
    'source_pages': [5],
    'confidence': 0.85},
   '換気扇': {'definition': '空気を入れ替えるための機械設備で、給気口や屋外フードと連携して使用される。',
    'source_pages': [3],
    'confidence': 0.9}}),
 ('1-17.pdf',
  {'玄関': {'definition': '住まいの入口であり、機能だけでなく印象を高める部屋として計画される空間。',
    'source_pages': [1, 2, 3],
    'confidence': 0.9},
   '玄関ポーチ': {'definition': '玄関の外に設けられるスペースで、適切な広さや雨がかりにならないように計画される。',
    'source_pages': [4, 5, 6],
    'confidence': 0.8},
   '土間': {'definition': '玄関に設けられる床の一部で、通行の安全性と視覚的な広がり感を大切に計画される。',
    'source_pages': [6, 8],
    'confidence': 0.75},
   '動線': {'definition': '住まいの中での人の動きの流れを指し、集中させないように計画することが求められる。',
    'source_pages': [2, 4],
    'confidence': 0.8},
   '玄関ドア': {'definition': 'アプローチの方向に開くことを基本とする玄関の出入り口の扉。