In [None]:
#PDFからテキスト抽出、2段組み対応、近い座標のブロックを２００単語以上でまとめる設定
import fitz
import os
import json
from dotenv import load_dotenv

load_dotenv()
PDF_FOLDER = os.getenv("PDF_FOLDER")
OUTPUT_JSONL = "paragraphs.jsonl"

# すでに処理済みのIDをロード
if os.path.exists(OUTPUT_JSONL):
    with open(OUTPUT_JSONL, "r", encoding="utf-8") as f:
        processed_ids = {json.loads(line)["id"] for line in f}
else:
    processed_ids = set()

pdf_files = []
for root, _, files in os.walk(PDF_FOLDER):
    for file in files:
        if file.lower().endswith(".pdf"):
            pdf_files.append(os.path.join(root, file))

print(f"PDFファイル数: {len(pdf_files)}")

def combine_close_blocks(blocks, x_margin=50, y_margin=10):
    """
    (x0, y0, x1, y1, text) のリストを受け取り、
    上→下、左→右ソート後「近い位置(x,y)」のブロックを一つの段落にまとめる。
    """
    blocks_sorted = sorted(blocks, key=lambda b: (b[1], b[0]))  # y0, x0 でソート

    paragraphs = []
    current_text = ""
    prev_block = None

    for block in blocks_sorted:
        x0, y0, x1, y1, text = block

        if prev_block is not None:
            px0, py0, px1, py1, ptext = prev_block
            same_column = (abs(x0 - px0) < x_margin)   # 横方向のずれが小さい
            vertical_gap = abs(y0 - py1)               # 縦方向の隙間が小さい

            if same_column and (vertical_gap < y_margin):
                # 同じ段落とみなして結合
                if current_text:
                    current_text += " "
                current_text += text
            else:
                paragraphs.append(current_text.strip())
                current_text = text
        else:
            current_text = text

        prev_block = block

    # 最後の段落が残っていれば追加
    if current_text:
        paragraphs.append(current_text.strip())

    return paragraphs

def merge_short_paragraphs_both_ends(paragraphs, min_words=100):
    """
    「短い段落は次の段落と結合」を繰り返す + 
    それでも最後に短い段落が残れば前の段落にくっつける

    - min_words: これ未満の段落は短いとみなして結合
    """

    if not paragraphs:
        return []

    merged = []
    i = 0
    n = len(paragraphs)

    # 1) 前方からスキャンして、短い段落は次に合体
    while i < n:
        current_para = paragraphs[i]
        current_words = len(current_para.split())

        if current_words >= min_words:
            # 充分長いなら確定
            merged.append(current_para)
            i += 1
        else:
            # 次の段落を巻き込む
            j = i + 1
            combined = current_para
            combined_words = current_words

            while j < n and combined_words < min_words:
                combined += " " + paragraphs[j]
                combined_words = len(combined.split())
                j += 1

            merged.append(combined.strip())
            i = j

    # 2) もし最後の段落がまだ短いなら、前の段落と合体
    if len(merged) >= 2:
        last_para = merged[-1]
        if len(last_para.split()) < min_words:
            # 前の段落がある限り合体
            merged[-2] = merged[-2] + " " + last_para
            merged.pop()  # 最後の段落を削除

    return merged

def extract_paragraphs_1column(raw_blocks, x_margin=50, y_margin=10, min_words=100):
    """
    シングルカラムとして全ブロックをまとめて処理。
    """
    # (x0, y0, x1, y1, text)を取り出す
    blocks = []
    for b in raw_blocks:
        x0, y0, x1, y1, txt = b[:5]
        txt = txt.replace("-\n", "").replace("\n", " ").strip()
        if txt:
            blocks.append((x0, y0, x1, y1, txt))

    if not blocks:
        return []

    # まず近いブロックを結合
    paras = combine_close_blocks(blocks, x_margin, y_margin)
    # さらに短い段落を次々に結合し、最後に余ったら前に合体
    paras_merged = merge_short_paragraphs_both_ends(paras, min_words=min_words)
    return paras_merged

def extract_paragraphs_2column(raw_blocks, page_width, x_margin=50, y_margin=10, min_words=100):
    """
    2段組を想定し、左カラム/右カラムに分けて処理し、最後に結合。
    """
    blocks_left = []
    blocks_right = []

    half_width = page_width / 2
    for b in raw_blocks:
        x0, y0, x1, y1, txt = b[:5]
        txt = txt.replace("-\n", "").replace("\n", " ").strip()
        if not txt:
            continue
        if x0 < half_width:
            blocks_left.append((x0, y0, x1, y1, txt))
        else:
            blocks_right.append((x0, y0, x1, y1, txt))

    # 左カラム
    left_paras = combine_close_blocks(blocks_left, x_margin, y_margin)
    left_paras_merged = merge_short_paragraphs_both_ends(left_paras, min_words=min_words)

    # 右カラム
    right_paras = combine_close_blocks(blocks_right, x_margin, y_margin)
    right_paras_merged = merge_short_paragraphs_both_ends(right_paras, min_words=min_words)

    # 左→右の順で合体
    return left_paras_merged + right_paras_merged

def is_two_column(page, threshold=0.3):
    """
    2段組とみなすかどうかを簡易判定。 
    - threshold=0.3: 全ブロックのうち 30%以上が右側なら2段組とみなす
    """
    raw_blocks = page.get_text("blocks")
    if not raw_blocks:
        return False

    page_width = page.rect.width
    half_width = page_width / 2

    total = 0
    right_side = 0
    for b in raw_blocks:
        x0, y0, x1, y1, txt = b[:5]
        if not txt.strip():
            continue
        total += 1
        if x0 > half_width:
            right_side += 1

    if total == 0:
        return False

    ratio = right_side / total
    return (ratio >= threshold)

def extract_paragraphs(page, x_margin=50, y_margin=10, min_words=100):
    """
    ページが2段組かどうかを判定して、適切な処理を呼び出す。
    """
    raw_blocks = page.get_text("blocks")
    if not raw_blocks:
        return []

    if is_two_column(page, threshold=0.3):
        # 2カラム処理
        return extract_paragraphs_2column(raw_blocks, page.rect.width,
                                          x_margin, y_margin, min_words)
    else:
        # シングルカラム処理
        return extract_paragraphs_1column(raw_blocks,
                                          x_margin, y_margin, min_words)


with open(OUTPUT_JSONL, "a", encoding="utf-8") as out_jsonl:
    for pdf_file in pdf_files:
        pdf_id = os.path.splitext(os.path.basename(pdf_file))[0]
        print(f"処理中: {os.path.basename(pdf_file)}")

        doc = fitz.open(pdf_file)
        paragraphs_all_pages = []

        for page_idx, page in enumerate(doc):
            page_paragraphs = extract_paragraphs(
                page,
                x_margin=50,     # 段落結合の横方向閾値
                y_margin=10,     # 段落結合の縦方向閾値
                min_words=200    # 200単語未満の段落は次の段落(or最後に前の段落)と連結
            )

            for p in page_paragraphs:
                paragraphs_all_pages.append((p, page_idx + 1))

        doc.close()

        # JSON Lines形式で書き出し
        for i, (para_text, page_num) in enumerate(paragraphs_all_pages):
            para_id = f"{pdf_id}_p{i+1}"
            if para_id in processed_ids:
                continue

            entry = {
                "id": para_id,
                "title": pdf_id,
                "paragraph": para_text,
                "metadata": {
                    "page": page_num,
                    "source": pdf_file
                }
            }
            out_jsonl.write(json.dumps(entry, ensure_ascii=False) + "\n")

print("段落抽出完了。")




In [None]:
#チェック用
import json

INPUT_JSONL = "paragraphs.jsonl"
OUTPUT_TXT = "first_paper_paragraphs_preview.txt"

# 最初に登場した論文IDを記録
first_paper_id = None

with open(INPUT_JSONL, "r", encoding="utf-8") as fin, open(OUTPUT_TXT, "w", encoding="utf-8") as fout:
    for line in fin:
        data = json.loads(line)
        title = data["title"]

        if first_paper_id is None:
            first_paper_id = title  # 最初の論文IDを記録
            print(f"最初の論文: {first_paper_id}")

        if data["title"] != first_paper_id:
            continue  # 最初の論文以外はスキップ

        para_id = data["id"]
        text = data["paragraph"]
        page = data["metadata"]["page"]

        fout.write(f"--- {para_id} (page {page}) ---\n")
        fout.write(text.strip() + "\n\n")

print(f"{first_paper_id} の段落を {OUTPUT_TXT} に出力しました。")
