<a href="https://colab.research.google.com/github/tai316/name-paper/blob/main/name_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================
# High-accuracy PDF renamer:
# text -> OCR -> DOI/arXiv -> Crossref/arXiv -> Title search -> rename
# =========================

# --- install deps ---
!pip -q install pymupdf requests pandas rapidfuzz pillow pytesseract
!apt-get -qq update
!apt-get -qq install -y tesseract-ocr
# 日本語OCRもやるなら（必要な場合のみ。少し重い）
!apt-get -qq install -y tesseract-ocr-jpn tesseract-ocr-jpn-vert

import re
import os
from pathlib import Path
from datetime import datetime
import fitz  # PyMuPDF
import requests
import pandas as pd
from rapidfuzz import fuzz
from PIL import Image
import pytesseract
import io

# ================
# Settings
# ================
TARGET_DIR = "論文が入っているフォルダ名のパスを書く"  # ★あなたのPDFフォルダ
DRY_RUN = False  # True=プレビューのみ / False=実行
RECURSIVE = True  # サブフォルダも見る

PAGES_TEXT_SCAN = 2       # まずテキスト抽出で見るページ数
PAGES_OCR_SCAN  = 2       # OCRするページ数（重いので2くらいでOK）

MAX_TITLE_LEN  = 90
MAX_AUTHOR_LEN = 40

# Crossrefはmailto入りUser-Agentが推奨（適当でOK）
CROSSREF_MAILTO = "your_email@example.com"
USER_AGENT = f"pdf-renamer/2.0 (mailto:{CROSSREF_MAILTO})"

# Crossref逆引きの一致条件（高めにして誤爆を減らす）
MIN_TITLE_SIMILARITY = 90  # 0-100

# OCR言語：英語論文中心なら "eng"
# 日本語PDFも多いなら "eng+jpn"
OCR_LANG = "eng+jpn"

# ================
# Helpers
# ================
def norm(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"\s+", " ", s)
    return s

def safe_component(s: str, max_len: int) -> str:
    s = norm(s)
    if not s:
        return "UNKNOWN"
    s = re.sub(r'[\\/:*?"<>|]', "", s)  # windows-safe
    s = s.strip(" .")
    s = s.replace(" ", "_")
    s = re.sub(r"_+", "_", s)
    if len(s) > max_len:
        s = s[:max_len].rstrip("_")
    return s or "UNKNOWN"

def unique_path(path: Path) -> Path:
    if not path.exists():
        return path
    stem, suf = path.stem, path.suffix
    i = 2
    while True:
        cand = path.with_name(f"{stem}_v{i}{suf}")
        if not cand.exists():
            return cand
        i += 1

# DOI / arXiv patterns
DOI_RE = re.compile(r"\b(10\.\d{4,9}/[^\s\"<>]+)\b", re.IGNORECASE)
ARXIV_RE = re.compile(r"\b(arXiv:\s*)?(\d{4}\.\d{4,5})(v\d+)?\b", re.IGNORECASE)

def find_doi(text: str) -> str | None:
    t = text.replace("\n", " ")
    m = DOI_RE.search(t)
    if not m:
        return None
    doi = m.group(1).rstrip(".,);]")
    return doi

def find_arxiv_id(text: str) -> str | None:
    t = text.replace("\n", " ")
    m = ARXIV_RE.search(t)
    return m.group(2) if m else None

# ================
# PDF reading (PyMuPDF)
# ================
def extract_text_pages(pdf_path: Path, pages: int) -> str:
    try:
        doc = fitz.open(pdf_path)
        out = []
        for i in range(min(pages, doc.page_count)):
            out.append(doc.load_page(i).get_text("text"))
        doc.close()
        return "\n".join(out)
    except Exception:
        return ""

def ocr_pages(pdf_path: Path, pages: int, dpi: int = 200) -> str:
    """
    PyMuPDFでページを画像化 -> pytesseractでOCR
    """
    try:
        doc = fitz.open(pdf_path)
        out = []
        for i in range(min(pages, doc.page_count)):
            page = doc.load_page(i)
            mat = fitz.Matrix(dpi/72, dpi/72)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.open(io.BytesIO(pix.tobytes("png")))
            txt = pytesseract.image_to_string(img, lang=OCR_LANG)
            out.append(txt)
        doc.close()
        return "\n".join(out)
    except Exception:
        return ""

def extract_title_author_by_layout(pdf_path: Path) -> tuple[str | None, str | None]:
    """
    フォントサイズ情報を使って “タイトルっぽい行” を拾う（テキスト抽出より安定）
    - 1ページ目の上半分で最大フォントの行をタイトル候補
    - 次に大きいフォント帯の行を著者候補
    """
    try:
        doc = fitz.open(pdf_path)
        page = doc.load_page(0)
        d = page.get_text("dict")
        h = page.rect.height

        lines = []
        for block in d.get("blocks", []):
            if block.get("type") != 0:
                continue
            for line in block.get("lines", []):
                spans = line.get("spans", [])
                if not spans:
                    continue
                text = norm("".join(s.get("text","") for s in spans))
                if not text or len(text) < 6:
                    continue
                # 平均フォントサイズ
                sizes = [s.get("size", 0) for s in spans if s.get("size", 0) > 0]
                avg_size = sum(sizes)/len(sizes) if sizes else 0
                # 行のy座標（上ほど小さい）
                y0 = line.get("bbox", [0,0,0,0])[1]
                lines.append((y0, avg_size, text))

        doc.close()
        if not lines:
            return None, None

        # 上半分中心にタイトル探索（abstract等を避ける）
        candidates = [(y,s,t) for (y,s,t) in lines if y < h*0.55]
        def is_noise(t: str) -> bool:
            low = t.lower()
            return ("abstract" in low) or ("introduction" in low) or low.startswith("received") or ("copyright" in low) or ("©" in t)

        candidates = [(y,s,t) for (y,s,t) in candidates if not is_noise(t) and len(t) <= 180]
        if not candidates:
            return None, None

        # タイトル：フォントサイズ最大の行（同率なら上に近い）
        candidates.sort(key=lambda x: (-x[1], x[0]))
        title = candidates[0][2]

        # 著者：タイトル直後〜少し下で、メール/所属っぽくないもの
        title_y = candidates[0][0]
        near = [(y,s,t) for (y,s,t) in lines if title_y < y < title_y + h*0.20]
        author = None
        for (y,s,t) in sorted(near, key=lambda x: (x[0], -x[1])):
            if "@" in t:
                continue
            low = t.lower()
            if re.search(r"\b(university|institute|department|laboratory|school)\b", low):
                continue
            if len(t) > 140:
                continue
            # 人名っぽい区切り
            if ("," in t) or (" and " in low) or ("・" in t) or ("、" in t):
                author = re.split(r",| and |・|、", t)[0].strip()
                break

        return title, author
    except Exception:
        return None, None

# ================
# Crossref / arXiv
# ================
def crossref_lookup_by_doi(doi: str) -> dict | None:
    url = f"https://api.crossref.org/works/{doi}"
    try:
        r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=20)
        if r.status_code != 200:
            return None
        return r.json().get("message", None)
    except Exception:
        return None

def pick_yyyymm_crossref(msg: dict) -> str | None:
    for k in ["published-print", "published-online", "issued", "created"]:
        v = msg.get(k, {})
        parts = v.get("date-parts", [])
        if parts and parts[0]:
            yyyy = parts[0][0]
            mm = parts[0][1] if len(parts[0]) >= 2 else 1
            return f"{int(yyyy):04d}{int(mm):02d}"
    return None

def pick_title_crossref(msg: dict) -> str | None:
    titles = msg.get("title") or []
    return norm(titles[0]) if titles else None

def pick_first_author_crossref(msg: dict) -> str | None:
    authors = msg.get("author") or []
    if not authors:
        return None
    a0 = authors[0]
    given = a0.get("given", "") or ""
    family = a0.get("family", "") or ""
    name = norm((given + " " + family).strip())
    return name or None

def crossref_search_by_title(title: str, author: str | None = None, rows: int = 5) -> dict | None:
    """
    タイトル（+可能なら著者）でCrossref検索して、最も一致するものを返す
    """
    title_q = norm(title)
    if not title_q or len(title_q) < 8:
        return None

    params = {
        "query.title": title_q,
        "rows": rows,
    }
    if author and author != "UNKNOWN":
        params["query.author"] = norm(author)

    try:
        r = requests.get("https://api.crossref.org/works", params=params,
                         headers={"User-Agent": USER_AGENT}, timeout=20)
        if r.status_code != 200:
            return None
        items = r.json().get("message", {}).get("items", []) or []
        if not items:
            return None

        # 最高一致のものを採用（誤爆防止でタイトル類似度チェック）
        best = None
        best_sim = -1
        for it in items:
            t = pick_title_crossref(it) or ""
            sim = fuzz.token_set_ratio(title_q.lower(), t.lower())
            if sim > best_sim:
                best_sim = sim
                best = it

        if best and best_sim >= MIN_TITLE_SIMILARITY:
            best["_title_similarity"] = best_sim
            return best
        return None
    except Exception:
        return None

def arxiv_lookup(arxiv_id: str) -> dict | None:
    url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    try:
        r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=20)
        if r.status_code != 200:
            return None
        xml = r.text
        entry_title_m = re.search(r"<entry>.*?<title>(.*?)</title>", xml, re.DOTALL)
        title = norm((entry_title_m.group(1) if entry_title_m else ""))
        authors = re.findall(r"<name>(.*?)</name>", xml)
        pub_m = re.search(r"<published>(\d{4})-(\d{2})-(\d{2})", xml)
        yyyymm = f"{pub_m.group(1)}{pub_m.group(2)}" if pub_m else None
        return {"title": title, "authors": authors, "yyyymm": yyyymm}
    except Exception:
        return None

# ================
# Main
# ================
target = Path(TARGET_DIR)
assert target.exists(), f"Folder not found: {target}"

pdfs = sorted(target.rglob("*.pdf")) if RECURSIVE else sorted(target.glob("*.pdf"))
pdfs = [p for p in pdfs if p.is_file()]
print(f"[INFO] Found PDFs: {len(pdfs)}")

rows = []

for pdf_path in pdfs:
    # 1) テキスト抽出
    txt = extract_text_pages(pdf_path, PAGES_TEXT_SCAN)

    # 2) DOI/arXiv抽出（テキスト）
    doi = find_doi(txt)
    arxiv_id = None if doi else find_arxiv_id(txt)

    # 3) OCR（必要なときだけ）
    ocr_txt = ""
    if not doi and not arxiv_id:
        ocr_txt = ocr_pages(pdf_path, PAGES_OCR_SCAN)
        doi = find_doi(ocr_txt) or doi
        arxiv_id = find_arxiv_id(ocr_txt) or arxiv_id

    yyyymm = None
    title = None
    author = None
    source = ""
    title_sim = None

    # 4) DOIがある → Crossref確定
    if doi:
        msg = crossref_lookup_by_doi(doi)
        if msg:
            yyyymm = pick_yyyymm_crossref(msg)
            title  = pick_title_crossref(msg)
            author = pick_first_author_crossref(msg)
            source = f"crossref:doi:{doi}"

    # 5) arXivがある → arXiv確定
    if (not title or not author) and arxiv_id:
        a = arxiv_lookup(arxiv_id)
        if a and a.get("title"):
            title = title or a["title"]
            author = author or (a["authors"][0] if a.get("authors") else None)
            yyyymm = yyyymm or a.get("yyyymm")
            source = source or f"arxiv:{arxiv_id}"

    # 6) タイトル/著者をレイアウトから推定（フォントサイズ）
    if not title or not author:
        t2, a2 = extract_title_author_by_layout(pdf_path)
        title = title or t2
        author = author or a2

    # 7) Crossrefで「タイトル→DOI逆引き」（最後の強手段）
    if (not doi) and title:
        guess = crossref_search_by_title(title, author=author, rows=7)
        if guess:
            doi = guess.get("DOI") or doi
            yyyymm = yyyymm or pick_yyyymm_crossref(guess)
            title  = pick_title_crossref(guess) or title
            author = pick_first_author_crossref(guess) or author
            title_sim = guess.get("_title_similarity")
            source = source or f"crossref:search(sim={title_sim})"

    # 8) 最終フォールバック
    if not yyyymm:
        yyyymm = "UNKNOWN"
    if not title:
        title = pdf_path.stem
    if not author:
        author = "UNKNOWN"

    new_name = f"{yyyymm}_{safe_component(author, MAX_AUTHOR_LEN)}_{safe_component(title, MAX_TITLE_LEN)}.pdf"
    dst = unique_path(pdf_path.with_name(new_name))

    will_rename = (dst.name != pdf_path.name)

    rows.append({
        "src": str(pdf_path),
        "dst": str(dst),
        "will_rename": will_rename,
        "doi": doi or "",
        "arxiv": arxiv_id or "",
        "source": source,
        "title_similarity": title_sim if title_sim is not None else ""
    })

df = pd.DataFrame(rows)

# 変更対応表CSVを必ず保存（元に戻せる）
map_path = target / f"rename_map_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(map_path, index=False)
print(f"[INFO] Saved mapping CSV: {map_path}")

print("\n=== Preview (first 40) ===")
display(df.head(40))

if DRY_RUN:
    print("\n[DRY_RUN=True] まだリネームしていません。問題なければ DRY_RUN=False にして再実行してください。")
else:
    changed = 0
    for _, r in df.iterrows():
        if not r["will_rename"]:
            continue
        Path(r["src"]).rename(Path(r["dst"]))
        changed += 1
    print(f"\n[DONE] Renamed: {changed} files")