In [1]:
#crawling for 10000 papers about Proteomics

# !pip install biopython

import ssl
import urllib3
import time
import json  # JSON 저장을 위해 추가
from pathlib import Path
from typing import List, Optional, Dict

from Bio import Entrez
import xml.etree.ElementTree as ET

# ===== SSL 설정 =====
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# ===== NCBI Entrez 설정 ===== crawling하려면 필요
Entrez.email = ""  # 본인 이메일

# ===== 기본 설정 =====
BASE_DIR = Path("/home/younghee-seo/Desktop/papers_for_LLM")
# QUERY = 'proteomics AND "mass spectrometry"'
QUERY = 'proteomics AND "mass spectrometry"'
REQUEST_DELAY = 0.5

def ensure_year_directories(base_dir: Path, years: List[int]) -> None:
    base_dir.mkdir(parents=True, exist_ok=True)
    for year in years:
        year_dir = base_dir / str(year)
        year_dir.mkdir(parents=True, exist_ok=True)

def search_pmc_ids_all(query: str, year: int, batch_size: int = 200) -> List[str]:
    """연도별 PMCID 검색"""
    term = f"{query} AND {year}[pdat]"
    print(f"[SEARCH] Year={year}, Query='{query}'")

    try:
        with Entrez.esearch(db="pmc", term=term, retmax=0) as handle:
            record = Entrez.read(handle)
        total_count = int(record.get("Count", 0))
    except Exception as e:
        print(f"Error in esearch: {e}")
        return []
    
    print(f"  -> Total count: {total_count}")
    if total_count == 0:
        return []

    id_list: List[str] = []
    for start in range(0, total_count, batch_size):
        retmax = min(batch_size, total_count - start)
        try:
            with Entrez.esearch(db="pmc", term=term, retstart=start, retmax=retmax) as handle:
                batch_record = Entrez.read(handle)
            batch_ids = batch_record.get("IdList", [])
            id_list.extend(batch_ids)
            time.sleep(REQUEST_DELAY)
        except Exception as e:
            print(f"  -> Error fetching batch {start}: {e}")
            continue

    print(f"  -> Collected {len(id_list)} IDs for year {year}")
    return id_list

def fetch_pmc_xml(pmc_id: str) -> Optional[str]:
    """XML 다운로드"""
    pmc_id_str = str(pmc_id).replace("PMC", "")
    try:
        with Entrez.efetch(db="pmc", id=pmc_id_str, rettype="full", retmode="xml") as handle:
            return handle.read()
    except Exception as e:
        print(f"  -> Error fetching XML for {pmc_id}: {e}")
        return None

def parse_xml_content(xml_data: str) -> Dict[str, str]:
    """
    XML에서 Abstract, Methods, 그리고 전체 Body Text를 각각 추출.
    반환 구조: {"abstract": "...", "methods": "...", "body": "..."}
    """
    result = {"abstract": "", "methods": "", "body": ""}
    
    # Methods 섹션을 찾기 위한 키워드 리스트 (소문자 기준)
    method_keywords = ["method", "material", "experiment", "procedure"]

    try:
        root = ET.fromstring(xml_data)
        
        # 1. Abstract 추출
        abstract_node = root.find(".//abstract")
        if abstract_node is not None:
            abs_texts = []
            for t in abstract_node.itertext():
                if t.strip():
                    abs_texts.append(t.strip())
            result["abstract"] = " ".join(abs_texts)

        # 2. Body 찾기
        body_node = None
        for elem in root.iter():
            if elem.tag.endswith("body"):
                body_node = elem
                break
        
        if body_node is not None:
            # (A) 전체 본문(Body) 추출
            all_body_texts = []
            for t in body_node.itertext():
                if t.strip():
                    all_body_texts.append(t.strip())
            result["body"] = "\n".join(all_body_texts)

            # (B) Methods 섹션 정밀 추출
            # <sec> 태그를 순회하며 title에 키워드가 있는지 확인
            methods_texts = []
            
            # body 바로 아래 혹은 깊이 있는 모든 sec 태그 검색
            for sec in body_node.findall(".//sec"):
                title_node = sec.find("title")
                
                # 제목이 존재하고, 텍스트가 있는 경우 확인
                if title_node is not None and title_node.text:
                    title_lower = title_node.text.lower()
                    
                    # 키워드가 제목에 포함되어 있는지 확인 (예: "Materials and Methods")
                    if any(keyword in title_lower for keyword in method_keywords):
                        sec_content = []
                        for t in sec.itertext():
                            if t.strip():
                                sec_content.append(t.strip())
                        
                        # 섹션 내용을 합쳐서 리스트에 추가
                        methods_texts.append("\n".join(sec_content))
            
            # 여러 개의 Method 섹션이 있을 수 있으므로(예: Experimental, Statistical Analysis 등) 합침
            result["methods"] = "\n\n".join(methods_texts)

    except Exception as e:
        print(f"  -> XML Parsing Error: {e}")
    
    return result

def save_paper_json(pmc_id: str, year: int, content: Dict[str, str], year_dir: Path) -> None:
    """
    JSON 저장: methods 필드 추가됨
    """
    pmc_id_str = str(pmc_id)
    if not pmc_id_str.upper().startswith("PMC"):
        pmc_id_str = "PMC" + pmc_id_str

    data = {
        "pmcid": pmc_id_str,
        "year": year,
        "abstract": content["abstract"],
        "methods": content["methods"], 
        "body": content["body"]         # 전체 본문 (methods 포함됨)
    }

    out_path = year_dir / f"{pmc_id_str}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def download_text_corpus_by_years(
    query: str,
    years: List[int],
    base_dir: Path,
    max_papers_per_year: Optional[int] = None,
) -> None:
    
    ensure_year_directories(base_dir, years)
    total_saved = 0

    for year in years:
        year_dir = base_dir / str(year)
        pmc_ids = search_pmc_ids_all(query=query, year=year)

        if max_papers_per_year is not None:
            pmc_ids = pmc_ids[:max_papers_per_year]

        print(f"[YEAR {year}] Target: {len(pmc_ids)} papers")

        for idx, pmc_id in enumerate(pmc_ids, start=1):
            pmc_id_str = str(pmc_id)
            if not pmc_id_str.upper().startswith("PMC"):
                pmc_id_str = "PMC" + pmc_id_str

            # 이미 파일이 있으면 스킵 (JSON 파일 확인)
            out_path = year_dir / f"{pmc_id_str}.json"
            if out_path.exists():
                print(f"[SKIP] {out_path} exists")
                continue

            # 1. XML 다운로드
            xml_data = fetch_pmc_xml(pmc_id)
            if not xml_data:
                continue

            # 2. 내용 파싱 (Abstract + Body)
            content = parse_xml_content(xml_data)
            
            # 내용이 둘 다 없으면 저장 안 함
            if not content["abstract"] and not content["body"]:
                print(f"  -> Empty content for {pmc_id}")
                continue

            # 3. JSON 저장
            save_paper_json(pmc_id, year, content, year_dir)
            total_saved += 1
            print(f"[OK] Saved {pmc_id_str}")

            time.sleep(REQUEST_DELAY)

            if idx % 50 == 0:
                print(f"[PROGRESS] {idx}/{len(pmc_ids)} done for {year}")

    print(f"\n[SUMMARY] Total saved JSON files: {total_saved}")

# 실행
download_text_corpus_by_years(
    query=QUERY,
    years=[2021, 2022,2023,2024, 2025],
    base_dir=BASE_DIR,
    max_papers_per_year=2000,
)

[SEARCH] Year=2021, Query='proteomics AND "mass spectrometry"'
  -> Total count: 14117


KeyboardInterrupt: 