In [5]:
# 假設你的檔案叫 "your_file.txt"
with open("data/full_text_no_abstract_ids.txt", "r", encoding="utf-8") as f:
    all_ids = [line.strip() for line in f]


In [6]:
len(all_ids)

652582

In [7]:
import pandas as pd
df = pd.DataFrame(all_ids, columns=["ids"])

In [8]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from time import sleep
from tqdm import tqdm  # ✅ 加入 tqdm 進度條

# API 基本設定
API_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"  # 實際 endpoint
TOOL = "fulltext_labeling"  # 替換成你的 tool 名稱
EMAIL = "yuhsinl2@illinois.edu"  # 替換成你的 email


id_to_pmcid = {}

batch_size = 200
for i in tqdm(range(0, len(all_ids), batch_size), desc="Processing batches"):
    
    batch_ids = all_ids[i:i + batch_size]
    ids_param = ",".join(batch_ids)

    params = {
        "tool": TOOL,
        "email": EMAIL,
        "ids": ids_param
    }

    response = requests.get(API_URL, params=params)

    if response.status_code == 200:
        root = ET.fromstring(response.text)
        for record in root.findall(".//record"):
            requested_id = record.get("requested-id")
            pmcid = record.get("pmcid")
            id_to_pmcid[requested_id] = pmcid
    else:
        print(f"Error on batch {i // batch_size + 1}: {response.status_code}")

    # 避免 API rate limit
    sleep(1)

# 新增 PMCID 欄位
df['PMCID'] = df['ids'].map(id_to_pmcid)

# 儲存結果
df.to_csv('data/full_text_without_abstract_ids.csv', index=False)
df


Processing batches: 100%|██████████| 3263/3263 [1:04:02<00:00,  1.18s/it]


Unnamed: 0,ids,PMCID
0,1279164,PMC5375567
1,1279522,PMC334367
2,1279851,PMC2965439
3,1280811,PMC334429
4,1280812,PMC334434
...,...,...
652577,40123982,PMC11926562
652578,40123983,PMC11926565
652579,40124147,PMC11925622
652580,40124349,PMC11927006


In [9]:
import json

data = []
with open("data/train.txt", "r") as f:
    for line in f:
        data.append(json.loads(line))

print(data[0]['article_id'])
print(data[0]['article_text'][:2])


PMC3872579
['a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries .', 'in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively .']


In [10]:
import pandas as pd
df2 = pd.DataFrame(data)
df2

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections
0,PMC3872579,[a recent systematic analysis showed that in 2...,[<S> background : the present study was carrie...,,"[INTRODUCTION, MATERIALS AND METHODS, Particip...",[[a recent systematic analysis showed that in ...
1,PMC3770628,[it occurs in more than 50% of patients and ma...,[<S> backgroundanemia in patients with cancer ...,,"[Introduction, Patients and methods, Study des...",[[it occurs in more than 50% of patients and m...
2,PMC5330001,"[tardive dystonia ( td ) , a rarer side effect...",[<S> tardive dystonia ( td ) is a serious side...,,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec..."
3,PMC4386667,"[lepidoptera include agricultural pests that ,...",[<S> many lepidopteran insects are agricultura...,,"[1. Introduction, 2. Insect Immunity, 3. Signa...",[[lepidoptera include agricultural pests that ...
4,PMC4307954,[syncope is caused by transient diffuse cerebr...,[<S> we present an unusual case of recurrent c...,,"[Introduction, Case report, Discussion, Confli...",[[syncope is caused by transient diffuse cereb...
...,...,...,...,...,...,...
119919,PMC3502213,[eukaryotic cells depend on vesicle - mediated...,[<S> long - distance trafficking of membranous...,,"[Introduction, Motor-Dependent Transport of Ra...",[[eukaryotic cells depend on vesicle - mediate...
119920,PMC3198562,[as regards the selection criteria of the post...,[<S> aims and objectives : to study the stress...,,"[INTRODUCTION, MATERIALS AND METHODS, Modeling...",[[fiber post systems are routinely used in res...
119921,PMC4436536,[in most of the peer review publications in th...,[<S> abstractbackgroundthe objective of this s...,,"[Introduction, Methods, Results, Discussion, L...",[[in most of the peer review publications in t...
119922,PMC4251613,[the reveal registry is a longitudinal registr...,[<S> background : patients with pulmonary arte...,,"[TRIAL REGISTRY:, Materials and Methods, REVEA...","[[], [the reveal registry is a longitudinal re..."


In [11]:
merged_df = pd.merge(df, df2, how='inner', left_on='PMCID', right_on='article_id')
merged_df

Unnamed: 0,ids,PMCID,article_id,article_text,abstract_text,labels,section_names,sections
0,26289395,PMC4852139,PMC4852139,[e - cadherin immunohistochemistry in foci of ...,"[<S> a 9-year - old , female , spayed , domest...",,[Supplementary Material],[[e - cadherin immunohistochemistry in foci of...
1,26913168,PMC4748986,PMC4748986,"[villa emy b&b , stra , brenta riviera ( venic...",[<S> the decline of the performance of the hum...,,"[Program, INDEX]","[[villa emy b&b , stra , brenta riviera ( veni..."
2,27054019,PMC4821221,PMC4821221,[],[<S> muscle regeneration is a multistep proces...,,[INDEX],[[]]
