In [11]:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from IPython.display import FileLink

def parse_authors_from_xml(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), "lxml-xml")

    doi_tag = soup.find("article-id", {"pub-id-type": "doi"})
    doi = doi_tag.text.strip() if doi_tag else "N/A"

    title_tag = soup.find("article-title")
    title = title_tag.get_text(strip=True) if title_tag else "N/A"

    abstract_tag = soup.find("abstract")
    abstract = abstract_tag.get_text(strip=True) if abstract_tag else "N/A"

    keywords = soup.find_all("kwd")
    keywords_text = "; ".join([kwd.text for kwd in keywords]) if keywords else "N/A"

    subjects = soup.find_all("subject")
    subjects_text = "; ".join([s.text for s in subjects]) if subjects else "N/A"

    aff_dict = {}
    for aff in soup.find_all("aff"):
        aff_id = aff.get("id")

        parts = []
        institution = aff.find("institution")
        if institution:
            parts.append(institution.get_text(strip=True))

        addr = aff.find("addr-line")
        if addr:
            parts.append(addr.get_text(strip=True))

        country = aff.find("country")
        if country:
            parts.append(country.get_text(strip=True))

        if not parts:
            label = aff.find("label")
            if label:
                label.extract()
            parts.append(aff.get_text(strip=True))

        aff_text = ", ".join(parts)
        aff_text = re.sub(r"^\d+\s*", "", aff_text)  # remove leading digits
        aff_dict[aff_id] = aff_text

        # 原本這一段要替換
    author_group = soup.find("contrib-group")
    authors = author_group.find_all("contrib", {"contrib-type": "author"}) if author_group else []

    # ✅ 更嚴謹的有效作者篩選（避免 N/A 被當成作者）
    valid_authors = []
    for author in authors:
        if author.find("surname") or author.find("given-names"):
            valid_authors.append(author)

    if not valid_authors:
        return []

    # ⬇️ 接下來請使用 valid_authors 而非原本的 authors
    records = []
    seen = set()
    for idx, author in enumerate(valid_authors, 1):
        surname = author.find("surname")
        given = author.find("given-names")
        name = f"{given.text.strip()} {surname.text.strip()}" if given and surname else surname.text.strip() if surname else "N/A"

        if name in seen:
            continue
        seen.add(name)

        role = []
        if idx == 1:
            role.append("第一作者")
        if author.find("xref", {"ref-type": "corresp"}):
            role.append("通訊作者")

        aff_ref = author.find("xref", {"ref-type": "aff"})
        aff_id = aff_ref.get("rid") if aff_ref else None
        aff_text = aff_dict.get(aff_id, "N/A")

        records.append({
            "DOI": doi,
            "Title": title,
            "Author": name,
            "Affiliation": aff_text,
            "Role": ", ".join(role) if role else "作者",
            "Abstract": abstract,
            "Keywords": keywords_text,
            "Subjects": subjects_text,
            "FullText": f"{title} {abstract} {keywords_text} {subjects_text}"
        })


    return records


In [17]:
# ✅ 設定資料夾路徑
folder_path = "/Users/timothy/Desktop/allofplos"

records = []
xml_files = [f for f in os.listdir(folder_path) if f.endswith(".xml")]

start_idx = 17600
batch_size = 50000
csv_base = "plos_all_cleaned"

for idx, file in enumerate(tqdm(xml_files), 1):
    file_path = os.path.join(folder_path, file)
    records.extend(parse_authors_from_xml(file_path))

    if idx % batch_size == 0:
        df = pd.DataFrame(records)
        df.to_csv(f"{csv_base}_up_to_{idx}.csv", index=False)
        print(f"✔️ 儲存進度：已處理 {idx} 筆")

# 最終儲存一次完整資料
df = pd.DataFrame(records)
df.to_csv(f"{csv_base}_final.csv", index=False)
df.head()

 14%|█▍        | 50002/362279 [6:31:14<404:40:25,  4.67s/it]   

✔️ 儲存進度：已處理 50000 筆


 14%|█▍        | 50100/362279 [6:31:24<9:06:59,  9.51it/s]  Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10eb19400>>
Traceback (most recent call last):
  File "/Users/timothy/Library/Python/3.13/lib/python/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
 14%|█▍        | 50283/362279 [6:31:43<40:30:35,  2.14it/s]


KeyboardInterrupt: 

In [3]:
# 顯示下載連結
FileLink("plos_research_only.csv")


In [18]:
import pandas as pd

df1 = pd.read_csv('plos_all_cleaned_up_to_17600.csv')
df2 = pd.read_csv('plos_all_cleaned_up_to_50000.csv')

merged_df = pd.concat([df1, df2], ignore_index=True)
merged_df.to_csv('plos_all_cleaned_up_to_67600.csv', index=False)


  df1 = pd.read_csv('plos_all_cleaned_up_to_17600.csv')
  df2 = pd.read_csv('plos_all_cleaned_up_to_50000.csv')
