In [23]:
!pip install lxml



# 以一筆書目資料試做

In [23]:
# 確保 openpyxl 有安裝（只需執行一次）
!pip install openpyxl

import requests
from bs4 import BeautifulSoup

doi = "10.1371/journal.pone.0259453"
xml_url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"

response = requests.get(xml_url)
soup = BeautifulSoup(response.content, "lxml-xml")  # 用 XML 解析器

authors = soup.find_all("contrib", {"contrib-type": "author"})

records = []

# 機構 mapping：aff id -> 機構名稱
aff_dict = {}
for aff in soup.find_all("aff"):
    aff_id = aff.get("id")
    aff_text = aff.get_text(strip=True)
    aff_dict[aff_id] = aff_text

# 解析每位作者
for author in authors:
    surname = author.find("surname")
    given_names = author.find("given-names")
    
    full_name = ""
    if given_names and surname:
        full_name = f"{given_names.text.strip()} {surname.text.strip()}"
    elif surname:
        full_name = surname.text.strip()

    aff_ref = author.find("xref", {"ref-type": "aff"})
    aff_id = aff_ref.get("rid") if aff_ref else None
    affiliation = aff_dict.get(aff_id, "N/A")

    records.append({"name": full_name, "affiliation": affiliation})

# 顯示結果
for r in records:
    print(f"Name: {r['name']}, Affiliation: {r['affiliation']}")




Name: Yu-Wei Chang, Affiliation: 1Department of Library and Information Science, National Taiwan University, Taipei, Taiwan
Name: Dar-Zen Chen, Affiliation: 3Department of Mechanical Engineering, National Taiwan University, Taipei, Taiwan
Name: Mu-Hsuan Huang, Affiliation: 1Department of Library and Information Science, National Taiwan University, Taipei, Taiwan
Name: Alberto Baccini, Affiliation: N/A
Name: Alberto Baccini, Affiliation: N/A
Name: Alberto Baccini, Affiliation: N/A
Name: Alberto Baccini, Affiliation: N/A


# 輸出成excel

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
import os

# 設定 DOI
doi = "10.1371/journal.pone.0259453"
xml_url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"

# 發送請求並解析 XML
response = requests.get(xml_url)
soup = BeautifulSoup(response.content, "lxml-xml")  # 用 XML 解析

# 準備機構 mapping：aff id -> 機構名稱
aff_dict = {}
for aff in soup.find_all("aff"):
    aff_id = aff.get("id")
    aff_text = aff.get_text(strip=True)
    aff_dict[aff_id] = aff_text

# 擷取作者資訊
records = []
authors = soup.find_all("contrib", {"contrib-type": "author"})
for author in authors:
    surname = author.find("surname")
    given_names = author.find("given-names")
    
    if given_names and surname:
        full_name = f"{given_names.text.strip()} {surname.text.strip()}"
    elif surname:
        full_name = surname.text.strip()
    else:
        full_name = "N/A"

    # 找作者對應的機構 ID 與實際名稱
    aff_ref = author.find("xref", {"ref-type": "aff"})
    aff_id = aff_ref.get("rid") if aff_ref else None
    affiliation = aff_dict.get(aff_id, "N/A")

    records.append({"Name": full_name, "Affiliation": affiliation})

# 存成 DataFrame
df = pd.DataFrame(records)

# 儲存為 Excel
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmpfile:
    output_path = tmpfile.name
    df.to_excel(output_path, index=False, engine="openpyxl")

print(f"✅ 已成功儲存 Excel 檔案於：{output_path}")
# ✅ 自動打開檔案（限 macOS 有安裝 Excel 的狀況）
os.system(f"open {output_path}")


✅ 已成功儲存 Excel 檔案於：/var/folders/8p/k5cclwt549dg7s3z944xq9h80000gn/T/tmpw4r11pvd.xlsx


0

In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
import os

# === Step 1: 設定 DOI ===
doi = "10.1371/journal.pone.0259453"
xml_url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"

# === Step 2: 發送請求並解析 XML ===
response = requests.get(xml_url)
soup = BeautifulSoup(response.content, "lxml-xml")

# === Step 3: 機構 mapping：aff id -> 機構名稱 ===
aff_dict = {}
for aff in soup.find_all("aff"):
    aff_id = aff.get("id")
    aff_text = aff.get_text(strip=True)
    aff_dict[aff_id] = aff_text

# === Step 4: 擷取作者資訊 ===
records = []
authors = soup.find_all("contrib", {"contrib-type": "author"})
for author in authors:
    surname = author.find("surname")
    given_names = author.find("given-names")
    
    if given_names and surname:
        full_name = f"{given_names.text.strip()} {surname.text.strip()}"
    elif surname:
        full_name = surname.text.strip()
    else:
        full_name = "N/A"

    # 機構
    aff_ref = author.find("xref", {"ref-type": "aff"})
    aff_id = aff_ref.get("rid") if aff_ref else None
    affiliation = aff_dict.get(aff_id, "N/A")

    # 是否來自台灣？
    is_taiwan = "taiwan" in affiliation.lower()

    records.append({
        "Name": full_name,
        "Affiliation": affiliation,
        "Is_Taiwan_Affiliation": is_taiwan
    })

# === Step 5: 存成 Excel ===
df = pd.DataFrame(records)

with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmpfile:
    output_path = tmpfile.name
    df.to_excel(output_path, index=False, engine="openpyxl")

print(f"✅ 已成功儲存 Excel 檔案於：{output_path}")

# 若在 macOS 上可自動打開
os.system(f"open {output_path}")


✅ 已成功儲存 Excel 檔案於：/var/folders/8p/k5cclwt549dg7s3z944xq9h80000gn/T/tmpwfi4n7u9.xlsx


0

In [27]:
import requests
doi = "10.1371/journal.pone.0170929"
url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"
response = requests.get(url)
print(response.status_code)


200


# 一次處理三篇

In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# 定義要查詢的 DOI 列表
dois = [
    "10.1371/journal.pone.0259453",
    "10.1371/journal.pone.0260961",
    "10.1371/journal.pone.0274826"
    # 在此處繼續添加更多 DOI
]

# 建立一個空的列表來儲存資料
all_records = []

# 依序處理每個 DOI
for doi in dois:
    xml_url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"
    response = requests.get(xml_url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "lxml-xml")
        
        # 準備機構映射：aff id -> 機構名稱
        aff_dict = {}
        for aff in soup.find_all("aff"):
            aff_id = aff.get("id")
            aff_text = aff.get_text(strip=True)
            aff_dict[aff_id] = aff_text

        # 擷取作者資訊
        authors = soup.find_all("contrib", {"contrib-type": "author"})
        for author in authors:
            surname = author.find("surname")
            given_names = author.find("given-names")
            
            if given_names and surname:
                full_name = f"{given_names.text.strip()} {surname.text.strip()}"
            elif surname:
                full_name = surname.text.strip()
            else:
                full_name = "N/A"
            
            # 找作者對應的機構 ID 與實際名稱
            aff_ref = author.find("xref", {"ref-type": "aff"})
            aff_id = aff_ref.get("rid") if aff_ref else None
            affiliation = aff_dict.get(aff_id, "N/A")

            all_records.append({"DOI": doi, "Name": full_name, "Affiliation": affiliation})

    else:
        print(f"Failed to fetch data for DOI {doi}, status code: {response.status_code}")

# 儲存結果為 DataFrame
df = pd.DataFrame(all_records)

with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmpfile:
    output_path = tmpfile.name
    df.to_excel(output_path, index=False, engine="openpyxl")

print(f"✅ 已成功儲存 Excel 檔案於：{output_path}")

# 若在 macOS 上可自動打開
os.system(f"open {output_path}")



✅ 已成功儲存 Excel 檔案於：/var/folders/8p/k5cclwt549dg7s3z944xq9h80000gn/T/tmpq618i_ry.xlsx


0

In [3]:
!pip install streamlit

import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
import base64

# 擷取資料的主函式
def fetch_metadata_from_dois(dois):
    all_records = []
    for doi in dois:
        xml_url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"
        response = requests.get(xml_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "lxml-xml")
            aff_dict = {aff.get("id"): aff.get_text(strip=True) for aff in soup.find_all("aff")}
            authors = soup.find_all("contrib", {"contrib-type": "author"})

            for author in authors:
                surname = author.find("surname")
                given_names = author.find("given-names")
                full_name = f"{given_names.text.strip()} {surname.text.strip()}" if given_names and surname else (surname.text.strip() if surname else "N/A")
                aff_ref = author.find("xref", {"ref-type": "aff"})
                aff_id = aff_ref.get("rid") if aff_ref else None
                affiliation = aff_dict.get(aff_id, "N/A")

                all_records.append({"DOI": doi, "Name": full_name, "Affiliation": affiliation})
    return pd.DataFrame(all_records)

# 建立下載連結
def generate_excel_download_link(df):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmpfile:
        df.to_excel(tmpfile.name, index=False, engine="openpyxl")
        tmpfile_path = tmpfile.name

    with open(tmpfile_path, "rb") as f:
        b64 = base64.b64encode(f.read()).decode()
    href = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="doi_affiliations.xlsx">📥 下載 Excel 檔案</a>'
    return href

# Streamlit UI
st.set_page_config(page_title="DOI 作者與機構擷取器", layout="wide")
st.title("🔍 DOI 作者與機構擷取器")
st.markdown("輸入 PLOS DOI 列表（每行一筆），擷取作者姓名與所屬機構資訊：")

input_text = st.text_area("請輸入 DOI（每行一筆）", height=200)

if st.button("🚀 開始擷取資料"):
    doi_list = [line.strip() for line in input_text.splitlines() if line.strip()]
    if doi_list:
        df = fetch_metadata_from_dois(doi_list)
        st.success(f"✅ 共擷取 {len(df)} 筆作者資料")
        st.dataframe(df)
        st.markdown(generate_excel_download_link(df), unsafe_allow_html=True)
    else:
        st.warning("請至少輸入一筆 DOI。")


Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting protobuf<6,>=3.20 (from streamlit)
  Downloading protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-19.0.1-cp313-cp313-macosx_12_0_x86_64.whl.metadata (3.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 k

2025-04-24 16:12:53.152 
  command:

    streamlit run /Users/timothy/Library/Python/3.13/lib/python/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-04-24 16:12:53.159 Session state does not function when running a script without `streamlit run`


In [4]:
# app.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
import os

st.title("📚 DOI 作者與機構擷取工具")

doi_input = st.text_area("請輸入 DOI（每行一筆）")
run_button = st.button("🚀 開始擷取")

if run_button and doi_input.strip():
    dois = [d.strip() for d in doi_input.strip().split("\n")]
    all_records = []

    for doi in dois:
        xml_url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"
        response = requests.get(xml_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "lxml-xml")
            aff_dict = {aff.get("id"): aff.get_text(strip=True) for aff in soup.find_all("aff")}
            authors = soup.find_all("contrib", {"contrib-type": "author"})

            for author in authors:
                surname = author.find("surname")
                given_names = author.find("given-names")
                name = f"{given_names.text.strip()} {surname.text.strip()}" if given_names and surname else surname.text.strip() if surname else "N/A"
                aff_ref = author.find("xref", {"ref-type": "aff"})
                aff_id = aff_ref.get("rid") if aff_ref else None
                affiliation = aff_dict.get(aff_id, "N/A")
                all_records.append({"DOI": doi, "Name": name, "Affiliation": affiliation})
        else:
            st.warning(f"❌ 無法取得 DOI: {doi}")

    df = pd.DataFrame(all_records)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
        df.to_excel(tmp.name, index=False)
        st.success("✅ 擷取完成！")
        st.download_button("⬇️ 下載 Excel", data=open(tmp.name, 'rb'), file_name="authors_affiliations.xlsx")




# 爬蟲（尚未成功，這禮拜考完期中考再做更詳細處理）

In [13]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup

# ------------------------------
# Step 1: 呼叫 PLOS 搜尋 API（方式一：利用全文文字搜尋）
# ------------------------------
base_url = "http://api.plos.org/search"
params = {
    "q": "journal:PLoSONE",  # 改成只篩 PLOS ONE
    "fl": "id,title_display",            # 回傳欄位，id 為 DOI
    "wt": "json",
    "rows": 100,                         # 取 100 筆
    "start": 0
}

print("開始呼叫 PLOS API...")
response = requests.get(base_url, params=params)
data = response.json()

# 若 API 回傳結構不同，請檢查 data 結構
docs = data.get("response", {}).get("docs", [])
print(f"取得 {len(docs)} 篇文章。")

# ------------------------------
# Step 2: 逐篇文章抓取 JATS XML，並解析機構資訊
# ------------------------------
records = []

# 這個函數從一篇文章的 XML 中找出所有包含 "Taiwan" 的機構資訊
def extract_taiwan_affiliations(xml_content, doi, title):
    # 使用 lxml-xml 解析 XML
    soup = BeautifulSoup(xml_content, "lxml-xml")
    for aff in soup.find_all("aff"):
        # 取得機構文字，使用空格將內部 tag 合併
        aff_text = aff.get_text(separator=" ", strip=True)
        if "Taiwan" in aff_text:
            records.append({
                "doi": doi,
                "title": title,
                "affiliation": aff_text
            })

# 針對取得的每篇文章進行處理
for doc in docs:
    doi = doc.get("id")
    title = doc.get("title_display", "")
    # 組成 JATS XML 的下載 URL
    xml_url = f"https://journals.plos.org/plosone/article/file?id={doi}&type=manuscript"
    try:
        xml_response = requests.get(xml_url)
        # 簡單延遲，避免過快請求
        time.sleep(0.1)
        if xml_response.status_code == 200:
            extract_taiwan_affiliations(xml_response.content, doi, title)
        else:
            print(f"doi: {doi} 無法取得 XML (狀態碼: {xml_response.status_code})")
    except Exception as e:
        print(f"doi: {doi} 取得或解析失敗：{e}")

# ------------------------------
# Step 3: 將結果存成 DataFrame 並輸出
# ------------------------------
df = pd.DataFrame(records)
print("結果 DataFrame:")
print(df.head())

# 若需要存成 Excel 檔（例如在 Jupyter Notebook 本機可寫入檔案系統）
output_file = "plos_taiwan_affiliations.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")
print(f"✅ 成功將結果儲存為 {output_file}")


開始呼叫 PLOS API...
取得 0 篇文章。
結果 DataFrame:
Empty DataFrame
Columns: []
Index: []
✅ 成功將結果儲存為 plos_taiwan_affiliations.xlsx


In [23]:
import requests
import time
import re

def get_dois_from_dynamic_search(max_results=10):
    base_url = "https://journals.plos.org/plosone/dynamicSearch"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
    }
    page = 1
    dois = []

    print("🔍 開始從 dynamicSearch API 抓取 DOI ...")

    while len(dois) < max_results:
        params = {
            "q": "author_affiliate:taiwan",
            "page": page
        }

        try:
            response = requests.get(base_url, params=params, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"❌ 第 {page} 頁請求失敗：{e}")
            break

        data = response.json()
        results = data.get("searchResults", [])
        if not results:
            print(f"⚠️ 第 {page} 頁無結果")
            break

        for html_snippet in results:
            match = re.search(r'https://doi.org/(10\.1371/journal\.pone\.\d+)', html_snippet)
            if match:
                doi = match.group(1)
                if doi not in dois:
                    dois.append(doi)
                    print(f"✅ 抓到 DOI: {doi}")
                    if len(dois) >= max_results:
                        break

        page += 1
        time.sleep(1)  # 每頁等待 1 秒，避免被封鎖

    print(f"🎯 共抓到 {len(dois)} 筆 DOI")
    return dois

# 測試前 10 篇
dois = get_dois_from_dynamic_search(max_results=10)




🔍 開始從 dynamicSearch API 抓取 DOI ...
❌ 第 464 頁請求失敗：('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
🎯 共抓到 0 筆 DOI
