In [56]:
import re
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree, html

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

chrome_driver_path = "./chromedriver-win64/chromedriver-win64/chromedriver.exe"

url = "https://www.mdpi.com/2079-4991/12/13"  # 要爬的網址
max_length = 150  # 最大檔名長度


# 設定 ChromeDriver 服務
service = Service(chrome_driver_path)

# 可選：設定無頭模式（不開啟視窗）
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 若不需要開啟視窗可以取消註解這行
options.add_argument('--start-maximized')

# 建立瀏覽器實例
driver = webdriver.Chrome(service=service, options=options)

# 開啟目標網頁
driver.get(url)

# 等待網頁載入（可視需要調整）
time.sleep(3)

# 取得整個 HTML 內容
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'lxml')
dom = html.fromstring(str(soup))

# 取得 article 總數
total_ele = dom.xpath('//h1[contains(string(.), "articles")]')[0]
total_ele = total_ele.text_content().strip()
total_num = int(re.findall(r"[0-9]+(?= articles)", total_ele)[0])

article_num = len(dom.xpath('//div[contains(@class, "generic-item") and contains(@class, "article-item")]'))
print(f"(第一次抓取) Total articles: {total_num}, current: {article_num}")

while article_num < total_num:
    # 執行 JavaScript 滾動到底部
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
    # 取得整個 HTML 內容
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'lxml')
    dom = html.fromstring(str(soup))
    article_num = len(dom.xpath('//div[contains(@class, "generic-item") and contains(@class, "article-item")]'))
    print(f"Total articles: {total_num}, current: {article_num}")

# 關閉瀏覽器
driver.quit()

# 驗證目錄數量是否正確
articles_nums = len(dom.xpath('//a[@class="UD_Listings_ArticlePDF"]'))
assert articles_nums == total_num

articles = dom.xpath('//a[@class="UD_Listings_ArticlePDF"]')

# 生成下載目錄
data = []
for article in articles:
    article_name = article.get("data-name")
    article_name = re.sub(r'[\/:*?"<>|]', '_', article_name)
    article_name = article_name[:max_length]
    
    download_path = "https://www.mdpi.com" + article.get("href")
    data.append({"name": article_name, "path": download_path})
df = pd.DataFrame(data)

# 將目錄寫到csv
_split = url.split('/')
csv_path = os.path.abspath(f"./csv/{_split[-2]}-{_split[-1]}.csv")
df.to_csv(csv_path, index=False, encoding='utf-8')
print(f"Done, save to {csv_path}")

(第一次抓取) Total articles: 192, current: 90
Total articles: 192, current: 180
Total articles: 192, current: 192
Done, save to C:\Users\User\Desktop\paper_download\csv\12-13.csv
