In [None]:
import os
import time
import re
import random
import zipfile
import shutil
import math
import psutil  # 用來清理殘留的 Chrome
from typing import List, Set, Optional, Tuple
from io import BytesIO

import ddddocr
from PIL import Image
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    NoAlertPresentException,
    UnexpectedAlertPresentException
)

# 獲取腳本所在的目錄
BASE_DIR = os.getcwd()

# ==========================================
# [前置作業] 強制關閉殘留的 Chrome，避免檔案鎖定
# ==========================================
print("[-] 正在執行環境清場 (關閉背景 Chrome)...")
try:
    for proc in psutil.process_iter(['pid', 'name']):
        if 'chrome' in proc.info['name'].lower():
            try:
                proc.kill()
            except:
                pass
    time.sleep(2)  # 等待釋放
    print("[*] 環境已清理完畢。")
except Exception as e:
    print(f"[!] 清場時發生小錯誤 (可忽略): {e}")

# ==========================================
# [主程式] 論文下載器
# ==========================================


class BaseThesisDownloader:
    def __init__(self, keyword, max_downloads_per_session=200):
        self.base_url = "https://ndltd.ncl.edu.tw/cgi-bin/gs32/gsweb.cgi/login?o=dwebmge"
        self.keyword = keyword
        self.download_dir = os.path.join(BASE_DIR, "downloaded_theses")
        self.log_file = os.path.join(BASE_DIR, "download_log.txt")
        self.page_progress_file = os.path.join(BASE_DIR, "page_progress.txt")
        self.max_downloads_per_session = max_downloads_per_session
        self.items_per_page = 10
        # 調整為較慢的擬人化速度
        self.inter_article_sleep_range = (10.0, 20.0)
        self.inter_page_sleep_range = (20.0, 45.0)

        self.downloaded_urls, self.last_crawled_page = self._load_log()
        self.session_download_count = 0
        self.ocr = ddddocr.DdddOcr(show_ad=False)
        self.driver = None
        self.wait = None
        self.main_window_handle = None

    def _setup_driver(self):
        print("[-] 設定 Selenium WebDriver (輕量記憶模式)...")
        os.makedirs(self.download_dir, exist_ok=True)

        # ★ 設定一個本地的小型設定檔，只記錄這個程式的登入狀態
        profile_path = os.path.join(BASE_DIR, "chrome_profile_thesis")

        options = uc.ChromeOptions()
        options.add_argument(f'--user-data-dir={profile_path}')
        options.add_argument("--no-first-run")
        options.add_argument("--password-store=basic")

        prefs = {
            "download.default_directory": self.download_dir,
            "download.prompt_for_download": False,
            "profile.default_content_settings.popups": 0,
        }
        options.add_experimental_option("prefs", prefs)

        try:
            self.driver = uc.Chrome(options=options, use_subprocess=True)
            time.sleep(3)  # 稍作等待
            self.driver.set_window_size(1366, 768)

            # 修正下載路徑
            try:
                self.driver.execute_cdp_cmd("Page.setDownloadBehavior", {
                    "behavior": "allow", "downloadPath": self.download_dir
                })
            except:
                pass

        except Exception as e:
            print(f"[錯誤] 瀏覽器啟動失敗: {e}")
            raise
        self.wait = WebDriverWait(self.driver, 20)

    def _load_log(self):
        urls, last_page = set(), 1
        if os.path.exists(self.log_file):
            with open(self.log_file, 'r', encoding='utf-8') as f:
                urls = {line.strip() for line in f if "record?" in line}
        if os.path.exists(self.page_progress_file):
            with open(self.page_progress_file, 'r', encoding='utf-8') as f:
                try:
                    last_page = int(f.read().strip())
                except:
                    pass
        print(f"[*] 載入紀錄: 已下載 {len(urls)} 篇，上次進度第 {last_page} 頁。")
        return urls, last_page

    def wait_for_manual_login(self):
        print("\n[步驟 1] 檢查登入狀態...")
        try:
            self.driver.get(self.base_url)
        except:
            pass

        # 自動偵測是否已登入
        try:
            if self.driver.find_elements(By.XPATH, "//div[@class='user_area']//a[text()='登出']"):
                print("[★] 檢測到已登入狀態，自動繼續！")
                return
        except:
            pass

        print("\n" + "="*50)
        print("★★★ 請手動登入 (已換 IP 模式) ★★★")
        print("請在視窗中輸入帳號密碼。")
        print("程式會自動處理錯誤警告，您只需專注輸入。")
        print("="*50 + "\n")

        end_time = time.time() + 600
        while time.time() < end_time:
            try:
                # 1. 防崩潰：優先處理警告視窗
                try:
                    self.driver.switch_to.alert.accept()
                    time.sleep(1)
                except:
                    pass

                # 2. 檢查網址 (防白畫面卡死)
                try:
                    if "index.htm" in self.driver.current_url:
                        self.driver.get(
                            "https://ndltd.ncl.edu.tw/cgi-bin/gs32/gsweb.cgi/login?o=dwebmge")
                        time.sleep(2)
                except:
                    pass

                # 3. 檢查登入成功
                if self.driver.find_elements(By.XPATH, "//div[@class='user_area']//a[text()='登出']"):
                    print("[*] 登入成功！")
                    return
                time.sleep(1)
            except:
                pass
        raise Exception("登入逾時")

    def run_search(self):
        print("\n[步驟 2] 執行搜尋...")
        # 抓取 Session ID
        ccd = None
        try:
            match = re.search(r'ccd=([^/&]+)', self.driver.current_url)
            if match:
                ccd = match.group(1)
        except:
            pass

        if ccd:
            target = f"https://ndltd.ncl.edu.tw/cgi-bin/gs32/gsweb.cgi/ccd={ccd}/search?mode=basic"
            if f"ccd={ccd}" not in self.driver.current_url:
                self.driver.get(target)

        try:
            search_box = self.wait.until(
                EC.visibility_of_element_located((By.ID, "ysearchinput0")))

            # ★★★ 強制全選 dcf 欄位 ★★★
            self.driver.execute_script("""
                var boxes = document.getElementsByName("dcf");
                for (var i = 0; i < boxes.length; i++) {
                    if (!boxes[i].checked) { boxes[i].checked = true; }
                }
                var fulltext = document.getElementById("cacheinternet");
                if (fulltext && fulltext.checked) { fulltext.checked = false; }
            """)
            time.sleep(1)

            search_box.clear()
            search_box.send_keys(self.keyword)
            time.sleep(1)

            self.driver.find_element(By.ID, "gs32search").click()
            print(f"[*] 搜尋提交: {self.keyword}")

            # 解析頁數
            try:
                summary = self.wait.until(EC.visibility_of_element_located(
                    (By.XPATH, "//td[contains(., '檢索結果共')]"))).text
                total = int(re.search(r'檢索結果共\s*(\d+)', summary).group(1))
                self.total_pages = math.ceil(total / self.items_per_page)
                print(f"[*] 找到 {total} 筆，共 {self.total_pages} 頁。")
            except:
                self.total_pages = 0

            # 跳頁
            if self.last_crawled_page > 1:
                print(f"[*] 跳轉至第 {self.last_crawled_page} 頁...")
                try:
                    page_input = self.driver.find_element(By.ID, "jmpage")
                    self.driver.execute_script(
                        "arguments[0].value = arguments[1];", page_input, str(self.last_crawled_page))
                    self.driver.find_element(By.NAME, "jumpfmt1page").click()
                    time.sleep(3)
                except:
                    self.last_crawled_page = 1

        except Exception as e:
            print(f"[錯誤] 搜尋失敗: {e}")
            raise

    # (下載邏輯簡化版，包含驗證碼處理)
    def _solve_captcha(self):
        try:
            img_el = self.driver.find_element(
                By.XPATH, "//img[contains(@src, 'random_validation')]")
            img_bytes = img_el.screenshot_as_png
            img = Image.open(BytesIO(img_bytes)).convert(
                'L').point(lambda p: 255 if p > 128 else 0)
            buf = BytesIO()
            img.save(buf, format="PNG")
            res = self.ocr.classification(buf.getvalue())
            return ''.join(filter(str.isalnum, res)).lower()
        except:
            return ""

    def run_download_process(self):
        print("\n[步驟 3] 開始下載...")
        if not self.main_window_handle:
            self.main_window_handle = self.driver.current_window_handle
        page = self.last_crawled_page

        while True:
            if self.session_download_count >= self.max_downloads_per_session:
                print("達下載上限。")
                break

            print(f"\n--- 第 {page} 頁 ---")
            self.driver.switch_to.window(self.main_window_handle)

            # 抓連結
            links = []
            try:
                rows = self.driver.find_elements(
                    By.CSS_SELECTOR, "td.tdfmt1-content")
                for row in rows:
                    try:
                        a = row.find_element(By.CSS_SELECTOR, "a.slink")
                        title = a.find_element(
                            By.CSS_SELECTOR, "span.etd_d").text
                        href = a.get_attribute('href')
                        links.append((href, title))
                    except:
                        continue
            except:
                pass

            # 處理每篇論文
            for url, title in links:
                if self.session_download_count >= self.max_downloads_per_session:
                    break

                # 檢查是否已下載
                match = re.search(r'/record\?.*$', url)
                norm_url = match.group(0) if match else url
                if norm_url in self.downloaded_urls:
                    print(f"  [跳過] {title}")
                    continue

                print(f"  [處理] {title}")
                self.driver.switch_to.new_window('tab')
                self.driver.get(url)

                # 嘗試點擊電子全文 & 下載
                try:
                    self.wait.until(EC.element_to_be_clickable(
                        (By.XPATH, "//a[em[text()='電子全文']]"))).click()
                    self.wait.until(EC.element_to_be_clickable(
                        (By.XPATH, "//img[@alt='電子全文']/following-sibling::a"))).click()
                    time.sleep(2)
                    self.driver.switch_to.window(
                        self.driver.window_handles[-1])

                    # 驗證碼迴圈
                    for _ in range(3):
                        try:
                            code = self._solve_captcha()
                            if not code:
                                self.driver.refresh()
                                time.sleep(2)
                                continue

                            self.driver.find_element(
                                By.ID, "validinput").clear()
                            self.driver.find_element(
                                By.ID, "validinput").send_keys(code)
                            self.driver.find_element(
                                By.XPATH, "//input[@value='我同意']").click()

                            # 處理 Alert
                            try:
                                self.driver.switch_to.alert.accept()
                            except:
                                pass

                            self.wait.until(EC.presence_of_element_located(
                                (By.LINK_TEXT, "下載"))).click()

                            # 等待下載
                            dl_ok = False
                            for _ in range(30):  # 等30秒
                                if any(f.endswith('.crdownload') for f in os.listdir(self.download_dir)):
                                    time.sleep(1)
                                elif any(f.endswith('.pdf') or f.endswith('.zip') for f in os.listdir(self.download_dir)):
                                    dl_ok = True
                                    break
                                time.sleep(1)

                            if dl_ok:
                                print("    V 下載成功")
                                self.session_download_count += 1
                                with open(self.log_file, 'a', encoding='utf-8') as f:
                                    f.write(norm_url + '\n')
                                self.downloaded_urls.add(norm_url)
                                break
                        except:
                            self.driver.refresh()
                            time.sleep(2)
                except:
                    print("    X 無法下載或無權限")

                # 關閉分頁
                while len(self.driver.window_handles) > 1:
                    self.driver.switch_to.window(
                        self.driver.window_handles[-1])
                    self.driver.close()
                self.driver.switch_to.window(self.main_window_handle)
                time.sleep(random.uniform(5, 10))  # 休息一下

            # 翻頁
            try:
                with open(self.page_progress_file, 'w') as f:
                    f.write(str(page))
                next_btn = self.driver.find_element(
                    By.CSS_SELECTOR, 'input[name="gonext"][type="image"]:not([src*="_"])')
                next_btn.click()
                page += 1
                time.sleep(random.uniform(10, 20))
            except:
                print("沒有下一頁了。")
                break

    def run(self):
        try:
            self._setup_driver()
            self.wait_for_manual_login()
            self.run_search()
            self.run_download_process()
        finally:
            if self.driver:
                self.driver.quit()


if __name__ == "__main__":
    # 請確認您的手機熱點已開啟，IP 已更換
    downloader = BaseThesisDownloader(keyword="精神疾病 家庭動力")
    downloader.run()



精神疾病 + 家庭動力

Bowen + 自我分化 + 台灣

結構派 + 家庭治療 + 界限


心理衛生社工 + 社會安全網

多重問題家庭 + 處遇 (或 多重需求家庭)

高風險家庭 + 系統觀點

社區精神復健 + 家庭工作


家族治療 + 本土化

華人家庭 + 溝通模式

家庭界限 + 文化

精神障礙者 + 照顧負荷 + 家庭韌性


