In [None]:

import os
import time
import re
import random
import zipfile
import shutil
import math
import json
import logging
from typing import List, Set, Optional, Tuple
from urllib.parse import quote
from io import BytesIO

import ddddocr
from PIL import Image
import undetected_chromedriver as uc  # 抗封鎖核心驅動
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    NoAlertPresentException,
    WebDriverException,
    UnexpectedAlertPresentException  # 已補上：防止警告視窗導致崩潰
)

# 獲取腳本所在的目錄
BASE_DIR = os.getcwd()

class BaseThesisDownloader:
    """
    [強健版下載器] 
    整合自動清除 Cookie、智慧冷卻與抗封鎖機制。
    """
    def __init__(self,
                 keyword: str,
                 download_dir: str = "downloaded_theses",
                 log_file: str = "download_log.txt",
                 page_progress_file: str = "page_progress.txt",
                 max_downloads_per_session: int = 70,
                 items_per_page: int = 10,
                 inter_article_sleep_range: Tuple[float, float] = (10.0, 20.0),
                 inter_page_sleep_range: Tuple[float, float] = (20.0, 45.0)
                 ):
        self.base_url = "https://ndltd.ncl.edu.tw/cgi-bin/gs32/gsweb.cgi/login?o=dwebmge"
        self.keyword = keyword
        self.download_dir = os.path.join(BASE_DIR, download_dir)
        self.log_file = os.path.join(BASE_DIR, log_file)
        self.page_progress_file = os.path.join(BASE_DIR, page_progress_file)
        self.max_downloads_per_session = max_downloads_per_session
        self.items_per_page = items_per_page
        self.inter_article_sleep_range = inter_article_sleep_range
        self.inter_page_sleep_range = inter_page_sleep_range
        self.downloaded_urls, self.last_crawled_page = self._load_log()
        self.session_download_count = 0
        self.total_pages = 0
        self.driver = None
        self.wait = None
        self.main_window_handle = None
        
        print("[-] 正在初始化 ddddocr 引擎...")
        self.ocr = ddddocr.DdddOcr(show_ad=False)
        print("[*] ddddocr 引擎初始化完成。")
        print(f"[*] 本次執行最大下載量設定為: {self.max_downloads_per_session} 篇")

    def _normalize_url(self, url: str) -> Optional[str]:
        if not isinstance(url, str): return None
        match = re.search(r'/record\?.*$', url)
        return match.group(0) if match else None

    def _setup_driver(self):
        print("[-] 設定 Selenium WebDriver (Anti-Ban Mode)...")
        os.makedirs(self.download_dir, exist_ok=True)
        
        # 設定 undetected_chromedriver 選項
        options = uc.ChromeOptions()
        prefs = {
            "download.default_directory": self.download_dir,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True,
            "profile.default_content_settings.popups": 0,
        }
        options.add_experimental_option("prefs", prefs)
        options.add_argument("--no-first-run")
        options.add_argument("--password-store=basic")
        # options.add_argument("--headless=new") # 除錯時建議關閉 headless

        try:
            self.driver = uc.Chrome(options=options, use_subprocess=True)
            self.driver.set_window_size(1366, 768)
            
            # 強制修正下載路徑
            self.driver.execute_cdp_cmd("Page.setDownloadBehavior", {
                "behavior": "allow",
                "downloadPath": self.download_dir
            })
            
        except Exception as e:
            print(f"[錯誤] WebDriver 初始化失敗: {e}")
            raise
        self.wait = WebDriverWait(self.driver, 20)

    def _load_log(self) -> Tuple[Set[str], int]:
        urls, last_page = set(), 1
        try:
            with open(self.log_file, 'r', encoding='utf-8') as f:
                urls = {self._normalize_url(line.strip()) for line in f if line.strip() and self._normalize_url(line.strip())}
            print(f"[*] 已從 {self.log_file} 載入 {len(urls)} 筆有效紀錄。")
        except FileNotFoundError:
            print(f"[*] 未找到下載紀錄檔 {self.log_file}，將會從頭開始下載。")
        
        try:
            with open(self.page_progress_file, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if content.isdigit():
                    last_page = int(content)
                    print(f"[*] 已從 {self.page_progress_file} 載入上次爬取進度：從第 {last_page} 頁開始。")
                else:
                    print(f"[*] {self.page_progress_file} 內容無效，將從第 1 頁開始。")
        except FileNotFoundError:
            print(f"[*] 未找到頁數進度檔，將從第 1 頁開始。")
        return urls, last_page

    def _log_download(self, url: str):
        normalized_url = self._normalize_url(url)
        if not normalized_url: return
        with open(self.log_file, 'a', encoding='utf-8') as f: f.write(normalized_url + '\n')
        self.downloaded_urls.add(normalized_url)
        self.session_download_count += 1
        print(f"      - [計數] 本次執行已下載 {self.session_download_count}/{self.max_downloads_per_session} 篇。")

    def _log_progress(self, page_num: int):
        try:
            with open(self.page_progress_file, 'w', encoding='utf-8') as f: f.write(str(page_num))
        except Exception as e: print(f"[錯誤] 記錄頁數進度時發生錯誤: {e}")

    # =================================================================
    # ★★★ Robust 改良版登入函式 ★★★
    # =================================================================
    def wait_for_manual_login(self):
        print("\n[步驟 1] 準備登入程序...")
        
        # 1. 前往網站並清除舊有 Cookie (模擬全新連線)
        try:
            self.driver.get(self.base_url)
            self.driver.delete_all_cookies()
            print("[-] 已清除瀏覽器 Cookie，防止舊 Session 導致的錯誤。")
            self.driver.refresh()
        except Exception as e:
            print(f"[!] 清除 Cookie 時遇到輕微錯誤 (不影響執行): {e}")

        print("\n" + "="*60)
        print("★★★ Robust 安全登入模式 ★★★")
        print("1. 請手動輸入帳號密碼與驗證碼。")
        print("2. 若出現「帳號密碼錯誤」警告，程式會自動關閉它。")
        print("3. 【重要】警告關閉後會強制冷卻 5 秒，請耐心等待倒數結束再輸入。")
        print("="*60 + "\n")

        # 給予 15 分鐘充裕時間
        end_time = time.time() + 900 
        
        while time.time() < end_time:
            try:
                # 偵測是否成功登入 (尋找登出按鈕)
                logout_btn = self.driver.find_elements(By.XPATH, "//div[@class='user_area']//a[text()='登出']")
                if logout_btn:
                    print("\n[*] 偵測到登出按鈕，登入成功！")
                    return
                
                # 正常等待
                time.sleep(1)
                
            except (UnexpectedAlertPresentException, NoAlertPresentException):
                # ★★★ 捕捉警告視窗 ★★★
                try:
                    alert = self.driver.switch_to.alert
                    msg = alert.text
                    print(f"\n[警告] 網站回傳訊息：{msg}")
                    
                    # 嚴重錯誤檢查 (若被封鎖則直接停止)
                    critical_keywords = ["封鎖", "停用", "次數過多", "banned", "Access Denied"]
                    if any(k in msg for k in critical_keywords):
                        alert.accept()
                        raise Exception(f"!!! 嚴重警報 !!! 偵測到封鎖關鍵字，程式停止以保護帳號。訊息：{msg}")

                    alert.accept() # 關閉視窗
                    
                    # ★ 強制冷卻機制 ★
                    print("      -> 正在執行強制冷卻 (5秒)... 請暫停操作。")
                    for i in range(5, 0, -1):
                        print(f"         倒數 {i} 秒...", end="\r")
                        time.sleep(1)
                    print("      -> 冷卻結束，請現在重新輸入帳密。                  ")
                    
                except Exception:
                    pass # 忽略處理 Alert 過程中的其他錯誤
            except Exception:
                pass

        raise Exception("手動登入逾時 (已超過 15 分鐘)。")

    def run_search(self):
        print("\n[步驟 2] 執行關鍵字搜尋...")
        self.driver.get("https://ndltd.ncl.edu.tw/cgi-bin/gs32/gsweb.cgi/ccd=20_UgG/search?mode=basic")
        try:
            # 確保有勾選 Checkbox
            self.driver.execute_script("var inputs=document.querySelectorAll('input[type=\"checkbox\"]'); for(var i=0;i<inputs.length;i++){if(!inputs[i].checked){inputs[i].click();}}")
            
            search_box = self.wait.until(EC.presence_of_element_located((By.ID, "ysearchinput0")))
            search_box.clear()
            search_box.send_keys(self.keyword)
            search_button = self.wait.until(EC.element_to_be_clickable((By.ID, "gs32search")))
            search_button.click()
            print(f"[*] 已成功提交搜尋，關鍵字為: '{self.keyword}'")
            
            try:
                print("[-] 正在等待總筆數資訊載入...")
                summary_container = self.wait.until(EC.visibility_of_element_located((By.XPATH, "//td[@headers='start' and contains(., '檢索結果共')]")))
                match = re.search(r'檢索結果共\s*(\d+)\s*筆資料', summary_container.text)
                if match:
                    total_items = int(match.group(1))
                    self.total_pages = math.ceil(total_items / self.items_per_page)
                    print(f"[*] 成功解析總筆數: {total_items} 筆，總頁數: {self.total_pages} 頁。")
                else: raise NoSuchElementException
            except:
                print("[警告] 未能解析總筆數，將依賴「下一頁」按鈕判斷。")
                self.total_pages = 0
                
            # 跳頁邏輯
            page_to_start = self.last_crawled_page
            if page_to_start > 1:
                if self.total_pages == 0 or page_to_start <= self.total_pages:
                    print(f"[*] 嘗試跳轉到第 {page_to_start} 頁...")
                    try:
                        jmpage_input = self.wait.until(EC.visibility_of_element_located((By.ID, "jmpage")))
                        self.driver.execute_script("arguments[0].value = arguments[1];", jmpage_input, str(page_to_start))
                        jump_button = self.wait.until(EC.element_to_be_clickable((By.NAME, "jumpfmt1page")))
                        old_html = self.driver.find_element(By.TAG_NAME, 'html')
                        jump_button.click()
                        self.wait.until(EC.staleness_of(old_html))
                        print(f"[*] 成功跳轉到第 {page_to_start} 頁。")
                        time.sleep(3)
                    except Exception as e:
                        print(f"[錯誤] 跳頁失敗: {e}，從頭開始。")
                        self.last_crawled_page = 1
                else:
                    self.last_crawled_page = 1
        except TimeoutException:
            print("[錯誤] 搜尋頁面逾時。")
            raise

    def _sanitize_filename(self, name: str) -> str:
        sanitized_name = re.sub(r'[\\/*?:"<>|]', "", name)
        return sanitized_name.strip()[:150]

    def _parse_article_links(self) -> List[Tuple[str, str]]:
        results = []
        try:
            self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "td.tdfmt1-content")))
            article_elements = self.driver.find_elements(By.CSS_SELECTOR, "td.tdfmt1-content")
            for elem in article_elements:
                try:
                    link_tag = elem.find_element(By.CSS_SELECTOR, "a.slink")
                    title_span = link_tag.find_element(By.CSS_SELECTOR, "span.etd_d")
                    url = link_tag.get_attribute('href')
                    title = title_span.text
                    if url and title: results.append((url, title))
                except NoSuchElementException: continue
        except TimeoutException: print("[警告] 等待論文連結載入逾時。")
        return results

    def _wait_for_download_complete(self, timeout: int = 180) -> Optional[str]:
        print("      - 自動監控下載中...", end="")
        seconds, initial_dl_files = 0, set(os.listdir(self.download_dir))
        while seconds < timeout:
            new_files = set(os.listdir(self.download_dir)) - initial_dl_files
            if new_files:
                candidates = [f for f in new_files if not f.endswith('.crdownload')]
                if candidates:
                    new_file_name = candidates[0]
                    full_path = os.path.join(self.download_dir, new_file_name)
                    time.sleep(1) 
                    print(f" 下載完成: {new_file_name}")
                    return full_path
            time.sleep(1)
            seconds += 1
            if seconds % 10 == 0: print(".", end="", flush=True)
        print("\n      - [錯誤] 等待下載逾時。")
        return None

    def _preprocess_captcha_image(self, image_bytes: bytes) -> bytes:
        try:
            img = Image.open(BytesIO(image_bytes)).convert('L').point(lambda p: 255 if p > 128 else 0)
            buffered = BytesIO()
            img.save(buffered, format="PNG")
            return buffered.getvalue()
        except: return image_bytes

    def _solve_captcha_with_ddddocr(self, captcha_element) -> str:
        try:
            res = self.ocr.classification(self._preprocess_captcha_image(captcha_element.screenshot_as_png))
            res_cleaned = ''.join(filter(str.isalnum, res)).lower()
            print(f"      - ddddocr: '{res_cleaned}'")
            if 4 <= len(res_cleaned) <= 6: return res_cleaned
            return ""
        except: return ""

    def _unzip_and_cleanup(self, file_path: str, new_name_base: str):
        if not file_path.lower().endswith('.zip'): return
        new_pdf_name = f"{new_name_base}.pdf"
        dest_pdf_path = os.path.join(self.download_dir, new_pdf_name)
        print(f"      - 解壓縮並改名為: {new_pdf_name}")
        try:
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                pdfs = [n for n in zip_ref.namelist() if n.lower().endswith('.pdf')]
                if not pdfs: return
                with zip_ref.open(pdfs[0]) as source, open(dest_pdf_path, 'wb') as target:
                    shutil.copyfileobj(source, target)
            os.remove(file_path)
        except Exception as e: print(f"      - [錯誤] 解壓失敗: {e}")

    def _handle_alert_if_present(self) -> bool:
        try:
            alert = self.driver.switch_to.alert
            alert.accept()
            print("      - 警告視窗已關閉。")
            return True
        except NoAlertPresentException: return False
        except UnexpectedAlertPresentException: # 額外防護
            try:
                self.driver.switch_to.alert.accept()
                return True
            except: return False

    def _process_article_in_new_tab(self, article_url: str, article_title: str):
        print(f"    - 正在處理: {article_title}")
        self.driver.switch_to.new_window('tab')
        self.driver.get(article_url)
        MAX_RETRIES = 3
        try:
            self.wait.until(EC.element_to_be_clickable((By.XPATH, "//a[em[text()='電子全文']]"))).click()
            self.wait.until(EC.element_to_be_clickable((By.XPATH, "//img[@alt='電子全文']/following-sibling::a[@title='電子全文']"))).click()
            time.sleep(random.uniform(1.5, 3.0))
            self.driver.switch_to.window(self.driver.window_handles[-1])
            
            for i in range(MAX_RETRIES):
                print(f"      - 嘗試下載 (第 {i + 1}/{MAX_RETRIES} 次)...")
                try:
                    captcha_img = self.wait.until(EC.presence_of_element_located((By.XPATH, "//img[contains(@src, 'random_validation')]")))
                    captcha_text = self._solve_captcha_with_ddddocr(captcha_img)
                    if not captcha_text:
                        self.driver.refresh(); time.sleep(2); continue
                    
                    input_box = self.driver.find_element(By.ID, "validinput")
                    input_box.clear()
                    input_box.send_keys(captcha_text)
                    time.sleep(1)
                    
                    self.driver.find_element(By.XPATH, "//input[@value='我同意']").click()
                    time.sleep(1.5)
                    
                    if self._handle_alert_if_present():
                        self.driver.refresh(); time.sleep(2); continue
                    
                    self.wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下載"))).click()
                    
                    newly_downloaded_file = self._wait_for_download_complete()
                    if newly_downloaded_file:
                        self._log_download(article_url)
                        sanitized_title = self._sanitize_filename(article_title)
                        if newly_downloaded_file.lower().endswith(".zip"):
                            self._unzip_and_cleanup(newly_downloaded_file, sanitized_title)
                        elif newly_downloaded_file.lower().endswith(".pdf"):
                            new_pdf_path = os.path.join(self.download_dir, f"{sanitized_title}.pdf")
                            print(f"      - 改名為: {sanitized_title}.pdf")
                            if os.path.exists(new_pdf_path):
                                base, ext = os.path.splitext(new_pdf_path)
                                new_pdf_path = f"{base}_{int(time.time())}{ext}"
                            os.rename(newly_downloaded_file, new_pdf_path)
                        return
                    else:
                        print("      - [警告] 下載逾時。")
                        break
                except Exception as e:
                    print(f"      - [重試 {i+1} 錯誤] {e}")
                    if self._handle_alert_if_present(): pass
                    if i < MAX_RETRIES - 1:
                        self.driver.refresh(); time.sleep(3)
                    else: print("      - 放棄此論文。")
        except TimeoutException:
            print("      - [提示] 無法找到電子全文按鈕或下載失敗。")
        except Exception as e:
            print(f"      - [錯誤] {e}")
        finally:
            while len(self.driver.window_handles) > 1:
                self.driver.switch_to.window(self.driver.window_handles[-1])
                self.driver.close()
            self.driver.switch_to.window(self.main_window_handle)
            time.sleep(random.uniform(*self.inter_article_sleep_range))

    def run_download_process(self):
        print("\n[步驟 3] 執行下載流程...")
        if not self.main_window_handle: self.main_window_handle = self.driver.current_window_handle
        page_num = self.last_crawled_page
        while True:
            if self.session_download_count >= self.max_downloads_per_session:
                print("已達下載上限。")
                self._log_progress(page_num)
                break
            
            print(f"\n--- 第 {page_num} 頁 ---")
            self.driver.switch_to.window(self.main_window_handle)
            try: self.wait.until(EC.presence_of_element_located((By.ID, "tablefmt1")))
            except TimeoutException: break
            
            article_urls = self._parse_article_links()
            print(f"[*] 本頁找到 {len(article_urls)} 篇。")
            
            for url, title in article_urls:
                if self.session_download_count >= self.max_downloads_per_session: break
                normalized = self._normalize_url(url)
                if not normalized: continue
                if normalized in self.downloaded_urls:
                    print(f"    - [跳過] 已下載: {title}")
                    continue
                self._process_article_in_new_tab(url, title)
            
            self._log_progress(page_num)
            try:
                next_btn = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="gonext"][type="image"]:not([src*="_"])')))
                self.driver.execute_script("arguments[0].click();", next_btn)
                page_num += 1
                time.sleep(random.uniform(*self.inter_page_sleep_range))
            except:
                print("沒有下一頁了。")
                break

    def run(self):
        try:
            self._setup_driver()
            self.wait_for_manual_login()
            self.run_search()
            self.run_download_process()
        finally:
            self.close()

    def close(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

class ThesisDownloaderWithReadme(BaseThesisDownloader):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.readme_file = os.path.join(BASE_DIR, "README.md")
        self.readme_handle = None

    def _initialize_readme(self):
        try:
            self.readme_handle = open(self.readme_file, 'a+', encoding='utf-8')
            self.readme_handle.seek(0)
            if not self.readme_handle.read(1):
                self.readme_handle.write(f"# 下載清單 - {self.keyword}\n\n")
        except: pass

    def _consolidate_existing_pdfs(self):
        print("\n[統整模式] 檢查已存在的 PDF...")
        if not self.readme_handle: return
        try:
            existing = {f for f in os.listdir(self.download_dir) if f.endswith('.pdf')}
            self.readme_handle.seek(0)
            content = self.readme_handle.read()
            for f in existing:
                if f not in content:
                    self.readme_handle.write(f"* {f} (已存在)\n")
            self.readme_handle.flush()
        except: pass

    def _setup_driver(self):
        super()._setup_driver()
        self._initialize_readme()

    def run(self):
        try:
            self._setup_driver()
            self._consolidate_existing_pdfs()
            self.wait_for_manual_login()
            self.run_search()
            self.run_download_process()
        finally:
            self.close()
            if self.readme_handle: self.readme_handle.close()

if __name__ == "__main__":
    # ★ 建議：如果之前被封鎖，請先重啟路由器換 IP 後再執行
    downloader = ThesisDownloaderWithReadme(
        keyword="思覺失調症 家族治療",
        max_downloads_per_session=200
    )
    downloader.run()


思覺失調症 + 家族治療

精神疾病 + 家庭動力

Bowen + 自我分化 + 台灣

結構派 + 家庭治療 + 界限


心理衛生社工 + 社會安全網

多重問題家庭 + 處遇 (或 多重需求家庭)

高風險家庭 + 系統觀點

社區精神復健 + 家庭工作


家族治療 + 本土化

華人家庭 + 溝通模式

家庭界限 + 文化

精神障礙者 + 照顧負荷 + 家庭韌性


