In [None]:
import os
import time
import re
import random
import logging
from typing import Set, Tuple, Optional
from io import BytesIO

# 第三方套件
import ddddocr
from PIL import Image
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    NoAlertPresentException,
    WebDriverException
)

# --- 設定區 ---
LOG_FILE = "download_log.txt"       # 下載紀錄 (避免重複)
README_FILE = "README.md"           # 閱讀用清單
PROGRESS_FILE = "page_progress.txt"  # 頁數進度 (斷點續傳用)
DOWNLOAD_DIR = "downloaded_theses"  # 檔案存放資料夾

# 設定日誌格式
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%H:%M:%S',
    handlers=[
        logging.FileHandler("crawler.log", encoding='utf-8'),
        logging.StreamHandler()
    ]
)


class RobustThesisDownloader:
    def __init__(self, keyword: str, max_downloads: int = 1000):
        self.keyword = keyword
        self.max_downloads = max_downloads
        self.download_dir = os.path.join(os.getcwd(), DOWNLOAD_DIR)

        # 策略參數
        self.short_delay = (5, 8)       # 翻頁間隔
        self.article_delay = (15, 25)   # 下載後休息
        self.coffee_break_interval = 15  # 每幾篇大休息一次
        self.coffee_break_duration = (120, 200)  # 大休息秒數

        self.ocr = ddddocr.DdddOcr(show_ad=False)
        self.driver = None
        self.wait = None

        # 狀態載入
        self.downloaded_urls = self._load_log()
        self.start_page = self._load_progress()
        self.session_count = 0

        os.makedirs(self.download_dir, exist_ok=True)
        self._init_readme()

    def _load_log(self) -> Set[str]:
        urls = set()
        if os.path.exists(LOG_FILE):
            with open(LOG_FILE, 'r', encoding='utf-8') as f:
                urls = {line.strip() for line in f if line.strip()}
        return urls

    def _load_progress(self) -> int:
        """讀取上次結束的頁碼"""
        if os.path.exists(PROGRESS_FILE):
            try:
                with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
                    page = int(f.read().strip())
                    logging.info(f"[*] 發現進度檔，將從第 {page} 頁開始續傳。")
                    return page
            except ValueError:
                pass
        return 1

    def _save_progress(self, page_num: int):
        """儲存當前頁碼"""
        with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
            f.write(str(page_num))

    def _init_readme(self):
        if not os.path.exists(README_FILE):
            with open(README_FILE, 'w', encoding='utf-8') as f:
                f.write(f"# {self.keyword} - 下載清單\n\n")

    def _init_driver(self):
        logging.info("[-] 啟動 undetected_chromedriver (抗偵測模式)...")
        options = uc.ChromeOptions()
        prefs = {
            "download.default_directory": self.download_dir,
            "download.prompt_for_download": False,
            "plugins.always_open_pdf_externally": True,
            "profile.default_content_settings.popups": 0,
        }
        options.add_experimental_option("prefs", prefs)
        options.add_argument("--no-first-run")
        # options.add_argument("--headless=new") # 開發時建議關閉 headless 以便除錯

        self.driver = uc.Chrome(options=options, use_subprocess=True)
        self.driver.set_window_size(1280, 960)
        self.wait = WebDriverWait(self.driver, 20)

    def _human_sleep(self, range_tuple: Tuple[int, int]):
        sleep_time = random.uniform(*range_tuple)
        time.sleep(sleep_time)

    def _solve_captcha(self) -> str:
        try:
            img_el = self.wait.until(EC.presence_of_element_located(
                (By.XPATH, "//img[contains(@src, 'random_validation')]")
            ))
            png = img_el.screenshot_as_png
            img = Image.open(BytesIO(png))
            res = self.ocr.classification(img)
            return ''.join(filter(str.isalnum, res)).lower()
        except Exception:
            return ""

    def process_single_thesis(self, url: str, title: str):
        logging.info(f"[-] 處理: {title}")
        self.driver.execute_script("window.open('');")
        self.driver.switch_to.window(self.driver.window_handles[-1])

        try:
            self.driver.get(url)

            # 點擊流程
            try:
                self.wait.until(EC.element_to_be_clickable(
                    (By.XPATH, "//a[em[text()='電子全文']]"))).click()
                if len(self.driver.window_handles) > 2:  # 處理彈出視窗
                    self.driver.switch_to.window(
                        self.driver.window_handles[-1])

                self.wait.until(EC.element_to_be_clickable(
                    (By.XPATH,
                     "//img[@alt='電子全文']/following-sibling::a[@title='電子全文']")
                )).click()
            except TimeoutException:
                logging.warning(f"    [跳過] 無權限或無電子全文: {title}")
                return

            # 驗證碼迴圈
            download_ok = False
            for i in range(3):
                code = self._solve_captcha()
                if not code:
                    self.driver.refresh()
                    time.sleep(2)
                    continue

                try:
                    self.driver.find_element(By.ID, "validinput").clear()
                    self.driver.find_element(
                        By.ID, "validinput").send_keys(code)
                    self.driver.find_element(
                        By.XPATH, "//input[@value='我同意']").click()

                    try:
                        self.driver.switch_to.alert.accept()  # 處理驗證碼錯誤彈窗
                        time.sleep(2)
                        self.driver.refresh()
                        continue
                    except NoAlertPresentException:
                        pass

                    self.wait.until(EC.element_to_be_clickable(
                        (By.LINK_TEXT, "下載"))).click()

                    # 簡易檢查檔案邏輯
                    start = time.time()
                    while time.time() - start < 60:
                        files = [f for f in os.listdir(
                            self.download_dir) if not f.endswith('.crdownload')]
                        # 這裡假設新檔案就是目標，實務上可用時間戳判斷
                        if files:  # 簡化判斷
                            download_ok = True
                            break
                        time.sleep(1)

                    if download_ok:
                        break
                except Exception:
                    self.driver.refresh()
                    time.sleep(3)

            if download_ok:
                logging.info(f"    [成功] {title}")
                with open(LOG_FILE, 'a', encoding='utf-8') as f:
                    f.write(f"{url}\n")
                with open(README_FILE, 'a', encoding='utf-8') as f:
                    f.write(f"* {title} ([Link]({url}))\n")
                self.session_count += 1

                # 休息機制
                if self.session_count % self.coffee_break_interval == 0:
                    break_time = random.uniform(*self.coffee_break_duration)
                    logging.info(f"☕ 咖啡時間：休息 {break_time/60:.1f} 分鐘...")
                    time.sleep(break_time)
                else:
                    self._human_sleep(self.article_delay)

        except Exception as e:
            logging.error(f"    [失敗] {e}")
        finally:
            if len(self.driver.window_handles) > 1:
                self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[0])

    def run(self):
        self._init_driver()

        # 1. 登入
        self.driver.get(
            "https://ndltd.ncl.edu.tw/cgi-bin/gs32/gsweb.cgi/login?o=dwebmge")
        logging.info("=== 請在 5 分鐘內手動登入 ===")
        try:
            WebDriverWait(self.driver, 300).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//div[@class='user_area']//a[text()='登出']"))
            )
        except TimeoutException:
            logging.error("登入逾時")
            return

        # 2. 搜尋
        self.driver.get(
            "https://ndltd.ncl.edu.tw/cgi-bin/gs32/gsweb.cgi/ccd=20_UgG/search?mode=basic")
        try:
            self.wait.until(EC.presence_of_element_located(
                (By.ID, "ysearchinput0"))).send_keys(self.keyword)
            self.driver.find_element(By.ID, "gs32search").click()
            logging.info(f"[*] 搜尋關鍵字: {self.keyword}")
        except Exception:
            logging.error("搜尋失敗")
            return

        # 3. 斷點續傳 (Jump Page)
        current_page = 1
        if self.start_page > 1:
            logging.info(f"[*] 正在跳轉至第 {self.start_page} 頁...")
            try:
                # 等待搜尋結果載入
                self.wait.until(
                    EC.presence_of_element_located((By.ID, "jmpage")))

                # 使用 JS 注入頁碼並點擊跳頁
                js = "document.getElementById('jmpage').value = arguments[0];"
                self.driver.execute_script(js, self.start_page)

                jump_btn = self.driver.find_element(By.NAME, "jumpfmt1page")
                jump_btn.click()

                # 等待頁面更新 (檢查第一筆資料是否變更，或簡單等待)
                time.sleep(5)
                current_page = self.start_page
                logging.info(f"[*] 跳轉完成，目前在第 {current_page} 頁。")
            except Exception as e:
                logging.error(f"[警告] 跳頁失敗: {e}，將從第 1 頁開始。")
                current_page = 1

        # 4. 主迴圈
        while self.session_count < self.max_downloads:
            try:
                self.wait.until(EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "td.tdfmt1-content")))
                rows = self.driver.find_elements(
                    By.CSS_SELECTOR, "td.tdfmt1-content")

                page_links = []
                for row in rows:
                    try:
                        link = row.find_element(By.CSS_SELECTOR, "a.slink")
                        url = link.get_attribute('href')
                        title = link.find_element(
                            By.CSS_SELECTOR, "span.etd_d").text
                        if url not in self.downloaded_urls:
                            page_links.append((url, title))
                    except NoSuchElementException:
                        continue

                logging.info(
                    f"--- 第 {current_page} 頁 (本頁 {len(page_links)} 篇未載) ---")

                for url, title in page_links:
                    if self.session_count >= self.max_downloads:
                        break
                    self.process_single_thesis(url, title)

                # 更新進度檔 (每爬完一頁就存一次)
                self._save_progress(current_page + 1)  # 預存下一頁

                # 翻頁
                try:
                    next_btn = self.driver.find_element(
                        By.CSS_SELECTOR, 'input[name="gonext"][type="image"]:not([src*="_"])')
                    next_btn.click()
                    current_page += 1
                    self._human_sleep(self.short_delay)
                except NoSuchElementException:
                    logging.info("[*] 沒有下一頁了，任務結束。")
                    break

            except Exception as e:
                logging.error(f"[錯誤] {e}")
                break

        self.driver.quit()


if __name__ == "__main__":
    # 使用範例
    downloader = RobustThesisDownloader(keyword="量化交易", max_downloads=100)
    downloader.run()