In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException, TimeoutException

import pandas as pd
from bs4 import BeautifulSoup
import time

import test_data

my_options = Options()
my_options.add_argument("--incognito")  # 開啟無痕模式
my_options.add_argument("--start-maximized")  # 視窗最大化
# my_options.add_argument("--headless")  # 不開啟實體瀏覽器
capabilities = DesiredCapabilities.CHROME
capabilities["pageLoadStrategy"] = "none"

def webpac_jsp_crawler(url, ISBN):
    test_url = url[:]  # 測試用，輸出「現在測試的網頁」
    test_start = time.time()  # 測試用，計算程式進行時間
    
    # 初始化 driver；設定 explicit waits
    driver = webdriver.Chrome(options=my_options, desired_capabilities=capabilities)
    wait = WebDriverWait(driver, 30)  # 最長等待時間：30 秒；查找元素的間隔時間：0.5秒（default）

    # 進入＂搜尋主頁＂
    driver.get(url)

    try:
        # 定位＂下拉式選單＂，選擇 ISBN
        search_field = wait.until(
            EC.presence_of_element_located((By.NAME, "search_field"))
        )
        select = Select(search_field)
        select.select_by_value("ISBN")
        
        # 抓取在哪間＂圖書館＂
        org = driver.title

        # 定位＂搜尋欄＂，輸入 ISBN
        search_input = wait.until(
            EC.presence_of_element_located((By.NAME, "search_input"))
        )
        search_input.send_keys(ISBN)
        search_input.submit()

        # 依據＂div.mainCon＂，來判斷搜尋結果是「＂一筆＂」還是「＂零筆＂或＂多筆＂」
        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.mainCon"))
            )
            
            # 定位＂表格＂，爬取內容
            wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "table.order"))
            )
            tgt = pd.read_html(driver.page_source, encoding="utf-8")[-2]
            tgt["圖書館"] = org
            tgt["連結"] = driver.current_url
        except (NoSuchElementException, TimeoutException):
            if ".jsp" in url:  # 用來組合不同＂詳細書目＂的網址
                url = url.replace(url.split("/")[-1], "")

            tgt = []
            # 定位＂內嵌框架＂，切換至此內部，找尋多個＂詳細書目的網址＂
            iframe = wait.until(
                EC.presence_of_element_located((By.ID, "leftFrame"))
            )
            driver.switch_to.frame(iframe)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            tgt_urls = []
            for anchor in soup.find_all("a", "bookname"):
                tgt_urls.append(url + anchor["href"])
            
            # 進入不同的＂詳細書目＂，再定位＂表格＂，爬取內容
            for urlx in tgt_urls:
                driver.get(urlx)
                wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "table.order"))
                )
                sec_tgt = pd.read_html(driver.page_source, encoding="utf-8")[-2]
                sec_tgt["連結"] = driver.current_url
                tgt.append(sec_tgt)
            tgt = pd.concat(tgt, axis=0, ignore_index=True)
            tgt["圖書館"] = org
        finally:
            driver.close()
            tgt.to_csv(f"{org}.csv", encoding="big5", errors="ignore")
            test_end = time.time()  # 測試用，計算程式進行時間
            print(f"程式執行成功，執行時間：{test_end - test_start:7.4f}秒，在「{org}」")  # 測試用，計算程式進行時間
    except Exception as e:
        print("="*80)
        print(f"程式執行失敗，執行時間：{test_end - test_start:7.4f}秒，在「{org}」")
        print(f"網址為：「{test_url}」")
        print(f"錯誤訊息為：「{e}」")
        print("="*80)

In [2]:
df_lst = []
for test_url in test_data.test_urls:
    df_lst.append(
        webpac_jsp_crawler(
            url=test_url,
            ISBN="9789573317241")
    )

AttributeError: module 'test_data' has no attribute 'test_urls'

In [None]:
pd.concat(df_lst, axis=0, ignore_index=True)

In [None]:
# webpac_jsp_crawler(
#     url="http://192.192.231.232/webpacIndex.jsp",
#     ISBN="9789573317241")