In [None]:
# 環境設置
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys  # 鍵盤操作
from selenium.common.exceptions import NoSuchElementException
# ---------- 等待頁面加載完成
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# ----------
import pandas as pd
import pandas.io.formats.excel
import requests
import requests.packages.urllib3
requests.packages.urllib3.disable_warnings()
from urllib.request import HTTPError
from bs4 import BeautifulSoup
import time

In [None]:
# webpac_jsp_crawler
# 國立宜蘭大學、佛光大學
def webpac_jsp_crawler(org, url, ISBN, driver, wait):
    try:
        # 前往《館藏系統》頁面
        driver.get(url)

        # 等待＜下拉式選單＞元素出現
        wait.until(EC.presence_of_element_located((By.NAME, "search_field")))

        # 定位＜下拉式選單＞，以 ISBN 搜尋方式搜尋
        select = Select(driver.find_element_by_name("search_field"))
        select.select_by_visible_text(u"ISBN")

        # 定位＜搜尋＞，輸入 IBSN 後，按下 ENTER 鍵
        search_input = driver.find_element_by_name("search_input")
        search_input.send_keys(ISBN)
        search_input.submit()

        # 等待＜表格＞元素出現
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.order")))

        # 取得 tgt
        tgt = pd.read_html(driver.page_source, encoding="utf-8")[-2]

        return tgt
    except:
        print(f"「{url}」無法爬取！")

In [None]:
# webpac_gov_crawler
def click_more_btn(driver):
    while True:
        try:
            time.sleep(1)
            driver.find_element_by_css_selector("a.btnstyle.bluebg3.morewidth").click()
        except NoSuchElementException:
            break
    tgt = pd.read_html(driver.page_source, encoding="utf-8")[0]
    return tgt

# 宜蘭縣公共圖書館、桃園市立圖書館
def webpac_gov_crawler(org, url, ISBN, driver, wait):
    try:
        # 進入《館藏系統》頁面
        driver.get(url)

        # 等待＂進階查詢的按鈕＂直到出現：click
        wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="center"]/div/section[1]/div/div/div/form/div[3]/div[1]/a'))).click()

        # 等待＂下拉式選單＂直到出現：選擇以＂ISBN＂方式搜尋，並再搜尋欄位輸入＂ISBN＂後再 submit
        select = Select(wait.until(EC.presence_of_element_located((By.NAME, "searchField"))))
        time.sleep(0.2)  # 瀏覽器反應不過來 QQ
        select.select_by_visible_text(u"ISBN")
        search_input = driver.find_element_by_name("searchInput")
        search_input.send_keys(ISBN)
        search_input.submit()  # 有些網站無法 submit，必須使用 send_keys(Keys.ENTER)

        # 計算搜尋結果的數量（results）
        # 以＂宜蘭縣公共圖書館＂為例，當搜尋結果只有一筆時，網頁會迅速地自動跳轉至＂詳細書目＂
        results = len(wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.booklist"))))
        soup = BeautifulSoup(driver.page_source, "html.parser")  # 用以當搜尋結果多筆時，找尋不同＂詳細書目＂的網址

        # 依照搜尋結果的數量分成：一筆和多筆兩種情形
        if results < 2:
            time.sleep(2)  # 強制等待，用 wait.until() 的方式一直出現錯誤，待解決
            tgt = click_more_btn(driver)
        else:
            tgt = []
            for li in soup.find_all("div", "booklist"):
                href = url + li.find("h2").find("a")["href"]
                driver.get(href)

                time.sleep(2)  # # 強制等待，用 wait.until() 的方式一直出現錯誤，待解決
                tgt.append(click_more_btn(driver))
            tgt = pd.concat(tgt, axis=0, ignore_index=True)
        return tgt
    except:
        print(f"「{url}」無法爬取！")

In [None]:
# 國立臺灣海洋大學、國立臺灣師範大學、中央研究院、國立陽明大學、國立中央大學、中原大學
# 國立臺灣科技大學、中國文化大學、輔仁大學
def easy_crawler(position, org, url, ISBN):
    try:
        # 組合成書的網址
        url = url + ISBN
        # 載入 html，如果發生 HTTPError，那麼就使用 requests.get(url, verify=False)
        try:
            dfs = pd.read_html(url, encoding="utf-8")
        except HTTPError:
            resp = requests.get(url, verify=False)  # 設定 verify=False，以解決 SSLError
            dfs = pd.read_html(resp.text, encoding="utf-8")
        # 定位表格
        tgt = dfs[position]
        
        # tgt.insert(0, "連結", [url for i in range(tgt.shape[0])])
        # tgt.insert(0, "圖書館", [org for i in range(tgt.shape[0])])
        return tgt
    except:
        print(f"「{url}」無法爬取！")

In [None]:
def klccab_crawler(org, ISBN, driver, wait):
    try:
        url = f"https://webpac.klccab.gov.tw/webpac/search.cfm?m=as&k0={ISBN}&t0=i&c0=and&y10=&y20=&cat0=&dt0=&l0=&lv0=&lc0="
        driver.get(url)

        time.sleep(5)  # 基隆的系統太詭異了，強制等待
        soup = BeautifulSoup(driver.page_source, "html.parser")
        results = len(soup.find_all("div", "list_box"))
        if results < 2:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.list.list_border")))
            tgt = pd.read_html(driver.page_source)[0]
        else:
            tgt = []
            for li in soup.find_all("div", "list_box"):
                url_temp = "https://webpac.klccab.gov.tw/webpac/" + li.find("a", "btn")["href"]
                driver.get(url_temp)
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.list.list_border")))
                tgt.append(pd.read_html(driver.page_source, encoding="utf-8")[0])
            tgt = pd.concat(tgt, axis=0, ignore_index=True)
        return tgt
    except:
        print(f"「{url}」無法爬取！")

In [None]:
def tpml_crawler(org, url, ISBN, driver, wait):
    # 進入《館藏系統》頁面
    driver.get(url)
    
    # 等待＜下拉式選單＞元素出現
    wait.until(EC.presence_of_element_located((By.NAME, "search_field")))

    # 定位＜下拉式選單＞，以 ISBN 搜尋方式搜尋
    select = Select(driver.find_element_by_name("search_field"))
    select.select_by_visible_text(u"ISBN")
    search_input = driver.find_element_by_name("search_input")
    search_input.send_keys(ISBN)
    search_input.submit()
    
    # 等待＜表格＞出現
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.order")))
    
    # 取得當前網頁的 html 進行解析，以取得 DataFrame
    html_text = driver.page_source
    
    dfs = pd.read_html(html_text, encoding="utf-8")
    tgt = dfs[19]
    return tgt

In [None]:
def ntu_crawler(ISBN, driver):
    url = "https://ntu.primo.exlibrisgroup.com/discovery/search?query=any,contains," + ISBN + "&tab=Everything&search_scope=MyInst_and_CI&vid=886NTU_INST:886NTU_INST&offset=0"
    driver.get(url)
    time.sleep(5)
    title = driver.find_element_by_class_name('item-title').click()

    time.sleep(3)
    try:
        try:
            back = driver.find_element_by_css_selector(".tab-header .back-button.button-with-icon.zero-margin.md-button.md-primoExplore-theme.md-ink-ripple")
        except:
            back = None
        if back != None:
            back.click()

        thelist = driver.find_elements_by_class_name("layout-align-space-between-center.layout-row.flex-100")
        for row in thelist:
            plist = row.find_elements_by_tag_name("p")
            where = row.find_elements_by_tag_name("h3")
            i = len(where)
            for sth in plist:
                a = sth.find_elements_by_tag_name("span")
                for _ in range(i): 
                    new_row = pd.Series(['台灣大學', where[_].text, a[4].text, a[0].text, url]).to_frame().T
                    df_lst.append(new_row)
                    break
                break
        tgt = pd.concat(df_lst, axis=0, ignore_index=True)
        return tgt
    except:
        print(f"「{url}」無法爬取！")