# 套件安裝

In [None]:
!pip install -U openpyxl selenium beautifulsoup4 lxml requests

# 設定初始化

In [None]:
'''
注意事項:
下載對應的 ChromeDriver (web driver) 到程式檔案同一個目錄下後解壓縮，下載前記得對應版本編號。
連結: https://chromedriver.chromium.org/downloads

參考網頁:
[1] 中央研究院中國文哲研究所
https://www.litphil.sinica.edu.tw/publications/bulletin
'''


'''
匯入套件
'''
# HTML parser
from bs4 import BeautifulSoup as bs

# 操作 browser 的 API
from selenium import webdriver

# 處理逾時例外的工具
from selenium.common.exceptions import TimeoutException

# 面對動態網頁，等待某個元素出現的工具，通常與 exptected_conditions 搭配
from selenium.webdriver.support.ui import WebDriverWait

# 搭配 WebDriverWait 使用，對元素狀態的一種期待條件，若條件發生，則等待結束，往下一行執行
from selenium.webdriver.support import expected_conditions as EC

# 期待元素出現要透過什麼方式指定，通常與 EC、WebDriverWait 一起使用
from selenium.webdriver.common.by import By

# 處理下拉式選單的工具
from selenium.webdriver.support.ui import Select

# 強制等待 (執行期間休息一下)
from time import sleep

# pretty-print
from pprint import pprint

# 隨機
from random import randint

# 計時
import time

# 整理 json 使用的工具
import json

# 執行 shell command 的時候用的
import os

# 子處理程序，用來取代 os.system 的功能
import subprocess

# 正規表達式
import re

# 編碼
from urllib.parse import quote

# 存取 Excel 的工具
from openpyxl import load_workbook
from openpyxl import Workbook

# 啟動瀏覽器工具的選項
my_options = webdriver.ChromeOptions()
# my_options.add_argument("--headless")             #不開啟實體瀏覽器背景執行
my_options.add_argument("--start-maximized")        #最大化視窗
my_options.add_argument("--incognito")              #開啟無痕模式
my_options.add_argument("--disable-popup-blocking") #禁用彈出攔截
my_options.add_argument("--disable-notifications")  #取消通知
my_options.add_argument("--lang=zh-TW")  #設定為正體中文

# 指定 chromedriver 檔案的路徑
driver_exec_path = './chromedriver.exe'

# 給 web driver 用的變數
driver = None

# 來源首頁
prefix_url = 'https://www.litphil.sinica.edu.tw/'
url = prefix_url + 'publications/bulletin'

# 指定 sheet name
sheetName = 'litphil_sinica'

# 指定 excel 檔名
excelFileName = 'litphil_sinica.xlsx'

# 指定 json 檔名
jsonFileName = f'{sheetName}.json'

# 建立儲存圖片、影片的資料夾
folderPath = f'./{sheetName}'
if not os.path.exists(folderPath):
    os.makedirs(folderPath)

# 判斷 excel 檔案是否存在，不存在就新增
filePath = folderPath + '/' + excelFileName
if not os.path.exists(filePath):
    workbook = Workbook() # 動態新增檔案
    worksheet = workbook.create_sheet(sheetName, 0) # 建立並取得 active sheet
else:
    workbook = load_workbook(filename = filePath)
    worksheet = workbook[sheetName] # 取得 active sheet

#預設下載路徑
my_options.add_experimental_option("prefs", {
    "download.default_directory": folderPath
})
    
# excel 標題
worksheet['A1'] = "流水號"
worksheet['B1'] = "期刊名稱"
worksheet['C1'] = "期數"
worksheet['D1'] = '網頁連結'
worksheet['E1'] = "出版商"
worksheet['F1'] = "出版日期"
worksheet['G1'] = "論文名稱"
worksheet['H1'] = "作者名稱"
worksheet['I1'] = "論文連結_原始"
worksheet['J1'] = "論文連結_curl可用"

# 放置爬取的資料
listData = []

# 自訂函式 (網路爬蟲執行流程)

In [None]:
'''
函式
'''
# 初始化 Web Driver
def init():
     global driver
    # 使用 Chrome 的 WebDriver
    driver = webdriver.Chrome( 
        options = my_options, 
        executable_path = driver_exec_path
    )

# 為 pagination 開啟 tabs
def openTabs():
    global driver
    driver.get(url) #進入來源網頁
    try:
        # 等待目標元素出現
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'ul.pagination > li')
            )
        )
        
        # 取得分頁列表
        li_elms = driver.find_elements(
            By.CSS_SELECTOR, 'ul.pagination > li'
        )
        
        # 開啟分頁
        for li in li_elms:
            # 開啟新分頁
            driver.execute_script(f'window.open("{url}", "_blank");')
            
    except TimeoutException as e:
        print('等待逾時: openTabs')
        
# 設定 tabs 內容
def setTabs():
    global driver
    try:
        # 切換 tab，同時各別抓取資料
        windows = driver.window_handles
        for index in range(len(windows)):
            # 跳過主要分頁
            if index == 0: continue
                
            # 跳到指定分頁
            driver.switch_to.window(windows[index])

            # 取得分頁列表
            li_elms = driver.find_elements(
                By.CSS_SELECTOR, 'ul.pagination > li'
            )
            
            # 按下每一個分頁
            li_elms[index - 1].click()
            
            # 等待一下
            sleep(randint(1,2))
            
            # 切換到初始分頁
            driver.switch_to.window(driver.window_handles[0])
                
    except TimeoutException as e:
        print('等待逾時: setTabs')
    
# 剖析內容
def parse():
    global driver
    global listData
    try:
        # 流水號
        sn = 1
        
        # 切換 tab，同時各別抓取資料
        windows = driver.window_handles
        for index in range(len(windows)):
            # 跳過主要分頁
            if index == 0: continue
                
            # 跳到指定分頁
            driver.switch_to.window(windows[index])
            
            # 放置期刊編號列表
            listJournalNum = []

            # 各別取得內頁資訊
            for li in driver.find_elements(By.CSS_SELECTOR, 'ul.publications.gridView.ng-scope > li'):
                # 取得期刊號碼對應數字
                strStyle = li.find_element(By.CSS_SELECTOR, 'div.cover').get_attribute('style')
                regexNum = r'\/(\d{2,})\/'
                strNum = re.search(regexNum, strStyle)[1]
                listJournalNum.append(strNum)
                
            for num in listJournalNum:
                # 前往內頁取得資料
                driver.get(f'https://www.litphil.sinica.edu.tw/publications/bulletin/{num}')
                sleep(randint(1,2))
                html = driver.page_source
                soup = bs(html, 'lxml')
                
                # 取得期刊名稱與期數
                div_name = soup.select_one('div.name.ng-binding')
                strJournalSubtitle = div_name.find('span').text
                strJournalTitle = div_name.text.replace(div_name.find('span').text, '')
                
                # 取得出版商與出版日期
                strPublisher = strDate = ""
                if len(soup.select('span.ng-binding.ng-scope[ng-if="doc.publisher"]')) > 0:
                    span_publisher = soup.select_one('span.ng-binding.ng-scope[ng-if="doc.publisher"]')
                    strPublisher = span_publisher.text.replace(span_publisher.find('b').text, '')
                if len(soup.select('span.ng-binding.ng-scope[ng-if="doc.date"]')) > 0:
                    span_date = soup.select_one('span.ng-binding.ng-scope[ng-if="doc.date"]')
                    strDate = span_date.text.replace(span_date.find('b').text, '')
                
                # 取得超連結
                a_elms = soup.select('li.ng-scope[ng-repeat="section in doc.sections"] li.ng-scope[ng-repeat="article in section.articles"] a.ng-binding')
                for a in a_elms:
                    # 取得有 href 屬性的 a 連結
                    if a.has_attr('href') and 'pdf' in a['href']:
                        # 處理論文名稱問題
                        strAuthor = a.find('span').text.replace('／', '')
                        strName = a.text.replace(a.find('span').text, '')
                        regexName = r"\s|\n"
                        strName = re.sub(regexName, '', strName)
                        
                        # 論文連結處理
                        if 'http' in a['href']:
                            strPdfOriginPath = a['href']
                        else:
                            strPdfOriginPath = prefix_url + a['href']
                        
                        # 有些連結有誤，經觀察，將 site 改成 www 即可
                        if 'http://site.' in strPdfOriginPath:
                            strPdfOriginPath = strPdfOriginPath.replace("http://site.", "http://www.")
                            
                        regexPdfPath = r'(https?:\/\/.+(?<=\d\/))(.+)\.pdf'
                        matchPdfPath = re.search(regexPdfPath, strPdfOriginPath)
                        if '%' in matchPdfPath[2]:
                            strPdfCurlPath = matchPdfPath[1] + matchPdfPath[2] + '.pdf'
                        else:
                            strPdfCurlPath = matchPdfPath[1] + quote(matchPdfPath[2]) + '.pdf'

                        # 整理資料
                        listData.append({
                            "流水號": sn,
                            "期刊名稱": strJournalTitle,
                            "期數": strJournalSubtitle,
                            "網頁連結": f'https://www.litphil.sinica.edu.tw/publications/bulletin/{num}',
                            "出版商": strPublisher,
                            "出版日期": strDate,
                            "論文名稱": strName,
                            "作者名稱": strAuthor,
                            "論文連結_原始": strPdfOriginPath,
                            "論文連結_curl可用": strPdfCurlPath
                        })

                        # 遞增流水號
                        sn += 1
            
    except TimeoutException as e:
        print('等待逾時: parse')
        
# 關閉瀏覽器
def close():
    driver.quit()
        
# 儲存成 json
def saveJson():
    global listData
    with open(f"{folderPath}/{jsonFileName}", "w", encoding="utf-8") as file:
        file.write( json.dumps( listData, ensure_ascii=False, indent=4 ) )

# 儲存成 excel
def saveExcel():
    with open(f"{folderPath}/{jsonFileName}", "r", encoding="utf-8") as file:
        # 從 excel 列號 2 開始寫入資料
        row_num = 2
        
        # 取得 json 內容
        strJson = file.read()
        
        # 將 json 轉成 list
        listJson = json.loads(strJson)
        
        # 逐列寫入
        for myDict in listJson:
            worksheet['A' + str(row_num)] = myDict["流水號"]
            worksheet['B' + str(row_num)] = myDict["期刊名稱"]
            worksheet['C' + str(row_num)] = myDict["期數"]
            worksheet['D' + str(row_num)] = myDict['網頁連結']
            worksheet['E' + str(row_num)] = myDict["出版商"]
            worksheet['F' + str(row_num)] = myDict["出版日期"]
            worksheet['G' + str(row_num)] = myDict["論文名稱"]
            worksheet['H' + str(row_num)] = myDict["作者名稱"]
            worksheet['I' + str(row_num)] = myDict["論文連結_原始"]
            worksheet['J' + str(row_num)] = myDict["論文連結_curl可用"]
            row_num += 1
    
    # 儲存 workbook
    workbook.save(filePath)

    # 關閉 workbook
    workbook.close()
    
# 下載
def download():
    with open(f"{folderPath}/{jsonFileName}", "r", encoding="utf-8") as file:      
        # 取得 json 內容
        strJson = file.read()
        
        # 將 json 轉成 list
        listJson = json.loads(strJson)
        
        for myDict in listJson:
            # 等待
            sleep(randint(1,3))
            
            # 下載 pdf
            cmd = ['curl', '-L', myDict["論文連結_curl可用"], '-o', f'{folderPath}/sn_{myDict["流水號"]}.pdf']
            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            #output = result.stdout
            #pprint(output)
            print(f'{folderPath}/sn_{myDict["流水號"]}.pdf')

# 以下函式，請各別依情況分別、陸續執行

In [None]:
# 初始化 Web Driver
init()

In [None]:
# 為 pagination 開啟 tabs
openTabs()

In [None]:
# 設定 tabs 內容
setTabs()

In [None]:
# 剖析資料
parse()

In [None]:
# 關閉瀏覽器
close()

In [None]:
# 儲存成 json
saveJson()

In [None]:
# 儲存成 excel
saveExcel()

In [None]:
# 下載
time_begin = time.time()
download()
time_end = time.time()
print(f"總共執行了 { time_end - time_begin } 秒")