In [None]:
!pip install -U openpyxl selenium beautifulsoup4 lxml requests

In [None]:
'''
注意事項:
下載對應的 ChromeDriver (web driver) 到程式檔案同一個目錄下後解壓縮，下載前記得對應版本編號。
連結: https://chromedriver.chromium.org/downloads

參考網頁:
[1] 國立中山大學 中國文學系
http://www.chinese.nsysu.edu.tw/zh_tw/Academic_Achievements/Publication/BOOK
'''


'''
匯入套件
'''
# HTML parser
from bs4 import BeautifulSoup as bs

# 網路請求工具
import requests as req

# 強制等待 (執行期間休息一下)
from time import sleep

# pretty-print
from pprint import pprint

# 隨機
from random import randint

# 計時
import time

# 整理 json 使用的工具
import json

# 執行 shell command 的時候用的
import os

# 子處理程序，用來取代 os.system 的功能
import subprocess

# 正規表達式
import re

# 編碼
from urllib.parse import quote

# 存取 Excel 的工具
from openpyxl import load_workbook
from openpyxl import Workbook

# 來源首頁
prefix_url = 'http://www.chinese.nsysu.edu.tw'
url = prefix_url + '/zh_tw/Academic_Achievements/Publication/BOOK'

# 指定 sheet name
sheetName = 'chinese_nsysu'

# 指定 excel 檔名
excelFileName = f'{sheetName}.xlsx'

# 指定 json 檔名
jsonFileName = f'{sheetName}.json'

# 建立儲存圖片、影片的資料夾
folderPath = f'./{sheetName}'
if not os.path.exists(folderPath):
    os.makedirs(folderPath)

# 判斷 excel 檔案是否存在，不存在就新增
filePath = folderPath + '/' + excelFileName
if not os.path.exists(filePath):
    workbook = Workbook() # 動態新增檔案
    worksheet = workbook.create_sheet(sheetName, 0) # 建立並取得 active sheet
else:
    workbook = load_workbook(filename = filePath)
    worksheet = workbook[sheetName] # 取得 active sheet

#預設下載路徑
my_options.add_experimental_option("prefs", {
    "download.default_directory": folderPath
})
    
# excel 標題
worksheet['A1'] = "流水號"
worksheet['B1'] = "期刊標題"
worksheet['C1'] = "期號"
worksheet['D1'] = "出版日期"
worksheet['E1'] = "主編"
worksheet['F1'] = "章節列表_網頁連結"
worksheet['G1'] = "篇名"
worksheet['H1'] = "作者"
worksheet['I1'] = "論文連結_原始"
worksheet['J1'] = "論文連結_curl可用"

# 自訂標頭
my_headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}

# 放置首頁分頁的 set 變數
setPagination = set()

# 放置爬取的資料
listData = []

In [None]:
'''
函式
'''
# 取得分頁連結
def getPaginationLink():
    global setPagination
    
    # 將自訂標頭加入 GET 請求中
    res = req.get(url, headers = my_headers)

    # 建立 soup 物件
    soup = bs(res.text, 'lxml')
    
    # 取得分頁連結
    a_elms = soup.select('ul.pagination.pagination-sm > li > a');
    
    # 整理分頁
    for a in a_elms:
        setPagination.add(url + a['href'])

# 取得分頁資料
def getMainLinks():
    global setPagination
    
    # 走訪分頁
    for link in list(setPagination):
        # 將自訂標頭加入 GET 請求中
        res = req.get(link, headers = my_headers)

        # 建立 soup 物件
        soup = bs(res.text, 'lxml')
        
        # 取得表格資料
        table_elm = soup.select_one('table.table.table-hover.table-striped.journals-index')
        tr_elms = table_elm.select('tbody[data-list="journals"] > tr')
        for tr in tr_elms:
            # 直接取得每一個 tr 底下的所有 td 集合
            td_elms = tr.select('td')
            
            # 期刊標題
            strJournalTitle = td_elms[1].get_text().strip()
            
            # 期號
            matchNum = re.search(r"\d+", strJournalTitle)
            intJournalNum = int(matchNum[0])
            
            # 出版日期
            strJounalPublishDate = td_elms[2].get_text().strip()
            
            # 主編
            strEditor = td_elms[3].get_text().strip()
            
            # 章節列表_網頁連結
            strLink = prefix_url + td_elms[4].select_one('a')['href']
            
            # 整理資料
            listData.append({
                "期刊標題": strJournalTitle,
                "期號": intJournalNum,
                "出版日期": strJounalPublishDate,
                "主編": strEditor,
                "章節列表_網頁連結": strLink
            })

# 取得內頁資料
def getDetailedData():
    global listData
    
    # 逐個內頁連結走訪
    for index, myDict in enumerate(listData):
        # 建立子節點，以便放置內頁資料
        if not 'sub' in listData[index]:
            listData[index]['sub'] = []
            
        # 將自訂標頭加入 GET 請求中
        res = req.get(myDict['章節列表_網頁連結'], headers = my_headers)
        
        # 等待
        sleep(randint(1,3))
        
        # 建立 soup 物件
        soup = bs(res.text, 'lxml')
        
        # 取得表格
        table_elm = soup.select_one('table.table.table-hover.table-striped.chapters-index')
        tr_elms = table_elm.select('tbody[data-list="chapters"] > tr')
        for tr in tr_elms:
            # 整理資料的 dict
            dict_tmp = {}
            
            # 直接取得每一個 tr 底下的所有 td 集合
            td_elms = tr.select('td')
            
            # 篇名
            strPdfName = td_elms[0].select_one('a').get_text().strip()
            
            # 作者
            strAuthor = td_elms[1].get_text().strip()
            
            # 初步整理資料
            dict_tmp['篇名'] = strPdfName
            dict_tmp['作者'] = strAuthor
            dict_tmp['論文連結_原始'] = dict_tmp['論文連結_curl可用'] = ""
            
            # 論文連結
            strPdfLink = strPdfLinkForCurl = ""
            
            # 如果「動作」欄位裡面的連結有一個以上，代表可能有 pdf 連結
            if len( td_elms[3].select('a') ) > 0:
                for a in td_elms[3].select('a'):
                    if '下載' in a.get_text() and a.has_attr('href') and a['href'] != '':
                        # 取得論文連結_原始
                        strPdfLink = prefix_url + a['href']
                        
                        # 取得論文連結_curl可用
                        if not '%' in strPdfLink:
                            strPdfLinkForCurl = quote(strPdfLink, safe=':/')
                        else:
                            strPdfLinkForCurl = strPdfLink
                        
                        # 整理連結資料
                        dict_tmp['論文連結_原始'] = strPdfLink
                        dict_tmp['論文連結_curl可用'] = strPdfLinkForCurl
                        
            # 建立子節點，以便放置內頁資料
            listData[index]['sub'].append(dict_tmp)
        
# 儲存成 json
def saveJson():
    global listData
    with open(f"{folderPath}/{jsonFileName}", "w", encoding="utf-8") as file:
        file.write( json.dumps( listData, ensure_ascii=False, indent=4 ) )

# 儲存成 excel
def saveExcel():
    with open(f"{folderPath}/{jsonFileName}", "r", encoding="utf-8") as file:
        # 從 excel 列號 2 開始寫入資料
        row_num = 2
        
        # 取得 json 內容
        strJson = file.read()
        
        # 將 json 轉成 list
        listJson = json.loads(strJson)
        
        # 流水號
        sn = 1
        
        # 逐列寫入
        for myDict in listJson:
            for d in myDict['sub']:
                worksheet['A' + str(row_num)] = sn
                worksheet['B' + str(row_num)] = myDict["期刊標題"]
                worksheet['C' + str(row_num)] = myDict["期號"]
                worksheet['D' + str(row_num)] = myDict["出版日期"]
                worksheet['E' + str(row_num)] = myDict["主編"]
                worksheet['F' + str(row_num)] = myDict["章節列表_網頁連結"]
                worksheet['G' + str(row_num)] = d["篇名"]
                worksheet['H' + str(row_num)] = d["作者"]
                worksheet['I' + str(row_num)] = d["論文連結_原始"]
                worksheet['J' + str(row_num)] = d["論文連結_curl可用"]
                row_num += 1
                sn += 1
    
    # 儲存 workbook
    workbook.save(filePath)

    # 關閉 workbook
    workbook.close()
    
# 下載
def download():
    with open(f"{folderPath}/{jsonFileName}", "r", encoding="utf-8") as file:      
        # 取得 json 內容
        strJson = file.read()
        
        # 將 json 轉成 list
        listJson = json.loads(strJson)
        
        # 流水號
        sn = 1
        
        for myDict in listJson:
            for d in myDict['sub']:
                # 若屬性的值不為空，代表有 pdf 連結，準備進行下載
                if d["論文連結_curl可用"] != '':
                    # 等待
                    sleep(randint(1,3))
                
                    # 下載 pdf
                    cmd = ['curl', '-L', d["論文連結_curl可用"], '-o', f'{folderPath}/sn_{sn}.pdf']
                    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                    print(f'{folderPath}/sn_{sn}.pdf')
                sn += 1

In [None]:
# 取得分頁連結
getPaginationLink()

In [None]:
# 取得分頁資料
getMainLinks()

In [None]:
# 取得內頁資料
getDetailedData()

In [None]:
# 儲存成 json
saveJson()

In [None]:
# 儲存成 excel
saveExcel()

In [None]:
time_begin = time.time()
download()
time_end = time.time()
print(f"總共執行了 { time_end - time_begin } 秒")