In [39]:
# CNS11643中文標準交換碼全字庫(簡稱全字庫) https://data.gov.tw/dataset/5961
import os
import re
import csv
import time
import random
import requests
import pandas as pd
import logging

from lxml import etree
from fake_useragent import UserAgent

LOGGING_FORMAT = '%(asctime)s %(levelname)s: %(message)s'
DATE_FORMAT = '%Y%m%d %H:%M:%S'
logging.basicConfig(level=logging.ERROR, filename='crawler.log', filemode='a', format=LOGGING_FORMAT)

user_agent = UserAgent()

In [2]:
# V 從 筆劃 直接爬

# https://www.cns11643.gov.tw/search.jsp?ID=11&SN=3&PAGE=2
# ID=11 : 筆劃查詢
# SN=3 : 筆劃查詢 < 3 劃 >
# PAGE=2 : 頁數

# func(筆劃, 頁數)

In [None]:
def logging_and_print(string):

    logging.critical(string)
    print(string)

In [3]:
url = "https://www.cns11643.gov.tw/search.jsp?ID=11"

def getPage(url, max_retries=5):
    times = 0

    while times < max_retries:

        try:
            session = requests.session()
            res = session.get(url, headers={'user-agent': user_agent.random}, timeout=(5.05, 27)) # connect & read timeout
            session.cookies.clear()
            htmltext = res.text
            
            return htmltext

        except requests.exceptions.RequestException:
            times += 1

# 獲取全筆劃 & 字數
def get_stroke_and_nums_list(htmltext):

    xdoc = etree.HTML(htmltext)

    stroke_list = list(xdoc.xpath("//td/div[@class='float2 part8p']/a/text()"))
    number_list = list(map(lambda x : x.replace("(", "").replace(")", ""), list(xdoc.xpath("//td/div[@class='float2 part8p']/a/span/text()"))))

    stroke_and_nums_list = list(zip(stroke_list, number_list))
    
    return stroke_and_nums_list

In [4]:
getPage(url)

'<!DOCTYPE html>\r\n<html lang="zh-Hant">\r\n<head>\r\n<!-- Global site tag (gtag.js) - Google Analytics -->\r\n<script async src="https://www.googletagmanager.com/gtag/js?id=UA-106164561-2"></script>\r\n<script>\r\n  window.dataLayer = window.dataLayer || [];\r\n  function gtag(){dataLayer.push(arguments);}\r\n  gtag(\'js\', new Date());\r\n\r\n  gtag(\'config\', \'UA-106164561-2\');\r\n</script>\r\n<link rel="canonical" href="https://www.cns11643.gov.tw/search.jsp?ID=11&SN=&lang=tw" />\r\n<link rel="alternate" href="https://www.cns11643.gov.tw/search.jsp?ID=11&SN=&lang=tw" hreflang="zh-Hant" />\r\n<link rel="alternate" href="https://www.cns11643.gov.tw/search.jsp?ID=11&SN=&lang=en" hreflang="en" />\r\n\r\n<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />\r\n<meta name="keywords"\tcontent="中文,全字庫,中文難字,字碼查詢,字典,注音,字形,字典,中文字形,中文字碼">\r\n<meta name="author"\t\tcontent="CMEX中文數位化推廣基金會">\r\n<meta name="viewport"\tcontent="width=device-width, initial-scale=1" >\r\n<meta htt

In [5]:
htmltext = getPage(url)

stroke_and_nums_list = get_stroke_and_nums_list(htmltext)
stroke_and_nums_df = pd.DataFrame(stroke_and_nums_list, columns = ['筆劃', '字數'])
stroke_and_nums_df['字數'] = stroke_and_nums_df['字數'].astype(int)
# stroke_and_nums_df.to_csv('stroke_and_nums.csv', index=0)

In [6]:
# # 要爬的
# waiting_queue = stroke_and_nums_df.loc[stroke_and_nums_df['字數'].astype(int) != 1]

# # 只有一筆的先跳過，手動加入
# skipped = stroke_and_nums_df.loc[stroke_and_nums_df['字數'].astype(int) == 1]

In [7]:
waiting_queue = stroke_and_nums_df

In [8]:
waiting_queue

Unnamed: 0,筆劃,字數
0,1畫,71
1,2畫,138
2,3畫,246
3,4畫,558
4,5畫,935
5,6畫,1812
6,7畫,3036
7,8畫,4388
8,9畫,5648
9,10畫,6591


In [9]:
# 取得各頁面最後一頁的頁數
def get_last_page_num(base_url, stroke):

    stroke_query = f'&SN={stroke}'

    target_url = base_url + stroke_query

    htmltext = getPage(target_url)

    xdoc = etree.HTML(htmltext)

    if "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '筆畫查詢':
        last_page_num = "".join(xdoc.xpath("//div[@class='pager']/span/text()")).replace('[', '').replace(']', '').split('/')[1]

    elif "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '字形資訊':
        last_page_num = '1'
    
    return {stroke : last_page_num}

In [10]:
tmp = {}

queue = list(map(lambda x : re.search("[0-9]+", x).group(), waiting_queue['筆劃']))

for stroke in queue:
    tmp.update(get_last_page_num(url, stroke))
    
logging_and_print(f'[Crawler] Last page num of each character scrapped.')

In [11]:
tmp

{'1': '2',
 '2': '3',
 '3': '5',
 '4': '12',
 '5': '19',
 '6': '37',
 '7': '61',
 '8': '88',
 '9': '113',
 '10': '132',
 '11': '153',
 '12': '168',
 '13': '165',
 '14': '158',
 '15': '154',
 '16': '140',
 '17': '111',
 '18': '98',
 '19': '80',
 '20': '62',
 '21': '50',
 '22': '37',
 '23': '29',
 '24': '22',
 '25': '15',
 '26': '10',
 '27': '8',
 '28': '5',
 '29': '3',
 '30': '3',
 '31': '2',
 '32': '2',
 '33': '1',
 '34': '1',
 '35': '1',
 '36': '1',
 '37': '1',
 '38': '1',
 '39': '1',
 '40': '1',
 '41': '1',
 '43': '1',
 '44': '1',
 '46': '1',
 '48': '1',
 '52': '1',
 '60': '1',
 '64': '1'}

In [12]:
stroke_and_last_page_num_df = pd.Series(tmp).rename_axis('stroke').to_frame('last_page_num').reset_index()

stroke_and_last_page_num_df

Unnamed: 0,stroke,last_page_num
0,1,2
1,2,3
2,3,5
3,4,12
4,5,19
5,6,37
6,7,61
7,8,88
8,9,113
9,10,132


In [13]:
def get_words_by_stroke_and_page_num(base_url, stroke, page_num):

    stroke_query = f'&SN={stroke}'
    page_query = f'&PAGE={page_num}'

    target_url = base_url + stroke_query + page_query

    htmltext = getPage(target_url)

    xdoc = etree.HTML(htmltext)

    if "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '筆畫查詢':
        words = list(xdoc.xpath("//div[@class='wordList']/span/a/img/@alt"))

    elif "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '字形資訊':
        words = list(xdoc.xpath("//div[@class='col2 Lt Ft'][2]//div[2]/img/@alt"))

    # dicts = dict(zip(words, cycle([stroke])))
    dicts = list(zip(words, [stroke] * len(words)))
    
    return iter(dicts)
    

In [16]:
get_words_by_stroke_and_page_num(url, 17, 1)

<list_iterator at 0x15b93355688>

In [40]:
output_file = '漢字筆畫對照表_全字庫.csv'

def create_file(output_file):

    if os.path.exists(output_file) == False:
        open(output_file, "w").close
        logging_and_print(f'[Crawler] {output_file} created.')

        return create_file(output_file)

    else:

        with open(output_file, 'r+', encoding='UTF-8') as f:
            line = f.readline()

        if "字" in line and "筆畫" in line:
            logging_and_print(f'[Crawler] {output_file} existed.')

        else:
            with open(output_file, 'a', newline='', encoding='UTF-8') as f:
                writer = csv.writer(f)
                writer.writerow(['字', '筆畫']) # header
                logging_and_print(f'[Crawler] {output_file} header added.')


In [41]:
create_file(output_file) 

In [18]:
word_stroke_dict = {}

stroke_lst = list(map(lambda x : int(x), stroke_and_last_page_num_df['stroke'].tolist()))
stroke_last_page_num = list(map(lambda x : int(x), stroke_and_last_page_num_df['last_page_num'].tolist()))
# stroke_lst = [4, 5]
# stroke_last_page_num = [12, 19]

# Instantiate
create_file(output_file) 

for stroke, last_page_num in zip(stroke_lst, stroke_last_page_num):
    for page_num in list(map(lambda x : x+1, range(last_page_num))):

        try:
            delay = [1, 3, 5, 30]
            time.sleep(random.choice(delay))
            
            with open(output_file, 'a', newline='', encoding='UTF-8') as f:

                writer = csv.writer(f)

                writer.writerows(get_words_by_stroke_and_page_num(url, stroke, page_num))
                logging_and_print(f"[Crawler] Got stroke:{stroke} & page_num {page_num}")
                
                f.close()
        except:
            logging_and_print(f"[Crawler] Error Occured at stroke:{stroke} & page_num {page_num}")


[Crawler] Got stroke:1 & page_num 1
[Crawler] Got stroke:1 & page_num 2
[Crawler] Got stroke:2 & page_num 1
[Crawler] Got stroke:2 & page_num 2
[Crawler] Got stroke:2 & page_num 3
[Crawler] Got stroke:3 & page_num 1
[Crawler] Got stroke:3 & page_num 2
[Crawler] Error Occured at stroke:3 & page_num 3
[Crawler] Got stroke:3 & page_num 4


In [213]:
word_stroke_dict

{'Ⅱ': 4,
 '⑽': 4,
 '⼼': 4,
 '⼽': 4,
 '⼾': 4,
 '⼿': 4,
 '⽀': 4,
 '⽁': 4,
 '⽂': 4,
 '⽃': 4,
 '⽄': 4,
 '⽅': 4,
 '⽆': 4,
 '⽇': 4,
 '⽈': 4,
 '⽉': 4,
 '⽊': 4,
 '⽋': 4,
 '⽌': 4,
 '⽍': 4,
 '⽎': 4,
 '⽏': 4,
 '⽐': 4,
 '⽑': 4,
 '⽒': 4,
 '⽓': 4,
 '⽔': 4,
 '⽕': 4,
 '⽖': 4,
 '⽗': 4,
 '⽘': 4,
 '⽙': 4,
 '⽚': 4,
 '⽛': 4,
 '⽜': 4,
 '⽝': 4,
 '丑': 4,
 '丐': 4,
 '不': 4,
 '中': 4,
 '丰': 4,
 '丹': 4,
 '之': 4,
 '尹': 4,
 '予': 4,
 '云': 4,
 '井': 4,
 '互': 4,
 '五': 4,
 '亢': 4,
 '仁': 4,
 '什': 4,
 '仃': 4,
 '仆': 4,
 '仇': 4,
 '仍': 4,
 '今': 4,
 '介': 4,
 '仄': 4,
 '元': 4,
 '允': 4,
 '內': 4,
 '六': 4,
 '兮': 4,
 '公': 4,
 '冗': 4,
 '凶': 4,
 '分': 4,
 '切': 4,
 '刈': 4,
 '勻': 4,
 '勾': 4,
 '勿': 4,
 '化': 4,
 '匹': 4,
 '午': 4,
 '升': 4,
 '卅': 4,
 '卞': 4,
 '厄': 4,
 '友': 4,
 '及': 4,
 '反': 4,
 '壬': 4,
 '天': 4,
 '夫': 4,
 '太': 4,
 '夭': 4,
 '孔': 4,
 '少': 4,
 '尤': 4,
 '尺': 4,
 '屯': 4,
 '巴': 4,
 '幻': 4,
 '廿': 4,
 '弔': 4,
 '引': 4,
 '心': 4,
 '戈': 4,
 '戶': 4,
 '手': 4,
 '扎': 4,
 '支': 4,
 '文': 4,
 '斗': 4,
 '斤': 4,
 '方': 4,
 '日': 4,
 '曰': 4,
 '月': 4,
 