In [27]:
# CNS11643中文標準交換碼全字庫(簡稱全字庫) https://data.gov.tw/dataset/5961
import os
import re
import csv
import time
import json
import random
import requests
import pandas as pd
import logging

from lxml import etree
from fake_useragent import UserAgent

LOGGING_FORMAT = '%(asctime)s %(levelname)s: %(message)s'
DATE_FORMAT = '%Y%m%d %H:%M:%S'
logging.basicConfig(level=logging.ERROR, filename='crawler.log', filemode='a', format=LOGGING_FORMAT)

user_agent = UserAgent()

In [28]:
proxy_pool_url = "http://0.0.0.0:5010/"

def getPage(url, max_retries=5, proxy=None):

    times = 0

    while times < max_retries:

        try:
            session = requests.session()

            if proxy:
                proxies = {'http' : f'http://{proxy}'}
                session.proxies.update(proxies)

            res = session.get(
                url,
                headers={'user-agent': user_agent.random},
                timeout=(5.05, 27)
                ) # connect & read timeout
                
            session.cookies.clear()
            htmltext = res.text
            
            return htmltext

        except requests.exceptions.RequestException:
            times += 1

def get_proxy(proxy_pool_url, option='get'):
    
    proxy_pool = f'{proxy_pool_url}/{option}/'

    r = getPage(proxy_pool)
    response = json.loads(r)
    
    if option == 'get':

        return response['proxy']
        
    elif option == 'all':
        available_proxies = [data['proxy'] for data in response]

        return available_proxies

    elif option == 'count':
        available_proxies_count = response['count']['total']

        return available_proxies_count

In [30]:
available_proxies = get_proxy(proxy_pool_url, option='all')

In [2]:
# # def allocate_proxies(url_count, available_proxies):
# # batch allocate_proxies and url

finished = set()
unfinished = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

# # In allocater

# for url in unfinished:
while unfinished:

    available_proxies = get_proxy(proxy_pool_url, option='all')
    available_proxies_count = len(available_proxies)

    if available_proxies_count > len(unfinished):
        concurrent_task_num = len(unfinished)
    else:
        concurrent_task_num = available_proxies_count
    
    urls_batch = set(random.sample(unfinished, concurrent_task_num))

    print(f'Now handling {urls_batch}')

    finished.update(urls_batch)
    print(f'finished : {finished}')
    unfinished -= urls_batch

    # pass same number of url and proxies
    await asyncio.gather([get_words_by_url_async(url, proxy) for url, proxy in zip(random_url_list, available_proxies)]) # -> await to get text -> parse and writerows
    
    finished.append(random_url_list)
    unfinished.remove(random_url_list)

else:
    print('All done !')
    break


In [13]:
import random

finished = set()
unfinished = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
batch = 3

while unfinished:
    random_list = set(random.sample(a, batch))
    print(f'Now handling {random_list}')

    finished.update(random_list)
    print(f'finished : {finished}')
    unfinished -= random_list
else:
    print(f'[Crawler] All done !')


Now handling {9, 1, 7}
finished : {9, 1, 7}
Now handling {2, 5, 7}
finished : {1, 2, 5, 7, 9}
Now handling {0, 2, 4}
finished : {0, 1, 2, 4, 5, 7, 9}
Now handling {1, 2, 4}
finished : {0, 1, 2, 4, 5, 7, 9}
Now handling {0, 4, 6}
finished : {0, 1, 2, 4, 5, 6, 7, 9}
Now handling {2, 5, 6}
finished : {0, 1, 2, 4, 5, 6, 7, 9}
Now handling {2, 4, 7}
finished : {0, 1, 2, 4, 5, 6, 7, 9}
Now handling {2, 4, 7}
finished : {0, 1, 2, 4, 5, 6, 7, 9}
Now handling {3, 5, 6}
finished : {0, 1, 2, 3, 4, 5, 6, 7, 9}
Now handling {2, 5, 6}
finished : {0, 1, 2, 3, 4, 5, 6, 7, 9}
Now handling {9, 4, 5}
finished : {0, 1, 2, 3, 4, 5, 6, 7, 9}
Now handling {8, 6, 7}
finished : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
[Crawler] All done !


In [32]:
len(available_proxies)

34

In [31]:
available_proxies

['47.92.113.71:80',
 '39.106.228.34:8080',
 '106.15.193.237:8088',
 '166.111.50.197:10080',
 '166.111.74.204:7078',
 '47.251.12.225:3128',
 '150.109.149.72:3128',
 '112.6.117.178:8085',
 '106.15.93.125:8080',
 '58.20.234.243:9091',
 '113.238.142.208:3128',
 '113.195.143.39:8085',
 '131.221.66.65:999',
 '185.110.208.37:8080',
 '8.218.81.68:59394',
 '47.103.30.183:8080',
 '183.213.28.64:3128',
 '183.0.203.167:8118',
 '124.204.33.162:8000',
 '223.96.90.216:8085',
 '47.118.60.3:80',
 '58.20.235.180:9091',
 '120.220.220.95:8085',
 '36.255.211.1:54623',
 '221.238.207.34:8000',
 '106.14.204.226:8080',
 '103.216.103.25:80',
 '47.101.41.163:8080',
 '112.6.117.135:8085',
 '221.4.241.198:9091',
 '183.247.199.126:30001',
 '183.247.199.111:30001',
 '120.26.123.95:8010',
 '120.240.95.40:80']

In [2]:
# V 從 筆劃 直接爬

# https://www.cns11643.gov.tw/search.jsp?ID=11&SN=3&PAGE=2
# ID=11 : 筆劃查詢
# SN=3 : 筆劃查詢 < 3 劃 >
# PAGE=2 : 頁數

# func(筆劃, 頁數)

In [3]:
def logging_and_print(string):

    logging.critical(string)
    print(string)

In [4]:
url = "https://www.cns11643.gov.tw/search.jsp?ID=11"

def getPage(url, max_retries=5, proxy=None):

    times = 0

    while times < max_retries:

        try:
            session = requests.session()

            if proxy:
                proxies = {'http' : f'http://{proxy}'}
                session.proxies.update(proxies)

            res = session.get(
                url,
                headers={'user-agent': user_agent.random},
                timeout=(5.05, 27)
                ) # connect & read timeout
                
            session.cookies.clear()
            htmltext = res.text
            
            return htmltext

        except requests.exceptions.RequestException:
            times += 1

# 獲取全筆劃 & 字數
def get_stroke_and_nums_list(htmltext):

    xdoc = etree.HTML(htmltext)

    stroke_list = list(xdoc.xpath("//td/div[@class='float2 part8p']/a/text()"))
    number_list = list(map(lambda x : x.replace("(", "").replace(")", ""), list(xdoc.xpath("//td/div[@class='float2 part8p']/a/span/text()"))))

    stroke_and_nums_list = list(zip(stroke_list, number_list))
    
    return stroke_and_nums_list

In [5]:
proxy_pool_url = "http://0.0.0.0:5010/get/"

def get_proxy(proxy_pool_url):

    r = getPage(proxy_pool_url)

    response = json.loads(r)
    
    return response['proxy']


In [6]:
get_proxy(proxy_pool_url)

'103.216.103.25:80'

In [7]:
# getPage(url, proxy='58.20.234.243:9091')

In [8]:
# getPage(url)

In [9]:
htmltext = getPage(url)

stroke_and_nums_list = get_stroke_and_nums_list(htmltext)
stroke_and_nums_df = pd.DataFrame(stroke_and_nums_list, columns = ['筆劃', '字數'])
stroke_and_nums_df['字數'] = stroke_and_nums_df['字數'].astype(int)
# stroke_and_nums_df.to_csv('stroke_and_nums.csv', index=0)

In [10]:
# # 要爬的
# waiting_queue = stroke_and_nums_df.loc[stroke_and_nums_df['字數'].astype(int) != 1]

# # 只有一筆的先跳過，手動加入
# skipped = stroke_and_nums_df.loc[stroke_and_nums_df['字數'].astype(int) == 1]

In [11]:
waiting_queue = stroke_and_nums_df

In [12]:
waiting_queue

Unnamed: 0,筆劃,字數
0,1畫,71
1,2畫,138
2,3畫,246
3,4畫,558
4,5畫,935
5,6畫,1812
6,7畫,3036
7,8畫,4388
8,9畫,5648
9,10畫,6591


In [13]:
# 取得各頁面最後一頁的頁數
def get_last_page_num(base_url, stroke):

    stroke_query = f'&SN={stroke}'

    target_url = base_url + stroke_query

    htmltext = getPage(target_url)

    xdoc = etree.HTML(htmltext)

    if "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '筆畫查詢':
        last_page_num = "".join(xdoc.xpath("//div[@class='pager']/span/text()")).replace('[', '').replace(']', '').split('/')[1]

    elif "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '字形資訊':
        last_page_num = '1'
    
    return {stroke : last_page_num}

In [14]:
tmp = {}

queue = list(map(lambda x : re.search("[0-9]+", x).group(), waiting_queue['筆劃']))

for stroke in queue:
    tmp.update(get_last_page_num(url, stroke))
    
logging_and_print(f'[Crawler] Last page num of each character scrapped.')

[Crawler] Last page num of each character scrapped.


In [15]:
tmp

{'1': '2',
 '2': '3',
 '3': '5',
 '4': '12',
 '5': '19',
 '6': '37',
 '7': '61',
 '8': '88',
 '9': '113',
 '10': '132',
 '11': '153',
 '12': '168',
 '13': '165',
 '14': '158',
 '15': '154',
 '16': '140',
 '17': '111',
 '18': '98',
 '19': '80',
 '20': '62',
 '21': '50',
 '22': '37',
 '23': '29',
 '24': '22',
 '25': '15',
 '26': '10',
 '27': '8',
 '28': '5',
 '29': '3',
 '30': '3',
 '31': '2',
 '32': '2',
 '33': '1',
 '34': '1',
 '35': '1',
 '36': '1',
 '37': '1',
 '38': '1',
 '39': '1',
 '40': '1',
 '41': '1',
 '43': '1',
 '44': '1',
 '46': '1',
 '48': '1',
 '52': '1',
 '60': '1',
 '64': '1'}

In [16]:
stroke_and_last_page_num_df = pd.Series(tmp).rename_axis('stroke').to_frame('last_page_num').reset_index()

stroke_and_last_page_num_df

Unnamed: 0,stroke,last_page_num
0,1,2
1,2,3
2,3,5
3,4,12
4,5,19
5,6,37
6,7,61
7,8,88
8,9,113
9,10,132


In [17]:
# needs to perform 1963 times requests
stroke_and_last_page_num_df['last_page_num'].astype(int).sum()

1963

In [18]:
def combine_urls(base_url, stroke, page_num):

    stroke_query = f'&SN={stroke}'
    page_query = f'&PAGE={page_num}'

    target_url = base_url + stroke_query + page_query
    
    return target_url

In [19]:
def get_words_by_stroke_and_page_num(target_url, proxy=None):

    stroke = re.search('(&SN=)([0-9]+)', target_url).group(2)

    htmltext = getPage(target_url, proxy=proxy)

    xdoc = etree.HTML(htmltext)

    if "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '筆畫查詢':
        words = list(xdoc.xpath("//div[@class='wordList']/span/a/img/@alt"))

    elif "".join(xdoc.xpath("//table[@class='frame']//td[@class='pageName']/text()")) == '字形資訊':
        words = list(xdoc.xpath("//div[@class='col2 Lt Ft'][2]//div[2]/img/@alt"))

    # dicts = dict(zip(words, cycle([stroke])))
    dicts = list(zip(words, [stroke] * len(words)))
    
    return iter(dicts)
    

In [20]:
# w = get_words_by_stroke_and_page_num('https://www.cns11643.gov.tw/search.jsp?ID=11&SN=27&PAGE=5', proxy=None)
# with open('w.txt', 'w', encoding='UTF-8') as wd:
#     for item in w:
#         wd.write(f"{item}\n")


In [21]:
# import re

# t = 'https://www.cns11643.gov.tw/search.jsp?ID=11&SN=27&PAGE=5'

# stroke = re.search('(&SN=)([0-9]+)', t).group(2)


In [22]:
output_file = '漢字筆畫對照表_全字庫.csv'

def create_file(output_file):

    if os.path.exists(output_file) == False:
        open(output_file, "w").close
        logging_and_print(f'[Crawler] {output_file} created.')

        return create_file(output_file)

    else:

        with open(output_file, 'r+', encoding='UTF-8') as f:
            line = f.readline()

        if "字" in line and "筆畫" in line:
            logging_and_print(f'[Crawler] {output_file} existed.')

        else:
            with open(output_file, 'a', newline='', encoding='UTF-8') as f:
                writer = csv.writer(f)
                writer.writerow(['字', '筆畫']) # header
                logging_and_print(f'[Crawler] {output_file} header added.')


In [23]:
urls_list = []

stroke_lst = list(map(lambda x : int(x), stroke_and_last_page_num_df['stroke'].tolist()))
stroke_last_page_num = list(map(lambda x : int(x), stroke_and_last_page_num_df['last_page_num'].tolist()))

for stroke, last_page_num in zip(stroke_lst, stroke_last_page_num):
    for page_num in list(map(lambda x : x+1, range(last_page_num))):
        urls_list.append(combine_urls(url, stroke, page_num))

In [24]:
len(urls_list)

1963

In [25]:
# with open('urls_list.txt', 'w') as g:
#     for item in urls_list:
#         g.write("%s\n" % item)

In [26]:
request_times = 0
refresh_freq = 30

word_stroke_dict = {}

# failed = []

stroke_lst = list(map(lambda x : int(x), stroke_and_last_page_num_df['stroke'].tolist()))
stroke_last_page_num = list(map(lambda x : int(x), stroke_and_last_page_num_df['last_page_num'].tolist()))
# stroke_lst = [4, 5]
# stroke_last_page_num = [12, 19]

# Instantiate
create_file(output_file) 

for url in urls_list:

    try:

        delay = [1, 3, 5, 7]
        time.sleep(random.choice(delay))
        
        with open(output_file, 'a', newline='', encoding='UTF-8') as f:

            writer = csv.writer(f)

            if request_times % refresh_freq == 0:

                proxy = get_proxy(proxy_pool_url) 
                logging_and_print(f"[Crawler] Changed proxy as {proxy}") 
            
            writer.writerows(get_words_by_stroke_and_page_num(url, proxy=proxy))
            logging_and_print(f"[Crawler] {url} scrapped.")    
            
            request_times += 1
            
    except:

        logging_and_print(f"[Crawler] Can't get {url}.")

[Crawler] 漢字筆畫對照表_全字庫.csv existed.
[Crawler] Changed proxy as 223.100.215.25:8080
[Crawler] https://www.cns11643.gov.tw/search.jsp?ID=11&SN=1&PAGE=1 scrapped.
[Crawler] https://www.cns11643.gov.tw/search.jsp?ID=11&SN=1&PAGE=2 scrapped.
[Crawler] https://www.cns11643.gov.tw/search.jsp?ID=11&SN=2&PAGE=1 scrapped.
[Crawler] https://www.cns11643.gov.tw/search.jsp?ID=11&SN=2&PAGE=2 scrapped.
[Crawler] https://www.cns11643.gov.tw/search.jsp?ID=11&SN=2&PAGE=3 scrapped.
[Crawler] https://www.cns11643.gov.tw/search.jsp?ID=11&SN=3&PAGE=1 scrapped.
[Crawler] https://www.cns11643.gov.tw/search.jsp?ID=11&SN=3&PAGE=2 scrapped.


In [213]:
word_stroke_dict

{'Ⅱ': 4,
 '⑽': 4,
 '⼼': 4,
 '⼽': 4,
 '⼾': 4,
 '⼿': 4,
 '⽀': 4,
 '⽁': 4,
 '⽂': 4,
 '⽃': 4,
 '⽄': 4,
 '⽅': 4,
 '⽆': 4,
 '⽇': 4,
 '⽈': 4,
 '⽉': 4,
 '⽊': 4,
 '⽋': 4,
 '⽌': 4,
 '⽍': 4,
 '⽎': 4,
 '⽏': 4,
 '⽐': 4,
 '⽑': 4,
 '⽒': 4,
 '⽓': 4,
 '⽔': 4,
 '⽕': 4,
 '⽖': 4,
 '⽗': 4,
 '⽘': 4,
 '⽙': 4,
 '⽚': 4,
 '⽛': 4,
 '⽜': 4,
 '⽝': 4,
 '丑': 4,
 '丐': 4,
 '不': 4,
 '中': 4,
 '丰': 4,
 '丹': 4,
 '之': 4,
 '尹': 4,
 '予': 4,
 '云': 4,
 '井': 4,
 '互': 4,
 '五': 4,
 '亢': 4,
 '仁': 4,
 '什': 4,
 '仃': 4,
 '仆': 4,
 '仇': 4,
 '仍': 4,
 '今': 4,
 '介': 4,
 '仄': 4,
 '元': 4,
 '允': 4,
 '內': 4,
 '六': 4,
 '兮': 4,
 '公': 4,
 '冗': 4,
 '凶': 4,
 '分': 4,
 '切': 4,
 '刈': 4,
 '勻': 4,
 '勾': 4,
 '勿': 4,
 '化': 4,
 '匹': 4,
 '午': 4,
 '升': 4,
 '卅': 4,
 '卞': 4,
 '厄': 4,
 '友': 4,
 '及': 4,
 '反': 4,
 '壬': 4,
 '天': 4,
 '夫': 4,
 '太': 4,
 '夭': 4,
 '孔': 4,
 '少': 4,
 '尤': 4,
 '尺': 4,
 '屯': 4,
 '巴': 4,
 '幻': 4,
 '廿': 4,
 '弔': 4,
 '引': 4,
 '心': 4,
 '戈': 4,
 '戶': 4,
 '手': 4,
 '扎': 4,
 '支': 4,
 '文': 4,
 '斗': 4,
 '斤': 4,
 '方': 4,
 '日': 4,
 '曰': 4,
 '月': 4,
 