# housekeeping

In [12]:
import re
import time
import timeit
from tqdm import tqdm

import pandas as pd
from selenium import webdriver # webdriver 操作一般用
from selenium.webdriver.chrome import service as fs # Chrome を driver として設定する用
from selenium.webdriver.chrome.options import Options # headless モードで作業する用
from selenium.webdriver.common.by import By # find_element() で参照したい位置を特定する用
from bs4 import BeautifulSoup 

# 基本設定

In [2]:
# chrome driver の PATH 
DRIVER_PATH = '/Users/domolm/.pyenv/versions/3.10.8/lib/python3.10/site-packages/selenium/chromedriver'
service = fs.Service(executable_path=DRIVER_PATH)

options = Options()
options.add_argument("--window-size=1920,1200")
# options.add_argument('--headless')

# 起動

In [3]:
# 起動
driver = webdriver.Chrome(options=options, service=service)

# 作業したい url を指定し、開く
BASE_URL = 'https://www3.nhk.or.jp/news/easy'
driver.get(BASE_URL)

# クリックする

In [4]:
# click して特定日付の記事一覧を表示する
driver.execute_script('window.scrollTo(0, 1000);')
element = driver.find_element(By.XPATH, '//*[@id="easy-wrapper"]/div[2]/aside/section[2]/div[1]/a[1]')
element.click()

# driver.quit()


# html データとして取得

In [5]:
# utf-8 に変換する
html = driver.page_source.encode('utf-8')

In [6]:
# driver.quit()

# Beautiful Soup 用にパースする

In [7]:
soup = BeautifulSoup(html, 'html.parser')

# 文章を抽出

In [8]:
article_raw = soup.select('#js-archives-list > a')


In [9]:
# %%timeit
# href = []
# for ref in article_raw:
#     href.append(ref.get('href'))

# hreff = pd.Series(href).str.replace(r'^./', BASE_URL+'/')



# このコードは 4 件の場合は、315 µs ± 36.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

In [34]:
# 7.67 µs ± 28.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
href = []
for ref in tqdm(article_raw):
    new_url = re.sub(r'^./', BASE_URL+'/', ref.get('href'))
    href.append(new_url)

100%|██████████| 4/4 [00:00<00:00, 32577.12it/s]


In [52]:
for i in range(10):
    time.sleep(i*0.5)
    print(f'{i*0.5}秒経過')

0.0秒経過
0.5秒経過
1.0秒経過
1.5秒経過
2.0秒経過
2.5秒経過
3.0秒経過
3.5秒経過
4.0秒経過
4.5秒経過


In [21]:
href

['https://www3.nhk.or.jp/news/easy/k10013903131000/k10013903131000.html',
 'https://www3.nhk.or.jp/news/easy/k10013901891000/k10013901891000.html',
 'https://www3.nhk.or.jp/news/easy/k10013901431000/k10013901431000.html',
 'https://www3.nhk.or.jp/news/easy/k10013901261000/k10013901261000.html']

# 過去分を取得する

In [36]:
# href には 11 月 25 日分が保存されており、11 月 24 日から 365 日分遡るコードを走らせる
for i in tqdm(range(2500)): # 0, 1, 2, ... 364 まで
    try:
        # click して特定日付の記事一覧を表示する
        driver.execute_script('window.scrollTo(0, 1000);')
        element = driver.find_element(By.XPATH, '//*[@id="easy-wrapper"]/div[2]/aside/section[2]/div[1]/a[2]')
        element.click()

        # utf-8 に変換する
        html = driver.page_source.encode('utf-8')

        soup = BeautifulSoup(html, 'html.parser')

        article_raw = soup.select('#js-archives-list > a')

        for ref in article_raw:
            new_url = re.sub(r'^./', BASE_URL+'/', ref.get('href'))
            href.append(new_url)
            time.sleep(1)
    
            
    except Exception as e:
        print(f'{e}: {type(e)}')
        date = soup.select('#js-pager-date')
        print(f'{date}: #{len(href)}th row.')


href

 18%|█▊        | 65/365 [18:20<1:24:38, 16.93s/it]


WebDriverException: Message: disconnected: Unable to receive message from renderer
  (failed to check if window was closed: timeout: Timed out receiving message from renderer: 600.000)
  (Session info: chrome=107.0.5304.110)
Stacktrace:
0   chromedriver                        0x000000010a0582c8 chromedriver + 4752072
1   chromedriver                        0x0000000109fd8463 chromedriver + 4228195
2   chromedriver                        0x0000000109c3bb18 chromedriver + 441112
3   chromedriver                        0x0000000109c25f4d chromedriver + 352077
4   chromedriver                        0x0000000109c25ca2 chromedriver + 351394
5   chromedriver                        0x0000000109c24b0d chromedriver + 346893
6   chromedriver                        0x0000000109c24ec2 chromedriver + 347842
7   chromedriver                        0x0000000109c24351 chromedriver + 344913
8   chromedriver                        0x0000000109c32ec0 chromedriver + 405184
9   chromedriver                        0x0000000109c24306 chromedriver + 344838
10  chromedriver                        0x0000000109c258fe chromedriver + 350462
11  chromedriver                        0x0000000109c24b0d chromedriver + 346893
12  chromedriver                        0x0000000109c24ec2 chromedriver + 347842
13  chromedriver                        0x0000000109c24351 chromedriver + 344913
14  chromedriver                        0x0000000109c2ffaa chromedriver + 393130
15  chromedriver                        0x0000000109c24306 chromedriver + 344838
16  chromedriver                        0x0000000109c258fe chromedriver + 350462
17  chromedriver                        0x0000000109c24b0d chromedriver + 346893
18  chromedriver                        0x0000000109c24ec2 chromedriver + 347842
19  chromedriver                        0x0000000109c24351 chromedriver + 344913
20  chromedriver                        0x0000000109c2b52c chromedriver + 374060
21  chromedriver                        0x0000000109c24306 chromedriver + 344838
22  chromedriver                        0x0000000109c258fe chromedriver + 350462
23  chromedriver                        0x0000000109c24b0d chromedriver + 346893
24  chromedriver                        0x0000000109c24ec2 chromedriver + 347842
25  chromedriver                        0x0000000109c24351 chromedriver + 344913
26  chromedriver                        0x0000000109c1e260 chromedriver + 320096
27  chromedriver                        0x0000000109c24306 chromedriver + 344838
28  chromedriver                        0x0000000109c23872 chromedriver + 342130
29  chromedriver                        0x0000000109c23a3d chromedriver + 342589
30  chromedriver                        0x0000000109c23d37 chromedriver + 343351
31  chromedriver                        0x0000000109c23ced chromedriver + 343277
32  chromedriver                        0x0000000109c3d25b chromedriver + 447067
33  chromedriver                        0x0000000109cb1768 chromedriver + 923496
34  chromedriver                        0x0000000109c99b33 chromedriver + 826163
35  chromedriver                        0x0000000109c6a9fd chromedriver + 633341
36  chromedriver                        0x0000000109c6c051 chromedriver + 639057
37  chromedriver                        0x000000010a02530e chromedriver + 4543246
38  chromedriver                        0x000000010a029a88 chromedriver + 4561544
39  chromedriver                        0x000000010a0316df chromedriver + 4593375
40  chromedriver                        0x000000010a02a8fa chromedriver + 4565242
41  chromedriver                        0x000000010a0002cf chromedriver + 4391631
42  chromedriver                        0x000000010a0495b8 chromedriver + 4691384
43  chromedriver                        0x000000010a049739 chromedriver + 4691769
44  chromedriver                        0x000000010a05f81e chromedriver + 4782110
45  libsystem_pthread.dylib             0x00007ff805d13259 _pthread_start + 125
46  libsystem_pthread.dylib             0x00007ff805d0ec7b thread_start + 15


In [13]:
for i in range(10, 20, 1):
    print(i)

10
11
12
13
14
15
16
17
18
19


In [38]:
len(href)

264