# WEB幹事スクレイピング（基礎編）

In [1]:
from time import sleep

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
"""
pandas==1.3.3
beautifulsoup4==4.10.0
webdriver-manager==3.5.1
requests
tqdm
"""

'\npandas==1.3.3\nbeautifulsoup4==4.10.0\nwebdriver-manager==3.5.1\nrequests\ntqdm\n#openpyxl==3.0.9\n'

In [3]:
options = Options()
# options.add_argument('--headless')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [/Users/takada/.wdm/drivers/chromedriver/mac64/95.0.4638.54/chromedriver] found in cache


In [4]:
driver.quit()

## 1. 検索結果1ページ目から個別ページのリンクを取得する

In [5]:
url = 'https://web-kanji.com/search/akita'
driver.get(url)

MaxRetryError: HTTPConnectionPool(host='localhost', port=54056): Max retries exceeded with url: /session/e3dbbe0d275843ecc66c56e9032cbda8/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe460885d30>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
company_items = driver.find_elements(By.CSS_SELECTOR, '.companies-item')

In [None]:
print(len(company_items))
print(company_items[0].get_attribute('href'))

In [None]:
single_company_urls = [item.get_attribute('href') for item in company_items]

### 関数化

In [None]:
def fetch_single_company_urls(driver, url):
    """
    パラメータ:
        driver: WebDriver
        url: 検索結果の一覧ページ
    返却値:
        single_company_urls: 制作会社個別ページのURLリスト

    urlに載っている制作会社個別ページのURLを取得する
    """
    driver.get(url)
    sleep(1)
    company_items = driver.find_elements(By.CSS_SELECTOR, '.companies-item')
    single_company_urls = [item.get_attribute('href') for item in company_items]
    return single_company_urls


## 2. 個別ページの会社概要の情報を抽出する

In [None]:
url = single_company_urls[0]
driver.get(url)

In [None]:
#dl_element = driver.find_elements(By.CSS_SELECTOR, '.company-data.is-narrow')[0]
dl_element = driver.find_element(By.CSS_SELECTOR, '.company-data.is-narrow')
dt_elements = dl_element.find_elements(By.CSS_SELECTOR, 'dt')
dd_elements = dl_element.find_elements(By.CSS_SELECTOR, 'dd')

datum = {}
for dt, dd in zip(dt_elements, dd_elements):
    datum[dt.text] = dd.text.replace('\n', ' ')
datum

### 関数化

In [None]:
def extract_company_data(driver, url):
    """
    パラメータ:
        driver: WebDriver
        url: 個別ページURL
    返却値:
        data: 制作会社の会社情報のdict
              (例) 
              {
                  'WEB幹事_URL': 'http://web-kanji....', 
                  '会社名': 'xxx株式会社', 
                  '代表': '山田 太郎', 
                  'URL': 'http://xxx.co.jp/', 
                  ...
              }
    """
    driver.get(url)
    sleep(1)
    dl_element = driver.find_element(By.CSS_SELECTOR, '.company-data.is-narrow')
    dt_elements = dl_element.find_elements(By.CSS_SELECTOR, 'dt')
    dd_elements = dl_element.find_elements(By.CSS_SELECTOR, 'dd')

    data = {}
    data['WEB幹事_URL'] = url
    for dt, dd in zip(dt_elements, dd_elements):
        data[dt.text] = dd.text.replace('\n', ' ')
    return data

## 3. 20社の会社概要を取得する ※ 今回は3社

ただし、今回は同時に20件アクセスするとサーバに負荷をかけてしまうので3件とします。

In [None]:
search_url = 'https://web-kanji.com/search/akita'
single_company_urls = fetch_single_company_urls(driver, search_url)

In [None]:
# 3社のみにフィルタ
print(len(single_company_urls))
single_company_urls = single_company_urls[:3]
print(len(single_company_urls))

In [None]:
company_data = [extract_company_data(driver, url) for url in single_company_urls]

In [None]:
df = pd.DataFrame(company_data).fillna('')
df

## 4. 検索結果の2ページ目以降に掲載されている制作会社個別ページも取得する

### 方法1 (易しい、手作業多い)

検索結果のページネーションURLを直書きで指定する

In [None]:
search_urls = [
    'https://web-kanji.com/search/akita',
    'https://web-kanji.com/search/akita/page/2',
]

In [None]:
single_company_urls = []
for url in search_urls:
    urls = fetch_single_company_urls(driver, url)
    single_company_urls.extend(urls)

print(len(single_company_urls))
single_company_urls = single_company_urls[:3]
print(len(single_company_urls))

In [None]:
company_data = [extract_company_data(driver, url) for url in single_company_urls]
df = pd.DataFrame(company_data).fillna('')
df

### 方法2 (難しい、手作業少ない)

In [None]:
search_url = 'https://web-kanji.com/search/akita'
driver.get(search_url)

In [None]:
search_urls = []
current_url = search_url
while True:
    driver.get(current_url)
    sleep(1)
    search_urls.append(current_url)
    try:
        next_a = driver.find_element(By.CSS_SELECTOR, '.pagination-item > a[rel="next"]')
    except NoSuchElementException:
        break
    current_url = next_a.get_attribute('href')

### 関数化

In [None]:
def fetch_search_pagination_urls(driver, search_url):
    """
    パラメータ:
        driver: WebDriver
        search_url: 検索結果１ページ目のURL
    返却値:
        search_urls: 検索条件のページネーションURLリスト
    
    検索結果のページネーションを自動で検知してページネーションのURLをすべて取得する
    """
    search_urls = []
    current_url = search_url
    while True:
        driver.get(current_url)
        sleep(1)
        search_urls.append(current_url)
        try:
            next_a = driver.find_element(By.CSS_SELECTOR, '.pagination-item > a[rel="next"]')
        except NoSuchElementException:
            break
        current_url = next_a.get_attribute('href')
    return search_urls

In [None]:
search_urls = fetch_search_pagination_urls(driver, search_url)
search_urls

In [None]:
single_company_urls = []
for url in search_urls:
    urls = fetch_single_company_urls(driver, url)
    single_company_urls.extend(urls)
print(len(single_company_urls))
single_company_urls = single_company_urls[:3]

In [None]:
company_data = [extract_company_data(driver, url) for url in single_company_urls]
df = pd.DataFrame(company_data).fillna('')
df

## 5. 抽出したデータをファイルに書き出す

### tsvフォーマットで書き出す

In [None]:
df.to_csv('company_data.tsv', sep='\t')

In [None]:
!pip install webdriver_manager

In [6]:
!pip freeze | grep webdri

webdriver-manager==3.5.1


In [None]:
try:
    3 / 0
except ZeroDivisionError:
    print('0で割ってるよ')