<a href="https://colab.research.google.com/github/usma11dia0/web_scraping_on_colab/blob/main/web_scraping_on_colab_tribeau.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.UXzG64NeZZ/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.4QWzEb5tOe/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.TZAo3n1OA8/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian bust



In [50]:
#環境変数設定
TARGET_URL = 'https://tribeau.jp/surgery_sites/1/case_reports'
DRIVER_WAIT_TIME = 5

# 各種ファイル保存先
SUBMIT_FILE_PATH ='/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/accumulated_list_tribeau_eye.xlsx'

#取得データ格納先
result_dict = {
    '施術名':[],
    'クリニック名': [],
    'クリニック住所': [],
    'ドクター名': [],
    'メニュー名':[],
    '費用': [],
    '副作用・リスク': [],
    '症例画像_施術前':[],
    '症例画像_施術後':[],
}

In [66]:
#標準ライブラリ
import os
import math
import json

#サードパーティライブラリ
import pandas as pd
from google.colab import drive
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException
from logging import (
    getLogger,
    StreamHandler,
    DEBUG,
    INFO,
    Formatter,
    config
)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
options = Options()
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--ignore-certificate-errors')
options.add_argument("--disable-extensions")
options.add_argument("--disable-popup-blocking")
options.add_argument(
       "user-agent=Mozilla/5.0 (X11; Linux x86_64; rv:93.0) Gecko/20100101 Firefox/93.0"
    )
options.add_experimental_option("prefs", {
    "profile.managed_default_content_settings.images": 2,  # 画像の無効化
    "profile.managed_default_content_settings.plugins": 2,  # プラグインの無効化
})

driver = webdriver.Chrome(options=options)
driver.implicitly_wait(DRIVER_WAIT_TIME)

In [52]:
#カスタムロガーの設定
with open('/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/logging_config.json', 'r') as f:
    logger_config = json.load(f)
config.dictConfig(logger_config)
logger = getLogger('main')

try:
      driver.get(TARGET_URL)
      logger.debug(f'{TARGET_URL}：トップページへ移動しました')
except Exception as e:
      logger.error(f'{TARGET_URL}：トップページへ移動出来ませんでした: [e]')
      raise

2023-10-19 15:22:54,035 [DEBUG] main: https://tribeau.jp/surgery_sites/1/case_reports：トップページへ移動しました


In [None]:
#テスト用
#全ページ数を導出
ul_element = driver.find_element(By.CSS_SELECTOR,"[class='Pagination__List-sc-j1ricz-0 hUIXWn']")
li_elements = ul_element.find_elements(By.CSS_SELECTOR,"[class='PageButton__ListItem-sc-5gweb2-1 dbVcqs']")
num_all_pages = int(li_elements[-1].text)
num_all_pages


# result_dict = {
#     '施術名':[],
#     'クリニック名': [],
#     'クリニック住所': [],
#     'ドクター名': [],
#     'メニュー名':[],
#     '費用': [],
#     '副作用・リスク': [],
#     '症例画像_施術前':[],
#     '症例画像_施術後':[],
# }

447

In [53]:
#全ページ数を導出
ul_element = driver.find_element(By.CSS_SELECTOR,"[class='Pagination__List-sc-j1ricz-0 hUIXWn']")
li_elements = ul_element.find_elements(By.CSS_SELECTOR,"[class='PageButton__ListItem-sc-5gweb2-1 dbVcqs']")
num_all_pages = int(li_elements[-1].text)

In [69]:
#一覧ページより詳細ページのリンク先取得
page_num = 1
target_url = TARGET_URL
detail_url_all = []

while page_num <= num_all_pages:
  try:
    driver.get(target_url)
    logger.debug(f'{target_url}：一覧ページ{page_num}へ移動しました')
  except Exception as e:
    logger.error(f'{target_url}：一覧ページへ移動出来ませんでした: [e]')
    raise

  try:
    ul_element = driver.find_element(By.CSS_SELECTOR,"[class='flex flex-col gap-5']")
    a_elements = ul_element.find_elements(By.TAG_NAME, "a")
    #詳細ページのリンク先を取得
    detail_urls = [a_element.get_attribute('href') for a_element in a_elements]
    detail_url_all.extend(detail_urls)
    logger.debug(detail_urls)
    #次の一覧ページへの遷移先更新
    page_num += 1
    target_url = f'https://tribeau.jp/surgery_sites/1/case_reports?page={page_num}'
  except StaleElementReferenceException:
    # 要素が古くなっていた場合、再度取得
    ul_element = driver.find_element(By.CSS_SELECTOR,"[class='flex flex-col gap-5']")
    a_elements = ul_element.find_elements(By.TAG_NAME, "a")
    detail_urls = [a_element.get_attribute('href') for a_element in a_elements]
    detail_url_all.extend(detail_urls)
    logger.debug(detail_urls)
    #次の一覧ページへの遷移先更新
    page_num += 1
    target_url = f'https://tribeau.jp/surgery_sites/1/case_reports?page={page_num}'
  except Exception as e:
    logger.error(f'{target_url}：{target_url}の詳細ページリンク取得に失敗しました: [e]')
    raise

  detail_url_all

2023-10-19 15:41:00,062 [DEBUG] main: https://tribeau.jp/surgery_sites/1/case_reports：一覧ページ1へ移動しました
[<selenium.webdriver.remote.webelement.WebElement (session="0e5e00535f3c2c59f39dcb1fa810d520", element="0f9ac6e3-8a34-4c24-9e3c-e9f4a85d83e3")>, <selenium.webdriver.remote.webelement.WebElement (session="0e5e00535f3c2c59f39dcb1fa810d520", element="c2da3f9e-d665-47fb-b3e5-e298191ff4de")>, <selenium.webdriver.remote.webelement.WebElement (session="0e5e00535f3c2c59f39dcb1fa810d520", element="2a45e73d-88d1-42c3-b424-42c1ee908538")>, <selenium.webdriver.remote.webelement.WebElement (session="0e5e00535f3c2c59f39dcb1fa810d520", element="d0feb8d2-f825-4ab2-b224-453ce3cb70e2")>, <selenium.webdriver.remote.webelement.WebElement (session="0e5e00535f3c2c59f39dcb1fa810d520", element="0bb1dec7-643e-469d-966a-1aca45697145")>, <selenium.webdriver.remote.webelement.WebElement (session="0e5e00535f3c2c59f39dcb1fa810d520", element="6096a0a4-e139-4f1e-9708-16504beaffb5")>, <selenium.webdriver.remote.webeleme

KeyboardInterrupt: ignored

In [None]:
    # 詳細ページにて各情報を取得
    #クリニック名/施術名/ドクター名/症例画像取得
    clinic_name = driver.find_element(By.CSS_SELECTOR,".display-name").text
    phone_number = driver.find_element(By.CSS_SELECTOR,".rstinfo-table__tel-num").text
    address = driver.find_element(By.CSS_SELECTOR,".rstinfo-table__address").text
    logger.debug(f'{target_url}：店舗ページ{page_num}の取得完了')
    #結果格納
    result_dict['店名'].append(store_name)
    result_dict['電話番号'].append(phone_number)
    result_dict['住所'].append(address)
    result_dict['URL'].append(store_url)

MaxRetryError: ignored

In [None]:
# 提出ファイル用出力
df_result.to_excel(SUBMIT_FILE_PATH, header=True, index=False)

# 積み上げリスト保存用出力
df_result.to_csv(ACCU_LIST_PATH, mode='a', header=True, index=False)

In [None]:
# 積み上げリスト確認
df_accu = pd.read_csv(ACCU_LIST_PATH)
df_accu

Unnamed: 0,店名,電話番号,住所,URL
0,うず潮屋 関内店,050-5869-5207,神奈川県横浜市中区真砂町3-33 セルテ 12F,https://tabelog.com/kanagawa/A1401/A140104/140...
1,ザ・肉餃子 四川厨房 横浜本舗,050-5594-1240,神奈川県横浜市西区南幸2-16-11 二幸ビル 6F,https://tabelog.com/kanagawa/A1401/A140102/140...
2,七輪焼肉 安安 横浜北口店,050-5457-1642,神奈川県横浜市神奈川区鶴屋町2-20-1 ＹＴＵビル　Ｂ１Ｆ,https://tabelog.com/kanagawa/A1401/A140101/140...
3,全席個室 楽蔵うたげ 関内駅前店,050-5456-1473,神奈川県横浜市中区港町4-15-2 フィル・パーク 2・3F,https://tabelog.com/kanagawa/A1401/A140104/140...
4,吾照里 ダイナシティ小田原店,050-5872-5981,神奈川県小田原市中里313-12 小田原ダイナシティーウォーク,https://tabelog.com/kanagawa/A1409/A140901/140...
...,...,...,...,...
995,原価ビストロBAN！ 溝の口,044-833-8585,神奈川県川崎市高津区ニ子5-9-6,https://tabelog.com/kanagawa/A1405/A140505/140...
996,まちノ食堂,050-5589-5904,神奈川県横浜市港北区日吉本町1-19-20 厚川ビル 1F,https://tabelog.com/kanagawa/A1401/A140204/140...
997,海鮮カフェ&バー グラベル,050-5597-9614,神奈川県相模原市中央区清新1-6-18 けやきビル 1F,https://tabelog.com/kanagawa/A1407/A140701/140...
998,橙家 横浜みなとみらい東急スクエア店,050-5868-3683,神奈川県横浜市西区みなとみらい2-3-8 みなとみらい東急スクエア③ 4F,https://tabelog.com/kanagawa/A1401/A140103/140...
