<a href="https://colab.research.google.com/github/usma11dia0/web_scraping_on_colab/blob/main/web_scraping_on_colab_tribeau.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.WjiupQZLIM/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.DZeMT8HAtD/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.6WsrEBzise/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://de



In [None]:
#環境変数設定
GET_START_PAGE = 1
GET_NUM = 200
TARGET_URL = f'https://tribeau.jp/surgery_sites/1/case_reports?page={GET_START_PAGE}'

DRIVER_WAIT_TIME = 5

# 各種ファイル保存先
SUBMIT_FILE_PATH =f'/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/accumulated_list_tribeau_eye_{GET_START_PAGE}.xlsx'

#取得データ格納先
result_dict = {
    '施術':[],
    'クリニック名': [],
    'クリニック住所': [],
    'URL':[],
    'ドクター名': [],
    'メニュー名':[],
    '費用': [],
    '副作用・リスク': [],
    '患者属性': [],
    '症例画像':[],
}

In [None]:
#標準ライブラリ
import os
import math
import json
from copy import deepcopy

#サードパーティライブラリ
import pandas as pd
from google.colab import drive
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from logging import (
    getLogger,
    StreamHandler,
    DEBUG,
    INFO,
    Formatter,
    config
)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
options = Options()
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--ignore-certificate-errors')
options.add_argument("--disable-extensions")
options.add_argument("--disable-popup-blocking")
options.add_argument(
       "user-agent=Mozilla/5.0 (X11; Linux x86_64; rv:93.0) Gecko/20100101 Firefox/93.0"
    )
options.add_experimental_option("prefs", {
    "profile.managed_default_content_settings.images": 2,  # 画像の無効化
    "profile.managed_default_content_settings.plugins": 2,  # プラグインの無効化
})

driver = webdriver.Chrome(options=options)
driver.implicitly_wait(DRIVER_WAIT_TIME)

In [None]:
#カスタムロガーの設定
with open('/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/logging_config.json', 'r') as f:
    logger_config = json.load(f)
config.dictConfig(logger_config)
logger = getLogger('main')

try:
      driver.get(TARGET_URL)
      logger.debug(f'{TARGET_URL}：トップページへ移動しました')
except Exception as e:
      logger.error(f'{TARGET_URL}：トップページへ移動出来ませんでした: [e]')
      raise

2023-10-21 11:13:11,167 [DEBUG] main: https://tribeau.jp/surgery_sites/1/case_reports?page=1：トップページへ移動しました


In [None]:
#全ページ数を導出
ul_element = driver.find_element(By.CSS_SELECTOR,"[class='Pagination__List-sc-j1ricz-0 hUIXWn']")
li_elements = ul_element.find_elements(By.CSS_SELECTOR,"[class='PageButton__ListItem-sc-5gweb2-1 dbVcqs']")
num_all_pages = int(li_elements[-1].text)

In [None]:
#一覧ページより詳細ページのリンク先取得
page_num = GET_START_PAGE
end_num = num_all_pages
# end_num = GET_START_PAGE + GET_NUM
target_url = TARGET_URL
detail_url_all = []
retry_count = 0
max_retry = 5

def get_detail_page(page_num):
  try:
    ul_element = driver.find_element(By.CSS_SELECTOR,"ul.flex.flex-col.gap-5")
    a_elements = ul_element.find_elements(By.TAG_NAME, "a")
    #詳細ページのリンク先を取得
    detail_urls = [a_element.get_attribute('href') for a_element in a_elements]
    return detail_urls
  except StaleElementReferenceException:
    return get_detail_page(page_num)
  except NoSuchElementException:
    target_url = f'https://tribeau.jp/surgery_sites/1/case_reports?page={page_num}'
    driver.get(target_url)
    return get_detail_page(page_num)
  except Exception as e:
    logger.error(f'{target_url}:{target_url}の詳細ページリンク取得に失敗しました: [{e}]')

while page_num <= end_num:
  try:
    driver.get(target_url)
    logger.debug(f'{target_url}：一覧ページ{page_num}へ移動しました')
  except Exception as e:
    logger.error(f'{target_url}：一覧ページへ移動出来ませんでした: [e]')

  # 詳細ページ取得
  try:
    detail_urls = get_detail_page(page_num)
    detail_url_all.extend(detail_urls)
    logger.debug(detail_urls)
  except NoSuchElementException as e:
    logger.error("Element not found: %s", str(e))

  #次の一覧ページへの遷移先更新
  page_num += 1
  target_url = f'https://tribeau.jp/surgery_sites/1/case_reports?page={page_num}'

print(detail_url_all)

2023-10-21 04:19:56,736 [DEBUG] main: https://tribeau.jp/surgery_sites/1/case_reports?page=1：一覧ページ1へ移動しました
2023-10-21 04:19:58,352 [DEBUG] main: ['https://tribeau.jp/case_reports/31798', 'https://tribeau.jp/case_reports/14731', 'https://tribeau.jp/case_reports/14286', 'https://tribeau.jp/case_reports/31356', 'https://tribeau.jp/case_reports/31627', 'https://tribeau.jp/case_reports/31454', 'https://tribeau.jp/case_reports/31834', 'https://tribeau.jp/case_reports/14878', 'https://tribeau.jp/case_reports/31642', 'https://tribeau.jp/case_reports/14732', 'https://tribeau.jp/case_reports/31291', 'https://tribeau.jp/case_reports/31032', 'https://tribeau.jp/case_reports/14877', 'https://tribeau.jp/case_reports/12899', 'https://tribeau.jp/case_reports/12894', 'https://tribeau.jp/case_reports/12898', 'https://tribeau.jp/case_reports/12942', 'https://tribeau.jp/case_reports/12943', 'https://tribeau.jp/case_reports/27436', 'https://tribeau.jp/case_reports/13991']
2023-10-21 04:20:00,611 [DEBUG] ma

In [None]:
# JSON形式から読み込み (途中処理用)
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(DRIVER_WAIT_TIME)
TARGET_URL = f'https://tribeau.jp/surgery_sites/1/case_reports?page=1'
driver.get(TARGET_URL)
with open('/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/detail_url_all.json', 'r') as f:
    detail_url_all = json.load(f)[5401:]

In [None]:
# 詳細ページにて各情報を取得
#クリニック名/施術名/ドクター名/症例画像取得
def get_text_by_selector(selector):
    try:
        text = driver.find_element(By.CSS_SELECTOR, selector).text
    except NoSuchElementException:
        text = ""
    return text

def get_text_by_selector_index(selector, index):
    try:
        text = driver.find_elements(By.CSS_SELECTOR, selector)[index].text
    except (IndexError, NoSuchElementException):
        text = ""
    return text

def separate_dict_column(result_dict, column_name):
  # 対象列の最大長さを取得
  max_column_length = max(len(imgs) for imgs in result_dict[column_name])

  # 症例画像を個別のカラムに分ける
  new_columns = {}
  for i in range(max_column_length):
      new_columns[f'{column_name}{i+1}'] = [item[i] if i < len(item) else '' for item in result_dict[column_name]]

  # data 辞書の更新
  del result_dict[column_name]
  result_dict.update(new_columns)

def save_file_from_dict(result_dict, submit_file_path):
  result_dict_copy = deepcopy(result_dict)
  separate_dict_column(result_dict_copy, '施術')
  separate_dict_column(result_dict_copy, '症例画像')

  #データフレーム作成
  df_result = pd.DataFrame(result_dict_copy)

  #データ前処理
  df_result_unique = df_result.drop_duplicates(subset='URL', keep='first')
  df_result_filled =df_result_unique[df_result_unique['クリニック名'] != '']

  # 提出ファイル用出力
  df_result_filled.to_excel(submit_file_path, header=True, index=False)


cnt = 5401
for detail_url in detail_url_all:
  try:
    driver.get(detail_url)
    logger.debug(f'詳細ページ{detail_url}へ移動しました')
  except Exception as e:
    logger.error(f'詳細ページ{detail_url}へ移動出来ませんでした: [e]')
    continue

  try:
      #施術
      a_elements = driver.find_elements(By.CSS_SELECTOR, "a.py-1\\.5.px-3.rounded-full.text-primary-high.text-text-tag.font-bold.border.bg-background-main.border-primary-high")
      medical_procedures = [a_element.text for a_element in a_elements]
      #クリニック名/クリニック住所/ドクター名/メニュー名/費用/副作用・リスク
      clinic_name = get_text_by_selector("div.ClinicAndDoctorLink__ClinicName-sc-1s8w7y6-2.ItnDt")
      clinic_address = get_text_by_selector_index("div.ClinicSection__ClinicOneInfo-sc-xpwsv2-5.kQoWQT", 1)
      doctor_name = get_text_by_selector("div.ClinicAndDoctorLink__DoctorName-sc-1s8w7y6-5.knTAKt")
      menu = get_text_by_selector("span.CaseReportPage__CaseReportTitle-sc-1w0k5lc-1.cHMnmm")
      cost = get_text_by_selector_index("div.Treatment__SectionContent-sc-14kahvy-3.dQGJTE", 1)
      side_effect = get_text_by_selector_index("div.Treatment__SectionContent-sc-14kahvy-3.dQGJTE", 2)
      #患者属性
      tmp = get_text_by_selector("span.CaseReportPage__CaseReportSubTitle-sc-1w0k5lc-2.jYMLSR")
      if len(tmp) > 0:
        patient_characteristics = tmp.split('/')[1].strip()
      else:
        patient_characteristics = ""
      #症例画像
      detail_img_url = f'{detail_url}/images'
      img_url_all = []
      try:
        driver.get(detail_img_url)
        logger.debug(f'詳細ページ{detail_img_url}へ移動しました')
      except Exception as e:
        logger.error(f'詳細ページ{detail_img_url}へ移動出来ませんでした: [e]')
        continue
      div_elements = driver.find_elements(By.CSS_SELECTOR, "div.CaseReportImagePage__Flex-sc-3j6mr5-3.bQVarB")
      for div_element in div_elements:
        img_elements = div_element.find_elements(By.CSS_SELECTOR, "img.CaseReportImagePage__Photo-sc-3j6mr5-6.jXGOqB")
        img_urls = [img_element.get_attribute('src') for img_element in img_elements]
        img_url_all.extend(img_urls)

      #結果格納
      result_dict['施術'].append(medical_procedures)
      result_dict['クリニック名'].append(clinic_name)
      result_dict['クリニック住所'].append(clinic_address)
      result_dict['URL'].append(detail_url)
      result_dict['ドクター名'].append(doctor_name)
      result_dict['メニュー名'].append(menu)
      result_dict['費用'].append(cost)
      result_dict['副作用・リスク'].append(side_effect)
      result_dict['患者属性'].append(patient_characteristics)
      result_dict['症例画像'].append(img_url_all)

      cnt += 1
      logger.debug(f'詳細ページ{detail_url}の取得完了 ■進捗: {cnt}/{len(detail_url_all)}' )

      # 中間成果物作成
      if cnt % 100 == 0:
        submit_file_path =f'/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/accumulated_list_tribeau_eye_until_{cnt}.xlsx'
        save_file_from_dict(result_dict, submit_file_path)
  except Exception as e:
    logger.error(f'詳細ページ{detail_url}の取得に失敗しました: [{e}]')
    continue

# 最終成果物生成
save_file_from_dict(result_dict, SUBMIT_FILE_PATH)

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
2023-10-21 13:06:57,499 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6478/imagesへ移動しました
2023-10-21 13:06:57,567 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6478の取得完了 ■進捗: 7319/3585
2023-10-21 13:06:59,046 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6475へ移動しました
2023-10-21 13:06:59,750 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6475/imagesへ移動しました
2023-10-21 13:07:00,076 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6475の取得完了 ■進捗: 7320/3585
2023-10-21 13:07:01,193 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6471へ移動しました
2023-10-21 13:07:02,114 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6471/imagesへ移動しました
2023-10-21 13:07:02,169 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6471の取得完了 ■進捗: 7321/3585
2023-10-21 13:07:03,483 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6470へ移動しました
2023-10-21 13:07:04,858 [DEBUG] main: 詳細ページhttps://tribeau.jp/case_reports/6470/imagesへ移動しました
2023-10-21 13:07:

In [None]:
# エクセルファイル形式から読み込み
# import pandas as pd
# df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/accumulated_list_tribeau_eye.xlsx')
# # NaNをハイフン「-」で置換
# df_filled = df.fillna('-')

In [None]:
# エクセルファイルとして保存
# df_filled.to_excel('/content/drive/MyDrive/Colab Notebooks/dev/web_scraping_crownstrategy/accumulated_list_tribeau_eye_filled.xlsx', header=True, index=False)