In [4]:
!pip install webdriver_manager



In [5]:
import time
import os
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import datetime
import re
from openpyxl.utils import ILLEGAL_CHARACTERS_RE

def main(urls):

    # ドライバ準備
    chrome_options = webdriver.ChromeOptions()
    #Program Files (x86)配下のバージョンの低いChromeを参照しに行ってバージョン非対応と怒られたので、新しいバージョンのChromeアプリにPathを通す
    chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe" 
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    all_reviews = []
    try:
        for url in urls:
            
            # アクセス
            print("スクレイピング開始: ", url)
            driver.get("https://review.kakaku.com/review/" + url + "#tab")
            
            # モデル取得
            try:
                model_name = driver.find_element(By.CLASS_NAME, 'p-main_title').find_element(By.TAG_NAME, 'h2').text
            except NoSuchElementException:
                model_name = ""
            print("モデル: ", model_name)
            
            # レビュー抽出
            all_reviews.extend(get_review(driver, model_name))

            # ページング処理
            while True:
                try:
                    # ボタンが存在するかチェック
                    load_more_button = driver.find_element(By.CLASS_NAME, 'arrowNext01')
                    load_more_button.click()
                    
                    # 読み込み待機
                    time.sleep(2)
                    
                    # 取得
                    all_reviews.extend(get_review(driver, model_name))

                except NoSuchElementException:
                    print("スクレイピング完了")
                    break
                except TimeoutException:
                    print("タイムアウト発生")
                    break

    finally:
        driver.quit()
    
    # 不正な文字を削除
    df = pd.DataFrame(all_reviews)
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = df[column].apply(remove_illegal_characters)

    # 保存先ディレクトリの指定と作成
    output_dir = os.path.join(os.getcwd(), '..', 'Output') #1個上の階層にあるOutputフォルダに格納
    os.makedirs(output_dir, exist_ok=True)
    
    # ファイル名生成
    now = datetime.datetime.now()
    file_name = os.path.join(output_dir, 'kakakucom_reviews_{}.xlsx'.format(now.strftime('%Y%m%d_%H%M%S')))

    # 保存    
    df.to_excel(file_name, index=False)
    print("Saved to", file_name)   
    

def get_review(driver, model_name):
    reviews = []
    
    review_elements = driver.find_elements(By.CLASS_NAME, 'reviewBox')
    for elem in review_elements:
        
        # パンくずリスト
        try:
            breadcrumbs = elem.find_element(By.CLASS_NAME, 'breadcrumbs').text
        except NoSuchElementException:
            breadcrumbs = ""
                
        # パンくずリストの最終項目からSKUを取得
        try:
            breadcrumbs_elements = elem.find_elements(By.XPATH, './/p[@class="breadcrumbs"]//a')
            if breadcrumbs_elements:
                breadcrumbs_last = breadcrumbs_elements[-1].text
            else:
                breadcrumbs_last = elem.find_element(By.XPATH, './/p[@class="breadcrumbs"]').text.split(' > ')[-1]
        except NoSuchElementException:
            breadcrumbs_last = ""
        
        # ユーザ名
        try:
            user_name = elem.find_element(By.CLASS_NAME, 'userName').text
        except NoSuchElementException:
            user_name = ""
        
        # レビュー投稿日
        try:
            entry_date = elem.find_element(By.CLASS_NAME, 'entryDate').text
        except NoSuchElementException:
            entry_date = ""
        
        # レビュー本文
        try:
            review_content = elem.find_element(By.CLASS_NAME, 'revEntryCont').text
        except NoSuchElementException:
            review_content = ""
            
        # レーティング
        ratings = {}
        try:
            rating_elements = elem.find_elements(By.XPATH, './/div[@class="revRateBox type2"]//table//tr')
            for rating in rating_elements:
                rating_category = rating.find_element(By.TAG_NAME, 'th').text
                rating_value = rating.find_element(By.TAG_NAME, 'td').text
                ratings[rating_category] = rating_value                
        except NoSuchElementException:
            print("error")
            
        reviews.append({
            "model_name": model_name,
            "model_detail": breadcrumbs_last,
            "breadcrumbs": breadcrumbs,
            "url": driver.current_url,
            "user_name": user_name,
            "entry_date": entry_date,
            "review_content": review_content,
            **ratings
        })
    
    return reviews

# 処理分割用
def chunk_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

# 不正文字出力回避用
def remove_illegal_characters(text):
    if isinstance(text, str):
        return ILLEGAL_CHARACTERS_RE.sub("", text)
    return text

url_list = pd.read_excel('../Output/kakakucom_device_lists_20240602_010957.xlsx', index_col=0)
urls = url_list['model_id'].values.tolist()

url_chunks = chunk_list(urls, 10)
for i, url_chunk in enumerate(url_chunks):
    
    # 一時的処理
    if i < 7:
        continue
    
    print(f'{i}回目のファイル出力処理開始')
    main(url_chunk)
    
# urls = [
#     # WRITE ME
#     "M0000001024"
# ]
# main(urls)


7回目のファイル出力処理開始
スクレイピング開始:  M0000000944
モデル:  Redmi Note 10T
スクレイピング完了
スクレイピング開始:  M0000000953
モデル:  Redmi Note 11 Pro 5G


KeyboardInterrupt: 