In [9]:
!pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.0 webdriver_manager-4.0.1


In [3]:
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
import pandas as pd
import datetime
import spacy
from collections import Counter

def main(urls):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    all_reviews = []
    try:
        for url in urls:
            print("Start scraping from", url)
            driver.get("https://www.rottentomatoes.com/m/" + url + "/reviews?type=user")
            time.sleep(1)

            # ページング処理
            max_clicks = 9 #ここ修正
            clicks = 0
            while clicks < max_clicks:
                try:
                    load_more_button = WebDriverWait(driver, 20).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '[data-qa="load-more-btn"]'))
                    )
                    load_more_button.click()
                    clicks += 1
                    time.sleep(2)  # 追加コンテンツの読み込みを待つ
                except TimeoutException:
                    print("Load More ボタンが見つからないか、クリックできません。")
                    break
                except NoSuchElementException:
                    print("ページの最後に到達しました。")
                    break
                    
            # 取得
            all_reviews.extend(get_inf(driver))

    finally:
        driver.quit()

    df = pd.DataFrame(all_reviews)
    now = datetime.datetime.now()
    file_name = 'rotten_tomatoes_reviews_{}.xlsx'.format(now.strftime('%Y%m%d_%H%M%S'))
    df.to_excel(file_name, index=False)
    print("Saved to", file_name)    
    
    
    df_gp = df.groupby('url')['review'].agg(','.join).reset_index().rename(columns={"url": "url", "review": "string"}) 
    df_gp['string'] = df_gp['string'].apply(extract_frequent_nouns_adjs)
    file_name_gp = 'rotten_tomatoes_reviews_grouping_{}.xlsx'.format(now.strftime('%Y%m%d_%H%M%S'))
    df_gp.to_excel(file_name_gp, index=False)
    print("Saved to", file_name_gp)

def get_inf(driver):
    reviews = []
    try:
        all_review = driver.find_element(By.CSS_SELECTOR, ".review_table")
        review_lists = all_review.find_elements(By.CSS_SELECTOR, '.audience-review-row')
        for review_list in review_lists:
            review_dic = {}
            try:
                review_dic["url"] = driver.current_url
                review_dic["reviewer"] = review_list.find_element(By.CSS_SELECTOR, ".audience-reviews__name-wrap").text
                review_dic["evaluation"] = sum(1 for _ in review_list.find_elements(By.CSS_SELECTOR, ".star-display__filled")) + sum(0.5 for _ in review_list.find_elements(By.CSS_SELECTOR, ".star-display__half"))
                review_dic["review"] = review_list.find_element(By.CSS_SELECTOR, "p.audience-reviews__review.js-review-text").text           
                reviews.append(review_dic)

            except NoSuchElementException as e:
                print("要素が見つかりませんでした:", e)
    except NoSuchElementException as e:
        print("レビューテーブルが見つかりませんでした:", e)
    
    return reviews

def extract_frequent_nouns_adjs(text, max_words=20):
    exclude_words = ["movie", "film", "films"]  # 除外する単語のデフォルトリスト

    # テキストを処理
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # 名詞と形容詞を抽出し、一つのリストに結合
    words = [token.text for token in doc if token.pos_ in ["NOUN", "ADJ"] and token.text not in exclude_words]

    # 単語の出現頻度を計算
    word_freq = Counter(words)
    
    # 2回以上出現する単語のみを抽出し、最大20単語までの結果を取得
    freq_info = ', '.join([f"{word}: {freq}" for word, freq in word_freq.items() if freq > 1][:max_words])
        
    return freq_info

def chunk_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

if __name__ == "__main__":
    
    url_list = pd.read_csv('./metadata_for_import_all.csv')
    urls = url_list['url'].values.tolist()
    url_chunks = chunk_list(urls, 30)
    for i, url_chunk in enumerate(url_chunks):
        print(f'{i}回目のファイル出力処理開始')
        main(url_chunk)
    
#     urls = [
#         "300"
#     ]

#     main(urls)


Start scraping from 300
Saved to rotten_tomatoes_reviews_20240104_134817.xlsx
Saved to rotten_tomatoes_reviews_grouping_20240104_134817.xlsx


In [57]:
df_test = pd.read_csv('./metadata_for_import_first50.csv')
df_test['url'].values.tolist()

['star_wars_episode_iv_a_new_hope',
 'american_beauty',
 'pirates_of_the_caribbean_the_curse_of_the_black_pearl',
 '2001_a_space_odyssey',
 'leon_the_professional',
 '48_hrs',
 'edward_scissorhands',
 '28_days_later',
 'saw',
 'minority_report',
 'clockwork_orange',
 'saw_ii',
 'stand_by_me_1986',
 '1046060-high_noon',
 'terminator_3_rise_of_the_machines',
 'once_upon_a_time_in_the_west',
 'batman_returns',
 'the_good_the_bad_and_the_ugly',
 'day_after_tomorrow',
 'psycho',
 'die_hard',
 'titanic',
 'men_in_black_ii',
 'final_fantasy_vii_advent_children',
 'harry_potter_and_the_goblet_of_fire',
 'pulp_fiction',
 'sixth_sense',
 'finding_neverland',
 'cars',
 '1005339-dawn_of_the_dead',
 'galaxy_quest',
 'madagascar',
 'die_hard_2_1990',
 'school_of_rock',
 'sister_act',
 'while_you_were_sleeping',
 'american_pie',
 'perfect_storm',
 'bourne_identity',
 'bourne_supremacy',
 'woman_in_red',
 'the_orphanage',
 'pirates_of_the_caribbean_dead_mans_chest',
 'war_of_the_worlds',
 'tron',
 'ba

In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.7.2-cp310-cp310-win_amd64.whl (12.1 MB)
     ---------------------------------------- 12.1/12.1 MB 4.1 MB/s eta 0:00:00
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.8-cp310-cp310-win_amd64.whl (481 kB)
     -------------------------------------- 481.9/481.9 kB 3.4 MB/s eta 0:00:00
Collecting thinc<8.3.0,>=8.1.8
  Downloading thinc-8.2.2-cp310-cp310-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 3.8 MB/s eta 0:00:00
Collecting wasabi<1.2.0,>=0.9.1
  Downloading wasabi-1.1.2-py3-none-any.whl (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
     -------------------------------------- 181.6/181.6 kB 1.4 MB/s eta 0:00:00
Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4
  Downloading pydantic-2.5.3-py3-none-any.whl (381 kB)
     ----------------------------------