In [1]:
import time
import json
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
host = 'https://www.imdb.com'
root_url = host + '/search/title/?languages=en&title_type=feature&genres={}&start={}'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X '
                         '10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/72.0.3626.109 Safari/537.36'}

In [3]:
movies_df = pd.read_csv('data/movies.csv', index_col = 0)
reviews_df = pd.read_csv('data/reviews.csv', index_col = 0)
reviews_df

Unnamed: 0,rating,title,url,review_id,user_id,date,spoiler,other,comment,movie_id
0,4/10,The franchise has had a lobotomy,/review/rw4751363/?ref_=tt_urv,rw4751363,ur101048846,30 March 2019,Warning: Spoilers,,Dinosaurs. Amusement Park. Tourists. Disaster....,tt0369610
1,2/10,There is a plus...,/review/rw3844587/?ref_=tt_urv,rw3844587,ur22419131,29 October 2017,,,... and it's dinosaurs. Absolutely everything ...,tt0369610
2,7/10,"Spielberg Magic, This Is Not. Still, a Visit t...",/review/rw4200129/?ref_=tt_urv,rw4200129,ur35359466,12 June 2018,Warning: Spoilers,,You may have heard some critics champion Juras...,tt0369610
3,3/10,Another piece of modern trash.,/review/rw3846832/?ref_=tt_urv,rw3846832,ur9357474,1 November 2017,Warning: Spoilers,,"There is a young, handsome (I suspect), super ...",tt0369610
4,3/10,Bleah,/review/rw3387151/?ref_=tt_urv,rw3387151,ur33389853,3 January 2016,,,"A dull monster movie without ideas, with all t...",tt0369610
...,...,...,...,...,...,...,...,...,...,...
482077,1/10,Low budget junk,/review/rw5063745/?ref_=tt_urv,rw5063745,ur38716160,16 August 2019,Warning: Spoilers,,That's the best words I can find to describe i...,tt9904820
482078,2/10,its a surviving story if in the right hands an...,/review/rw4823075/?ref_=tt_urv,rw4823075,ur79950921,2 May 2019,,,"This is a c-level horror flick, and like most ...",tt9904820
482079,,It don't add up,/review/rw4842487/?ref_=tt_urv,rw4842487,ur26490810,11 May 2019,,,"Look, I ain't seen this movie, neither I will,...",tt9904820
482080,10/10,Loved this movie!,/review/rw5817238/?ref_=tt_urv,rw5817238,ur119650852,12 June 2020,,,Very intense social drama. Realistic character...,tt9913660


In [4]:
def calculate_review_crawled_rate(review_id, review_count):
    if review_count == '' or int(review_count) == 0: 
        return 0
    else:
        return int(review_id) / int(review_count) * 100

movies_df.drop_duplicates(['movie_id'], inplace = True)

reviews_df.drop_duplicates(['review_id'], inplace = True)
reviews_crawled = reviews_df.groupby('movie_id').count()
reviews_count = pd.merge(movies_df, reviews_crawled, on = 'movie_id')[[
    'movie_id', 'name', 'average_rating', 'metascore', 'critic_count', 'review_count', 'review_id', 'review_url'
]]
reviews_count['review_count'] = reviews_count['review_count'].str.replace(',', '').str.extract('(\d+)').fillna("0")
reviews_count['review_crawled_rate'] = reviews_count.apply(
    lambda x: calculate_review_crawled_rate(x['review_id'], x['review_count']),
    axis = 1
)

movies_with_incomplete_reviews = reviews_count[(reviews_count['review_crawled_rate'] < 99)].sort_values('review_crawled_rate')
movies_with_incomplete_reviews

Unnamed: 0,movie_id,name,average_rating,metascore,critic_count,review_count,review_id,review_url,review_crawled_rate
1897,tt10887282,Kasanova,,,,0,1,/title/tt10887282/reviews,0.0
70,tt8972556,I Am Vengeance: Retaliation,3.7,,9 critic,7,6,/title/tt8972556/reviews,85.714286
950,tt6853934,Hammer,5.6,73.0,10 critic,7,6,/title/tt6853934/reviews,85.714286
1232,tt8636456,Bannister DollHouse,5.1,,7 critic,12,11,/title/tt8636456/reviews,91.666667
1132,tt7095476,Waiting for Anya,5.5,38.0,11 critic,13,12,/title/tt7095476/reviews,92.307692
695,tt8484012,Sword of Trust,6.3,70.0,44 critic,36,35,/title/tt8484012/reviews,97.222222
535,tt3246874,Wish Man,7.1,,6 critic,92,91,/title/tt3246874/reviews,98.913043


In [5]:
# comment are loaded via ajax for page 2++, need web driver to crawl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# function to extract review details from beautiful soup object (shared by normal crawling & web driver crawling)
def extract_reviews(movie_id, bs):
    movie_reviews = reviews_df[reviews_df['movie_id'] == movie_id]
    
    reviews = list()
    # the beautiful soup object is a list of review
    for review_div in bs.find_all("div", class_="review-container"):
        # define empty row for review
        review = {
            "rating": "", "title": "", "url": "", "id": "", "user_id": "", "user_name": "", 
            "user_url": "", "date": "", "spoiler": "", "other": "", "comment": ""
        }
            
        # extracted information from html tags
        review_rating = review_div.find_all("span", class_="rating-other-user-rating")
        review_link = review_div.find_all("a", class_="title")
        review_user = review_div.find_all("span", class_="display-name-link")
        review_date = review_div.find_all("span", class_="review-date")
        review_spoiler = review_div.find_all("span", class_="spoiler-warning")
        review_comment = review_div.select("div.content > div.text")

        # and if respected information exists, assign to the row
        if len(review_rating) > 0:
            review['rating'] = review_rating[0].text.strip()

        if len(review_link) > 0:
            review['title'] = review_link[0].text.strip()
            review['url'] = review_link[0].get('href')
            review['id'] = review_link[0].get('href').split('/')[2]
            
            # skip if review has already been crawled
            if review['id'] in movie_reviews['review_id']:
                continue

        if len(review_user) > 0:
            review_user_link = review_user[0].select("a")
            if len(review_user_link) > 0:
                review['user_name'] = review_user_link[0].text.strip()
                review['user_url'] = review_user_link[0].get('href')
                review['user_id'] = review_user_link[0].get('href').split('/')[2]
            
        if len(review_date) > 0:
            review['date'] = review_date[0].text.strip()
            
        if len(review_spoiler) > 0:
            review['spoiler'] = review_spoiler[0].text.strip()

        if len(review_comment) > 0:
            review['comment'] = review_comment[0].text.strip()
            
        reviews.append(review)
    
    # return the list of review extraceted from the beautiful soup object
    return(reviews)
        
    
# function to crawl review of the given movie & url
def crawl_reviews(movie_id, review_url):
    
    # some issue with crawling process, needs to handle retries
    retry_strategy = Retry(
        total = 10,
        status_forcelist = [429, 500, 502, 503, 504],
        method_whitelist = ["HEAD", "GET", "OPTIONS"],
        backoff_factor = 1
    )
    
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)

    review_url = host + review_url
    print(review_url, end = " - ")
    
    # get the beautifu soup of jects of the review page
    reviews_bs4 = BeautifulSoup(http.get(review_url, headers=headers).text, 'html.parser')
    
    # extract review information from the beautiful soup object
    reviews = extract_reviews(movie_id, reviews_bs4)
    
    # dump the first page of review into individual json file for movie
    with open('data/reviews_' + movie_id + '.json', 'a+') as reviews_json:
        print(json.dumps(reviews), file = reviews_json, flush = True)
        
    page_count = 0
    
    # if the page contains reviews
    if len(reviews) > 0:
        page_count = 1
        
        # initial the web drive to simulate click to retrieve ajax update
        driver = webdriver.Chrome('/home/kitlim/.wdm/drivers/chromedriver/linux64/80.0.3987.106/chromedriver')
        wait = WebDriverWait(driver,10)
        driver.get(review_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')

        # infinite loop until no more new review pages
        previous_key = ""
        while True:
            try:
                # find if "Load More" button exist
                element = driver.find_element_by_class_name("load-more-data");
                key = element.get_attribute('data-key')
                
            except NoSuchElementException:
                # if not, end the loop
                break
                
            # if the key of the current "Load More" button is not the same with previous key
            if key != previous_key:
                if key is not None:
                    # get the ajax url for the new review page
                    load_more_review_ajax_url = review_url + "/_ajax?ref_=undefined&paginationKey=" + key
                
                    try:
                        # get the beautifu soup of jects of the review page returned by the ajax call
                        reviews_bs4 = BeautifulSoup(http.get(load_more_review_ajax_url, headers=headers).text, 'html.parser')
                    except: 
                        # error handling
                        print('ConnectionError: retry')
                        time.sleep(0.5)
                        continue
                        
                    # extract review information from the beautiful soup object
                    reviews = extract_reviews(movie_id, reviews_bs4)
                    
                    page_count = page_count + 1
                
                # if same with previous key, do nothing (previous ajax call is not completed successfully yet)
                else: 
                    break
                
                # dump the current page of review into individual json file for movie
                with open('data/reviews_' + movie_id + '.json', 'a+') as reviews_json:
                    print(json.dumps(reviews), file = reviews_json, flush = True)

            try:
                # simulate click on the button to load more review 
                # and update the "Load more" button's key
                driver.find_element_by_css_selector("button#load-more-trigger").click()
                wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
                soup = BeautifulSoup(driver.page_source, 'lxml')
                
            except Exception:
                break
                
        # close the web driver
        driver.quit()
        
    print(page_count, "pages")

In [6]:
# for each movie with missing reviews
for movie in movies_with_incomplete_reviews.to_dict('record'):
    
    # crawl reviews of current movie
    crawl_reviews(movie['movie_id'], movie['review_url'])

    # add the movie as successfully crawled
    with open('data/review_done.csv', 'a+') as review_done:
        print(movie['movie_id'], file = review_done, flush = True)
    

https://www.imdb.com/title/tt10887282/reviews - 1 pages
https://www.imdb.com/title/tt8972556/reviews - 1 pages
https://www.imdb.com/title/tt6853934/reviews - 1 pages
https://www.imdb.com/title/tt8636456/reviews - 1 pages
https://www.imdb.com/title/tt7095476/reviews - 1 pages
https://www.imdb.com/title/tt8484012/reviews - 2 pages
https://www.imdb.com/title/tt3246874/reviews - 4 pages
