In [0]:
import requests
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
import time
import re
import io
from collections import Counter
from requests import Session
import os
from tqdm import tqdm
from datetime import datetime
import pandas as pd
from tqdm import tqdm

# Box Office Mojo Scraping

In [0]:
url_1 = 'https://www.boxofficemojo.com/year/'
year = [2017, 2018, 2019]
url_2 = '/?sort=grossToDate&grossesOption=totalGrosses'

In [0]:
movie_info = pd.DataFrame() # create an empty dataframe to store all info

for i in tqdm(range(0, len(year))):
    full_url = url_1 + str(year[i]) + url_2
    response = requests.get(full_url, headers={"User-Agent":"Mozilla/5.0"})
    soup = BeautifulSoup(response.content, "html.parser")  
    tag_tr = soup.find_all('tr')
    
    movie_link = [] # box office mojo movie link
    movie = [] # movie name
    gross_rev = [] # gross domestic revenue
    open_rev = [] # opening weekend revenue
    release_date = [] # release date
    for tr in tag_tr[1:]:
        # movie name
        movie_bin = tr.find('a', attrs = {'class': 'a-link-normal'}).text
        movie.append(movie_bin)
        
        # box office mojo url of movie
        movie_link_bin = 'https://www.boxofficemojo.com/' + tr.find('a', attrs = {'class': 'a-link-normal'})['href']
        movie_link.append(movie_link_bin)
        
        # gross revenue
        gross_bin = tr.find('td', attrs = \
                    {'class': 'a-text-right mojo-field-type-money mojo-sort-column mojo-estimatable'}).text
        if gross_bin == '-':
            gross_bin = None
        else:
            gross_bin = int(re.sub(r"[$,]", "", gross_bin))
        gross_rev.append(gross_bin)
        
        # opening weekend revenue
        open_bin = tr.find('td', attrs = {'class': 'a-text-right mojo-field-type-money'}).text
        if open_bin == '-':
            open_bin = None
        else:
            open_bin = int(re.sub(r"[$,]", "", open_bin))
        open_rev.append(open_bin)
        
        # release date
        release_bin = tr.find('td', attrs={'class': 'a-text-left mojo-field-type-date a-nowrap'}).text + \
        ' ' + str(year[i])
        release_date.append(release_bin)
        
    movie_info_bin = pd.DataFrame({'movie_title': movie, 'mojo_url': movie_link, 'gross_revenue': gross_rev,
                                       'open_weekend_revenue': open_rev, 'release_date': release_date})
    # format the release_date column
    movie_info_bin['release_date'] = pd.to_datetime(movie_info_bin['release_date']) 
    movie_info = pd.concat([movie_info, movie_info_bin], ignore_index=True)

100%|██████████| 3/3 [00:11<00:00,  3.81s/it]


In [0]:
movie_info.head()

Unnamed: 0,movie_title,mojo_url,gross_revenue,open_weekend_revenue,release_date
0,Star Wars: Episode VIII - The Last Jedi,https://www.boxofficemojo.com//release/rl27087...,620181382,220009584.0,2017-12-15
1,Beauty and the Beast,https://www.boxofficemojo.com//release/rl22259...,504014165,174750616.0,2017-03-17
2,Wonder Woman,https://www.boxofficemojo.com//release/rl57845...,412563408,103251471.0,2017-06-02
3,Jumanji: Welcome to the Jungle,https://www.boxofficemojo.com//release/rl30952...,404515480,36169328.0,2017-12-20
4,Guardians of the Galaxy Vol. 2,https://www.boxofficemojo.com//release/rl29760...,389813101,146510104.0,2017-05-05


In [0]:
movie_info.tail()

Unnamed: 0,movie_title,mojo_url,gross_revenue,open_weekend_revenue,release_date
2403,Game Day,https://www.boxofficemojo.com//release/rl24837...,1624,1624.0,2019-10-04
2404,American Playhouse: The Killing Floor,https://www.boxofficemojo.com//release/rl26615...,1583,1583.0,2019-11-22
2405,The Hours and Times,https://www.boxofficemojo.com//release/rl28359...,1273,1273.0,2019-03-01
2406,The Untold Story,https://www.boxofficemojo.com//release/rl62118...,790,790.0,2019-01-18
2407,Tall Tales from the Magical Garden of Antoon K...,https://www.boxofficemojo.com//release/rl17401...,220,,2019-01-11


In [0]:
movie_info_list=movie_info['mojo_url'].to_list()

In [0]:
movie_info_list[0:2]

['https://www.boxofficemojo.com//release/rl2708702721/?ref_=bo_yld_table_1',
 'https://www.boxofficemojo.com//release/rl222594561/?ref_=bo_yld_table_2']

In [0]:
rows=[]
for url in tqdm(movie_info_list):
    try:
        user_agent = {'User-agent': 'Mozilla/5.0'} 
        response = requests.get(url, headers = user_agent)
        soup = BeautifulSoup(response.content)

        #distributer
        wholename=soup.find("div", attrs= {"class": "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile"}).div.text
        distributer=wholename[11:-30]

        #budget
        budget=soup.find_all("span", attrs= {"class": "money"})
        if len(budget)<5:
            budget = None
        else:
            budget=budget[4].text
            budget=int(re.sub(r"[$,]", "", budget))
        #genres
        a=soup.find("div",attrs={"class": "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile"}).text
        if 'Genres' not in a:
            genres = None
        else:
            b=a.split("Genres")
            c=b[1].split("In Release")
            d=c[0]
            genres=''.join(c for c in d if c.isalpha())
            
        row=[url,distributer,budget,genres]
        rows.append(row)

    except:
        print(url)
        continue

100%|██████████| 2408/2408 [14:45<00:00,  2.72it/s]


In [0]:
movie_info2=pd.DataFrame(rows)
movie_info2.columns=['mojo_url','distributer','budget','genres']

In [0]:
movie_info2.head(10)

Unnamed: 0,mojo_url,distributer,budget,genres
0,https://www.boxofficemojo.com//release/rl27087...,Walt Disney Studios Motion Pictures,317000000.0,ActionAdventureFantasySciFi
1,https://www.boxofficemojo.com//release/rl22259...,Walt Disney Studios Motion Pictures,160000000.0,FamilyFantasyMusicalRomance
2,https://www.boxofficemojo.com//release/rl57845...,Warner Bros.,149000000.0,ActionAdventureFantasySciFiWar
3,https://www.boxofficemojo.com//release/rl30952...,Sony Pictures Releasing,90000000.0,ActionAdventureComedyFantasy
4,https://www.boxofficemojo.com//release/rl29760...,Walt Disney Studios Motion Pictures,200000000.0,ActionAdventureComedySciFi
5,https://www.boxofficemojo.com//release/rl86320...,Sony Pictures Releasing,175000000.0,ActionAdventureSciFi
6,https://www.boxofficemojo.com//release/rl34812...,Warner Bros.,35000000.0,Horror
7,https://www.boxofficemojo.com//release/rl29593...,Walt Disney Studios Motion Pictures,180000000.0,ActionAdventureComedyFantasySciFi
8,https://www.boxofficemojo.com//release/rl12265...,Universal Pictures,80000000.0,AdventureAnimationComedyFamilyFantasySciFi
9,https://www.boxofficemojo.com//release/rl11290...,Warner Bros.,,ActionAdventureFantasySciFi


In [0]:
movies_info=movie_info.merge(movie_info2, left_on='mojo_url', right_on='mojo_url', suffixes=(False, False))

In [0]:
movies_info.head()

Unnamed: 0,movie_title,mojo_url,gross_revenue,open_weekend_revenue,release_date,distributer,budget,genres
0,Star Wars: Episode VIII - The Last Jedi,https://www.boxofficemojo.com//release/rl27087...,620181382,220009584.0,2017-12-15,Walt Disney Studios Motion Pictures,317000000.0,ActionAdventureFantasySciFi
1,Beauty and the Beast,https://www.boxofficemojo.com//release/rl22259...,504014165,174750616.0,2017-03-17,Walt Disney Studios Motion Pictures,160000000.0,FamilyFantasyMusicalRomance
2,Wonder Woman,https://www.boxofficemojo.com//release/rl57845...,412563408,103251471.0,2017-06-02,Warner Bros.,149000000.0,ActionAdventureFantasySciFiWar
3,Jumanji: Welcome to the Jungle,https://www.boxofficemojo.com//release/rl30952...,404515480,36169328.0,2017-12-20,Sony Pictures Releasing,90000000.0,ActionAdventureComedyFantasy
4,Guardians of the Galaxy Vol. 2,https://www.boxofficemojo.com//release/rl29760...,389813101,146510104.0,2017-05-05,Walt Disney Studios Motion Pictures,200000000.0,ActionAdventureComedySciFi


In [0]:
movies_info.to_csv('all_scraped_movies.csv')

In [0]:
rotten_tomatoes_movies=movie_info['movie_title'].to_list()
rotten_tomatoes_movies

['Star Wars: Episode VIII - The Last Jedi',
 'Beauty and the Beast',
 'Wonder Woman',
 'Jumanji: Welcome to the Jungle',
 'Guardians of the Galaxy Vol. 2',
 'Spider-Man: Homecoming',
 'It',
 'Thor: Ragnarok',
 'Despicable Me 3',
 'Justice League',
 'Logan',
 'The Fate of the Furious',
 'Coco',
 'Dunkirk',
 'Get Out',
 'The Lego Batman Movie',
 'The Boss Baby',
 'The Greatest Showman',
 'Pirates of the Caribbean: Dead Men Tell No Tales',
 'Kong: Skull Island',
 'Cars 3',
 'War for the Planet of the Apes',
 'Split',
 'Wonder',
 'Transformers: The Last Knight',
 'Girls Trip',
 'Fifty Shades Darker',
 'Baby Driver',
 'Pitch Perfect 3',
 "Daddy's Home 2",
 'Murder on the Orient Express',
 'Annabelle: Creation',
 'Kingsman: The Golden Circle',
 'Blade Runner 2049',
 'John Wick: Chapter 2',
 'The Emoji Movie',
 'Power Rangers',
 'Ferdinand',
 'The Post',
 'The Mummy',
 "The Hitman's Bodyguard",
 'Alien: Covenant',
 'Captain Underpants: The First Epic Movie',
 'A Bad Moms Christmas',
 "A Dog's

# Rotten Tomatoes Scraping 

In [0]:
movie_info['release_date'] = pd.to_datetime(movie_info['release_date'])
# append a release year column for our convenience to match the year in the website later
movie_info['release_year'] = movie_info['release_date'].dt.year

In [0]:
def find_rt_release_year(soup):
    '''soup = review page beautiful soup object
    find the release year of a given webpage on rottem tomatoes.'''
    release_date_text = soup.find_all('span', class_='subtle alignSubTitle')[1].parent.text
    release_date = re.search(r'[a-zA-Z]+ [0-9]+, [0-9]{4}' ,release_date_text).group()
    rt_release_year = re.search(r'[0-9]{4}', release_date).group()
    return rt_release_year

In [0]:
punc = '[,\.!\'\?\:\(\)/;\+!#\*]'
no_match_list = []
rotten_tomatoes_reviews = []
error_list = []
index = 0
for movie in rotten_tomatoes_movies:
    print(movie)
    # remove punctuations above
    movie_search_str = re.sub(punc, '', movie)
    movie_search_str = re.sub(r'\-', ' ', movie_search_str)
    movie_search_str = movie_search_str.replace('&', 'and')
    # add underscore between words
    movie_search_str = re.sub(r'\s+', '_', movie_search_str)
    # remove accents
    movie_search_str = unidecode.unidecode(movie_search_str)
    movie_search_str = movie_search_str.lower()

    # release year in movie_info
    info_release_year = str(movie_info.loc[movie_info['movie_title']==movie, 'release_year'].values[0])
#     print('info release year:', info_release_year)
    
    # check if the movie found by the search_str matches the movie we're matching with the one in movie_info
    # by comparing if the release year is the same
    # then, define the correct url for the review page

    try:
        # raw search_url
        review_page = 'https://www.rottentomatoes.com/m/'+ movie_search_str + '/reviews'
        response = requests.get(review_page, headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(response.content, "html.parser")
        rt_release_year = find_rt_release_year(soup)
        if rt_release_year != info_release_year:
            raise Exception("year doesn't match")
        else:
            review_page_url = review_page

    except:

        try:
            # search_url + release year
            str_with_year = movie_search_str + '_' + info_release_year
            new_review_page = 'https://www.rottentomatoes.com/m/'+ str_with_year + '/reviews'
            response = requests.get(new_review_page, headers={"User-Agent":"Mozilla/5.0"})
            soup = BeautifulSoup(response.content, "html.parser")
            rt_release_year = find_rt_release_year(soup)
        
            if rt_release_year != info_release_year:
                raise Exception("year doesn't match")
            else:
                review_page_url = new_review_page   
    
        except:

            try:
                # use webdriver to search for the movie
                driver = webdriver.Chrome('../../Downloads/chromedriver') 
                driver.get('https://www.rottentomatoes.com')
                time.sleep(1)
                search_form_text = driver.find_element_by_class_name('search-bar__text-input')
                search_form_text.send_keys(movie, Keys.ENTER)
                time.sleep(1)

                # if the following is executable, then the year is matched. Otherwise it goes into the exception clause
                year_text = '(' + info_release_year + ')'
                driver.find_element_by_xpath('//span[text()="' + year_text + '"]/preceding-sibling::a').click()
                review_page_url = driver.current_url + '/reviews'
                driver.close()

            except:
                try:
                    driver.close()
                except:
                    pass
                no_match_list.append(movie)


### the code above finds the url for the review page for each movie.
### below scrapes the information off the website
    try:
        print(review_page_url)
        response = requests.get(review_page_url, headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(response.content, "html.parser")
        
        total_num_pages_text = soup.find('span', class_='pageInfo').text
        total_num_pages = total_num_pages_text[total_num_pages_text.find('of ')+3:]
        
        release_date_text = soup.find_all('span', class_='subtle alignSubTitle')[1].parent.text
        release_date = re.search(r'[a-zA-Z]+ [0-9]+, [0-9]{4}' ,release_date_text).group()
        
        for i in range(1, int(total_num_pages)+1):
            reviews_pages = review_page_url + '?type=&sort=&page=' + str(i)
            page = requests.get(reviews_pages, headers={"User-Agent":"Mozilla/5.0"})
            soup = BeautifulSoup(page.content, "html.parser")
            rows = soup.find_all('div', class_='row review_table_row')
            for row in rows:
                rotten_tomatoes_reviews.append({'movie_title': movie})
                rotten_tomatoes_reviews[index].update({'release_date_rt': release_date})

                #### critic name
                critic = row.find('a', class_='unstyled bold articleLink').text
#                 print(critic)
                rotten_tomatoes_reviews[index].update({'critic': critic})

                #### rotten or fresh
                if row.find('div', class_='review_icon icon small fresh'):
                    fresh_or_rotten = 'fresh'
                if row.find('div', class_='review_icon icon small rotten'):
                    fresh_or_rotten = 'rotten'
#                 print(fresh_or_rotten)
                rotten_tomatoes_reviews[index].update({'fresh_or_rotten': fresh_or_rotten})

                #### review
                review = row.find('div', class_='the_review').text
                # trim all the white spaces
                review = review.replace((' '*36), '').replace('\n', '').replace(' '*32, '')
#                 print(review)
                rotten_tomatoes_reviews[index].update({'review': review})

                #### review date
                review_date = row.find(class_='review-date subtle small').text
                review_date = review_date.replace(' '*32, '').replace(' '*28, '').replace('\n', '')
#                 print(review_date)
                rotten_tomatoes_reviews[index].update({'review_date': review_date})

                #### review score
                original_score = row.find('div', class_='small subtle review-link').text
                if 'Original Score:' in original_score:
                    score = original_score[original_score.find('Original Score:'):]
                    score = score[score.find(':')+2:]
                    score = score.replace(' '*36, '').replace(' '*32, '').replace('\n', '')
                else:
                    score = None
                rotten_tomatoes_reviews[index].update({'original_score': score})
#                 print(score)

                #### top critic?
                if row.find('span', class_='glyphicon glyphicon-star'):
                    top_critic = True
                else:
                    top_critic = False
                rotten_tomatoes_reviews[index].update({'top_critic': top_critic})
#                 print(top_critic)

                index +=1
                
        print('done:', movie)
        
    except:
        print('error at', movie)
        error_list.append(movie)

In [0]:
# fix the scraping for movies in the error list
punc = "[,\.\!'\?\:\(\)/;\+!#\*]"
rotten_tomatoes_reviews_2 = []
error_list_2 = []
index2 = 0
no_match_list = []
for movie in error_list:
    print(movie)
    # remove punctuations above
    movie_search_str = re.sub(punc, '', movie)
    movie_search_str = re.sub(r'\-', ' ', movie_search_str)
    movie_search_str = movie_search_str.replace('&', 'and')
    # add underscore between words
    movie_search_str = re.sub(r'\s+', '_', movie_search_str)
    # remove accents
    movie_search_str = unidecode.unidecode(movie_search_str)
    movie_search_str = movie_search_str.lower()

    # release year in movie_info
    info_release_year = str(movie_info.loc[movie_info['movie_title']==movie, 'release_year'].values[0])
#     print('info release year:', info_release_year)

    # check if the movie found by the search_str matches the movie we're matching with the one in movie_info
    # by comparing if the release year is the same
    # then, define the correct url for the review page

    try:
        review_page = 'https://www.rottentomatoes.com/m/'+ movie_search_str + '/reviews'
        response = requests.get(review_page, headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(response.content, "html.parser")
        rt_release_year = find_rt_release_year(soup)
        if rt_release_year != info_release_year:
            raise Exception("year doesn't match")
        else:
            review_page_url = review_page

    except:

        try:
            str_with_year = movie_search_str + '_' + info_release_year
            new_review_page = 'https://www.rottentomatoes.com/m/'+ str_with_year + '/reviews'
            response = requests.get(new_review_page, headers={"User-Agent":"Mozilla/5.0"})
            soup = BeautifulSoup(response.content, "html.parser")
            rt_release_year = find_rt_release_year(soup)

            if rt_release_year != info_release_year:
                raise Exception("year doesn't match")
            else:
                review_page_url = new_review_page   

        except:

            try:
                driver = webdriver.Chrome('../../Downloads/chromedriver') 
                driver.get('https://www.rottentomatoes.com')
                time.sleep(1)
                search_form_text = driver.find_element_by_class_name('search-bar__text-input')
                search_form_text.send_keys(movie, Keys.ENTER)
                time.sleep(1)

                # if the following is executable, then the year is matched. Otherwise it goes into the exception clause
                year_text = '(' + info_release_year + ')'
                driver.find_element_by_xpath('//span[text()="' + year_text + '"]/preceding-sibling::a').click()
                time.sleep(1)
                review_page_url = driver.current_url + '/reviews'
                driver.close()

            except:
                try:
                    driver.close()
                except:
                    pass
                no_match_list.append(movie)
                review_page_url = None

    try:
        print(review_page_url)
        response = requests.get(review_page_url, headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(response.content, "html.parser")
        try:
            release_date_text = soup.find_all('span', class_='subtle alignSubTitle')[1].parent.text
            release_date = re.search(r'[a-zA-Z]+ [0-9]+, [0-9]{4}' ,release_date_text).group()
        except:
            release_date = None
        ##### some movies don't have total page num since there is only 1 page
        try:
            total_num_pages_text = soup.find('span', class_='pageInfo').text
            total_num_pages = total_num_pages_text[total_num_pages_text.find('of ')+3:]
        except:
            total_num_pages = 1

        for i in range(1, int(total_num_pages)+1):
            reviews_pages = review_page_url + '?type=&sort=&page=' + str(i)
            page = requests.get(reviews_pages, headers={"User-Agent":"Mozilla/5.0"})
            soup = BeautifulSoup(page.content, "html.parser")
            rows = soup.find_all('div', class_='row review_table_row')
            for row in rows:
                rotten_tomatoes_reviews_2.append({'movie_title': movie})
                rotten_tomatoes_reviews_2[index2].update({'release_date_rt': release_date})

                #### critic name
                try:
                    critic = row.find('a', class_='unstyled bold articleLink').text
                except:
                    critic = None
#                 print(critic)
                rotten_tomatoes_reviews_2[index2].update({'critic': critic})

                #### rotten or fresh
                if row.find('div', class_='review_icon icon small fresh'):
                    fresh_or_rotten = 'fresh'
                if row.find('div', class_='review_icon icon small rotten'):
                    fresh_or_rotten = 'rotten'
#                 print(fresh_or_rotten)
                rotten_tomatoes_reviews_2[index2].update({'fresh_or_rotten': fresh_or_rotten})

                #### review
                review = row.find('div', class_='the_review').text
                # trim all the white spaces
                review = review.replace((' '*36), '').replace('\n', '').replace(' '*32, '')
#                 print(review)
                rotten_tomatoes_reviews_2[index2].update({'review': review})

                #### review date
                review_date = row.find(class_='review-date subtle small').text
                review_date = review_date.replace(' '*32, '').replace(' '*28, '').replace('\n', '')
#                 print(review_date)
                rotten_tomatoes_reviews_2[index2].update({'review_date': review_date})

                #### review score
                original_score = row.find('div', class_='small subtle review-link').text
                if 'Original Score:' in original_score:
                    score = original_score[original_score.find('Original Score:'):]
                    score = score[score.find(':')+2:]
                    score = score.replace(' '*36, '').replace(' '*32, '').replace('\n', '')
                else:
                    score = None
                rotten_tomatoes_reviews_2[index2].update({'original_score': score})
#                 print(score)

                #### top critic?
                if row.find('span', class_='glyphicon glyphicon-star'):
                    top_critic = True
                else:
                    top_critic = False
                rotten_tomatoes_reviews_2[index2].update({'top_critic': top_critic})
#                 print(top_critic)

                index2 +=1

        print('done:', movie)

    except:
        print('error at', movie)
        error_list_2.append(movie)

In [0]:
updated_error_list = pd.Series(error_list_2)
updated_error_list.to_csv('updated_error_list.csv', header=False)
rt_reviews_2 = pd.DataFrame(rotten_tomatoes_reviews_2)
rt_reviews_2.to_csv('rt_reviews_addtional.csv')
no_match = pd.Series(no_match_list)
no_match.to_csv('no_match_in_rt.csv', header=False)

In [0]:
rt_reviews_all = pd.concat([rt_reviews_1, rt_reviews_2], axis=0, ignore_index=True)