In [1]:
import requests
import time

# requirements
from bs4 import BeautifulSoup
from requests import TooManyRedirects
import re

def make_soup(url):
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
    except TooManyRedirects:
        soup = ''
    return soup

#regex patterns
page_pat = re.compile(r'Page 1 of \d+')
review_pat = re.compile(r'<div class=\"the_review\" data-qa=\"review-text\">[;a-zA-Z\s,-.\'\/\?\[\]\":\']*</div>')
rating_pat = re.compile(r'Original Score:\s([A-Z](\+|-)?|\d(.\d)?(\/\d)?)')
fresh_pat = re.compile(r'small\s(fresh|rotten)\"')
critic_pat = re.compile(r'\/\"\>([A-Z][a-zA-Z]+\s[A-Z][a-zA-Z\-]+)|([A-Z][a-zA-Z.]+\s[A-Z].?\s[A-Z][a-zA-Z]+)|([A-Z][a-zA-Z]+\s[A-Z]+\'[A-Z][a-zA-Z]+)')
publisher_pat = re.compile(r'\"subtle\">[a-zA-Z\s,.\(\)\'\-&;!\/\d+]+</em>')
date_pat = re.compile(r'[a-zA-Z]+\s\d+,\s\d+')

In [2]:
def get_critic_reviews_from_page(soup):
    reviews = list()
    rating = list()
    fresh = list()
    critic = list()
    top_critic = list()
    publisher = list()
    date = list()
    
    soup = str(soup)
    review_soup = soup.split('="review_table')[1].split('row review_table_row')
    review_soup.pop(0)

    for review in review_soup:
        match = re.findall(review_pat, str(review))
        if len(match) > 0:
            m = match[0]            
            for iden in ['<div class="the_review" data-qa="review-text"> ','</div>']:
                m = m.replace(iden,'')
            reviews.append(m.strip('"'))            
            # extract rating
            match = re.findall(rating_pat, str(review))
            if len(match) > 0:
                m = match[0][0]
                if '/1' in m:
                    sp_m = m.split('/')
                    if sp_m[-1] == '1':
                        sp_m[-1] = '10'
                    m = '/'.join(sp_m)
                rating.append(m)
            else:
                rating.append(None)            
            # extract fresh indicator
            match = re.findall(fresh_pat, str(review))
            if len(match) > 0:
                fresh.append(match[0])
            else:
                fresh.append(None)            
            # extract critic
            match = re.findall(critic_pat, str(review))
            if len(match) > 0:
                critic.append(''.join(match[0]))
            else:
                critic.append(None)            
            # check if top critic
            if '> Top Critic<' in str(review):
                top_critic.append(1)
            else:
                top_critic.append(0)            
            # extract publisher
            match = re.findall(publisher_pat, str(review))
            if len(match) > 0:
                m = match[0]
                m = m.replace('"subtle">', '')
                m = m.replace('</em>','')
                publisher.append(m)
            else:
                publisher.append(None)            
            # extract date
            match = re.findall(date_pat, str(review))
            if len(match) > 0:
                date.append(match[0].strip('"'))
            else:
                date.append(None)
            
    return [reviews, rating, fresh, critic, top_critic, publisher, date]

def get_num_pages(soup):
    match = re.findall(page_pat,str(list(soup)))
    if len(match) > 0:
        match = match[0]
        match = match.split(' of ')[-1]
        return match
    else:
        return None

In [3]:
def get_critic_reviews(page):
    info = [[],[],[],[],[],[],[]]
    soup = make_soup(page + "reviews")
#     print(soup)
    pages = get_num_pages(soup)
#     print(pages)
    if pages is not None:
        for page_num in range(1,int(pages)+1):
            soup = make_soup(page + "reviews?page=" + str(page_num) + "&sort=")
            c_info = get_critic_reviews_from_page(soup)
            
            # accumulate review info
            for i in range(len(c_info)):
                info[i] = info[i] + c_info[i]
        
        c_info = dict()
        keys = ['reviews', 'rating', 'fresh', 'critic', 'top_critic', 'publisher', 'date']
        for k in range(len(keys)):
            c_info[keys[k]] = info[k]        
    else:
        c_info = None        
    return c_info

In [4]:
movie_urls = {'Wonder Woman 1984': 'https://www.rottentomatoes.com/m/wonder_woman_1984/', 
              'Soul':'https://www.rottentomatoes.com/m/soul_2020/', 
              'Mulan': 'https://www.rottentomatoes.com/m/mulan_2020/',
              'Birds of Prey': 'https://www.rottentomatoes.com/m/birds_of_prey_2020/',
              'Sonic': 'https://www.rottentomatoes.com/m/sonic_the_hedgehog_2020/'
             }

movie_urls["Captain Marvel"] = "https://www.rottentomatoes.com/m/captain_marvel/"
movie_urls["Lion King"] = "https://www.rottentomatoes.com/m/the_lion_king_2019/"
movie_urls["Aladdin"] = "https://www.rottentomatoes.com/m/aladdin/"
movie_urls["Joker"] = "https://www.rottentomatoes.com/m/joker_2019/"
movie_urls["Shazam!"] = "https://www.rottentomatoes.com/m/shazam/"
movie_urls["Godzilla: King of the Monsters"] = "https://www.rottentomatoes.com/m/godzilla_king_of_the_monsters_2019/"

movie_urls['Tenet'] = 'https://www.rottentomatoes.com/m/tenet/'
movie_urls['Scoob'] = 'https://www.rottentomatoes.com/m/scoob/'
movie_urls['The Marksman'] = 'https://www.rottentomatoes.com/m/the_marksman_2021/'
movie_urls['Artemis Fowl'] = 'https://www.rottentomatoes.com/m/artemis_fowl/'
movie_urls['Lovebirds'] = 'https://www.rottentomatoes.com/m/the_lovebirds_2020/'

movie_urls.keys()

dict_keys(['Wonder Woman 1984', 'Soul', 'Mulan', 'Birds of Prey', 'Sonic', 'Captain Marvel', 'Lion King', 'Aladdin', 'Joker', 'Shazam!', 'Godzilla: King of the Monsters', 'Tenet', 'Scoob', 'The Marksman', 'Artemis Fowl', 'Lovebirds'])

In [5]:
import pandas as pd

dfs = []
for key in movie_urls.keys():
    temp = get_critic_reviews(movie_urls[key])
    df = pd.DataFrame.from_dict(temp)
    df['Film'] = key
    dfs.append(df)

In [6]:
all_films = pd.concat(dfs)
all_films

Unnamed: 0,reviews,rating,fresh,critic,top_critic,publisher,date,Film
0,"<div class=""the_review"" data-qa=""review-text"">...",,fresh,Karen M. Peterson,0,,"January 25, 2021",Wonder Woman 1984
1,"<div class=""the_review"" data-qa=""review-text"">...",C-,rotten,,0,,"January 23, 2021",Wonder Woman 1984
2,"<div class=""the_review"" data-qa=""review-text"">...",2/5,rotten,,0,,"January 22, 2021",Wonder Woman 1984
3,"<div class=""the_review"" data-qa=""review-text"">...",,rotten,,0,,"January 20, 2021",Wonder Woman 1984
4,"<div class=""the_review"" data-qa=""review-text"">...",,rotten,,0,,"January 20, 2021",Wonder Woman 1984
...,...,...,...,...,...,...,...,...
153,"<div class=""the_review"" data-qa=""review-text"">...",,fresh,,0,,"May 20, 2020",Lovebirds
154,"<div class=""the_review"" data-qa=""review-text"">...",3/5,fresh,,0,,"May 20, 2020",Lovebirds
155,"<div class=""the_review"" data-qa=""review-text"">...",3/4,fresh,,0,,"May 20, 2020",Lovebirds
156,"<div class=""the_review"" data-qa=""review-text"">...",3.5/5,fresh,,0,,"May 20, 2020",Lovebirds


In [7]:
all_films['score'] = all_films['fresh'].apply(lambda x: 1 if x == 'fresh' else 0)
all_films

Unnamed: 0,reviews,rating,fresh,critic,top_critic,publisher,date,Film,score
0,"<div class=""the_review"" data-qa=""review-text"">...",,fresh,Karen M. Peterson,0,,"January 25, 2021",Wonder Woman 1984,1
1,"<div class=""the_review"" data-qa=""review-text"">...",C-,rotten,,0,,"January 23, 2021",Wonder Woman 1984,0
2,"<div class=""the_review"" data-qa=""review-text"">...",2/5,rotten,,0,,"January 22, 2021",Wonder Woman 1984,0
3,"<div class=""the_review"" data-qa=""review-text"">...",,rotten,,0,,"January 20, 2021",Wonder Woman 1984,0
4,"<div class=""the_review"" data-qa=""review-text"">...",,rotten,,0,,"January 20, 2021",Wonder Woman 1984,0
...,...,...,...,...,...,...,...,...,...
153,"<div class=""the_review"" data-qa=""review-text"">...",,fresh,,0,,"May 20, 2020",Lovebirds,1
154,"<div class=""the_review"" data-qa=""review-text"">...",3/5,fresh,,0,,"May 20, 2020",Lovebirds,1
155,"<div class=""the_review"" data-qa=""review-text"">...",3/4,fresh,,0,,"May 20, 2020",Lovebirds,1
156,"<div class=""the_review"" data-qa=""review-text"">...",3.5/5,fresh,,0,,"May 20, 2020",Lovebirds,1


In [8]:
all_films['date'] = pd.to_datetime(all_films['date'], format="%B %d, %Y")
all_films = all_films.sort_values('date')

In [9]:
import numpy as np


def calculate_score(film, frame):
    df = frame[frame['Film'] == film]
    grouped_1 = df[['date', 'score']].groupby('date').agg([sum, 'count'])
    grouped_1.columns = grouped_1.columns.droplevel(0)
    return grouped_1.cumsum()['sum']/grouped_1.cumsum()['count']

calculate_score("Wonder Woman 1984", all_films)

date
2020-12-09    1.000000
2020-12-15    0.888889
2020-12-16    0.900000
2020-12-17    0.863636
2020-12-18    0.854167
2020-12-19    0.836735
2020-12-20    0.830189
2020-12-21    0.761905
2020-12-22    0.720588
2020-12-23    0.690476
2020-12-24    0.650943
2020-12-25    0.634146
2020-12-26    0.617647
2020-12-27    0.614286
2020-12-28    0.602484
2020-12-29    0.584795
2020-12-30    0.570621
2020-12-31    0.562842
2021-01-02    0.559783
2021-01-03    0.553763
2021-01-04    0.567708
2021-01-05    0.567010
2021-01-06    0.564103
2021-01-07    0.563452
2021-01-08    0.560606
2021-01-11    0.562814
2021-01-12    0.562189
2021-01-14    0.561576
2021-01-15    0.563725
2021-01-16    0.565217
2021-01-20    0.559809
2021-01-22    0.557143
2021-01-23    0.554502
2021-01-25    0.556604
dtype: float64