# Part 1

In [1]:
links_by_year = ['https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected='+str(i)+'&sort=desc&view=detailed' for i in range(2018, 2023)]
links_by_year

['https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=2018&sort=desc&view=detailed',
 'https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=2019&sort=desc&view=detailed',
 'https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=2020&sort=desc&view=detailed',
 'https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=2021&sort=desc&view=detailed',
 'https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=2022&sort=desc&view=detailed']

In [19]:
BASE_URL = 'https://www.metacritic.com'

In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

In [3]:
user_agent = {'User-agent': 'Mozilla/5.0'}
texts_by_year = [requests.get(link, headers=user_agent).text for link in links_by_year]
texts_by_year[0]

'<!DOCTYPE html>\n<html xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://ogp.me/ns/fb#">\n<head>\n            <title>Best Movies for 2018 - Metacritic</title>\n        \n    <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n\n    \n    <meta name="description" content="See how well critics are rating the Best Movies for 2018">\n\n    \n    \n    \n    <meta name="application-name" content="Metacritic">\n    <meta name="msapplication-TileColor" content="#000000">\n    <meta name="msapplication-TileImage" content="/images/win8tile/76bf1426-2886-4b87-ae1c-06424b6bb8a2.png">\n\n        \n    <meta name="facebook-domain-verification" content="618k3mbeki8tar7u6wvrum5lxs5cka" />\n\n    \n    \n                                        <meta property="og:title" content="Best Movies for 2018">\n                                                <meta property="og:type" content="website">\n                                                <meta property="og:url" conte

In [168]:
import json

with open("texts_by_year.json", "w") as f:
    json.dump(texts_by_year, f)

In [129]:
def one_page_by_year(page_text):
    soup = BeautifulSoup(page_text, 'html.parser')
    movies = soup.find_all("a", {"class": "title"})
    movie_names = [movie.get_text() for movie in movies]
    movie_urls = [movie['href'] for movie in movies]
    metascores = [score.text.split()[1] for score in soup.find_all('div', {'class':'clamp-metascore'})]
    userscores = [score.text.split()[2] for score in soup.find_all('div', {'class':'clamp-userscore'})]
    release_dates = [date.text.split()[0] + ' ' + date.text.split()[1][:-1] for date in soup.find_all('div', {'class':'clamp-details'})]
    return movie_names, movie_urls, metascores, userscores, release_dates

In [130]:
def one_year(year_text):
    # year_text is the first page for each year
    soup = BeautifulSoup(year_text, 'html.parser')
    pages = [a['href']for a in soup.find_all("a", {'class':'page_num'})]
    movie_names, movie_urls, metascores, userscores, release_dates = one_page_by_year(year_text)
    for page in pages:
        page_text = requests.get(BASE_URL+page, headers=user_agent).text
        movie_names1, movie_urls1, metascores1, userscores1, release_dates1 = one_page_by_year(page_text)
        movie_names += movie_names1
        movie_urls += movie_urls1
        metascores += metascores1
        userscores += userscores1
        release_dates += release_dates1
        
    return movie_names, movie_urls, metascores, userscores, release_dates

In [133]:
movie_dict = {'movie':[], 'year':[], 'url':[], 'metascore':[], 'userscore':[], 'release_date':[]}

for i, year_text in enumerate(tqdm(texts_by_year)):
    movie_names, movie_urls, metascores, userscores, release_dates = one_year(year_text)
    movie_year = [i+2018] * len(movie_names)
    movie_dict['movie'] += movie_names
    movie_dict['url'] += movie_urls
    movie_dict['year'] += movie_year
    movie_dict['metascore'] += metascores
    movie_dict['userscore'] += userscores
    movie_dict['release_date'] += release_dates

100%|██████████| 5/5 [00:28<00:00,  5.77s/it]


In [134]:
pd.DataFrame.from_dict(movie_dict).to_csv('movies_metacritic2.csv')

In [135]:
movies_df2 = pd.read_csv('movies_metacritic2.csv')
movies_df2.head()

Unnamed: 0.1,Unnamed: 0,movie,year,url,metascore,userscore,release_date
0,0,Roma,2018,/movie/roma,96,7.7,November 21
1,1,Amazing Grace,2018,/movie/amazing-grace-1972,94,8.2,November 23
2,2,Shoplifters,2018,/movie/shoplifters,93,8.4,November 23
3,3,Shoah: Four Sisters,2018,/movie/shoah-four-sisters,93,tbd,November 14
4,4,Gavagai,2018,/movie/gavagai,91,6.0,August 3


# Part 2

In [37]:
movies_df = pd.read_csv('movies_metacritic.csv')
movies_df.head()

Unnamed: 0.1,Unnamed: 0,movie,year,url
0,0,Roma,2018,/movie/roma
1,1,Amazing Grace,2018,/movie/amazing-grace-1972
2,2,Shoplifters,2018,/movie/shoplifters
3,3,Shoah: Four Sisters,2018,/movie/shoah-four-sisters
4,4,Gavagai,2018,/movie/gavagai


In [84]:
def reviews_by_movie(review_page_text):
    soup = BeautifulSoup(review_page_text, 'html.parser')
    review_urls = [a['href'] for a in soup.find_all('a', {"class": "read_full"})]
    url_bases = [urlparse(url).netloc for url in review_urls]
    return review_urls, url_bases

In [53]:
from tqdm import tqdm

In [61]:
movie_texts = []
for url in tqdm(movies_df.url):
    movie_texts.append(requests.get(BASE_URL+url+'/critic-reviews', headers=user_agent).text)

100%|██████████| 2793/2793 [39:21<00:00,  1.18it/s] 


In [169]:
with open("movie_texts.json", "w") as f:
    json.dump(movie_texts, f)

In [85]:
movie_reviews = {'movie':[], 'review_url':[], 'review_base':[]}
for i in tqdm(range(len(movies_df))):
    review_urls, url_bases = reviews_by_movie(movie_texts[i])
    movie_reviews['review_url'] += review_urls
    movie_reviews['review_base'] += url_bases
    movie_reviews['movie'] += [movies_df['movie'][i]]*len(review_urls)

100%|██████████| 2793/2793 [04:36<00:00, 10.10it/s]


In [86]:
pd.DataFrame.from_dict(movie_reviews).to_csv('reviews_metacritic.csv')

In [93]:
user_review_texts = []
for url in tqdm(movies_df.url):
    user_review_texts.append(requests.get(BASE_URL+url+'/user-reviews', headers=user_agent).text)

100%|██████████| 2793/2793 [40:36<00:00,  1.15it/s]  


In [170]:
with open("user_review_texts.json", "w") as f:
    json.dump(user_review_texts, f)

In [179]:
def user_reviews(user_review_text):
    soup = BeautifulSoup(user_review_text, 'html.parser')
    user_reviews = []
    user_scores = [user_score.text.strip() for user_score in soup.find_all('div', {'class':'left fl'})]
    for review in soup.find_all('div', {"class": "review_body"}):
        if review.find('span', {'class': 'blurb blurb_expanded'}):
            user_reviews.append(review.find('span', {'class': 'blurb blurb_expanded'}).text)
        else:
            user_reviews.append(review.text)
    return user_scores, user_reviews

In [180]:
movie_user_reviews = {'movie':[], 'user_score':[], 'user_review':[]}
for i in tqdm(range(len(movies_df))):
    scores, reviews = user_reviews(user_review_texts[i])
    movie_user_reviews['movie'] += [movies_df['movie'][i]]*len(reviews)
    movie_user_reviews['user_score'] += scores
    movie_user_reviews['user_review'] += reviews

100%|██████████| 2793/2793 [05:25<00:00,  8.57it/s]


In [181]:
pd.DataFrame.from_dict(movie_user_reviews).to_csv('user_reviews_metacritic.csv')

In [182]:
user_reviews_df = pd.read_csv('user_reviews_metacritic.csv', lineterminator='\n')
user_reviews_df

Unnamed: 0.1,Unnamed: 0,movie,user_score,user_review
0,0,Roma,1,Baffled by the positive reviews. This was one ...
1,1,Roma,10,"The best film I've seen this year, last year, ..."
2,2,Roma,0,I am shocked at the reviews of this thing call...
3,3,Roma,3,I guess it depends why you go to the movies. I...
4,4,Roma,3,"\nBored AF. Boring,boring,boring and boring. T..."
...,...,...,...,...
43831,43831,Me Time,6,"\nTurn your brain off, get the eternal child i..."
43832,43832,Me Time,0,Mark Wahlberg playing a 44 year old at 51. You...
43833,43833,Me Time,7,7.5/10Me Time isn't as bad as everybody's thin...
43834,43834,Me Time,7,"Kevin Hart is one of my favorite comedians, an..."


# Part 3

In [2]:
movie_reviews_df = pd.read_csv('reviews_metacritic.csv')
movie_reviews_df

Unnamed: 0.1,Unnamed: 0,movie,review_url,review_base
0,0,Roma,https://www.nola.com/entertainment/2018/12/rom...,www.nola.com
1,1,Roma,https://www.newyorker.com/magazine/2018/12/17/...,www.newyorker.com
2,2,Roma,https://chicago.suntimes.com/entertainment/rom...,chicago.suntimes.com
3,3,Roma,https://www.chicagotribune.com/entertainment/m...,www.chicagotribune.com
4,4,Roma,https://www.washingtonpost.com/goingoutguide/m...,www.washingtonpost.com
...,...,...,...,...
54864,54864,Asking for It,https://screenrant.com/asking-for-it-2022-movi...,screenrant.com
54865,54865,Asking for It,https://variety.com/2022/film/reviews/asking-f...,variety.com
54866,54866,Asking for It,https://www.rogerebert.com/reviews/asking-for-...,www.rogerebert.com
54867,54867,Asking for It,https://www.nytimes.com/2022/03/03/movies/aski...,www.nytimes.com


In [3]:
critics = movie_reviews_df.groupby('review_base')['review_base'].count().nlargest(6)
critics.index
# New York Times requires subscription

Index(['www.theguardian.com', 'www.nytimes.com', 'www.rogerebert.com',
       'variety.com', 'www.hollywoodreporter.com', 'www.latimes.com'],
      dtype='object', name='review_base')

In [4]:
movie_reviews_cleaned = movie_reviews_df[movie_reviews_df['review_base'].isin(critics.index.drop('www.nytimes.com'))][['movie', 'review_url', 'review_base']].reset_index(drop=True)
movie_reviews_cleaned

Unnamed: 0,movie,review_url,review_base
0,Roma,https://www.latimes.com/entertainment/movies/l...,www.latimes.com
1,Roma,https://www.rogerebert.com/reviews/roma-2018,www.rogerebert.com
2,Roma,https://www.theguardian.com/film/2018/aug/30/r...,www.theguardian.com
3,Roma,https://www.hollywoodreporter.com/review/roma-...,www.hollywoodreporter.com
4,Roma,https://variety.com/2018/film/reviews/roma-rev...,variety.com
...,...,...,...
11577,Me Time,https://variety.com/2022/film/reviews/me-time-...,variety.com
11578,Me Time,https://www.latimes.com/entertainment-arts/mov...,www.latimes.com
11579,Me Time,https://www.hollywoodreporter.com/movies/movie...,www.hollywoodreporter.com
11580,Asking for It,https://variety.com/2022/film/reviews/asking-f...,variety.com


In [12]:
from tqdm import tqdm

review_htmls = []
user_agent = {'User-agent': 'Mozilla/5.0'}
for url in tqdm(movie_reviews_cleaned['review_url']):
    review_htmls.append(requests.get(url, headers=user_agent).text)

100%|██████████| 11582/11582 [1:43:00<00:00,  1.87it/s] 


In [13]:
import json

with open("critic_reviews.json", "w") as f:
    json.dump(review_htmls, f)

In [18]:
movie_reviews_cleaned['review_html'] = review_htmls
movie_reviews_cleaned

Unnamed: 0,movie,review_url,review_base,review_html
0,Roma,https://www.latimes.com/entertainment/movies/l...,www.latimes.com,"<!DOCTYPE html><html class=""page article-page""..."
1,Roma,https://www.rogerebert.com/reviews/roma-2018,www.rogerebert.com,"<!DOCTYPE html>\n<html class=""no-js"" lang=""en""..."
2,Roma,https://www.theguardian.com/film/2018/aug/30/r...,www.theguardian.com,"<!doctype html>\n <html lang=""en"">\n ..."
3,Roma,https://www.hollywoodreporter.com/review/roma-...,www.hollywoodreporter.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6..."
4,Roma,https://variety.com/2018/film/reviews/roma-rev...,variety.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6..."
...,...,...,...,...
11577,Me Time,https://variety.com/2022/film/reviews/me-time-...,variety.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6..."
11578,Me Time,https://www.latimes.com/entertainment-arts/mov...,www.latimes.com,"<!DOCTYPE html><html class=""page article-page""..."
11579,Me Time,https://www.hollywoodreporter.com/movies/movie...,www.hollywoodreporter.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6..."
11580,Asking for It,https://variety.com/2022/film/reviews/asking-f...,variety.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6..."


In [19]:
movie_reviews_cleaned.to_csv('critic_reviews_htmls.csv')

In [22]:
# LA Times

def latimes(review_text):
    try:
        soup = BeautifulSoup(review_text, 'html.parser')
        title = soup.find('h1').text.strip()
        body = '\n'.join([p.text.strip() for p in soup.find('div', {'class':'story-body'}).find_all('p')]).split('----')[0].strip()
        return title, body
    except:
        print(soup.prettify())

In [23]:
# Rogerebert

def rogerebert(review_text):
    try:
        soup = BeautifulSoup(review_text, 'html.parser')
        body = [p.text.strip() for p in soup.find_all('p')]
        remove_idx = len(body)
        for i in range(len(body)-1, 0, -1):
            if len(body[i].split()) > 20:
                remove_idx = i
                break
        return '', '\n'.join(body[1:remove_idx])
    except:
        print(soup.prettify())

In [24]:
# The Guardian

def guardian(review_text):
    try:
        soup = BeautifulSoup(review_text, 'html.parser')
        title = soup.find('h1').text
        body = '\n'.join([p.text.strip() for p in soup.find_all('p', {'class':'dcr-az7egx'})])
        return title, body
    except:
        print(soup.prettify())

In [25]:
# The Hollywood Reportor

def hollywood(review_text):
    try:
        soup = BeautifulSoup(review_text, 'html.parser')
        body_tmp = [p.text.strip().split('\n') for p in soup.find('main').find_all('p')]
        body = []
        for p in body_tmp:
            if '' not in p and len(p) == 1:
                body += p
        
        remove_idx = len(body)
        for i in range(len(body)-1, 0, -1):
            if len(body[i].split()) > 20:
                remove_idx = i
                break
        return '', '\n'.join(body[:remove_idx+1])
    except:
        print(soup.prettify())

In [26]:
# Variety

def variety(review_text):
    try:
        soup = BeautifulSoup(review_text, 'html.parser')
        return '', '\n'.join([p.text.strip() for p in soup.find('div', {'class':'vy-cx-page-content'}).find_all('p')])
    except:
        print(soup.prettify())

In [20]:
for i, html in enumerate(tqdm(review_htmls)):
    if '<html class="page error-page" lang="en-US">' in html:
        print(i)

  2%|▏         | 194/11582 [00:01<01:35, 119.51it/s]

175


 10%|█         | 1185/11582 [00:06<00:44, 233.60it/s]

1148


100%|██████████| 11582/11582 [00:57<00:00, 200.42it/s]


In [27]:
movie_reviews_cleaned2 = movie_reviews_cleaned.copy()
movie_reviews_cleaned2 = movie_reviews_cleaned2.drop([175, 1148]).reset_index(drop=True)

critic_review_title = []
critic_review_content = []
for i in tqdm(range(len(movie_reviews_cleaned2))):
    if movie_reviews_cleaned2['review_base'][i] == 'www.theguardian.com':
        title, content = guardian(movie_reviews_cleaned2['review_html'][i])
    elif movie_reviews_cleaned2['review_base'][i] == 'www.rogerebert.com':
        title, content = rogerebert(movie_reviews_cleaned2['review_html'][i])
    elif movie_reviews_cleaned2['review_base'][i] == 'variety.com':
        title, content = variety(movie_reviews_cleaned2['review_html'][i])
    elif movie_reviews_cleaned2['review_base'][i] == 'www.hollywoodreporter.com':
        title, content = hollywood(movie_reviews_cleaned2['review_html'][i])
    else:
        title, content = latimes(movie_reviews_cleaned2['review_html'][i])
    critic_review_title.append(title)
    critic_review_content.append(content)
movie_reviews_cleaned2['critic_review_title'] = critic_review_title
movie_reviews_cleaned2['critic_review_content'] = critic_review_content

100%|██████████| 11580/11580 [13:26<00:00, 14.35it/s]


In [28]:
movie_reviews_cleaned2

Unnamed: 0,movie,review_url,review_base,review_html,critic_review_title,critic_review_content
0,Roma,https://www.latimes.com/entertainment/movies/l...,www.latimes.com,"<!DOCTYPE html><html class=""page article-page""...",Review: ‘Roma’ lives up to lofty expectations ...,Alfonso Cuarón’s “Roma” has been the talk of t...
1,Roma,https://www.rogerebert.com/reviews/roma-2018,www.rogerebert.com,"<!DOCTYPE html>\n<html class=""no-js"" lang=""en""...",,Alfonso Cuaron’s “Roma” opens with a close-up ...
2,Roma,https://www.theguardian.com/film/2018/aug/30/r...,www.theguardian.com,"<!doctype html>\n <html lang=""en"">\n ...",Roma review: Alfonso Cuarón returns to Venice ...,"The Mexican director Alfonso Cuarón, whose bre..."
3,Roma,https://www.hollywoodreporter.com/review/roma-...,www.hollywoodreporter.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6...",,"Alfonso Cuaron ('Gravity') returns with 'Roma,..."
4,Roma,https://variety.com/2018/film/reviews/roma-rev...,variety.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6...",,Alfonso Cuarón is a filmmaker of such up-front...
...,...,...,...,...,...,...
11575,Me Time,https://variety.com/2022/film/reviews/me-time-...,variety.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6...",,"“Me Time,” a slapdash comedy by writer-directo..."
11576,Me Time,https://www.latimes.com/entertainment-arts/mov...,www.latimes.com,"<!DOCTYPE html><html class=""page article-page""...",Review: Sylvester Stallone finds nuance in sup...,The pandemic-delayed grim-and-gritty superhero...
11577,Me Time,https://www.hollywoodreporter.com/movies/movie...,www.hollywoodreporter.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6...",,Regina Hall also appears in the film written a...
11578,Asking for It,https://variety.com/2022/film/reviews/asking-f...,variety.com,"<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=""ie6...",,"On paper, the premise of writer-director Eamon..."


In [32]:
movie_reviews_cleaned2.to_csv('critic_reviews_metacritic.csv')

## Genre

In [2]:
import pandas as pd

movies = pd.read_csv('data/movies_metacritic2.csv')
movies

Unnamed: 0.1,Unnamed: 0,movie,year,url,metascore,userscore,release_date
0,0,Roma,2018,/movie/roma,96,7.7,November 21
1,1,Amazing Grace,2018,/movie/amazing-grace-1972,94,8.2,November 23
2,2,Shoplifters,2018,/movie/shoplifters,93,8.4,November 23
3,3,Shoah: Four Sisters,2018,/movie/shoah-four-sisters,93,tbd,November 14
4,4,Gavagai,2018,/movie/gavagai,91,6.0,August 3
...,...,...,...,...,...,...,...
2788,2788,Big Gold Brick,2022,/movie/big-gold-brick,30,3.8,February 25
2789,2789,Mother Schmuckers,2022,/movie/mother-schmuckers,28,tbd,March 4
2790,2790,Blacklight,2022,/movie/blacklight,27,3.9,February 11
2791,2791,Me Time,2022,/movie/me-time,25,3.7,August 26


In [69]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

BASE_URL = 'https://www.metacritic.com'

user_agent = {'User-agent': 'Mozilla/5.0'}
movie_pages = [requests.get(BASE_URL+url, headers=user_agent).text for url in tqdm(movies['url'])]

100%|██████████| 2793/2793 [1:04:07<00:00,  1.38s/it]


In [70]:
import json

with open("movie_pages.json", "w") as f:
    json.dump(movie_pages, f)

In [82]:
def movie_info(movie_text):
    soup = BeautifulSoup(movie_text, 'html.parser')
    stars = ','.join([star.text for star in soup.find('div', {'class': 'summary_cast details_section'}).find_all('a')]) if soup.find('div', {'class': 'summary_cast details_section'}) else ''
    director = soup.find('div', {'class': 'director'}).find('a').text if soup.find('div', {'class': 'director'}) else ''
    genres = ','.join([genre.text for genre in soup.find('div', {'class': 'genres'}).find_all('span')][2:]) if soup.find('div', {'class': 'genres'}) else ''
    runtime = soup.find('div', {'class': 'runtime'}).text.strip().split('\n')[1] if soup.find('div', {'class': 'runtime'}) else ''
    return stars, director, genres, runtime

In [83]:
movie_stars = []
movie_dir = []
movie_genres = []
movie_runtime = []
for page in tqdm(movie_pages):
    stars, director, genres, runtime = movie_info(page)
    movie_stars.append(stars)
    movie_dir.append(director)
    movie_genres.append(genres)
    movie_runtime.append(runtime)

100%|██████████| 2793/2793 [06:27<00:00,  7.21it/s]


In [84]:
movies['starring'] = movie_stars
movies['director'] = movie_dir
movies['genre'] = movie_genres
movies['runtime'] = movie_runtime

movies.to_csv('movies_metacritic3.csv')

In [85]:
movies

Unnamed: 0.1,Unnamed: 0,movie,year,url,metascore,userscore,release_date,starring,director,genre,runtime
0,0,Roma,2018,/movie/roma,96,7.7,November 21,"Andy Cortés,Carlos Peralta,Daniela Demesa,Dieg...",Alfonso Cuarón,Drama,135 min
1,1,Amazing Grace,2018,/movie/amazing-grace-1972,94,8.2,November 23,"Alexander Hamilton,Aretha Franklin,Bernard Pre...",Alan Elliott,Documentary,87 min
2,2,Shoplifters,2018,/movie/shoplifters,93,8.4,November 23,"Akira Emoto,Chizuru Ikewaki,Daisuke Kuroda,Jyo...",Hirokazu Koreeda,"Drama,Crime",121 min
3,3,Shoah: Four Sisters,2018,/movie/shoah-four-sisters,93,tbd,November 14,"Ada Lichtman,Claude Lanzmann,Hanna Marton,Paul...",Claude Lanzmann,"History,Documentary",273 min
4,4,Gavagai,2018,/movie/gavagai,91,6.0,August 3,"Amel Ibisevic,Andreas Lust,Anni-Kristiina Juus...",Rob Tregenza,Drama,90 min
...,...,...,...,...,...,...,...,...,...,...,...
2788,2788,Big Gold Brick,2022,/movie/big-gold-brick,30,3.8,February 25,"Alys Crocker,Andy Garcia,Emory Cohen,Frederick...",Brian Petsos,"Fantasy,Comedy",132 min
2789,2789,Mother Schmuckers,2022,/movie/mother-schmuckers,28,tbd,March 4,"Agnès Proust,Claire Bodson,Fresco,Habib Ben Ta...",Harpo Guit,Comedy,70 min
2790,2790,Blacklight,2022,/movie/blacklight,27,3.9,February 11,"Aidan Quinn,Andrew Shaw,Claire van der Boom,Em...",Mark Williams,"Action,Thriller",108 min
2791,2791,Me Time,2022,/movie/me-time,25,3.7,August 26,"Carlo Rota,Deborah S. Craig,Jimmy O. Yang,Kevi...",John Hamburg,Comedy,101 min


critic ratings

In [7]:
import pandas as pd
import json
from bs4 import BeautifulSoup

In [8]:
with open('data_intermediate/movie_texts.json') as json_file:
    movie_texts = json.load(json_file)

In [52]:
from urllib.parse import urlparse

def reviews_by_movie2(review_page_text):
    soup = BeautifulSoup(review_page_text, 'html.parser')
    reviews = soup.find_all('a', {"class": "read_full"})
    review_scores = [int(a.parent.parent.parent.find('div', {'class':'left fl'}).text.strip()) for a in reviews]
    review_urls = [a['href'] for a in reviews]
    url_bases = [urlparse(url).netloc for url in review_urls]
    return review_scores, review_urls, url_bases

In [53]:
from tqdm import tqdm

movies_df = pd.read_csv('data/movies_metacritic3.csv')
movie_reviews = {'movie':[], 'critic_score':[], 'review_url':[], 'review_base':[]}
for i in tqdm(range(len(movies_df))):
    review_scores, review_urls, url_bases = reviews_by_movie2(movie_texts[i])
    movie_reviews['critic_score'] += review_scores
    movie_reviews['review_url'] += review_urls
    movie_reviews['review_base'] += url_bases
    movie_reviews['movie'] += [movies_df['movie'][i]]*len(review_urls)

pd.DataFrame.from_dict(movie_reviews).to_csv('reviews_metacritic_updated.csv')

100%|██████████| 2793/2793 [04:04<00:00, 11.41it/s]
