In [1]:
import re
import urllib3
import certifi
import json
import pymongo
import pandas as pd
import time

In [2]:
# Construct all regular expressions
title_regex = re.compile(r"class=\"title\"><h3>(.+)</h3>")
date_regex = re.compile(r"class=\"clamp-details\">\s+<span>(.+)</span>")
description_regex = re.compile(r"class=\"summary\">\s*([\S\s]+?)\s*<\/div>")
score_regex = re.compile(r"<span class=\"title\">Metascore:<\/span>\s+<a class=\"metascore_anchor\" href=\"\/movie\/.*?\/critic-reviews\">\s+<div class=\"metascore_w large movie.+\">(.*?)<\/div>")
image_regex = re.compile(r"<a href =\"/movie/.*\"><img src=\"(.*)\" alt=\"")

http = urllib3.PoolManager(ca_certs=certifi.where())

In [3]:
# Retrieve credentials
with open("/Users/tiffanivick/Desktop/credentials.json") as f:
  data = json.load(f)
  mongo_connection_string = data ['mongodb']
  
# Fetch the database named "DB1"
client = pymongo.MongoClient(mongo_connection_string, tlsCAFile=certifi.where())
db1_database = client['DB1']
metacritic_data = db1_database['metacritic-movies']

In [4]:
# Retrieve a list of movies from a particular year and page of Metacritic
def metacritic_scraper(year: int, page: int) -> pd.DataFrame:
  
  # Fetch the webpage
  url = "https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=(year)&sort=desc&view=detailed&page=(page)"
  response = http.request('GET', url, headers={'User-Agent': 'Mozilla/5.0'})
  datastring = str(response.data, 'utf-8')
  
  # Execute all the regular expressions
  titles = title_regex.findall(datastring)
  dates = date_regex.findall(datastring)
  descriptions = description_regex.findall(datastring)
  scores = score_regex.findall(datastring)
  images = image_regex.findall(datastring)
  
  # Return a unified collection
  dataset = {'title': titles, 'dates': dates, 'descriptions': descriptions, 'scores': scores, 'images': images}
  
  return pd.DataFrame(dataset)

In [5]:
# Write a CSV file with this data
for year in range(2000, 2023):
  page = 0
  print(f'Collecting data for {year} page {page}...')
  
  # Retry a page multiple times if necessary
  while True:
    data = metacritic_scraper(year, page)
    
    # Stop when we reach a page with zero rows
    if len(data) == 0:
      break
    
    # Convert the dataframe into a list of movies to insert into MongoDB
    movies_to_insert = []

    for row in data.itertuples():
      movie = {
        'title': row.title,
        'release_date': row.date,
        'description': row.description,
        'metascore': row.score,
        'image_url': row.image
      }
      movies_to_insert.append(movie)
      
    # Insert records into MongoDB
    print(f'Inserting {len(movies_to_insert)} movies for the year {year} page {page}') 
    metacritic_data.insert_many(movies_to_insert)
    page = page + 1
      

Collecting data for 2000 page 0...
Collecting data for 2001 page 0...
Collecting data for 2002 page 0...
Collecting data for 2003 page 0...
Collecting data for 2004 page 0...
Collecting data for 2005 page 0...
Collecting data for 2006 page 0...
Collecting data for 2007 page 0...
Collecting data for 2008 page 0...
Collecting data for 2009 page 0...
Collecting data for 2010 page 0...
Collecting data for 2011 page 0...
Collecting data for 2012 page 0...
Collecting data for 2013 page 0...
Collecting data for 2014 page 0...
Collecting data for 2015 page 0...
Collecting data for 2016 page 0...
Collecting data for 2017 page 0...
Collecting data for 2018 page 0...
Collecting data for 2019 page 0...
Collecting data for 2020 page 0...
Collecting data for 2021 page 0...
Collecting data for 2022 page 0...
