# DA320 Midterm Project - MongoDB Notebook

This Jupyter notebook is to scrape the metacritic website for movies between 2000 and 2022.

Most of the code here are from Prof. Spencer's lecture slides, so the credits all go to him.


In [4]:
import re
import urllib3
import certifi
import pandas as pd
import json
import pymongo

CREDENTIALS_FILE = "D:\\BC fall quarter 2022\\DA320 14730 - F22 - Data Acquisition & Mngmt\\week 6\\credentials.json"
MONGO_DB_CONN_STR_KEY = "mongodb_connection_string"
DATABASE_NAME = "da320ZhanjuLi"
MIDTERM_METACRITIC_COLL_NAME = "midterm_metacritic"

# Construct all regular expressions
title_regex = re.compile(r"class=\"title\"><h3>(.+)</h3>")
date_regex = re.compile(r"class=\"clamp-details\">\s+<span>(.+)</span>")
description_regex = re.compile(r"<div class=\"summary\">\s*([\S\s]+?)\s*</div>")
score_regex = re.compile(r"<span class=\"title\">Metascore:</span>\s+<a class=\"metascore_anchor\" href=\"/movie/.*?/critic-reviews\">\s+<div class=\"metascore_w large movie .+\">(.*?)</div>")
image_regx =re.compile(r"<a href=\"/movie/.*\"><img src=\"(.*)\" alt=\"")

# Construct an HTTP pool for connections
http = urllib3.PoolManager(ca_certs=certifi.where())

with open(CREDENTIALS_FILE) as f:
    data = json.load(f)
    mongodb_connection_string = data[MONGO_DB_CONN_STR_KEY]


client = pymongo.MongoClient(mongodb_connection_string, tlsCAFile=certifi.where())
da320_database = client[DATABASE_NAME]

midterm_metacritic = da320_database[MIDTERM_METACRITIC_COLL_NAME]

# Retrieve a list of movies from a particular year and page of Metacritic
def metacritic_scraper(year: int, page: int) -> pd.DataFrame:
    # Fetch the webpage
    url = f"https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected={year}&sort=desc&view=detailed&page={page}"
    response = http.request('GET', url, headers={'User-Agent': 'Mozilla/5.0'})
    datastring = str(response.data, "utf-8")

    # Execute all the regular expressions
    titles = title_regex.findall(datastring)
    dates = date_regex.findall(datastring)
    descriptions = description_regex.findall(datastring)
    scores = score_regex.findall(datastring)
    images = image_regx.findall(datastring)

    # Return a unified collection
    dataset = {
        "title": titles,
        "date": dates,
        "description": descriptions,
        "score": scores,
        "image": images
    }

    return pd.DataFrame(dataset)


In [5]:
import re
import time


#Write a CSV file with this data
for year in range(2000, 2023):
    page = 0
    print(f"Collecting data for {year} page {page}...")

    # Retry a page multiple times if necessary
    while True:
        data = metacritic_scraper(year, page)

        # Stop when we reach a page with zero rows
        if len(data) == 0:
            break

        # Convert the dataframe into a list of movies to insert into MongoDB
        movies_to_insert = []
        for row in data.itertuples():
            movie = {
                "title": row.title,
                "release_date": row.date,
                "description": row.description,
                "metascore": row.score,
                "image_url": row.image,
            }
            movies_to_insert.append(movie)

        # Insert records into MongoDB
        print(f"Inserting {len(movies_to_insert)} for the year {year} page {page}")
        midterm_metacritic.insert_many(movies_to_insert)
        page = page + 1
        

Collecting data for 2000 page 0...
Inserting 100 for the year 2000 page 0
Inserting 100 for the year 2000 page 1
Inserting 100 for the year 2000 page 2
Inserting 65 for the year 2000 page 3
Collecting data for 2001 page 0...
Inserting 100 for the year 2001 page 0
Inserting 100 for the year 2001 page 1
Inserting 100 for the year 2001 page 2
Inserting 85 for the year 2001 page 3
Collecting data for 2002 page 0...
Inserting 100 for the year 2002 page 0
Inserting 100 for the year 2002 page 1
Inserting 100 for the year 2002 page 2
Inserting 100 for the year 2002 page 3
Inserting 30 for the year 2002 page 4
Collecting data for 2003 page 0...
Inserting 100 for the year 2003 page 0
Inserting 100 for the year 2003 page 1
Inserting 100 for the year 2003 page 2
Inserting 100 for the year 2003 page 3
Inserting 9 for the year 2003 page 4
Collecting data for 2004 page 0...
Inserting 100 for the year 2004 page 0
Inserting 100 for the year 2004 page 1
Inserting 100 for the year 2004 page 2
Inserting 1