In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import tmdbsimple

In [2]:
data_pathway = './data/ratings.csv'

data = pd.read_csv(data_pathway)

api_pathway = './data/api_key.txt'
api_key = open(api_pathway, "r").read()

In [11]:
def preprocess_data(data):
    # Drop rows with NaN entries
    processed_data = data.dropna()

    # Converts Year column from float to int
    processed_data = processed_data.astype({'Year':'int'})

    # This adds a primary key to the data
    primary_key = range(0, len(processed_data['Letterboxd URI']))
    processed_data.insert(0, column='Primary Key', value = primary_key)

    return processed_data



# Need to rewrite this function to check for whether key exists when making a dictionary call
# Also it's possible for movies to not have IMDb entries
def extract_TMDb_data(data):
    # Propagate primary key of data as secondary key for generated dataset
    secondary_key = range(0, len(data['Letterboxd URI']))

    # We'll use the URI column to get the IMDb IDs for each movie
    URI_list = data["Letterboxd URI"]

    # Might be better to initialize these as NA to the right size, could avoid large chunks of repeated code
    # setting them to be NA if they're not present in API call
    TMDb_IDs = []

    genres_list = []
    runtime_list = []
    revenue_list = []
    budget_list = []
    original_language_list = []
    vote_average_list = []
    vote_count_list = []
    popularity_list = []
    type_list = []

    for URI in URI_list:

        page = requests.get(URI)

        soup = BeautifulSoup(page.content, "html.parser")

        results = soup.find(class_="col-17")
        sections = results.find_all("section", class_="section col-10 col-main")

        for element in sections:
            paragraphs = element.find_all("p", class_="text-link text-footer")
            for hyperlink in paragraphs:
                links = hyperlink.find_all("a", class_="micro-button track-event")
                TMDb_check = False
                for link in links:
                    if link.text == "TMDb":
                        TMDb_check = True
                        TMDb_link = link["href"]
                        ID = TMDb_link.replace('https://www.themoviedb.org', '')

                        # Changes to True if it's a movie
                        type_flag = False

                        if '/movie/' in ID:
                            type_flag = True
                            ID = ID.replace('/movie/', '')
                            TMDb_ID = ID.rstrip(ID[-1])
                            TMDb_IDs.append(TMDb_ID)
                            print(TMDb_ID)
                        elif '/tv/' in ID:
                            ID = ID.replace('/tv/', '')
                            TMDb_ID = ID.rstrip(ID[-1])
                            TMDb_IDs.append(TMDb_ID)
                            print(TMDb_ID)
                        else:
                            genres_list.append(pd.NA)
                            runtime_list.append(pd.NA)
                            revenue_list.append(pd.NA)
                            budget_list.append(pd.NA)
                            original_language_list.append(pd.NA)
                            vote_average_list.append(pd.NA)
                            vote_count_list.append(pd.NA)
                            popularity_list.append(pd.NA)
                            type_list.append(pd.NA)

                        
                        if type_flag == True:
                            url = "https://api.themoviedb.org/3/movie/" + TMDb_ID + "?api_key=" + api_key
                        else:
                            url = "https://api.themoviedb.org/3/tv/" + TMDb_ID + "?api_key=" + api_key

                        
                        response = requests.get(url).json()
                        # Need to implement a check that the keys are in response

                        if 'title' in response:
                            print(response['title'])
                        elif 'original_title' in response:
                            print(response['original_title'])
                        elif 'name' in response:
                            print(response['name'])
                        
                        if 'genres' in response:
                            movie_genres = []
                            for items in response['genres']:
                                movie_genres.append(items['name'])
                            print(movie_genres)
                            genres_list.append(movie_genres)
                        else:
                            # Should probably implement a scrape on the Letterboxd website to get genres as alternative
                            # since the website has genre information from URI
                            genres_list.append(pd.NA)

                        if 'runtime' in response:
                            runtime_list.append(response['runtime'])
                        else:
                            runtime_list.append(pd.NA)
                        if 'revenue' in response:
                            revenue_list.append(response['revenue'])
                        else:
                            revenue_list.append(pd.NA)
                        if 'budget' in response:
                            budget_list.append(response['budget'])
                        else:
                            budget_list.append(pd.NA)
                        if 'original_language' in response:
                            original_language_list.append(response['original_language'])
                        else:
                            original_language_list.append(pd.NA)
                        if 'vote_average' in response:
                            vote_average_list.append(response['vote_average'])
                        else:
                            vote_average_list.append(pd.NA)
                        if 'vote_count' in response:
                            vote_count_list.append(response['vote_count'])
                        else:
                            vote_count_list.append(pd.NA)
                        if 'popularity' in response:
                            popularity_list.append(response['popularity'])
                        else:
                            popularity_list.append(pd.NA)
                        if 'type' in response:
                            type_list.append(response['type'])
                        else:
                            type_list.append(pd.NA)
                if TMDb_check == False:
                    genres_list.append(pd.NA)
                    runtime_list.append(pd.NA)
                    revenue_list.append(pd.NA)
                    budget_list.append(pd.NA)
                    original_language_list.append(pd.NA)
                    vote_average_list.append(pd.NA)
                    vote_count_list.append(pd.NA)
                    popularity_list.append(pd.NA)
                    type_list.append(pd.NA)



    TMDb_data = [secondary_key, genres_list, runtime_list, revenue_list, budget_list, original_language_list, vote_average_list, vote_count_list,
                     popularity_list, type_list]

    return TMDb_data


In [12]:
processed_data = preprocess_data(data)
processed_data

Unnamed: 0,Primary Key,Date,Name,Year,Letterboxd URI,Rating
0,0,2020-09-11,Knives Out,2019,https://boxd.it/jWEA,4.0
1,1,2020-09-11,Inception,2010,https://boxd.it/1skk,4.0
2,2,2020-09-11,Spider-Man: Into the Spider-Verse,2018,https://boxd.it/azpY,3.5
3,3,2020-09-11,Avengers: Infinity War,2018,https://boxd.it/9vEe,3.5
4,4,2020-09-11,Guardians of the Galaxy,2014,https://boxd.it/3VH2,3.5
...,...,...,...,...,...,...
386,385,2023-07-18,Joy Ride,2023,https://boxd.it/wEeK,3.0
387,386,2023-07-20,The Story of Stuff,2007,https://boxd.it/4JBI,4.5
388,387,2023-07-22,Barbie,2023,https://boxd.it/bCLK,3.5
389,388,2023-07-23,My Neighbor Totoro,1988,https://boxd.it/20eA,3.0


In [13]:
TMDb_data = extract_TMDb_data(processed_data)
TMDb_data = pd.DataFrame(TMDb_data)
TMDb_data = TMDb_data.T
column_change = ['Secondary Key', 'Genres', 'Runtime', 'Revenue', 'Budget', 'Original Language', 'Vote Average', 'Vote Count', 'Popularity', 'Type']
column_change = dict(enumerate(column_change))
TMDb_data = TMDb_data.rename(columns= column_change)

546554
Knives Out
['Comedy', 'Crime', 'Mystery']
27205
Inception
['Action', 'Science Fiction', 'Adventure']
324857
Spider-Man: Into the Spider-Verse
['Action', 'Adventure', 'Animation', 'Science Fiction']
299536
Avengers: Infinity War
['Adventure', 'Action', 'Science Fiction']
118340
Guardians of the Galaxy
['Action', 'Science Fiction', 'Adventure']
38
Eternal Sunshine of the Spotless Mind
['Science Fiction', 'Drama', 'Romance']
140607
Star Wars: The Force Awakens
['Adventure', 'Action', 'Science Fiction']
37799
The Social Network
['Drama']
181808
Star Wars: The Last Jedi
['Adventure', 'Action', 'Science Fiction']
315635
Spider-Man: Homecoming
['Action', 'Adventure', 'Science Fiction', 'Drama']
11
Star Wars
['Adventure', 'Action', 'Science Fiction']
150540
Inside Out
['Animation', 'Family', 'Adventure', 'Drama', 'Comedy']
13
Forrest Gump
['Comedy', 'Drama', 'Romance']
603
The Matrix
['Action', 'Science Fiction']
24428
The Avengers
['Science Fiction', 'Action', 'Adventure']
283995
Guard

In [14]:
TMDb_data

Unnamed: 0,Secondary Key,Genres,Runtime,Revenue,Budget,Original Language,Vote Average,Vote Count,Popularity,Type
0,0,"[Comedy, Crime, Mystery]",131,312897920,40000000,en,7.851,10992,48.269,
1,1,"[Action, Science Fiction, Adventure]",148,825532764,160000000,en,8.365,34208,87.383,
2,2,"[Action, Adventure, Animation, Science Fiction]",117,375464627,90000000,en,8.404,13729,252.448,
3,3,"[Adventure, Action, Science Fiction]",149,2052415039,300000000,en,8.259,27420,179.154,
4,4,"[Action, Science Fiction, Adventure]",121,772776600,170000000,en,7.905,26448,45.941,
...,...,...,...,...,...,...,...,...,...,...
385,385,[Comedy],95,11000000,0,en,6.535,100,63.904,
386,386,"[Animation, Documentary]",22,0,0,en,7.7,18,1.226,
387,387,"[Comedy, Adventure, Fantasy]",114,1052300070,145000000,en,7.49,2680,2441.778,
388,388,"[Fantasy, Animation, Family]",86,41000000,3700000,ja,8.1,7013,55.156,


In [None]:
# Figure out EDA 

In [None]:
# Also w.r.t scripts, recent movies will probably not have scripts out
# Also non-english scripts? Maybe subtitles