In [1]:
# Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Send a GET request to the constructed URL to retrieve data
username = 'USERNAME'
url = f'https://letterboxd.com/{username}/films/page/0/'
data = requests.get(url)

In [3]:
# Create a BeautifulSoup object to parse the HTML content of the retrieved data
soup = BeautifulSoup(data.text)

In [4]:
# Try to find the pagination div element in the parsed HTML
try:
    # Select the div with class 'pagination'
    pages = soup.select('div.pagination')[0]
    # Find all 'a' elements within the selected div
    pages = pages.find_all('a')
    # Extract the text content of the last 'a' element, representing the total number of pages
    n_pages = pages[-1:][0].text

# Handle the case where there is no pagination element (IndexError)
except IndexError:
    # If no pagination is found, set the number of pages to 1
    n_pages = 1

In [5]:
def rating_to_numeric(film_rating_raw):
    # Count the number of full stars in the raw rating string
    stars = film_rating_raw.count('★')
    # Count the number of half stars in the raw rating string
    halves = film_rating_raw.count('½')
    # Calculate the numeric representation of the rating by combining full stars and half stars
    rating_numeric = stars + halves/2
    return rating_numeric

In [6]:
# Initialize empty lists to store film information
film_url_list = []
film_name_list = []
film_rating_list = []

# Loop through each page to collect film data
for page_idx in range(1, int(n_pages) + 1):
    # Construct the URL for the current page
    url = f'https://letterboxd.com/{username}/films/page/{page_idx}/'
    # Send a GET request to the URL to retrieve data
    data = requests.get(url)
    # Create a BeautifulSoup object to parse the HTML content of the retrieved data
    soup = BeautifulSoup(data.text)
    # Select all poster containers on the page
    posters = soup.select('li.poster-container')

    # Iterate through each poster to extract film information
    for poster in posters:
        try:
            # Extract film URL from the data-film-slug attribute
            film_url = poster.find('div', class_='film-poster')['data-film-slug']
            film_url = f'https://letterboxd.com/film/{film_url}'
            
            # Extract film name from the alt attribute of the image
            film_name = poster.find('img', class_='image')['alt']
            
            # Extract raw film rating text and convert it to numeric using the rating_to_numeric function
            film_rating_raw = poster.find('span', class_='rating').text
            film_rating_numeric = rating_to_numeric(film_rating_raw)

            # Append film information to respective lists
            film_url_list.append(film_url)
            film_name_list.append(film_name)
            film_rating_list.append(film_rating_numeric)
        
        # Handle any exceptions that might occur during extraction
        except:
            continue

# Create a dictionary to store the film data
df_data = {
    'Title': film_name_list,
    'Rating': film_rating_list,
    'URL': film_url_list
}

# Create a DataFrame using the collected data
rating_df = pd.DataFrame(data=df_data)


In [7]:
rating_df

Unnamed: 0,Title,Rating,URL
0,The Holdovers,5.0,https://letterboxd.com/film/the-holdovers
1,Oppenheimer,5.0,https://letterboxd.com/film/oppenheimer-2023
2,Barbie,5.0,https://letterboxd.com/film/barbie
3,Asteroid City,5.0,https://letterboxd.com/film/asteroid-city
4,May December,5.0,https://letterboxd.com/film/may-december
...,...,...,...
157,The King of Comedy,5.0,https://letterboxd.com/film/the-king-of-comedy
158,Grease,0.5,https://letterboxd.com/film/grease
159,Grey Gardens,5.0,https://letterboxd.com/film/grey-gardens
160,Dog Day Afternoon,5.0,https://letterboxd.com/film/dog-day-afternoon


In [8]:
def get_movie_features(urls):
    # Define indices for different tabs on the movie page
    tab_indices = {
        0: [0],  # only cast box
        1: [0],  # only director crew
        2: [0],  # only studios
        3: [0, 1]  # only genres
    }

    # Initialize an empty list to store movie data
    all_movie_data = []

    # Iterate through each movie URL
    for url in urls:
        # Send a GET request to the movie URL to retrieve data
        data = requests.get(url)
        soup = BeautifulSoup(data.text, 'html.parser')

        # Initialize a dictionary to store movie features
        df_data = {}
        
        # Extract movie title from the page
        df_data['Title'] = soup.select('h1.headline-1')[0].text

        # Extract movie duration (in minutes) from the page
        try:
            duration = soup.select('p.text-link')[0].text
            index_mins = duration.find('mins')
            duration = int(duration[index_mins - 4: index_mins].strip())
        except ValueError:
            duration = 0
        df_data['Duration'] = duration

        # Iterate through different tabs and extract information
        for tab, indices in tab_indices.items():
            try:
                df_data[tab] = []
                for index in indices:
                    for group in soup.select('div.tabbed-content-block')[tab].select('div.text-sluglist')[index].find_all('a'):
                        for g in group:
                            if g != 'Show All…':
                                df_data[tab].append(g.text)
            except:
                continue

        # Map numeric keys to descriptive names
        key_mapping = {
            0: "Cast",
            1: "Crew",
            2: 'Studios',
            3: 'Genres'
        }

        # Create a new dictionary with renamed keys
        new_dict = {key_mapping.get(key, key): value for key, value in df_data.items()}
        
        # Add the movie URL to the dictionary
        new_dict['URL'] = url
        
        # Append the movie data to the list
        all_movie_data.append(new_dict)

    # Create a DataFrame using the collected movie data
    df = pd.DataFrame(all_movie_data)
    
    # Return the DataFrame
    return df


In [9]:
# Call the get_movie_features function to retrieve additional movie features based on the URLs in rating_df
movie_features = get_movie_features(rating_df.URL.values)
movie_features

Unnamed: 0,Title,Duration,Cast,Crew,Studios,Genres,URL
0,The Holdovers,133,"[Paul Giamatti, Dominic Sessa, Da'Vine Joy Ran...",[Alexander Payne],"[Miramax, Gran Via Productions]","[Drama, Comedy, Underdogs and coming of age, R...",https://letterboxd.com/film/the-holdovers
1,Oppenheimer,181,"[Cillian Murphy, Emily Blunt, Matt Damon, Robe...",[Christopher Nolan],"[Syncopy, Universal Pictures, Atlas Entertainm...","[History, Drama, Politics and human rights, Wa...",https://letterboxd.com/film/oppenheimer-2023
2,Barbie,114,"[Margot Robbie, Ryan Gosling, America Ferrera,...",[Greta Gerwig],"[LuckyChap Entertainment, Heyday Films, NB/GG ...","[Comedy, Fantasy, Adventure, Relationship come...",https://letterboxd.com/film/barbie
3,Asteroid City,105,"[Jason Schwartzman, Scarlett Johansson, Tom Ha...",[Wes Anderson],"[American Empirical Pictures, Indian Paintbrus...","[Drama, Comedy, Relationship comedy, Imaginati...",https://letterboxd.com/film/asteroid-city
4,May December,117,"[Natalie Portman, Julianne Moore, Chris Tenzis...",[Todd Haynes],"[Gloria Sanchez Productions, Killer Films, Mou...","[Drama, Comedy]",https://letterboxd.com/film/may-december
...,...,...,...,...,...,...,...
157,The King of Comedy,109,"[Robert De Niro, Jerry Lewis, Diahnne Abbott, ...",[Martin Scorsese],"[Embassy International Pictures, 20th Century ...","[Comedy, Drama, Crude humor and satire, Relati...",https://letterboxd.com/film/the-king-of-comedy
158,Grease,110,"[John Travolta, Olivia Newton-John, Stockard C...",[Randal Kleiser],"[Paramount, Robert Stigwood Organization, Fine...","[Comedy, Romance, Song and dance, Underdogs an...",https://letterboxd.com/film/grease
159,Grey Gardens,95,"[Edith Bouvier Beale, Edith Ewing Bouvier Beal...","[Albert Maysles, David Maysles, Muffie Meyer, ...","[Portrait Films, Maysles Films]","[Documentary, Humanity and the world around us...",https://letterboxd.com/film/grey-gardens
160,Dog Day Afternoon,125,"[Al Pacino, John Cazale, Charles Durning, Chri...",[Sidney Lumet],"[Artists Entertainment Complex, Warner Bros. P...","[Crime, Drama, Thriller, Crime, drugs and gang...",https://letterboxd.com/film/dog-day-afternoon


In [10]:
# Merge the rating_df and movie_features DataFrames based on the 'Title' and 'URL' columns
df = rating_df.merge(movie_features, on=['Title', 'URL'])
df.head()

Unnamed: 0,Title,Rating,URL,Duration,Cast,Crew,Studios,Genres
0,The Holdovers,5.0,https://letterboxd.com/film/the-holdovers,133,"[Paul Giamatti, Dominic Sessa, Da'Vine Joy Ran...",[Alexander Payne],"[Miramax, Gran Via Productions]","[Drama, Comedy, Underdogs and coming of age, R..."
1,Oppenheimer,5.0,https://letterboxd.com/film/oppenheimer-2023,181,"[Cillian Murphy, Emily Blunt, Matt Damon, Robe...",[Christopher Nolan],"[Syncopy, Universal Pictures, Atlas Entertainm...","[History, Drama, Politics and human rights, Wa..."
2,Barbie,5.0,https://letterboxd.com/film/barbie,114,"[Margot Robbie, Ryan Gosling, America Ferrera,...",[Greta Gerwig],"[LuckyChap Entertainment, Heyday Films, NB/GG ...","[Comedy, Fantasy, Adventure, Relationship come..."
3,Asteroid City,5.0,https://letterboxd.com/film/asteroid-city,105,"[Jason Schwartzman, Scarlett Johansson, Tom Ha...",[Wes Anderson],"[American Empirical Pictures, Indian Paintbrus...","[Drama, Comedy, Relationship comedy, Imaginati..."
4,May December,5.0,https://letterboxd.com/film/may-december,117,"[Natalie Portman, Julianne Moore, Chris Tenzis...",[Todd Haynes],"[Gloria Sanchez Productions, Killer Films, Mou...","[Drama, Comedy]"


In [11]:
df.shape

(162, 8)

In [12]:
import json

# Serialize 'Cast', 'Crew', 'Studios', and 'Genres' columns to JSON format
df['Cast'] = df['Cast'].apply(json.dumps)
df['Crew'] = df['Crew'].apply(json.dumps)
df['Studios'] = df['Studios'].apply(json.dumps)
df['Genres'] = df['Genres'].apply(json.dumps)

# Save the DataFrame to a CSV file named '{username}_ratings.csv'
df.to_csv(f'{username}_ratings.csv')