In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
username = 'deathproof'
url = f'https://letterboxd.com/{username}/films/page/1/'
data = requests.get(url)

In [3]:
soup = BeautifulSoup(data.text)

In [4]:
pages = soup.select('div.pagination')[0]
pages = pages.find_all('a')
n_pages = pages[-1:][0].text
print(n_pages)

36


In [5]:
def rating_to_numeric(film_rating_raw):
    stars = film_rating_raw.count('★')
    halves = film_rating_raw.count('½')
    rating_numeric = stars + halves/2
    return rating_numeric

In [6]:
film_url_list = []
film_name_list = []
film_rating_list = []
for page_idx in range(1, int(n_pages)+1):
    url = f'https://letterboxd.com/{username}/films/page/{page_idx}/'
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    posters = soup.select('li.poster-container')

    for poster in posters:
        try:
            film_url = poster.find('div', class_='film-poster')['data-film-slug']
            film_url = f'https://letterboxd.com/film/{film_url}'
            film_name = poster.find('img', class_='image')['alt']
            film_rating_raw = poster.find('span', class_='rating').text
            film_rating_numeric = rating_to_numeric(film_rating_raw)

            film_url_list.append(film_url)
            film_name_list.append(film_name)
            film_rating_list.append(film_rating_numeric)
        except:
            continue
df_data = {
    'Title': film_name_list,
    'Rating': film_rating_list,
    'URL': film_url_list
}
rating_df = pd.DataFrame(data=df_data)

In [7]:
rating_df

Unnamed: 0,Title,Rating,URL
0,Barbie,3.0,https://letterboxd.com/film/barbie
1,Insidious: The Red Door,2.0,https://letterboxd.com/film/insidious-the-red-...
2,No Hard Feelings,3.0,https://letterboxd.com/film/no-hard-feelings-2023
3,Spider-Man: Across the Spider-Verse,4.5,https://letterboxd.com/film/spider-man-across-...
4,Asteroid City,4.0,https://letterboxd.com/film/asteroid-city
...,...,...,...
2396,Dracula,2.5,https://letterboxd.com/film/dracula
2397,Metropolis,3.5,https://letterboxd.com/film/metropolis
2398,Nosferatu,2.5,https://letterboxd.com/film/nosferatu
2399,The Kid,3.0,https://letterboxd.com/film/the-kid


In [8]:
rating_df.URL.values[0]

'https://letterboxd.com/film/barbie'

In [9]:
def get_movie_features(urls):
    tab_indices = {
        0: [0],  # only cast box
        1: [0],  # only director crew
        2: [0],  # only studios
        3: [0,1]   # only genres
    }

    all_movie_data = []

    for url in urls:
        data = requests.get(url)
        soup = BeautifulSoup(data.text, 'html.parser')
    
        df_data = {}
        df_data['Title'] = soup.select('h1.headline-1')[0].text
        try:
            duration = soup.select('p.text-link')[0].text
            index_mins = duration.find('mins')
            duration = int(duration[index_mins - 4 : index_mins].strip())
        except ValueError:
            duration = 0
        df_data['Duration'] = duration
        
        for tab, indices in tab_indices.items():
            try:
                df_data[tab] = []
                for index in indices:
                    for group in soup.select('div.tabbed-content-block')[tab].select('div.text-sluglist')[index].find_all('a'):
                        for g in group:
                            if g != 'Show All…':
                                df_data[tab].append(g.text)
            except:
                continue

        key_mapping = {
            0: "Cast",
            1: "Crew",
            2: 'Studios',
            3: 'Genres'
        }

        # Create a new dictionary with renamed keys
        new_dict = {key_mapping.get(key, key): value for key, value in df_data.items()}
        new_dict['URL'] = url
        all_movie_data.append(new_dict)

    df = pd.DataFrame(all_movie_data)
    return df


In [10]:
movie_features = get_movie_features(rating_df.URL.values)
movie_features

Unnamed: 0,Title,Duration,Cast,Crew,Studios,Genres,URL
0,Barbie,114,"[Margot Robbie, Ryan Gosling, America Ferrera,...",[Greta Gerwig],"[LuckyChap Entertainment, Heyday Films, NB/GG ...","[fantasy, comedy, adventure, Relationship come...",https://letterboxd.com/film/barbie
1,Insidious: The Red Door,107,"[Ty Simpkins, Patrick Wilson, Sinclair Daniel,...",[Patrick Wilson],"[Blumhouse Productions, Stage 6 Films, Screen ...","[thriller, horror, mystery, Horror, the undead...",https://letterboxd.com/film/insidious-the-red-...
2,No Hard Feelings,103,"[Jennifer Lawrence, Andrew Barth Feldman, Laur...",[Gene Stupnitsky],"[Excellent Cadaver, Odenkirk Provissiero Enter...","[comedy, romance, Relationship comedy, Crude h...",https://letterboxd.com/film/no-hard-feelings-2023
3,Spider-Man: Across the Spider-Verse,140,"[Shameik Moore, Hailee Steinfeld, Jason Schwar...","[Joaquim Dos Santos, Justin K. Thompson, Kemp ...","[Columbia Pictures, Sony Pictures Animation, L...","[animation, adventure, action, Epic heroes, ac...",https://letterboxd.com/film/spider-man-across-...
4,Asteroid City,105,"[Jason Schwartzman, Scarlett Johansson, Tom Ha...",[Wes Anderson],"[American Empirical Pictures, Indian Paintbrush]","[comedy, drama, Relationship comedy, earth, sc...",https://letterboxd.com/film/asteroid-city
...,...,...,...,...,...,...,...
2396,Dracula,74,"[Bela Lugosi, Helen Chandler, David Manners, D...",[Tod Browning],[Universal Pictures],"[fantasy, drama, horror, Horror, the undead an...",https://letterboxd.com/film/dracula
2397,Metropolis,149,"[Gustav Fröhlich, Brigitte Helm, Alfred Abel, ...",[Fritz Lang],[UFA],"[drama, science-fiction, Faith and religion, E...",https://letterboxd.com/film/metropolis
2398,Nosferatu,94,"[Max Schreck, Gustav von Wangenheim, Greta Sch...",[F. W. Murnau],"[Prana-Film GmbH, Jofa-Atelier Berlin-Johannis...","[fantasy, horror, Horror, the undead and monst...",https://letterboxd.com/film/nosferatu
2399,The Kid,68,"[Charlie Chaplin, Jackie Coogan, Edna Purvianc...",[Charlie Chaplin],[Charles Chaplin Productions],"[comedy, drama, Relationship comedy, Crude hum...",https://letterboxd.com/film/the-kid


In [11]:
df = rating_df.merge(movie_features, on=['Title', 'URL'])
df.head()

Unnamed: 0,Title,Rating,URL,Duration,Cast,Crew,Studios,Genres
0,Barbie,3.0,https://letterboxd.com/film/barbie,114,"[Margot Robbie, Ryan Gosling, America Ferrera,...",[Greta Gerwig],"[LuckyChap Entertainment, Heyday Films, NB/GG ...","[fantasy, comedy, adventure, Relationship come..."
1,Insidious: The Red Door,2.0,https://letterboxd.com/film/insidious-the-red-...,107,"[Ty Simpkins, Patrick Wilson, Sinclair Daniel,...",[Patrick Wilson],"[Blumhouse Productions, Stage 6 Films, Screen ...","[thriller, horror, mystery, Horror, the undead..."
2,No Hard Feelings,3.0,https://letterboxd.com/film/no-hard-feelings-2023,103,"[Jennifer Lawrence, Andrew Barth Feldman, Laur...",[Gene Stupnitsky],"[Excellent Cadaver, Odenkirk Provissiero Enter...","[comedy, romance, Relationship comedy, Crude h..."
3,Spider-Man: Across the Spider-Verse,4.5,https://letterboxd.com/film/spider-man-across-...,140,"[Shameik Moore, Hailee Steinfeld, Jason Schwar...","[Joaquim Dos Santos, Justin K. Thompson, Kemp ...","[Columbia Pictures, Sony Pictures Animation, L...","[animation, adventure, action, Epic heroes, ac..."
4,Asteroid City,4.0,https://letterboxd.com/film/asteroid-city,105,"[Jason Schwartzman, Scarlett Johansson, Tom Ha...",[Wes Anderson],"[American Empirical Pictures, Indian Paintbrush]","[comedy, drama, Relationship comedy, earth, sc..."


In [12]:
df.shape

(2401, 8)

In [13]:
import json

df['Cast'] = df['Cast'].apply(json.dumps)
df['Crew'] = df['Crew'].apply(json.dumps)
df['Studios'] = df['Studios'].apply(json.dumps)
df['Genres'] = df['Genres'].apply(json.dumps)

# Save DataFrame to CSV
df.to_csv(f'{username}_ratings.csv')