In [19]:
import pandas as pd
import re
import unicodedata
from pathlib import Path

In [10]:
# df = pd.read_excel('letterboxd.xlsx')
# df['URL'] = df['URL'].str.replace('https://boxd.it/', '')
# df['Year'] = df['Year'].astype(int)
# print(df.head())

In [20]:
def remove_accents(text):
    normalised = unicodedata.normalize('NFKD', text)
    return ''.join(c for c in normalised if not unicodedata.combining(c))

def clean_title(title):
    title = remove_accents(title)
    title = title.lower()
    title = re.sub(r'[^A-Za-z0-9 ]', '', title)

    if title[:4] == "the ":
        title = title[4:]
    elif title[:2] == "a ":
        title = title[2:]
    elif title[:3] == "an ":
        title = title[3:]
    
    return title
    


In [21]:
lists_folder = Path("lists")
csv_files = lists_folder.glob("*.csv")

dfs = []

for file in csv_files:
    df = pd.read_csv(file, skiprows=4)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_df['URL'] = combined_df['URL'].str.replace('https://boxd.it/', '')
combined_df['Year'] = combined_df['Year'].astype(int)
combined_df['Sort Name'] = combined_df['Name'].apply(clean_title)

  combined_df['URL'] = combined_df['URL'].str.replace('https://boxd.it/', '')


In [8]:
combined_df.head()

Unnamed: 0,Position,Name,Year,URL,Description,Sort Name
0,1,All of Us Strangers,2023,Bz3C,,all of us strangers
1,2,The Zone of Interest,2023,gJsA,,zone of interest
2,3,The Boy and the Heron,2023,ipeM,,boy and the heron
3,4,Poor Things,2023,tNWU,,poor things
4,5,Oppenheimer,2023,wUow,,oppenheimer


In [22]:
film_counts = (
    combined_df.groupby("URL")
      .agg(
          appearances=("URL", "count"),
          name=("Name", "first"),
          year=("Year", "first"),
          sort=("Sort Name", "first")
      )
      .reset_index()
)

In [13]:
film_counts.head()

Unnamed: 0,URL,appearances,name,year,sort
0,10hC,2,Make Way for Tomorrow,1937,make way for tomorrow
1,10iU,3,La Notte,1961,la notte
2,10iq,1,Pyaasa,1957,pyaasa
3,10uQ,1,Terror Train,1980,terror train
4,11Hc,1,Baby Doll,1956,baby doll


In [23]:
film_count_counts = (
    film_counts["appearances"]
    .value_counts()
    .sort_index(ascending=False)
    .rename("number of films")
    .to_frame()
)

film_count_counts["cumulative total"] = film_count_counts["number of films"].cumsum()

print(film_count_counts)

    number of films  cumulative total
10                6                 6
9                19                25
8                33                58
7                40                98
6                45               143
5                71               214
4               116               330
3               204               534
2               462               996
1              1394              2390


In [24]:
filtered_films = film_counts[film_counts["appearances"] > 2].sort_values(by=["sort"], ascending=True).reset_index()

In [25]:
filtered_films[['name', 'year', 'appearances']].head()

Unnamed: 0,name,year,appearances
0,12 Angry Men,1957,8
1,12 Years a Slave,2013,3
2,1917,2019,3
3,2001: A Space Odyssey,1968,9
4,The 39 Steps,1935,3


In [26]:
for i, row in filtered_films.iterrows():
    print(f"{i+1}. {row['name']} ({row['year']}) - {row['appearances']}")

1. 12 Angry Men (1957) - 8
2. 12 Years a Slave (2013) - 3
3. 1917 (2019) - 3
4. 2001: A Space Odyssey (1968) - 9
5. The 39 Steps (1935) - 3
6. 4 Months, 3 Weeks and 2 Days (2007) - 3
7. The 400 Blows (1959) - 6
8. 8½ (1963) - 7
9. Ace in the Hole (1951) - 5
10. Adaptation. (2002) - 3
11. The Adventures of Robin Hood (1938) - 4
12. Aguirre, the Wrath of God (1972) - 3
13. Airplane! (1980) - 6
14. Akira (1988) - 5
15. Ali: Fear Eats the Soul (1974) - 4
16. Alien (1979) - 9
17. Aliens (1986) - 6
18. All About Eve (1950) - 8
19. All About My Mother (1999) - 5
20. All That Heaven Allows (1955) - 4
21. All That Jazz (1979) - 6
22. All the President's Men (1976) - 4
23. Amadeus (1984) - 8
24. Amarcord (1973) - 3
25. Amélie (2001) - 4
26. American Graffiti (1973) - 3
27. An American Werewolf in London (1981) - 4
28. Anatomy of a Murder (1959) - 3
29. Andrei Rublev (1966) - 5
30. Angels with Dirty Faces (1938) - 3
31. Annie Hall (1977) - 5
32. The Apartment (1960) - 8
33. Apocalypse Now (1979) 

In [9]:
selection = film_counts[film_counts["appearances"] == 6].sort_values(by="name", ascending=True)
for _, row in selection.iterrows():
    print(f"{row['name']} ({row['year']})")

A Clockwork Orange (1971)
Aliens (1986)
All That Jazz (1979)
Brazil (1985)
Children of Men (2006)
Children of Paradise (1945)
Chungking Express (1994)
Everything Everywhere All at Once (2022)
Fanny and Alexander (1982)
Get Out (2017)
High and Low (1963)
Inception (2010)
Interstellar (2014)
Last Year at Marienbad (1961)
Metropolis (1927)
Moonlight (2016)
No Country for Old Men (2007)
Oldboy (2003)
Oppenheimer (2023)
Saving Private Ryan (1998)
Spider-Man: Into the Spider-Verse (2018)
Suspiria (1977)
Taste of Cherry (1997)
The 400 Blows (1959)
The Elephant Man (1980)
The General (1926)
The Handmaiden (2016)
The Life and Death of Colonel Blimp (1943)
The Lives of Others (2006)
The Piano (1993)
The Red Shoes (1948)
The Seventh Seal (1957)
The Social Network (2010)
The Texas Chain Saw Massacre (1974)
The Treasure of the Sierra Madre (1948)
The Umbrellas of Cherbourg (1964)
The Young Girls of Rochefort (1967)
To Be or Not to Be (1942)
When Harry Met Sally... (1989)
Whiplash (2014)
