Import Necessary Libraries

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Get the HTML Content

In [21]:
def get_html_content(url):
    response = requests.get(url)
    return response.text

Get the Languages

In [22]:
def extract_language(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <a> tags with href attributes
    #extracted = soup.find_all('a', href=True)
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith("/films/language"):
            return a_tag.get_text(strip=True)

    return None

Get the Genres

In [23]:
def extract_genres(html_content):
    links = []
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <a> tags with href attributes
    #extracted = soup.find_all('a', href=True)
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith("/films/genre"):
            links.append(a_tag.get_text(strip=True))

    return links

Get the US Earliest release Date

In [24]:
def extract_releaseold(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    release_table_div = soup.find('div', class_="release-table -bydate")
    if release_table_div:
        list_items = release_table_div.find_all('div', class_='listitem')

    # Iterate through each list item
    for list_item in list_items:
        # Find the country name within the list item
        country_name = list_item.find('span', class_='name')

        # If the country name is "USA," extract the release date
        if country_name and country_name.text.strip() == 'USA':
            date_element = list_item.find('h5', class_='date')
            usa_release_date = date_element.text.strip() if date_element else None
            break  # Stop iterating after finding the USA release date
    return usa_release_date

Get the Release Dates

In [25]:
def extract_release(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the div with class "release-table"
    release_table_div = soup.find('div', class_='release-table')

    # Initialize variables to None
    first_matching_href = None
    first_matching_content = None

    # Check if the div with class "release-table" is found
    if release_table_div:
        # Find all list items within the "release-table" div
        list_items = release_table_div.find_all('div', class_='listitem')

        # Iterate through each list item
        for list_item in list_items:
            # Find the country name within the list item
            country_name = list_item.find('span', class_='name')

            # If the country name is "USA," extract the release date and content
            if country_name and country_name.text.strip() == 'USA':
                date_element = list_item.find('h5', class_='date')
                first_matching_href = date_element.text.strip() if date_element else None

                # Extract content from other elements as needed
                # Example: content_element = list_item.find('span', class_='some-class')
                # first_matching_content = content_element.text.strip() if content_element else None

                break  # Stop iterating after finding the USA release date

    return first_matching_href

Read the Ratings.csv

In [26]:
df_ratings = pd.read_csv("ratings.csv")
df_ratings

Unnamed: 0,Date,Name,Year,Letterboxd URI,Rating
0,6/30/2022,Knives Out,2019,https://boxd.it/jWEA,3.0
1,6/30/2022,The Batman,2022,https://boxd.it/eDGs,2.0
2,6/30/2022,Inception,2010,https://boxd.it/1skk,4.0
3,6/30/2022,Spider-Man: No Way Home,2021,https://boxd.it/nwRw,5.0
4,6/30/2022,Dune,2021,https://boxd.it/fA7G,1.0
...,...,...,...,...,...
440,12/20/2023,Rabhasa,2014,https://boxd.it/9QGK,2.0
441,12/20/2023,Power,2014,https://boxd.it/9aJ8,2.5
442,12/20/2023,Cameraman Ganga Tho Rambabu,2012,https://boxd.it/4sf6,2.5
443,12/20/2023,Doosukeltha,2013,https://boxd.it/75H8,3.0


Add the Languages for each Movie to the Ratings.csv

In [27]:
for index, row in df_ratings.iterrows():
    url = row['Letterboxd URI']  # Assuming 'URL' is the column containing URLs

    # Fetch HTML content from the URL
    html_content = get_html_content(url)

    # Extract link content based on the starting pattern
    extracted_content = extract_language(html_content)
    df_ratings.at[index, 'Language'] = extracted_content

df_ratings

Unnamed: 0,Date,Name,Year,Letterboxd URI,Rating,Language
0,6/30/2022,Knives Out,2019,https://boxd.it/jWEA,3.0,English
1,6/30/2022,The Batman,2022,https://boxd.it/eDGs,2.0,English
2,6/30/2022,Inception,2010,https://boxd.it/1skk,4.0,English
3,6/30/2022,Spider-Man: No Way Home,2021,https://boxd.it/nwRw,5.0,English
4,6/30/2022,Dune,2021,https://boxd.it/fA7G,1.0,English
...,...,...,...,...,...,...
440,12/20/2023,Rabhasa,2014,https://boxd.it/9QGK,2.0,Telugu
441,12/20/2023,Power,2014,https://boxd.it/9aJ8,2.5,Telugu
442,12/20/2023,Cameraman Ganga Tho Rambabu,2012,https://boxd.it/4sf6,2.5,Telugu
443,12/20/2023,Doosukeltha,2013,https://boxd.it/75H8,3.0,Telugu


Add the Genres to the DF

In [28]:
for index, row in df_ratings.iterrows():
    url = row['Letterboxd URI']  # Assuming 'URL' is the column containing URLs

    # Fetch HTML content from the URL
    html_content = get_html_content(url)

    # Extract link content based on the starting pattern
    extracted_content = extract_genres(html_content)
    df_ratings.at[index, 'Genres'] = ', '.join(extracted_content)

df_ratings

Unnamed: 0,Date,Name,Year,Letterboxd URI,Rating,Language,Genres
0,6/30/2022,Knives Out,2019,https://boxd.it/jWEA,3.0,English,"Mystery, Comedy, Crime"
1,6/30/2022,The Batman,2022,https://boxd.it/eDGs,2.0,English,"Crime, Mystery, Thriller"
2,6/30/2022,Inception,2010,https://boxd.it/1skk,4.0,English,"Action, Adventure, Science Fiction"
3,6/30/2022,Spider-Man: No Way Home,2021,https://boxd.it/nwRw,5.0,English,"Action, Adventure, Science Fiction"
4,6/30/2022,Dune,2021,https://boxd.it/fA7G,1.0,English,"Science Fiction, Adventure"
...,...,...,...,...,...,...,...
440,12/20/2023,Rabhasa,2014,https://boxd.it/9QGK,2.0,Telugu,"Comedy, Action"
441,12/20/2023,Power,2014,https://boxd.it/9aJ8,2.5,Telugu,"Comedy, Action"
442,12/20/2023,Cameraman Ganga Tho Rambabu,2012,https://boxd.it/4sf6,2.5,Telugu,"Action, Drama"
443,12/20/2023,Doosukeltha,2013,https://boxd.it/75H8,3.0,Telugu,"Drama, Comedy"


Add the Release Date for the DF

In [29]:
for index, row in df_ratings.iterrows():
    url = row['Letterboxd URI']  # Assuming 'URL' is the column containing URLs

    # Fetch HTML content from the URL
    html_content = get_html_content(url)

    # Extract link content based on the starting pattern
    extracted_content = extract_release(html_content)
    df_ratings.at[index, 'ReleaseDate'] = extracted_content

df_ratings

Unnamed: 0,Date,Name,Year,Letterboxd URI,Rating,Language,Genres,ReleaseDate
0,6/30/2022,Knives Out,2019,https://boxd.it/jWEA,3.0,English,"Mystery, Comedy, Crime",26 Sep 2019
1,6/30/2022,The Batman,2022,https://boxd.it/eDGs,2.0,English,"Crime, Mystery, Thriller",02 Mar 2022
2,6/30/2022,Inception,2010,https://boxd.it/1skk,4.0,English,"Action, Adventure, Science Fiction",13 Jul 2010
3,6/30/2022,Spider-Man: No Way Home,2021,https://boxd.it/nwRw,5.0,English,"Action, Adventure, Science Fiction",13 Dec 2021
4,6/30/2022,Dune,2021,https://boxd.it/fA7G,1.0,English,"Science Fiction, Adventure",07 Oct 2021
...,...,...,...,...,...,...,...,...
440,12/20/2023,Rabhasa,2014,https://boxd.it/9QGK,2.0,Telugu,"Comedy, Action",28 Aug 2014
441,12/20/2023,Power,2014,https://boxd.it/9aJ8,2.5,Telugu,"Comedy, Action",
442,12/20/2023,Cameraman Ganga Tho Rambabu,2012,https://boxd.it/4sf6,2.5,Telugu,"Action, Drama",
443,12/20/2023,Doosukeltha,2013,https://boxd.it/75H8,3.0,Telugu,"Drama, Comedy",


In [30]:
df_ratings.to_csv("newRatings.csv")