In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd

# Replace with the path to your webdriver executable (chromedriver.exe, geckodriver.exe, etc.)
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
wait = WebDriverWait(driver, 10)

# Create an empty list to store data
data_rows = []

# pages of my letterboxd diary
urls = ["https://letterboxd.com/phi2/films/diary/", "https://letterboxd.com/phi2/films/diary/page/2/", "https://letterboxd.com/phi2/films/diary/page/3/"]

for url in urls:
    driver.get(url)

    # Wait for the page to load
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "diary-entry-row")))

    # Collect all film links
    film_links = driver.find_elements(By.CSS_SELECTOR, ".diary-entry-row .td-film-details h3 a")

    # Iterate through each film link
    for i in range(len(film_links)):
        # Re-find the film links on each iteration to avoid StaleElementReferenceException
        film_links = driver.find_elements(By.CSS_SELECTOR, ".diary-entry-row .td-film-details h3 a")

        # Click on the film link to view details
        film_links[i].click()

        # Extract the data
        watched_date_elements = driver.find_elements(By.CSS_SELECTOR, ".film-viewing-info-wrapper .view-date a")
        watched_date_parts = [element.text.strip() for element in watched_date_elements]
        watched_date = " ".join(watched_date_parts)

        film_name_element = driver.find_element(By.CSS_SELECTOR, ".film-viewing-info-wrapper h2 .film-title-wrapper a")
        film_name = film_name_element.text.strip()

        film_year_element = driver.find_element(By.CSS_SELECTOR, ".film-viewing-info-wrapper h2 .metadata a")
        film_year = film_year_element.text.strip()

        # Extract your rating (handle case when there is no rating)
        try:
            rating_element = driver.find_element(By.CSS_SELECTOR, ".film-viewing-info-wrapper .rating.rating-large")
            raw_rating = rating_element.get_attribute("class").split("-")[-1]
            your_rating = float(raw_rating) / 2 if raw_rating != "0" else None
        except NoSuchElementException:
            your_rating = None

        # Click on the movie to navigate to its general page
        driver.find_element(By.CSS_SELECTOR, ".film-viewing-info-wrapper h2 a").click()

        # Extract general rating
        try:
            general_rating_element = driver.find_element(By.CSS_SELECTOR, ".average-rating a")
            general_rating = general_rating_element.text.strip()
        except NoSuchElementException:
            general_rating = None    

        # Extract director (updated XPath)
        directed_by_element = driver.find_element(By.XPATH, "//p[contains(., 'Directed by')]")
        director_element = directed_by_element.find_element(By.XPATH, "./a/span[@class='prettify']")
        director = director_element.text.strip()

        # Extract the genres
        genres_section = driver.find_element(By.CSS_SELECTOR, "#tab-genres .text-sluglist:first-of-type")
        genre_elements = genres_section.find_elements(By.CSS_SELECTOR, "#tab-genres .text-slug")
        genres = [element.get_attribute("textContent").strip() for element in genre_elements]

        # Append the data to the list
        data_rows.append([watched_date, film_name, film_year, your_rating, general_rating, director, genres])

        # Go back to the diary page
        driver.back()
        driver.back()


# Create a DataFrame from the list of data
columns = ["Watched Date", "Film Name", "Film Year", "My Rating", "General Rating", "Director", "Genres"]
df = pd.DataFrame(data_rows, columns=columns)

# Print the DataFrame
print(df)

# Close the webdriver
driver.quit()
