In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import pandas as pd

# Setup browser
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 15)

genres = ['Action', 'Animation', 'Comedy', 'Crime', 'Fantasy',
          'Thriller', 'Mystery', 'Adventure', 'Drama', 'Romance',
          'Sci-Fi', 'Sport']

base_url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres={}"

all_movies = []

for genre in genres:
    print(f"\nProcessing genre: {genre}")
    url = base_url.format(genre.lower())
    driver.get(url)
    time.sleep(2)
    
    # Accept cookies if present
    try:
        cookie_btn = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "button#onetrust-accept-btn-handler")
        ))
        cookie_btn.click()
        time.sleep(1)
    except:
        pass
    
    # Click "50 more" until it disappears
    click_count = 0
    while True:
        try:
            more_button = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "button.ipc-see-more__button")
            ))
            # Scroll to button and click via JavaScript
            driver.execute_script("arguments[0].scrollIntoView();", more_button)
            driver.execute_script("arguments[0].click();", more_button)
            click_count += 1
            print(f"Clicked '50 more' ({click_count} times)")
            time.sleep(2)  # Allow time to load
        except (NoSuchElementException, TimeoutException):
            print("No more '50 more' button found")
            break
    
    # Now scrape all loaded movies
    movies = driver.find_elements(By.CSS_SELECTOR, "div.ipc-metadata-list-summary-item__c")
    print(f"Found {len(movies)} movies for {genre}")
    
    for movie in movies:
        try:
            all_movies.append({
                'genre': genre,
                'name': movie.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text,
                'description': (movie.find_element(By.CSS_SELECTOR, "div.ipc-html-content-inner-div").text 
                              if movie.find_elements(By.CSS_SELECTOR, "div.ipc-html-content-inner-div") 
                              else None),
                'duration': (movie.find_element(By.XPATH, ".//div[contains(@class, 'dli-title-metadata')]/span[2]").text 
                          if movie.find_elements(By.XPATH, ".//div[contains(@class, 'dli-title-metadata')]/span[2]") 
                          else None),
                'rating': (movie.find_element(By.CSS_SELECTOR, "span.ipc-rating-star--rating").text 
                         if movie.find_elements(By.CSS_SELECTOR, "span.ipc-rating-star--rating") 
                         else None),
                'rating_count': (movie.find_element(By.CSS_SELECTOR, "span.ipc-rating-star--voteCount").text.strip() 
                              if movie.find_elements(By.CSS_SELECTOR, "span.ipc-rating-star--voteCount") 
                              else None)
            })
        except Exception as e:
            print(f"Error processing movie: {e}")
            continue

# Convert to DataFrame
df = pd.DataFrame(all_movies)

# Remove exact duplicates (same movie in same genre)

print(f"\nTotal movies collected: {len(df)}")
print("Sample data:")
print(df.head())

driver.quit()


Processing genre: Action
Clicked '50 more' (1 times)
Clicked '50 more' (2 times)
Clicked '50 more' (3 times)
Clicked '50 more' (4 times)
Clicked '50 more' (5 times)
Clicked '50 more' (6 times)
Clicked '50 more' (7 times)
Clicked '50 more' (8 times)
Clicked '50 more' (9 times)
Clicked '50 more' (10 times)
Clicked '50 more' (11 times)
Clicked '50 more' (12 times)
Clicked '50 more' (13 times)
Clicked '50 more' (14 times)
Clicked '50 more' (15 times)
Clicked '50 more' (16 times)
Clicked '50 more' (17 times)
Clicked '50 more' (18 times)
Clicked '50 more' (19 times)
Clicked '50 more' (20 times)
Clicked '50 more' (21 times)
Clicked '50 more' (22 times)
Clicked '50 more' (23 times)
Clicked '50 more' (24 times)
Clicked '50 more' (25 times)
Clicked '50 more' (26 times)
Clicked '50 more' (27 times)
No more '50 more' button found
Found 1398 movies for Action

Processing genre: Animation
Clicked '50 more' (1 times)
Clicked '50 more' (2 times)
Clicked '50 more' (3 times)
Clicked '50 more' (4 times)

In [3]:
# Convert to DataFrame and save
df

Unnamed: 0,genre,name,description,duration,rating,rating_count
0,Action,1. Mahavatar Narsimha,The demon Hiranyakashyap seeks revenge on Vish...,2h 10m,9.4,(25K)
1,Action,2. Gladiator II,After his home is conquered by the tyrannical ...,2h 28m,6.5,(266K)
2,Action,3. The Ministry of Ungentlemanly Warfare,The British military recruits a small group of...,2h 2m,6.8,(142K)
3,Action,4. River of Blood,Four kayakers take the wrong river into a jung...,1h 26m,4.9,(2.2K)
4,Action,5. Kraven the Hunter,Kraven's complex relationship with his ruthles...,2h 7m,5.5,(69K)
...,...,...,...,...,...,...
18879,Sport,271. The Road to OVC Champions,The Road to OVC Champions follows the Morehead...,1h 2m,,
18880,Sport,272. Not That NB,,,,
18881,Sport,273. El Chilote: Gloria del Béisbol Dominicano,The life and career of Dominican baseball play...,1h 15m,,
18882,Sport,274. Rising Tennis Star Meloni Jones,Rising tennis star player Meloni Jones 8 years...,,,


In [4]:
# Check for duplicate rows and print the total number of duplicates
duplicate_count = df.duplicated().sum()
print(f"Total duplicate rows in df: {duplicate_count}")

Total duplicate rows in df: 0


In [5]:
df.isna().sum()

genre              0
name               0
description     3184
duration        4033
rating          6470
rating_count    6470
dtype: int64

In [6]:
null_description_counts = df[df['description'].isna()].groupby('genre').size()
print("Null description counts by genre:")
print(null_description_counts)

Null description counts by genre:
genre
Action        231
Adventure      64
Animation      86
Comedy        550
Crime         119
Drama        1413
Fantasy        89
Mystery        66
Romance       231
Sci-Fi         68
Sport          39
Thriller      228
dtype: int64


In [7]:
genre_counts = df['genre'].value_counts()
null_description_percentage = (null_description_counts / genre_counts * 100).round(2)
print("Percentage of null descriptions by genre:")
print(null_description_percentage)

Percentage of null descriptions by genre:
genre
Action       16.52
Adventure    10.34
Animation    19.95
Comedy       17.33
Crime        11.52
Drama        21.45
Fantasy      15.72
Mystery       9.58
Romance      18.80
Sci-Fi       10.74
Sport        14.18
Thriller     10.13
dtype: float64


In [8]:
df = df.dropna(subset=['description'])
print(f"Rows after dropping null descriptions: {len(df)}")

Rows after dropping null descriptions: 15700


In [17]:
# Remove leading/trailing whitespace from 'description'
df['description'] = df['description'].str.lower().str.strip()
# Remove the number and dot before each name, then strip whitespace
df['name'] = df['name'].str.replace(r'^\d+\.\s*', '', regex=True).str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].str.lower().str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'] = df['name'].str.replace(r'^\d+\.\s*', '', regex=True).str.strip()


In [18]:
df.head(5)

Unnamed: 0,genre,name,description,duration,rating,rating_count
0,Action,Mahavatar Narsimha,the demon hiranyakashyap seeks revenge on vish...,2h 10m,9.4,(25K)
1,Action,Gladiator II,after his home is conquered by the tyrannical ...,2h 28m,6.5,(266K)
2,Action,The Ministry of Ungentlemanly Warfare,the british military recruits a small group of...,2h 2m,6.8,(142K)
3,Action,River of Blood,four kayakers take the wrong river into a jung...,1h 26m,4.9,(2.2K)
4,Action,Kraven the Hunter,kraven's complex relationship with his ruthles...,2h 7m,5.5,(69K)


In [12]:
df['duration'] = df['duration'].fillna('2h 0m')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duration'] = df['duration'].fillna('2h 0m')


In [None]:
import pandas as pd


# Normalize keys to avoid grouping mismatches
for col in ['genre', 'duration']:
    df[col] = df[col].astype(str).str.strip().str.lower()

# Merge genres
df['genre'] = df.groupby(['name', 'description', 'duration'])['genre'] \
                .transform(lambda x: ', '.join(sorted(set(x))))

# Merge ratings
df['rating'] = df.groupby(['name', 'description', 'duration'])['rating'] \
                 .transform(lambda x: x.dropna().iloc[0] if not x.dropna().empty else None)

# Merge rating_count
df['rating_count'] = df.groupby(['name', 'description', 'duration'])['rating_count'] \
                       .transform(lambda x: x.dropna().iloc[0] if not x.dropna().empty else None)

# Drop duplicates
df = df.drop_duplicates(subset=['name', 'description', 'duration'])

df



Unnamed: 0,genre,name,description,duration,rating,rating_count
0,"action, animation, drama, fantasy",Mahavatar Narsimha,the demon hiranyakashyap seeks revenge on vish...,2h 10m,9.4,(25K)
1,"action, adventure, drama",Gladiator II,after his home is conquered by the tyrannical ...,2h 28m,6.5,(266K)
2,"action, comedy",The Ministry of Ungentlemanly Warfare,the british military recruits a small group of...,2h 2m,6.8,(142K)
3,"action, adventure, thriller",River of Blood,four kayakers take the wrong river into a jung...,1h 26m,4.9,(2.2K)
4,"action, adventure, thriller",Kraven the Hunter,kraven's complex relationship with his ruthles...,2h 7m,5.5,(69K)
...,...,...,...,...,...,...
15694,sport,UFC 308: Topuria vs. Holloway in COSM Shared R...,be a part of the action as ilia topuria looks ...,2h 0m,,
15695,sport,The Path Full of Stars,this film is a look at the life of blind runne...,1h,,
15696,sport,Nine: The Big Break,the big break delves into the journey of austr...,2h 0m,,
15698,sport,El Chilote: Gloria del Béisbol Dominicano,the life and career of dominican baseball play...,1h 15m,,


In [None]:
df.to_csv('imdb_movies_2024.csv', index=False)

In [41]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load your preprocessed data
df = pd.read_csv('imdb_movies_2024.csv')  # Columns: genre, name, description, duration, rating, rating_count

# Initialize spaCy (optimized for speed)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])
nlp.enable_pipe('lemmatizer')

def process_text(text):
    """Tokenize, lemmatize, and clean text"""
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Apply NLP processing
df['nlp_processed'] = df['description'].apply(process_text)

# Vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['nlp_processed'])

# Precompute similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix)

# Save model + data
with open('movie_recommender.pkl', 'wb') as f:
    pickle.dump({
        'tfidf': tfidf,
        'tfidf_matrix': tfidf_matrix,
        'cosine_sim': cosine_sim,
        'movies': df[['name', 'description', 'duration', 'rating', 'rating_count', 'genre']]  # All display columns
    }, f)

