# Section 1: Imports & Setup

In [1]:
import pandas as pd
import numpy as np
import os
import torch
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from transformers import CLIPProcessor, CLIPModel
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt

## Set device

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


## Import and clean data

In [3]:
# read movielens
ratings = pd.read_csv('raw_data/ratings.csv')
links = pd.read_csv('raw_data/links.csv')

# get list of movies that have ratings
movie_ids = ratings['movieId'].unique()

# get list of imdb ids from movie ids
valid_movie_ids = links['movieId'].isin(movie_ids)
imdb_ids = links[valid_movie_ids]['imdbId'].tolist()

# add leading 0s to 6 digit ids
imdb_ids = [str(num).zfill(7) for num in imdb_ids]

In [4]:
# read imdb5000 merged
metadata_raw = pd.read_csv('raw_data/movie_metadata.csv')

# get merged of movies that have ratings
metadata = metadata_raw[metadata_raw['movie_imdb_link'].str[28:35].isin(imdb_ids)].copy()

In [5]:
# read posters
posters = pd.read_csv('raw_data/movie_posters.csv', encoding='latin1')

# get posters that have movies with user ratings
posters['imdbId'] = posters['imdbId'].astype(str).str.zfill(7)
posters = posters[posters['imdbId'].isin(imdb_ids)]

In [6]:
# read plots
plots = pd.read_csv('raw_data/movie_plots.csv')

# get movie titles
# titles = [title[:-1] for title in merged['movie_title'].values]

# get plots only from valid movie titles
# plots = plots[plots['Title'].isin(titles)]

In [18]:
# Create cleaned title columns first if not done already
metadata['title_clean'] = metadata['movie_title'].str.lower().str.strip().str.replace(r'\xa0', '', regex=True)
plots['title_clean'] = plots['Title'].str.lower().str.strip()

# Only keep the columns you care about from plots
plots_subset = plots[['title_clean', 'Plot']]

# Merge just Plot into merged
merged = pd.merge(metadata, plots_subset, on='title_clean', how='inner')

print(f"🧩 Merged rows: {merged.shape[0]}, columns: {merged.shape[1]}")
print(merged.head())

🧩 Merged rows: 3749, columns: 30
   color      director_name  num_critic_for_reviews  duration  \
0  Color      James Cameron                   723.0     178.0   
1  Color     Gore Verbinski                   302.0     169.0   
2  Color         Sam Mendes                   602.0     148.0   
3  Color         Sam Mendes                   602.0     148.0   
4  Color  Christopher Nolan                   813.0     164.0   

   director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                      0.0                   855.0  Joel David Moore   
1                    563.0                  1000.0     Orlando Bloom   
2                      0.0                   161.0      Rory Kinnear   
3                      0.0                   161.0      Rory Kinnear   
4                  22000.0                 23000.0    Christian Bale   

   actor_1_facebook_likes        gross                           genres  ...  \
0                  1000.0  760505847.0  Action|Adventure|Fantas

# Section 2: Preprocessing

## Metadata

In [10]:
# Step 1: Split genres by '|' and explode them into sets
genres_split = merged['genres'].fillna('').apply(lambda x: [genre for genre in x.split('|') if genre]).copy()

# Step 2: Get list of all unique genres
from itertools import chain

all_genres = set(chain.from_iterable(genres_split))
all_genres.discard('')
print("All genres found:", all_genres)

All genres found: {'Music', 'Sport', 'Adventure', 'Thriller', 'Documentary', 'Crime', 'Biography', 'Musical', 'Comedy', 'Mystery', 'Sci-Fi', 'History', 'Horror', 'Romance', 'Animation', 'Family', 'Fantasy', 'Western', 'Action', 'Film-Noir', 'Drama', 'War'}


In [11]:
# Create one hots for each genre
for genre in all_genres:
    if not genre.strip():  # Skip empty strings
        continue
    merged.loc[:,f'genre_{genre.lower()}'] = genres_split.apply(lambda genres: int(genre in genres))

genre_cols = [col for col in merged.columns if col.startswith('genre_')]
print(merged[genre_cols].head())

   genre_music  genre_sport  genre_adventure  genre_thriller  \
0            0            0                1               0   
1            0            0                1               0   
2            0            0                1               1   
3            0            0                1               1   
4            0            0                0               1   

   genre_documentary  genre_crime  genre_biography  genre_musical  \
0                  0            0                0              0   
1                  0            0                0              0   
2                  0            0                0              0   
3                  0            0                0              0   
4                  0            0                0              0   

   genre_comedy  genre_mystery  ...  genre_horror  genre_romance  \
0             0              0  ...             0              0   
1             0              0  ...             0              0

In [22]:
if 'plot_keywords' in merged:
    plot_keywords = merged['plot_keywords'].copy()

if 'movie_title' in merged:
    titles = merged['movie_title'].values

DROPPED_COLS = ['genres', 'color', 'movie_imdb_link', 'plot_keywords', 'movie_title']
merged.drop(columns=DROPPED_COLS, inplace=True, errors='ignore')

In [23]:
fill_unknown = ['director_name', 'actor_2_name', 'actor_3_name', 'language', 'content_rating']
merged[fill_unknown] = merged[fill_unknown].fillna('Unknown')

plot_keywords = plot_keywords.fillna('')
merged['duration'] = merged['duration'].fillna(merged['duration'].median())
merged['director_facebook_likes'] = merged['director_facebook_likes'].fillna(0)
merged['actor_3_facebook_likes'] = merged['actor_3_facebook_likes'].fillna(0)
merged['gross'] = merged['gross'].fillna(merged['gross'].median())  # or 0
merged['facenumber_in_poster'] = merged['facenumber_in_poster'].fillna(0)
merged['budget'] = merged['budget'].fillna(merged['budget'].median())  # or 0
merged['title_year'] = merged['title_year'].fillna(merged['title_year'].median())
merged['actor_2_facebook_likes'] = merged['actor_2_facebook_likes'].fillna(0)
merged['aspect_ratio'] = merged['aspect_ratio'].fillna(merged['aspect_ratio'].median())

print("No empty cols:", merged.isnull().sum().sum() == 0)
# null_counts = merged.isnull().sum()
# print(null_counts[null_counts > 0])

No empty cols: True


In [24]:
# Create TFIDF vectors for the plot keywords, then use Truncated SVD to create dense embeddings

tfidf = TfidfVectorizer(
    tokenizer=lambda x: x.split('|'),
    max_features=1000  # Limit to top 1000 most frequent
)

plot_keywords_tfidf = tfidf.fit_transform(plot_keywords.fillna(''))

svd = TruncatedSVD(n_components=100, random_state=42)  # You can change dimensions
plot_keywords_dense = svd.fit_transform(plot_keywords_tfidf)



In [25]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

embedding_cols = [
    'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name',
    'language', 'country', 'content_rating'
]

for col in embedding_cols:
    merged[col] = LabelEncoder().fit_transform(merged[col])

numerical_cols = [
    'num_critic_for_reviews', 'duration', 'director_facebook_likes',
    'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross',
    'num_voted_users', 'cast_total_facebook_likes', 'facenumber_in_poster',
    'num_user_for_reviews', 'budget', 'title_year', 'actor_2_facebook_likes',
    'imdb_score', 'aspect_ratio', 'movie_facebook_likes'
]

scaler = StandardScaler()
merged[numerical_cols] = scaler.fit_transform(merged[numerical_cols])

## Plots