In [None]:
# Initialization: Importing core libraries for data manipulation (Pandas, NumPy), abstract syntax tree parsing (ast), and progress monitoring (tqdm).
import pandas as pd
import numpy as np
import json
import os
from tqdm.notebook import tqdm
import ast

In [None]:
# Filesystem Mount: Mounting the Google Drive filesystem to enable access to datasets stored in the cloud environment.
from google.colab import drive
drive.mount('/content/drive')

print("Libraries installed and Google Drive mounted successfully!")

Mounted at /content/drive
Libraries installed and Google Drive mounted successfully!


In [None]:
drive_path = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/The_Movies_Dataset/'

# Datasets Used:

- **The Movies Dataset: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset**
- **IMDB Multimodal Vision & NLP Genre Classification: https://www.kaggle.com/datasets/zulkarnainsaurav/imdb-multimodal-vision-and-nlp-genre-classification**

In [None]:
# Data Ingestion: Loading the primary source datasets (movies_metadata.csv and credits.csv) into Pandas DataFrames.
try:
    movies_metadata = pd.read_csv(drive_path + 'movies_metadata.csv', low_memory=False)
    credits = pd.read_csv(drive_path + 'credits.csv')
    print("movies_metadata.csv and credits.csv loaded successfully.")
    print(f"Movies metadata shape: {movies_metadata.shape}")
    print(f"Credits shape: {credits.shape}")
except FileNotFoundError:
    print(f"Error: Make sure your files are in the specified path: {drive_path}")
    print("Please upload 'movies_metadata.csv' and 'credits.csv' to your Google Drive.")


movies_metadata.csv and credits.csv loaded successfully.
Movies metadata shape: (45466, 24)
Credits shape: (45476, 3)


In [None]:
print("\nMovies Metadata Head:")
movies_metadata.head(2)


Movies Metadata Head:


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [None]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [None]:
print("\nCredits Head:")
credits.head(2)


Credits Head:


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [None]:
credits.columns

Index(['cast', 'crew', 'id'], dtype='object')

**Data Unification: The id column is sanitized to ensure a consistent integer type, followed by an inner merge to unify the metadata and credits DataFrames into a single entity.**

In [None]:
# Convert 'id' in movies_metadata to numeric, coercing errors to NaN
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')

In [None]:
# Drop rows where 'id' became NaN (these were non-numeric IDs)
movies_metadata.dropna(subset=['id'], inplace=True)

In [None]:
# Convert 'id' to integer type for merging
movies_metadata['id'] = movies_metadata['id'].astype(int)

In [None]:
credits['id'] = credits['id'].astype(int)

In [None]:
print(f"Cleaned movies_metadata shape: {movies_metadata.shape}")
print(f"Cleaned credits shape: {credits.shape}")

Cleaned movies_metadata shape: (45463, 24)
Cleaned credits shape: (45476, 3)


***merging 2 datasets***

In [None]:
# Perform an inner merge to keep only movies present in both datasets
movies_df = pd.merge(movies_metadata, credits, on='id', how='inner')

In [None]:
print(f"Merged DataFrame shape: {movies_df.shape}")

Merged DataFrame shape: (45538, 26)


In [None]:
print("\nMerged DataFrame Head:")
movies_df.head(2)


Merged DataFrame Head:


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."


In [None]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

In [None]:
movies_df['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [None]:
movies_df['crew'][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

Feature Engineering (Cast & Crew): This block defines functions to parse string-encoded JSON from the cast and crew columns. It then extracts key roles (Director, Writer, Composer, top 5 actors) into new, structured columns and generates concatenated text features.

In [None]:
# Extract Comprehensive Cast and Crew Information

# --- Define the fixed parsing function ---
def parse_json_column_fixed(json_str):
    if isinstance(json_str, str) and json_str.strip():
        try:
            return ast.literal_eval(json_str)
        except (ValueError, SyntaxError) as e:
            print(f"Warning: Could not parse string (first 100 chars): {json_str[:100]}... Error: {e}")
            return []
    return []

# Apply the fixed parsing function
tqdm.pandas(desc="Re-parsing 'cast' column")
movies_df['parsed_cast'] = movies_df['cast'].progress_apply(parse_json_column_fixed)
tqdm.pandas(desc="Re-parsing 'crew' column")
movies_df['parsed_crew'] = movies_df['crew'].progress_apply(parse_json_column_fixed)


# --- Functions to extract specific roles---

def extract_roles(person_list, job_titles, top_n=None):
    """
    Extracts names for specified job titles from a crew/cast list.
    If job_titles is None, extracts names from cast list (actors).
    """
    if not person_list:
        return []

    names = []
    if job_titles: # For crew roles (director, composer, writer etc.)
        for p in person_list:
            if p.get('job') in job_titles and p.get('name'):
                names.append(p['name'])
    else: # For cast (actors), considering 'order'
        # Sort by 'order' if available, otherwise keep original order
        sorted_persons = sorted(person_list, key=lambda x: x.get('order', float('inf')))
        for p in sorted_persons:
            if p.get('name'):
                names.append(p['name'])

    return names[:top_n] if top_n else names

# Define the specific job titles you want to extract from 'crew'
CREW_ROLES_TO_EXTRACT = {
    'Director': 'Director',
    'Screenplay': 'Writer',
    'Original Music Composer': 'Composer',
    'Director of Photography': 'Cinematographer',
    'Editor': 'Editor',
    'Production Design': 'ProductionDesigner',
    'Art Direction': 'ArtDirector'
}

# Apply extraction for crew roles
for job_title, col_name_suffix in tqdm(CREW_ROLES_TO_EXTRACT.items(), desc="Extracting Crew Roles"):
    movies_df[col_name_suffix.lower()] = movies_df['parsed_crew'].apply(
        lambda x: extract_roles(x, job_titles=[job_title])
    )

# Apply extraction for main actors (from cast)
TOP_N_ACTORS = 5 # You can adjust this number
movies_df['main_actors'] = movies_df['parsed_cast'].apply(
    lambda x: extract_roles(x, job_titles=None, top_n=TOP_N_ACTORS)
)

# --- Create a combined 'crew_text_features' and 'cast_text_features' string ---

def combine_crew_features(row):
    features = []
    # Add director(s)
    if row['director']:
        features.append(f"Directed by {', '.join(row['director'])}.")
    # Add writer(s)
    if row['writer']:
        features.append(f"Written by {', '.join(row['writer'])}.")
    # Add composer(s)
    if row['composer']:
        features.append(f"Music by {', '.join(row['composer'])}.")
    # Add cinematographer(s)
    if row['cinematographer']:
        features.append(f"Cinematography by {', '.join(row['cinematographer'])}.")
    # Add editor(s)
    if row['editor']:
        features.append(f"Edited by {', '.join(row['editor'])}.")
    # Add production designer
    if row['productiondesigner']:
        features.append(f"Production design by {', '.join(row['productiondesigner'])}.")
    # Add art director
    if row['artdirector']:
        features.append(f"Art direction by {', '.join(row['artdirector'])}.")

    return " ".join(features).strip()

def combine_cast_features(row):
    if row['main_actors']:
        return f"Starring {', '.join(row['main_actors'])}."
    return ""


tqdm.pandas(desc="Combining Crew Text Features")
movies_df['crew_text_features'] = movies_df.progress_apply(combine_crew_features, axis=1)
tqdm.pandas(desc="Combining Cast Text Features")
movies_df['cast_text_features'] = movies_df.progress_apply(combine_cast_features, axis=1)

Re-parsing 'cast' column:   0%|          | 0/45538 [00:00<?, ?it/s]

Re-parsing 'crew' column:   0%|          | 0/45538 [00:00<?, ?it/s]

Extracting Crew Roles:   0%|          | 0/7 [00:00<?, ?it/s]

Combining Crew Text Features:   0%|          | 0/45538 [00:00<?, ?it/s]

Combining Cast Text Features:   0%|          | 0/45538 [00:00<?, ?it/s]

In [None]:
print("\nSample of extracted roles and combined text features (after comprehensive extraction):")
movies_df[['title', 'director', 'writer', 'composer', 'main_actors', 'crew_text_features', 'cast_text_features']].head()


Sample of extracted roles and combined text features (after comprehensive extraction):


Unnamed: 0,title,director,writer,composer,main_actors,crew_text_features,cast_text_features
0,Toy Story,[John Lasseter],"[Joss Whedon, Andrew Stanton, Joel Cohen, Alec...",[],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",Directed by John Lasseter. Written by Joss Whe...,"Starring Tom Hanks, Tim Allen, Don Rickles, Ji..."
1,Jumanji,[Joe Johnston],"[Jonathan Hensleigh, Greg Taylor, Jim Strain]",[James Horner],"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Directed by Joe Johnston. Written by Jonathan ...,"Starring Robin Williams, Jonathan Hyde, Kirste..."
2,Grumpier Old Men,[Howard Deutch],[],[],"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Directed by Howard Deutch.,"Starring Walter Matthau, Jack Lemmon, Ann-Marg..."
3,Waiting to Exhale,[Forest Whitaker],"[Ronald Bass, Terry McMillan]",[Kenneth Edmonds],"[Whitney Houston, Angela Bassett, Loretta Devi...",Directed by Forest Whitaker. Written by Ronald...,"Starring Whitney Houston, Angela Bassett, Lore..."
4,Father of the Bride Part II,[Charles Shyer],"[Nancy Meyers, Albert Hackett]",[Alan Silvestri],"[Steve Martin, Diane Keaton, Martin Short, Kim...",Directed by Charles Shyer. Written by Nancy Me...,"Starring Steve Martin, Diane Keaton, Martin Sh..."


In [None]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'parsed_cast',
       'parsed_crew', 'director', 'writer', 'composer', 'cinematographer',
       'editor', 'productiondesigner', 'artdirector', 'main_actors',
       'crew_text_features', 'cast_text_features'],
      dtype='object')

In [None]:
# Drop original 'cast' and 'crew' columns and parsed ones to save memory
movies_df.drop(columns=['cast', 'crew', 'parsed_cast', 'parsed_crew'], inplace=True, errors='ignore')

print("\nFinished extracting and combining comprehensive cast and crew information.")
print(f"DataFrame shape after comprehensive role extraction: {movies_df.shape}")


Finished extracting and combining comprehensive cast and crew information.
DataFrame shape after comprehensive role extraction: (45538, 34)


In [None]:
# Load IMDB Multimodal dataset
# Data Enrichment (Plot): Ingesting a supplementary dataset containing enhanced plot descriptions to enrich the primary movie data.
try:
    imdb_multimodal_path = drive_path
    imdb_df = pd.read_csv(imdb_multimodal_path + 'IMDB_four_genre_larger_plot_description.csv')
    print("IMDB Multimodal descriptionsort.csv loaded successfully.")
    print(f"IMDB Multimodal shape: {imdb_df.shape}")

except FileNotFoundError:
    print(f"Error: Make sure your IMDB Multimodal descriptionsort.csv is in {imdb_multimodal_path}")
    print("Please download it and adjust the path accordingly.")

IMDB Multimodal descriptionsort.csv loaded successfully.
IMDB Multimodal shape: (1000, 3)


In [None]:
imdb_df.head(2)

Unnamed: 0,movie_id,description,genre
0,tt12783454,Elle Evans (Joey King) has finally completed h...,romance
1,tt1798632,A young girl tries to understand how she myste...,horror


In [None]:
imdb_df.columns

Index(['movie_id', 'description', 'genre'], dtype='object')

In [None]:
imdb_df['movie_id'] = imdb_df['movie_id'].astype(str).str.strip()

In [None]:
# ---  Data Integration: A left join is performed to integrate the supplementary plot descriptions. A consolidated plot_description column is created, using the enriched text where available and falling back to the original overview otherwise
final_movies_df = pd.merge(
    movies_df,
    imdb_df[['movie_id', 'description', 'genre']], # relevant columns from imdb_df
    left_on='imdb_id',   # Column in movies_df
    right_on='movie_id', # Column in imdb_df
    how='left',          # Keep all from movies_df
    suffixes=('_meta', '_imdb') # Suffixes for overlapping column names
)

In [None]:
# Prioritize 'description' from IMDB Multimodal if it exists, otherwise use 'overview' from movies_metadata
final_movies_df['plot_description'] = final_movies_df['description'].fillna(final_movies_df['overview'])

In [None]:
# Drop redundant columns after consolidation
final_movies_df.drop(columns=[
    'description_imdb', 'overview', 'genre_imdb', 'genres_meta', 'movie_id'
], inplace=True, errors='ignore')

In [None]:
# Only keep movies that have a plot description and at least one director/actor/composer
# final_movies_df.dropna(subset=['plot_description'], inplace=True)

In [None]:
print("\nFinal Merged Content DataFrame Head (with IMDB descriptions):")
final_movies_df.head(1)


Final Merged Content DataFrame Head (with IMDB descriptions):


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,popularity,...,cinematographer,editor,productiondesigner,artdirector,main_actors,crew_text_features,cast_text_features,description,genre,plot_description
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,21.946943,...,[],"[Lee Unkrich, Robert Gordon]",[],[Ralph Eggleston],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",Directed by John Lasseter. Written by Joss Whe...,"Starring Tom Hanks, Tim Allen, Don Rickles, Ji...",,,"Led by Woody, Andy's toys live happily in his ..."


In [None]:
print(f"\nFinal Merged Content DataFrame shape after all merges and filtering: {final_movies_df.shape}")


Final Merged Content DataFrame shape after all merges and filtering: (45538, 36)


In [None]:
# And 'imdb_id' is also retained for potential external links
final_movies_df = final_movies_df.rename(columns={'id': 'tmdb_id'})

In [None]:
final_movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'tmdb_id', 'imdb_id', 'original_language', 'original_title',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'director', 'writer', 'composer',
       'cinematographer', 'editor', 'productiondesigner', 'artdirector',
       'main_actors', 'crew_text_features', 'cast_text_features',
       'description', 'genre', 'plot_description'],
      dtype='object')

THIS PART WAS DONE IN MY LOCAL PC, AS THERE WERE MANY POSTERS TO DOWNLOAD AND API CALLS TO BE MONITORED

In [None]:
from google.colab import userdata # For Colab secrets

# Path to save downloaded posters
poster_download_dir = os.path.join(drive_path, 'tmdb_posters')
os.makedirs(poster_download_dir, exist_ok=True)

print(f"Posters will be saved to: {poster_download_dir}")

try:
    TMDB_API_KEY = userdata.get('TMDB_API_KEY')
    if TMDB_API_KEY:
        print("TMDB_API_KEY loaded successfully from Colab Secrets.")
    else:
        print("Error: TMDB_API_KEY not found in Colab Secrets or is empty.")
        print("Please ensure you followed the instructions to add it via the left sidebar 'key' icon.")
except Exception as e:
    print(f"Could not load TMDB_API_KEY from Colab Secrets: {e}")
    print("Please ensure you've set it up correctly.")
    TMDB_API_KEY = None

Data Ingestion (Multimodal Features): Loading pre-processed data from local execution, which includes the full enriched metadata and a Parquet file containing poster image features (poster_byte, avg_rgb_color).

In [None]:
local_project_dir = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/'
local_data_dir = os.path.join(local_project_dir, 'final_datasets')

local_full_enriched_metadata_path = os.path.join(local_data_dir, 'multimodal_movies_content_for_local_processing.xlsx')
local_poster_features_parquet_path = os.path.join(local_data_dir, 'multimodal_movies_content_for_publication.parquet')


# --- Load the two source DataFrames ---

# Load the full, enriched metadata DataFrame (the left side of our merge)
if not os.path.exists(local_full_enriched_metadata_path):
    raise FileNotFoundError(f"Error: Full enriched metadata Excel file not found at {local_full_enriched_metadata_path}")
try:
    # Assuming this has all ~44k movies with rich metadata (textual, crew/cast)
    full_enriched_metadata_df = pd.read_excel(local_full_enriched_metadata_path)
    print(f"Loaded full_enriched_metadata_df. Shape: {full_enriched_metadata_df.shape}")
    # Ensure tmdb_id and imdb_id are correctly typed for merging
    full_enriched_metadata_df['tmdb_id'] = pd.to_numeric(full_enriched_metadata_df['tmdb_id'], errors='coerce').fillna(0).astype(int)
    full_enriched_metadata_df['imdb_id'] = full_enriched_metadata_df['imdb_id'].astype(str)
except Exception as e:
    raise Exception(f"Error loading full enriched metadata Excel: {e}")

Loaded full_enriched_metadata_df. Shape: (44584, 37)


In [None]:
# Load the poster features DataFrame (the right side of our merge)
if not os.path.exists(local_poster_features_parquet_path):
    raise FileNotFoundError(f"Error: Poster features Parquet file not found at {local_poster_features_parquet_path}")
try:
    # This DataFrame has ~5.6k movies with 'poster_byte' and 'avg_rgb_color'
    poster_features_df = pd.read_parquet(local_poster_features_parquet_path)
    print(f"Loaded poster_features_df. Shape: {poster_features_df.shape}")
    # Ensure tmdb_id and imdb_id are correctly typed for merging
    poster_features_df['tmdb_id'] = pd.to_numeric(poster_features_df['tmdb_id'], errors='coerce').fillna(0).astype(int)
    poster_features_df['imdb_id'] = poster_features_df['imdb_id'].astype(str)
except Exception as e:
    raise Exception(f"Error loading poster features Parquet: {e}")


Loaded poster_features_df. Shape: (5620, 38)


In [None]:
full_enriched_metadata_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'tmdb_id', 'imdb_id', 'original_language', 'original_title',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'director', 'writer', 'composer',
       'cinematographer', 'editor', 'productiondesigner', 'artdirector',
       'main_actors', 'crew_text_features', 'cast_text_features',
       'description', 'genre', 'plot_description', 'poster_local_path'],
      dtype='object')

In [None]:
# --- Merge new Parquet file with full_enriched_metadata_df ---
# Final Assembly: The multimodal poster features are merged into the main dataset. A boolean Poster_available flag is engineered, and the DataFrame is subsetted to the final required columns, dropping intermediate or redundant data.
print("\nMerging full metadata with poster features...")
full_dataset_df = pd.merge(
    full_enriched_metadata_df,
    poster_features_df[['tmdb_id', 'imdb_id', 'poster_path', 'poster_byte', 'avg_rgb_color']], # Select only necessary columns from right df
    on='tmdb_id',
    how='left',
    suffixes=('_meta', '_poster_')
)
print(f"Merged DataFrame shape: {full_dataset_df.shape}")


Merging full metadata with poster features...
Merged DataFrame shape: (44600, 41)


In [None]:
# ---  Add 'Poster_available' flag ---
print("Creating 'Poster_available' flag...")
full_dataset_df['Poster_available'] = full_dataset_df['poster_byte'].notna()
full_dataset_df['Poster_available'] = full_dataset_df['Poster_available'].astype(bool) # Ensure boolean type


Creating 'Poster_available' flag...


In [None]:
# --- Ensure NULL for missing poster data ---
full_dataset_df.loc[~full_dataset_df['Poster_available'], ['poster_byte', 'avg_rgb_color']] = None # Or np.nan

In [None]:
# --- Remove Unnecessary Columns ---
print("Removing unnecessary columns...")
columns_to_keep = [
    'tmdb_id', 'imdb_id', 'title', 'plot_description', 'genres', 'adult', 'tagline',# Basic movie info
    'director', 'writer', 'composer', 'cinematographer', 'editor', # Crew
    'productiondesigner', 'artdirector', 'main_actors', # More crew/cast
    'crew_text_features', 'cast_text_features', # Combined text features
    'original_language', 'runtime', 'vote_average', 'vote_count', 'release_date','popularity', 'budget', 'revenue',  # Numerical/categorical
    'poster_path', # TMDB relative path, needed for users to download if poster_byte is excluded
    'Poster_available', # New flag
    'poster_byte', # Raw image bytes (will make file huge, especially CSV/Excel)
    'avg_rgb_color' # Derived image feature
]

# Review merged columns for duplicates from 'suffixes'.
current_cols = full_dataset_df.columns.tolist()
final_columns_to_select = [col for col in columns_to_keep if col in current_cols]

# Select only the desired columns
full_dataset_df_new = full_dataset_df[final_columns_to_select].copy()

print(f"Final DataFrame columns after selection: {full_dataset_df_new.columns.tolist()}")
print(f"Final DataFrame shape before saving: {full_dataset_df_new.shape}")

Removing unnecessary columns...
Final DataFrame columns after selection: ['tmdb_id', 'title', 'plot_description', 'genres', 'adult', 'tagline', 'director', 'writer', 'composer', 'cinematographer', 'editor', 'productiondesigner', 'artdirector', 'main_actors', 'crew_text_features', 'cast_text_features', 'original_language', 'runtime', 'vote_average', 'vote_count', 'release_date', 'popularity', 'budget', 'revenue', 'Poster_available', 'poster_byte', 'avg_rgb_color']
Final DataFrame shape before saving: (44600, 27)


Data Serialization: The final, fully-processed DataFrame is serialized to disk. Parquet is the recommended output format due to its efficiency with complex data types and superior compression.

In [None]:
# --- Final Save of the Full Dataset for Publication ---
print("\nSaving the full dataset for publication (Parquet recommended, CSV/Excel might be problematic)...")

# Define output paths for the full dataset
final_output_filename_base = 'multimodal_movies_full_dataset_for_publication'
final_output_parquet_path = os.path.join(local_data_dir, f"{final_output_filename_base}.parquet")
final_output_csv_path = os.path.join(local_data_dir, f"{final_output_filename_base}.csv")
final_output_excel_path = os.path.join(local_data_dir, f"{final_output_filename_base}.xlsx")


try:
    print(f"Attempting to save to Parquet: {final_output_parquet_path}")
    full_dataset_df_new.to_parquet(final_output_parquet_path, index=False)
    print(f"  - Saved to Parquet: {final_output_parquet_path}")
except Exception as e:
    print(f"  - WARNING: Could not save final Parquet: {e}. Ensure pyarrow is installed.")

try:
    print(f"Attempting to save to CSV: {final_output_csv_path}")
    full_dataset_df_new.to_csv(final_output_csv_path, index=False)
    print(f"  - Saved to CSV: {final_output_csv_path}")
except Exception as e:
    print(f"  - WARNING: Could not save final CSV: {e}. File will be very large due to 'poster_byte' and might be slow/corrupt.")


Saving the full dataset for publication (Parquet recommended, CSV/Excel might be problematic)...
Attempting to save to Parquet: /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/multimodal_movies_full_dataset_for_publication.parquet
  - Saved to Parquet: /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/multimodal_movies_full_dataset_for_publication.parquet
Attempting to save to CSV: /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/multimodal_movies_full_dataset_for_publication.csv
  - Saved to CSV: /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/multimodal_movies_full_dataset_for_publication.csv
Attempting to save to Excel: /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/multimodal_movies_full_dataset_for_publication.xlsx

Phase 1: Full dataset assembly complete locally.
You can now upload '/conte

------------