In [None]:
import pandas as pd
import numpy as np
import os
import ast
from sklearn.model_selection import train_test_split
from collections import Counter

In [None]:
# --- Mount Google Drive and Define Paths ---
from google.colab import drive
drive.mount('/content/drive')

drive_path = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/'
full_dataset_parquet_path = os.path.join(drive_path, 'multimodal_movies_full_dataset_for_publication.parquet')
output_stratified_parquet_path = os.path.join(drive_path, 'multimodal_movies_stratified_prototype_dataset.parquet')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# --- Load the Full Dataset ---
print(f"Loading full dataset from: {full_dataset_parquet_path}")
if not os.path.exists(full_dataset_parquet_path):
    print(f"Error: Full dataset Parquet file not found at {full_dataset_parquet_path}.")
    print("Please ensure you uploaded it from your local PC to this Drive path.")
    raise FileNotFoundError("Full dataset not found. Cannot perform stratification.")

try:
    full_movies_df = pd.read_parquet(full_dataset_parquet_path)
    print(f"Dataset loaded successfully. Original shape: {full_movies_df.shape}")
    print("Initial Columns (from loaded Parquet):", full_movies_df.columns.tolist())
except Exception as e:
    print(f"Error loading Parquet file: {e}")
    raise

Loading full dataset from: /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/multimodal_movies_full_dataset_for_publication.parquet
Dataset loaded successfully. Original shape: (44600, 27)
Initial Columns (from loaded Parquet): ['tmdb_id', 'title', 'plot_description', 'genres', 'adult', 'tagline', 'director', 'writer', 'composer', 'cinematographer', 'editor', 'productiondesigner', 'artdirector', 'main_actors', 'crew_text_features', 'cast_text_features', 'original_language', 'runtime', 'vote_average', 'vote_count', 'release_date', 'popularity', 'budget', 'revenue', 'Poster_available', 'poster_byte', 'avg_rgb_color']


In [None]:
# --- Define Helper Parsing Function ---
# This function is for consistently parsing list-like string columns
def parse_list_column_corrected(list_str):
    if isinstance(list_str, str):
        try:
            parsed_literal = ast.literal_eval(list_str)
            if isinstance(parsed_literal, list):
                extracted_items = []
                for item in parsed_literal:
                    if isinstance(item, dict) and 'name' in item:
                        extracted_items.append(item['name'].strip())
                    elif isinstance(item, str): # Handle cases where it might just be a list of strings
                        extracted_items.append(item.strip())
                return [item for item in extracted_items if item] # Filter out empty strings
            else: # If it's a single dictionary or other non-list type
                if isinstance(parsed_literal, dict) and 'name' in parsed_literal:
                    return [parsed_literal['name'].strip()]
        except (ValueError, SyntaxError):
            # Fallback for pipe-separated string (e.g., "Action|Comedy") if literal_eval fails
            return [g.strip() for g in list_str.split('|') if g.strip()]
    return [] # Return empty list for NaN, empty strings, or unparseable values


In [None]:
# ---  Prepare All Derived Columns for Filtering and Stratification ---
print("\nPreparing all derived columns for filtering and stratification...")

# --- Column Renaming for Consistency ---
if 'director' in full_movies_df.columns and 'directors' not in full_movies_df.columns:
    full_movies_df.rename(columns={'director': 'directors'}, inplace=True)
    print("Renamed 'director' column to 'directors' for consistency.")


# --- Parse list-like columns ---
list_columns_to_parse = [
    'genres', 'directors', 'writer', 'composer', 'cinematographer', 'editor',
    'productiondesigner', 'artdirector', 'main_actors'
]
# Ensure new _parsed columns are created based on existing original columns
for col in list_columns_to_parse:
    if col in full_movies_df.columns:
        full_movies_df[f'{col}_parsed'] = full_movies_df[col].apply(parse_list_column_corrected)
    else:
        # If original column somehow missing, create empty parsed column to avoid errors later
        print(f"Warning: Original column '{col}' not found. '{col}_parsed' will be empty.")
        full_movies_df[f'{col}_parsed'] = [[]] * len(full_movies_df)


Preparing all derived columns for filtering and stratification...
Renamed 'director' column to 'directors' for consistency.


In [None]:
# --- Derive primary_genre ---
# Use 'genres_parsed' which should now always be present and correctly formatted
full_movies_df['primary_genre'] = full_movies_df['genres_parsed'].apply(lambda x: x[0] if x else 'Unknown')


# --- Prepare release_year for binning ---
if 'release_date' in full_movies_df.columns:
    full_movies_df['release_date_dt'] = pd.to_datetime(full_movies_df['release_date'], errors='coerce')
    full_movies_df['release_year'] = full_movies_df['release_date_dt'].dt.year.astype('Int64')
else:
    print("Warning: 'release_date' column not found. Cannot derive 'release_year'.")
    full_movies_df['release_year'] = pd.NA


In [None]:
bins = [1900, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2030] # Extended lower end for robustness
labels = ['Pre-1990', 'Early-90s', 'Late-90s', 'Early-00s', 'Late-00s', 'Early-10s', 'Late-10s', 'Early-20s']

full_movies_df['release_year_binned'] = pd.cut(
    full_movies_df['release_year'],
    bins=bins,
    labels=labels[:len(bins)-1], # Slicing labels to match number of bins-1
    right=False, # Interval is [left, right)
    include_lowest=True
).astype(object).fillna('Unknown_Year_Bin') # Fill NaN years with a distinct string


if 'Poster_available' not in full_movies_df.columns:
    # If poster_byte is present, derive from it. Otherwise, assume False.
    if 'poster_byte' in full_movies_df.columns:
        full_movies_df['Poster_available'] = full_movies_df['poster_byte'].notna()
    else:
        print("Warning: 'Poster_available' and 'poster_byte' columns not found. Assuming all posters unavailable.")
        full_movies_df['Poster_available'] = False
full_movies_df['Poster_available'] = full_movies_df['Poster_available'].astype(bool)

print("\nDerived columns created. Sample of these columns:")
print(full_movies_df[['primary_genre', 'release_year_binned', 'Poster_available']].head())



Derived columns created. Sample of these columns:
  primary_genre release_year_binned  Poster_available
0     Animation            Late-90s             False
1     Adventure            Late-90s              True
2       Romance            Late-90s             False
3        Comedy            Late-90s             False
4        Comedy            Late-90s             False


In [None]:
print("\nApplying initial filters (English movies, release year >= 1990) for stratification source...")
filtered_df_for_stratification = full_movies_df.copy()

# Filter by original_language == 'en'
initial_filter_shape = filtered_df_for_stratification.shape[0]
if 'original_language' in filtered_df_for_stratification.columns:
    filtered_df_for_stratification = filtered_df_for_stratification[
        filtered_df_for_stratification['original_language'] == 'en'
    ].copy()
    print(f"Filtered to English movies only. Shape: {filtered_df_for_stratification.shape} ({initial_filter_shape - filtered_df_for_stratification.shape[0]} removed).")
else:
    print("Warning: 'original_language' column not found. Cannot filter by language.")

# Filter by release_year >= 1990
year_filter_threshold = 1990
if 'release_year' in filtered_df_for_stratification.columns:
    initial_filter_shape = filtered_df_for_stratification.shape[0]
    filtered_df_for_stratification = filtered_df_for_stratification[
        filtered_df_for_stratification['release_year'] >= year_filter_threshold
    ].copy()
    print(f"Filtered to movies released in {year_filter_threshold} or later. Shape: {filtered_df_for_stratification.shape} ({initial_filter_shape - filtered_df_for_stratification.shape[0]} removed).")
else:
    print("Warning: 'release_year' column not found. Cannot filter by release year.")

print(f"DataFrame shape after initial filtering for stratification source: {filtered_df_for_stratification.shape}")




Applying initial filters (English movies, release year >= 1990) for stratification source...
Filtered to English movies only. Shape: (32263, 40) (12337 removed).
Filtered to movies released in 1990 or later. Shape: (20964, 40) (11299 removed).
DataFrame shape after initial filtering for stratification source: (20964, 40)


In [None]:
# --- Create the Composite Stratification Key ---
# This key combines Primary Genre, Release Year Bin, and Poster Availability
filtered_df_for_stratification['stratify_key'] = \
    filtered_df_for_stratification['primary_genre'] + '_' + \
    filtered_df_for_stratification['release_year_binned'].astype(str) + '_' + \
    filtered_df_for_stratification['Poster_available'].astype(str)


# --- Handle Sparse Strata for Robust Stratification ---
print("\nHandling sparse strata for robust stratification...")
stratify_key_counts = filtered_df_for_stratification['stratify_key'].value_counts()
min_samples_per_stratum = 5
rare_strata = stratify_key_counts[stratify_key_counts < min_samples_per_stratum].index.tolist()

if rare_strata:
    print(f"Grouping {len(rare_strata)} rare composite strata (less than {min_samples_per_stratum} movies) into 'Other_Stratum'.")
    filtered_df_for_stratification['stratify_key_final'] = filtered_df_for_stratification['stratify_key'].replace(rare_strata, 'Other_Stratum')
else:
    filtered_df_for_stratification['stratify_key_final'] = filtered_df_for_stratification['stratify_key']

print("Distribution of final stratification keys (Top 20, after grouping rare ones):")
print(filtered_df_for_stratification['stratify_key_final'].value_counts(normalize=True).head(20))



Handling sparse strata for robust stratification...
Grouping 42 rare composite strata (less than 5 movies) into 'Other_Stratum'.
Distribution of final stratification keys (Top 20, after grouping rare ones):
stratify_key_final
Drama_Early-10s_False          0.057909
Drama_Late-00s_False           0.047796
Documentary_Early-10s_False    0.046079
Comedy_Late-00s_False          0.045077
Comedy_Early-10s_False         0.044648
Drama_Early-00s_False          0.032675
Documentary_Late-00s_False     0.028811
Comedy_Early-00s_False         0.027905
Drama_Late-90s_False           0.027380
Comedy_Late-90s_False          0.023087
Horror_Early-10s_False         0.021370
Action_Early-10s_False         0.020750
Action_Late-00s_False          0.020368
Drama_Late-10s_False           0.020177
Comedy_Late-10s_False          0.018699
Drama_Early-90s_False          0.018174
Action_Early-00s_False         0.016027
Thriller_Early-10s_False       0.015741
Comedy_Early-90s_False         0.015598
Horror_Late-0

In [None]:
desired_prototype_size = 2000 # Sample size for the prototype
num_posters_false_in_sample = 30 # Number of movies WITHOUT posters to include (between 20-30)

print(f"\nPerforming stratified sampling with custom handling for Poster_available=False ({num_posters_false_in_sample} movies)...")

# Separate movies by Poster_available flag
df_poster_false = filtered_df_for_stratification[filtered_df_for_stratification['Poster_available'] == False].copy()
df_poster_true = filtered_df_for_stratification[filtered_df_for_stratification['Poster_available'] == True].copy()

# Sample Poster_available = False movies
if df_poster_false.shape[0] < num_posters_false_in_sample:
    print(f"Warning: Not enough movies with Poster_available=False ({df_poster_false.shape[0]}) to meet desired {num_posters_false_in_sample}. Taking all available.")
    sampled_false_posters_df = df_poster_false.copy()
else:
    sampled_false_posters_df = df_poster_false.sample(n=num_posters_false_in_sample, random_state=42).copy()



Performing stratified sampling with custom handling for Poster_available=False (30 movies)...


In [None]:
# Sample Poster_available = True movies (stratified by composite key)
remaining_sample_size_for_true = desired_prototype_size - sampled_false_posters_df.shape[0]

if remaining_sample_size_for_true <= 0:
    print("Desired sample size is met or exceeded by False posters. Not sampling True posters.")
    sampled_true_posters_df = pd.DataFrame(columns=filtered_df_for_stratification.columns) # Empty DataFrame
elif df_poster_true.empty:
    print("Warning: No movies with Poster_available=True available to sample.")
    sampled_true_posters_df = pd.DataFrame(columns=filtered_df_for_stratification.columns) # Empty DataFrame
else:
    valid_strata_true = df_poster_true['stratify_key_final'].value_counts()
    problematic_strata_true = valid_strata_true[valid_strata_true < 2].index.tolist()

    df_for_stratified_split_true = df_poster_true.copy()
    if problematic_strata_true:
        print(f"  Filtering {len(problematic_strata_true)} problematic strata from Poster_available=True sampling.")
        df_for_stratified_split_true = df_poster_true[~df_poster_true['stratify_key_final'].isin(problematic_strata_true)].copy()

    if df_for_stratified_split_true.empty:
        print("Warning: No valid data left for stratified sampling of Poster_available=True movies.")
        sampled_true_posters_df = pd.DataFrame(columns=filtered_df_for_stratification.columns)
    else:
        actual_sample_size_for_true = min(remaining_sample_size_for_true, df_for_stratified_split_true.shape[0])

        sample_fraction_to_keep_true = actual_sample_size_for_true / df_for_stratified_split_true.shape[0]

        if sample_fraction_to_keep_true >= 1.0:
            sampled_true_posters_df = df_for_stratified_split_true.copy()
        else:
            _, sampled_true_posters_df = train_test_split(
                df_for_stratified_split_true,
                test_size=sample_fraction_to_keep_true, # This is the fraction of data to put in the 'test' split (our sample)
                stratify=df_for_stratified_split_true['stratify_key_final'],
                random_state=42 # For reproducibility
            )


In [None]:
stratified_prototype_df = pd.concat([sampled_false_posters_df, sampled_true_posters_df]).reset_index(drop=True)

print(f"\nFinal stratified prototype DataFrame shape: {stratified_prototype_df.shape}")
print(f"  ({stratified_prototype_df['Poster_available'].sum()} movies with posters, "
      f"{len(stratified_prototype_df) - stratified_prototype_df['Poster_available'].sum()} movies without posters)")




Final stratified prototype DataFrame shape: (2000, 42)
  (1970 movies with posters, 30 movies without posters)


In [None]:
# --- Finalize and Save Sampled Dataset ---
columns_to_drop_from_final = [
    'genres_parsed', 'primary_genre', 'release_date_dt', 'release_year',
    'release_year_binned', 'stratify_key', 'stratify_key_final'
]

stratified_prototype_df.drop(columns=[col for col in columns_to_drop_from_final if col in stratified_prototype_df.columns], inplace=True, errors='ignore')

stratified_prototype_df.to_parquet(output_stratified_parquet_path, index=False)

--------------