In [5]:
import pandas as pd

In [7]:
df = pd.read_csv('data/credits.csv')

In [8]:
import ast

# Convert the 'cast' and 'crew' columns from strings to objects
df['cast'] = df['cast'].apply(ast.literal_eval)
df['crew'] = df['crew'].apply(ast.literal_eval)

In [9]:
df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [10]:
from collections import defaultdict

cast_list = []
cast_id_list = []
crew_list = []
crew_id_list = []

for index, row in df.iterrows():
    cast = []
    cast_ids = []
    crew = defaultdict(list)
    crew_ids = defaultdict(list)
    for cast_member in row['cast']:
        cast.append(cast_member.get('name'))
        cast_ids.append(cast_member.get('id'))
    for crew_member in row['crew']:
        department = crew_member.get('department')
        if department:
            crew[department].append(crew_member.get('name'))
            crew_ids[department].append(crew_member.get('id'))
    cast_list.append(cast)
    cast_id_list.append(cast_ids)
    crew_list.append(crew)
    crew_id_list.append(crew_ids)

print(cast_list[:10])  # Display the first 10 entries
print(crew_id_list[:10])  # Display the first 10 IDs

[['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim Varney', 'Wallace Shawn', 'John Ratzenberger', 'Annie Potts', 'John Morris', 'Erik von Detten', 'Laurie Metcalf', 'R. Lee Ermey', 'Sarah Freeman', 'Penn Jillette'], ['Robin Williams', 'Jonathan Hyde', 'Kirsten Dunst', 'Bradley Pierce', 'Bonnie Hunt', 'Bebe Neuwirth', 'David Alan Grier', 'Patricia Clarkson', 'Adam Hann-Byrd', 'Laura Bell Bundy', 'James Handy', 'Gillian Barber', 'Brandon Obray', 'Cyrus Thiedeke', 'Gary Joseph Thorup', 'Leonard Zola', 'Lloyd Berry', 'Malcolm Stewart', 'Annabel Kershaw', 'Darryl Henriques', 'Robyn Driscoll', 'Peter Bryant', 'Sarah Gilson', 'Florica Vlad', 'June Lion', 'Brenda Lockmuller'], ['Walter Matthau', 'Jack Lemmon', 'Ann-Margret', 'Sophia Loren', 'Daryl Hannah', 'Burgess Meredith', 'Kevin Pollak'], ['Whitney Houston', 'Angela Bassett', 'Loretta Devine', 'Lela Rochon', 'Gregory Hines', 'Dennis Haysbert', 'Michael Beach', 'Mykelti Williamson', 'Lamont Johnson', 'Wesley Snipes'], ['Steve Martin', 'Diane K

In [16]:
from collections import defaultdict

# Step 1: Get all departments and their max crew count
department_max_counts = defaultdict(int)

for crew_ids in crew_id_list:
    for department, ids in crew_ids.items():
        department_max_counts[department] = max(department_max_counts[department], len(ids))

# Step 2: Sort departments for consistent ordering
sorted_departments = sorted(department_max_counts.keys())

# Step 3: Pad each movie's crew list
padded_crew_ids = []
for crew_ids in crew_id_list:
    padded_ids = []
    for department in sorted_departments:
        ids = crew_ids.get(department, [])
        pad_size = department_max_counts[department] - len(ids)
        padded_ids.extend(ids + [0] * pad_size)  # Pad with zeros
    padded_crew_ids.append(padded_ids)

print(sorted_departments)
print(padded_crew_ids[:2])

print("Max length of padded crew IDs:", max(len(ids) for ids in padded_crew_ids))
print("Min length of padded crew IDs:", min(len(ids) for ids in padded_crew_ids))


['Actors', 'Art', 'Camera', 'Costume & Make-Up', 'Crew', 'Directing', 'Editing', 'Lighting', 'Production', 'Sound', 'Visual Effects', 'Writing']
[[0, 0, 0, 0, 0, 0, 7883, 7961, 1458006, 1748705, 1748710, 1443471, 1748711, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1629419, 1748557, 12890, 953331, 1468014, 15894, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [17]:
# Step 1: Determine the maximum length of cast_ids
max_cast_length = max(len(cast) for cast in cast_id_list)

# Step 2: Pad each cast_ids list with zeros to match the maximum length
padded_cast_ids = [cast + [0] * (max_cast_length - len(cast)) for cast in cast_id_list]

print(padded_cast_ids[:2])  # Display the first 2 padded cast_ids

print("Max length of padded_cast_ids:", max(len(cast) for cast in padded_cast_ids))
print("Min length of padded_cast_ids:", min(len(cast) for cast in padded_cast_ids))

[[31, 12898, 7167, 12899, 12900, 7907, 8873, 1116442, 12901, 12133, 8655, 12903, 37221, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2157, 8537

In [None]:
# Create a new DataFrame with the id column and cast_ids list
id_df = pd.DataFrame({'movie_id': df['id'], 'cast_ids': padded_cast_ids[:len(df)], 'crew_ids': padded_crew_ids[:len(df)]})
id_df.head()
# Save the DataFrame to a CSV file
id_df.to_csv('clean_data/credits_padded.csv', index=False)

In [85]:
movies = pd.read_csv('data/movies_metadata.csv')
display(movies.head())

  movies = pd.read_csv('data/movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [86]:
movies['genres'] = movies['genres'].apply(ast.literal_eval)
movies['production_companies'] = movies['production_companies'].dropna().apply(ast.literal_eval)

In [87]:
movies = movies[["id", "genres", "production_companies", "vote_average", "vote_count", "popularity", "revenue"]]

In [88]:
# filter out movies with less than 100 votes
# movies = movies[movies['revenue'] > 0]
display(movies.head())
# movies = movies[movies['production_companies'].notna()]
movies = movies[movies['vote_average'].notna()]
movies = movies[movies['popularity'].notna()]
movies = movies[movies['revenue'].notna()]

Unnamed: 0,id,genres,production_companies,vote_average,vote_count,popularity,revenue
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'name': 'Pixar Animation Studios', 'id': 3}]",7.7,5415.0,21.946943,373554033.0
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'name': 'TriStar Pictures', 'id': 559}, {'na...",6.9,2413.0,17.015539,262797249.0
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",6.5,92.0,11.7129,0.0
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",[{'name': 'Twentieth Century Fox Film Corporat...,6.1,34.0,3.859495,81452156.0
4,11862,"[{'id': 35, 'name': 'Comedy'}]","[{'name': 'Sandollar Productions', 'id': 5842}...",5.7,173.0,8.387519,76578911.0


In [89]:
# Get max length for genres and production_companies
max_genres = movies['genres'].apply(lambda x: len(x)).max()
max_prods = movies['production_companies'].apply(lambda x: len(x)).max()

def extract_ids(entry):
    """Extract list of IDs from genre or production_company field"""
    if isinstance(entry, list):
        return [d['id'] for d in entry if isinstance(d, dict)]
    return []

def pad_list(lst, max_len):
    return lst + [0] * (max_len - len(lst))

# Apply extraction and padding
movies['genre_ids'] = movies['genres'].apply(extract_ids).apply(lambda x: pad_list(x, max_genres))
movies['prod_ids'] = movies['production_companies'].apply(extract_ids).apply(lambda x: pad_list(x, max_prods))

# Combine them
movies['combined_ids'] = movies.apply(lambda row: row['genre_ids'] + row['prod_ids'], axis=1)

# print max and min lengths of combined_ids
print("Max length of combined_ids:", movies['combined_ids'].apply(len).max())
print("Min length of combined_ids:", movies['combined_ids'].apply(len).min())


Max length of combined_ids: 34
Min length of combined_ids: 34


In [90]:
display(movies.head())

Unnamed: 0,id,genres,production_companies,vote_average,vote_count,popularity,revenue,genre_ids,prod_ids,combined_ids
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'name': 'Pixar Animation Studios', 'id': 3}]",7.7,5415.0,21.946943,373554033.0,"[16, 35, 10751, 0, 0, 0, 0, 0]","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[16, 35, 10751, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'name': 'TriStar Pictures', 'id': 559}, {'na...",6.9,2413.0,17.015539,262797249.0,"[12, 14, 10751, 0, 0, 0, 0, 0]","[559, 2550, 10201, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 14, 10751, 0, 0, 0, 0, 0, 559, 2550, 1020..."
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",6.5,92.0,11.7129,0.0,"[10749, 35, 0, 0, 0, 0, 0, 0]","[6194, 19464, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[10749, 35, 0, 0, 0, 0, 0, 0, 6194, 19464, 0, ..."
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",[{'name': 'Twentieth Century Fox Film Corporat...,6.1,34.0,3.859495,81452156.0,"[35, 18, 10749, 0, 0, 0, 0, 0]","[306, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35, 18, 10749, 0, 0, 0, 0, 0, 306, 0, 0, 0, 0..."
4,11862,"[{'id': 35, 'name': 'Comedy'}]","[{'name': 'Sandollar Productions', 'id': 5842}...",5.7,173.0,8.387519,76578911.0,"[35, 0, 0, 0, 0, 0, 0, 0]","[5842, 9195, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[35, 0, 0, 0, 0, 0, 0, 0, 5842, 9195, 0, 0, 0,..."


In [91]:
# Ensure both columns have the same data type
id_df['movie_id'] = id_df['movie_id'].astype(str)
movies['id'] = movies['id'].astype(str)

# Perform the join operation
merged_df = id_df.merge(movies[['id', 'vote_average', 'vote_count', 'combined_ids', 'popularity', 'revenue']], left_on='movie_id', right_on='id', how='inner')
merged_df = merged_df.drop(columns=['id'])

# Display the first few rows of the merged DataFrame
display(merged_df.head())

Unnamed: 0,movie_id,cast_ids,crew_ids,vote_average,vote_count,combined_ids,popularity,revenue
0,862,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[0, 0, 0, 0, 0, 0, 7883, 7961, 1458006, 174870...",7.7,5415.0,"[16, 35, 10751, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ...",21.946943,373554033.0
1,8844,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[0, 0, 0, 0, 0, 0, 9967, 0, 0, 0, 0, 0, 0, 0, ...",6.9,2413.0,"[12, 14, 10751, 0, 0, 0, 0, 0, 559, 2550, 1020...",17.015539,262797249.0
2,15602,"[6837, 3151, 13567, 16757, 589, 16523, 7166, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.5,92.0,"[10749, 35, 0, 0, 0, 0, 0, 0, 6194, 19464, 0, ...",11.7129,0.0
3,31357,"[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.1,34.0,"[35, 18, 10749, 0, 0, 0, 0, 0, 306, 0, 0, 0, 0...",3.859495,81452156.0
4,11862,"[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.7,173.0,"[35, 0, 0, 0, 0, 0, 0, 0, 5842, 9195, 0, 0, 0,...",8.387519,76578911.0


In [92]:
merged_df['final_ids'] = merged_df.apply(lambda row: row['cast_ids'] + row['crew_ids'] + row['combined_ids'], axis=1)
# Drop the original columns used for merging
# merged_df = merged_df.drop(columns=['cast_ids', 'crew_ids', 'combined_ids'])
display(merged_df.head())
# Save the final DataFrame to a CSV file
merged_df.to_csv('clean_data/embeddings.csv', index=False)

# print max and min lengths of final_ids
print("Max length of final_ids:", merged_df['final_ids'].apply(len).max())
print("Min length of final_ids:", merged_df['final_ids'].apply(len).min())

Unnamed: 0,movie_id,cast_ids,crew_ids,vote_average,vote_count,combined_ids,popularity,revenue,final_ids
0,862,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[0, 0, 0, 0, 0, 0, 7883, 7961, 1458006, 174870...",7.7,5415.0,"[16, 35, 10751, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ...",21.946943,373554033.0,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11..."
1,8844,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[0, 0, 0, 0, 0, 0, 9967, 0, 0, 0, 0, 0, 0, 0, ...",6.9,2413.0,"[12, 14, 10751, 0, 0, 0, 0, 0, 559, 2550, 1020...",17.015539,262797249.0,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ..."
2,15602,"[6837, 3151, 13567, 16757, 589, 16523, 7166, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.5,92.0,"[10749, 35, 0, 0, 0, 0, 0, 0, 6194, 19464, 0, ...",11.7129,0.0,"[6837, 3151, 13567, 16757, 589, 16523, 7166, 0..."
3,31357,"[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.1,34.0,"[35, 18, 10749, 0, 0, 0, 0, 0, 306, 0, 0, 0, 0...",3.859495,81452156.0,"[8851, 9780, 18284, 51359, 66804, 352, 87118, ..."
4,11862,"[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.7,173.0,"[35, 0, 0, 0, 0, 0, 0, 0, 5842, 9195, 0, 0, 0,...",8.387519,76578911.0,"[67773, 3092, 519, 70696, 59222, 18793, 14592,..."


Max length of final_ids: 1018
Min length of final_ids: 1018


In [None]:
merged_df = pd.read_csv('clean_data/embeddings.csv')

max_length = merged_df['final_ids'].apply(len).max()
min_length = merged_df['final_ids'].apply(len).min()

print(f"Maximum length of final_ids: {max_length}")
print(f"Minimum length of final_ids: {min_length}")

Maximum length of final_ids: 5691
Minimum length of final_ids: 3054
