In [1]:
# Which Actors appear most frequently in high-rated movies (e.g., IMDb ratings > 8)

# Import Libraries
import requests
import json
import matplotlib as mp
import pandas as pd
import ast
from pprint import pprint
from config import tmdb_api_key
from config import omdb_api_key

# TMDB Setup 

# t_api_key = tmdb_api_key
# t_base_url = "https://api.themoviedb.org/3/discover/movie"

# Import The Movie Dataset CSVs into Dataframes
credits_df = pd.read_csv("Resources/credits.csv")
keywords_df = pd.read_csv("Resources/keywords.csv")
links_small_df = pd.read_csv("Resources/links_small.csv")
links_df = pd.read_csv("Resources/links.csv")
movies_df = pd.read_csv("Resources/movies_metadata.csv")
ratings_small_df = pd.read_csv("Resources/ratings_small.csv")
ratings_df = pd.read_csv("Resources/ratings.csv")



  movies_df = pd.read_csv("Resources/movies_metadata.csv")


In [2]:
# Dropping unneeded columns for movie dataframe
clean_movies_df = movies_df.drop(columns=["adult", 
                                          "belongs_to_collection", 
                                          "budget", 
                                          "genres", 
                                          "homepage", 
                                          "original_language", 
                                          "overview",
                                          "popularity",
                                          "poster_path",
                                          "production_companies",
                                          "production_countries",
                                          "revenue", 
                                          "runtime", 
                                          "spoken_languages", 
                                          "status", 
                                          "tagline", 
                                          "video"])

# Dropping Cast Column
clean_credits_df = credits_df.drop(columns = ["crew"])

In [3]:
# Converting 'cast' column from strings to Python objects
clean_credits_df['cast'] = clean_credits_df['cast'].apply(ast.literal_eval)

In [4]:
# Extract actor names
actor_names_df = clean_credits_df['cast'].apply(lambda x: [person['name'] for person in x])

# Setting the max number of actors per movie to 5
max_actors = 5

#Filling in list with "None" to make sure all rows have the same number of actors
actor_names_df = actor_names_df.apply(lambda x: x[:max_actors] + [None] * (max_actors - len(x)))

# Inserting actors in their own column
actor_columns = pd.DataFrame(actor_names_df.tolist(), 
                             columns=[f'actor_{i+1}' for i in range(max_actors)])

# Aligning indexes
actor_columns.index = clean_credits_df.index

# Merging the new actor columns with the original DataFrame
actor_results_df = pd.concat([clean_credits_df, actor_columns], axis=1)

# Removing old cast column
actor_results_df = actor_results_df.drop(columns=['cast'])

# Print the result
actor_results_df.head()


Unnamed: 0,id,actor_1,actor_2,actor_3,actor_4,actor_5
0,862,Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wallace Shawn
1,8844,Robin Williams,Jonathan Hyde,Kirsten Dunst,Bradley Pierce,Bonnie Hunt
2,15602,Walter Matthau,Jack Lemmon,Ann-Margret,Sophia Loren,Daryl Hannah
3,31357,Whitney Houston,Angela Bassett,Loretta Devine,Lela Rochon,Gregory Hines
4,11862,Steve Martin,Diane Keaton,Martin Short,Kimberly Williams-Paisley,George Newbern


In [20]:
# Cleaning up invalid entries
clean_movies_df['id'] = pd.to_numeric(clean_movies_df['id'], errors='coerce')
actor_results_df['id'] = pd.to_numeric(actor_results_df['id'], errors='coerce')
clean_movies_df = clean_movies_df.dropna(subset=['id'])
actor_results_df = actor_results_df.dropna(subset=['id'])

# Setting id column to int
clean_movies_df['id'] = clean_movies_df['id'].astype(int)
actor_results_df['id'] = actor_results_df['id'].astype(int)

# Merge
merged_movie_df = pd.merge(clean_movies_df, actor_results_df, how='left', on='id')
merged_movie_df.head()

Unnamed: 0,id,imdb_id,original_title,release_date,title,vote_average,vote_count,actor_1,actor_2,actor_3,actor_4,actor_5
0,862,tt0114709,Toy Story,1995-10-30,Toy Story,7.7,5415.0,Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wallace Shawn
1,8844,tt0113497,Jumanji,1995-12-15,Jumanji,6.9,2413.0,Robin Williams,Jonathan Hyde,Kirsten Dunst,Bradley Pierce,Bonnie Hunt
2,15602,tt0113228,Grumpier Old Men,1995-12-22,Grumpier Old Men,6.5,92.0,Walter Matthau,Jack Lemmon,Ann-Margret,Sophia Loren,Daryl Hannah
3,31357,tt0114885,Waiting to Exhale,1995-12-22,Waiting to Exhale,6.1,34.0,Whitney Houston,Angela Bassett,Loretta Devine,Lela Rochon,Gregory Hines
4,11862,tt0113041,Father of the Bride Part II,1995-02-10,Father of the Bride Part II,5.7,173.0,Steve Martin,Diane Keaton,Martin Short,Kimberly Williams-Paisley,George Newbern


In [21]:
# Putting the 5 actors into their own column with a repeating ID
merged_movie_df = merged_movie_df.melt(
    id_vars=['id', 'imdb_id', 'original_title', 'release_date', 'title', 'vote_average', 'vote_count'], 
    value_vars=['actor_1', 'actor_2', 'actor_3', 'actor_4', 'actor_5'],  
    value_name='actor'  
)

# Applying Vote Average Filter
filtered_merged_movie_df = merged_movie_df[merged_movie_df['vote_average'] > 7]
actor_appearance_counts = filtered_merged_movie_df['actor'].value_counts()

# Display the top 20 actors 
actor_appearance_counts_df = actor_appearance_counts.reset_index()
actor_appearance_counts_df.columns = ['actor', 'appearance_count']
actor_appearance_counts_df.rename(columns={'actor': 'Actor', 'appearance_count': '# of Appearances'}, inplace=True)
actor_appearance_counts_df.head(20)

Unnamed: 0,Actor,# of Appearances
0,Robert De Niro,26
1,Bette Davis,26
2,Toshirō Mifune,23
3,Tom Hanks,21
4,Christopher Plummer,20
5,James Stewart,20
6,Werner Herzog,20
7,Humphrey Bogart,19
8,Clint Eastwood,19
9,Catherine Deneuve,19


In [9]:
# Double checking our results
filtered_actor = merged_movie_df[
    (merged_movie_df['actor'] == 'Bette Davis') & (merged_movie_df['vote_average'] > 7)
     ]
filtered_actor

Unnamed: 0,id,imdb_id,original_title,release_date,title,vote_average,vote_count,variable,actor
901,705,tt0042192,All About Eve,1950-11-09,All About Eve,8.0,367.0,actor_1,Bette Davis
3429,10242,tt0056687,What Ever Happened to Baby Jane?,1962-10-31,What Ever Happened to Baby Jane?,7.9,210.0,actor_1,Bette Davis
4678,43802,tt0033836,The Little Foxes,1941-08-20,The Little Foxes,7.7,31.0,actor_1,Bette Davis
4797,32847,tt0035140,"Now, Voyager",1942-10-31,"Now, Voyager",7.1,30.0,actor_1,Bette Davis
6937,1976,tt0030287,Jezebel,1938-03-19,Jezebel,7.2,39.0,actor_1,Bette Davis
8406,17801,tt0032701,The Letter,1940-11-22,The Letter,7.4,42.0,actor_1,Bette Davis
8920,49815,tt0094315,The Whales of August,1987-10-14,The Whales of August,7.1,17.0,actor_1,Bette Davis
9758,43492,tt0037614,The Corn Is Green,1945-07-14,The Corn Is Green,7.3,9.0,actor_1,Bette Davis
10266,85783,tt0045186,The Star,1952-12-11,The Star,7.2,13.0,actor_1,Bette Davis
11544,43470,tt0038984,A Stolen Life,1946-07-06,A Stolen Life,7.6,8.0,actor_1,Bette Davis
