# Load Dependencies

In [1]:
import pandas as pd
import time

## Create People Ratings

In [2]:
# Define the file path
file_path = "../Resources/IMDB_OMDB_processed.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the DataFrame
df.head(10)

Unnamed: 0.1,Unnamed: 0,tconst,averageRating,numVotes,isAdult,startYear,runtimeMinutes,genres,Movie,OMDB_Title,...,OMDB_Language,OMDB_Country,OMDB_Type,OMDB_Metascore,OMDB_imdbRating,OMDB_imdbVotes,OMDB_BoxOffice,OMDB_Production,ReleaseYear,ReleaseMonth
0,0,tt0013274,6.8,59,0,2021,94,Documentary,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,...,,Soviet Union,movie,,,58.0,,,2022.0,5.0
1,1,tt0015414,5.2,16,0,2000,60,0,La tierra de los toros,La tierra de los toros,...,,"Spain, France",movie,,6.6,16.0,,,,
2,2,tt0015724,6.1,27,0,1993,102,"Drama,Mystery,Romance",Dama de noche,Dama de noche,...,Spanish,Mexico,movie,,5.8,27.0,,,1993.0,3.0
3,3,tt0035423,6.4,87965,0,2001,118,"Comedy,Fantasy,Romance",Kate & Leopold,Kate & Leopold,...,"English, French",United States,movie,44.0,6.4,87977.0,47121859.0,,2001.0,12.0
4,4,tt0036606,6.5,345,0,1983,118,"Drama,War","Another Time, Another Place","Another Time, Another Place",...,"English, Italian",United Kingdom,movie,,6.5,338.0,,,1984.0,5.0
5,5,tt0038086,7.0,27,0,1993,0,Thriller,Shiva und die Galgenblume,Shiva und die Galgenblume,...,German,Germany,movie,,7.2,27.0,,,1993.0,11.0
6,6,tt0057461,4.5,23,0,1983,84,"Drama,Fantasy",La rosa de los vientos,La rosa de los vientos,...,Spanish,"Spain, Venezuela, Cuba",movie,,,22.0,,,1986.0,8.0
7,7,tt0059325,6.5,257,0,1990,100,"Drama,Romance",Born in '45,Born in '45,...,German,East Germany,movie,,6.5,256.0,,,1990.0,10.0
8,8,tt0059900,6.4,38,0,1990,78,"Drama,Fantasy","Wenn du groß bist, lieber Adam","Wenn du groß bist, lieber Adam",...,German,East Germany,movie,,6.8,38.0,,,1990.0,10.0
9,9,tt0062181,6.5,401,0,1981,80,Drama,Hands Up!,Rece do góry,...,Polish,Poland,movie,,6.5,404.0,,,1985.0,1.0


In [3]:
# Initialize an empty dictionary to store director instances
director_dict = {}

In [4]:
# Initialize a variable to keep track of the running sum of various columns for each director
running_sum = {'box_office': {}, 'production_budget': {}, 'metascore': {}, 'imdb_rating': {}}

In [5]:
# Grab start time
start_time = time.time()

# Iterate through the DataFrame rows
for index, row in df.iterrows(): 
    # Get director names from the 'OMDB_Director' column
    directors = row['OMDB_Director']
    
    # Check if the value is not NaN and is a string
    if pd.notna(directors) and isinstance(directors, str):
        # Split director names separated by commas
        director_list = [director.strip() for director in directors.split(',')]
        
        # Get the corresponding values for various columns (using direct IMDB pull bc more are available)
        box_office = row['OMDB_BoxOffice']
        metascore = row['OMDB_Metascore']
        imdb_rating = row['averageRating']
        
        # Iterate through director names and update the director_dict
        for director in director_list:
            if director in director_dict:
                # Increment the instance count if the director is already in the dictionary
                director_dict[director]['instances'] += 1
                
                # Add the values to the running sum for the director
                running_sum['box_office'][director] = running_sum['box_office'].get(director, 0) + box_office
                running_sum['metascore'][director] = running_sum['metascore'].get(director, 0) + metascore
                running_sum['imdb_rating'][director] = running_sum['imdb_rating'].get(director, 0) + imdb_rating
                
                # Update the total values in the director_dict
                director_dict[director]['total_box_office'] = running_sum['box_office'][director]
                director_dict[director]['total_metascore'] = running_sum['metascore'][director]
                director_dict[director]['total_imdb_rating'] = running_sum['imdb_rating'][director]
            else:
                # Add the director to the dictionary with an initial instance count of 1
                director_dict[director] = {
                    'instances': 1,
                    'total_box_office': box_office,
                    'total_metascore': metascore,
                    'total_imdb_rating': imdb_rating
                }
                
                # Initialize the running sum for the director
                running_sum['box_office'][director] = box_office
                running_sum['metascore'][director] = metascore
                running_sum['imdb_rating'][director] = imdb_rating

# Print time taken
print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 8.43 seconds ---


In [6]:
# Display the populated director dictionary
#print(director_dict)

In [7]:
# Initialize an empty dictionary to store writer instances and running totals
writer_dict = {}
# Initialize a variable to keep track of the running sum of various columns for each writer
running_sum_writers = {'box_office': {}, 'production_budget': {}, 'metascore': {}, 'imdb_rating': {}}

In [8]:
# Grab start time
start_time = time.time()

# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Get writer names from the 'OMDB_Writer' column
    writers = row['OMDB_Writer']
    
    # Check if the value is not NaN and is a string
    if pd.notna(writers) and isinstance(writers, str):
        # Split writer names separated by commas
        writer_list = [writer.strip() for writer in writers.split(',')]
        
        # Get the corresponding values for various columns
        box_office = row['OMDB_BoxOffice']
        
        # Check if 'OMDB_Metascore' column is present in the DataFrame
        if 'OMDB_Metascore' in df.columns:
            metascore = row['OMDB_Metascore']
        else:
            metascore = 0  # Set a default value if the column is not present
        
        # Check if 'averageRating' column is present in the DataFrame
        if 'averageRating' in df.columns:
            imdb_rating = row['averageRating']
        else:
            imdb_rating = 0  # Set a default value if the column is not present
        
        # Iterate through writer names and update the writer_dict
        for writer in writer_list:
            if writer in writer_dict:
                # Increment the instance count if the writer is already in the dictionary
                writer_dict[writer]['instances'] += 1
                
                # Add the values to the running sum for the writer
                running_sum_writers['box_office'][writer] = running_sum_writers['box_office'].get(writer, 0) + box_office
                running_sum_writers['metascore'][writer] = running_sum_writers['metascore'].get(writer, 0) + metascore
                running_sum_writers['imdb_rating'][writer] = running_sum_writers['imdb_rating'].get(writer, 0) + imdb_rating
                
                # Update the total values in the writer_dict
                writer_dict[writer]['total_box_office'] = running_sum_writers['box_office'][writer]
                writer_dict[writer]['total_metascore'] = running_sum_writers['metascore'][writer]
                writer_dict[writer]['total_imdb_rating'] = running_sum_writers['imdb_rating'][writer]
            else:
                # Add the writer to the dictionary with an initial instance count of 1
                writer_dict[writer] = {
                    'instances': 1,
                    'total_box_office': box_office,
                    'total_metascore': metascore,
                    'total_imdb_rating': imdb_rating
                }
                
                # Initialize the running sum for the writer
                running_sum_writers['box_office'][writer] = box_office
                running_sum_writers['metascore'][writer] = metascore
                running_sum_writers['imdb_rating'][writer] = imdb_rating

# Print time taken
print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 8.73 seconds ---


In [9]:
# Display the populated writer dictionary
#print(writer_dict)

In [10]:
# Initialize an empty dictionary to store actor instances and running totals
actor_dict = {}

# Initialize a variable to keep track of the running sum of various columns for each actor
running_sum_actors = {'box_office': {}, 'production_budget': {}, 'metascore': {}, 'imdb_rating': {}}


In [11]:
# Grab start time
start_time = time.time()

# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Get actor names from the 'OMDB_Actors' column
    actors = row['OMDB_Actors']
    
    # Check if the value is not NaN and is a string
    if pd.notna(actors) and isinstance(actors, str):
        # Split actor names separated by commas
        actor_list = [actor.strip() for actor in actors.split(',')]
        
        # Get the corresponding values for various columns
        box_office = row['OMDB_BoxOffice']
        
        # Check if 'OMDB_Metascore' column is present in the DataFrame
        if 'OMDB_Metascore' in df.columns:
            metascore = row['OMDB_Metascore']
        else:
            metascore = 0  # Set a default value if the column is not present
        
        # Check if 'averageRating' column is present in the DataFrame
        if 'averageRating' in df.columns:
            imdb_rating = row['averageRating']
        else:
            imdb_rating = 0  # Set a default value if the column is not present
        
        # Iterate through actor names and update the actor_dict
        for actor in actor_list:
            if actor in actor_dict:
                # Increment the instance count if the actor is already in the dictionary
                actor_dict[actor]['instances'] += 1
                
                # Add the values to the running sum for the actor
                running_sum_actors['box_office'][actor] = running_sum_actors['box_office'].get(actor, 0) + box_office
                running_sum_actors['metascore'][actor] = running_sum_actors['metascore'].get(actor, 0) + metascore
                running_sum_actors['imdb_rating'][actor] = running_sum_actors['imdb_rating'].get(actor, 0) + imdb_rating
                
                # Update the total values in the actor_dict
                actor_dict[actor]['total_box_office'] = running_sum_actors['box_office'][actor]
                actor_dict[actor]['total_metascore'] = running_sum_actors['metascore'][actor]
                actor_dict[actor]['total_imdb_rating'] = running_sum_actors['imdb_rating'][actor]
            else:
                # Add the actor to the dictionary with an initial instance count of 1
                actor_dict[actor] = {
                    'instances': 1,
                    'total_box_office': box_office,
                    'total_metascore': metascore,
                    'total_imdb_rating': imdb_rating
                }
                
                # Initialize the running sum for the actor
                running_sum_actors['box_office'][actor] = box_office
                running_sum_actors['metascore'][actor] = metascore
                running_sum_actors['imdb_rating'][actor] = imdb_rating


In [12]:
# Display the populated actor dictionary
#print(actor_dict)

In [13]:
import csv
import math  # Import math to handle nan
# Specify the file path for the CSV file
output_csv_path_director = '../Resources/IMDB_OMDB_director_data.csv'
output_csv_path_writer = '../Resources/IMDB_OMDB_writer_data.csv'
output_csv_path_actor = '../Resources/IMDB_OMDB_actor_data.csv'

In [14]:
# Define the column headers based on the structure of the dictionary
column_headers_director = ['Director', 'Instances', 'Total_Box_Office', 'Total_Production_Budget', 'Total_Metascore', 'Total_IMDB_Rating', 'Average_Box_Office', 'Average_IMDB_Rating', 'Success_Metric']
# Define the column headers based on the structure of the dictionary
column_headers_writer = ['Writer', 'Instances', 'Total_Box_Office', 'Total_Production_Budget', 'Total_Metascore', 'Total_IMDB_Rating', 'Average_Box_Office', 'Average_IMDB_Rating', 'Success_Metric']
# Define the column headers based on the structure of the dictionary
column_headers_actor = ['Actor', 'Instances', 'Total_Box_Office', 'Total_Production_Budget', 'Total_Metascore', 'Total_IMDB_Rating', 'Average_Box_Office', 'Average_IMDB_Rating', 'Success_Metric']


In [15]:
# Write director data to CSV
with open(output_csv_path_director, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=column_headers_director)
    
    # Write the header
    writer.writeheader()
    
    # Write the data
    for director, data in director_dict.items():
        writer.writerow({'Director': director, 'Instances': data['instances'], 
                         'Total_Box_Office': data['total_box_office'], 
                         'Total_Metascore': data['total_metascore'], 
                         'Total_IMDB_Rating': data['total_imdb_rating'],
                         'Average_Box_Office': data['total_box_office']/data['instances'],
                         'Average_IMDB_Rating': data['total_imdb_rating']/data['instances'],
                         'Success_Metric': (data['total_imdb_rating']/data['instances'] + data['total_box_office']/data['instances']/50000000)
                        })

# Print time taken
print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 10.31 seconds ---


In [16]:
# Write actor data to CSV
with open(output_csv_path_actor, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=column_headers_actor)
    
    # Write the header
    writer.writeheader()
    
    # Write the data
    for actor, data in actor_dict.items():
        writer.writerow({'Actor': actor, 'Instances': data['instances'], 
                         'Total_Box_Office': data['total_box_office'], 
                         'Total_Metascore': data['total_metascore'], 
                         'Total_IMDB_Rating': data['total_imdb_rating'],
                         'Average_Box_Office': data['total_box_office']/data['instances'],
                         'Average_IMDB_Rating': data['total_imdb_rating']/data['instances'],
                         'Success_Metric': (data['total_imdb_rating']/data['instances'] + data['total_box_office']/data['instances']/50000000)
                        })


In [17]:
# Write writer data to CSV
with open(output_csv_path_writer, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=column_headers_writer)
    
    # Write the header
    writer.writeheader()
    
    # Write the data
    for writer_value, data in writer_dict.items():
        writer.writerow({'Writer': writer_value, 'Instances': data['instances'], 
                         'Total_Box_Office': data['total_box_office'], 
                         'Total_Metascore': data['total_metascore'], 
                         'Total_IMDB_Rating': data['total_imdb_rating'],
                         'Average_Box_Office': data['total_box_office']/data['instances'],
                         'Average_IMDB_Rating': data['total_imdb_rating']/data['instances'],
                         'Success_Metric': (data['total_imdb_rating']/data['instances'] + data['total_box_office']/data['instances']/50000000)
                        })
