In [11]:
import pandas as pd

# Define the file path
file_path = "Resources/larger_movies_formatted.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the DataFrame
df['OMDB_Director'].head(30)

0                      Nikolai Izvolov, Dziga Vertov
1                                           Musidora
2                                  Eva López Sánchez
3                                      James Mangold
4                                    Michael Radford
5                                     Hans Steinhoff
6                                    Patricio Guzmán
7                                    Jürgen Böttcher
8                                       Egon Günther
9                                  Jerzy Skolimowski
10                      Raúl Ruiz, Valeria Sarmiento
11                                    Michael Roemer
12                                 Gavriil Egiazarov
13                                        Zdenek Tyc
14                                     Claude d'Anna
15                                     Chen-Kuo Chao
16                                        Godfrey Ho
17                  Cirio H. Santiago, Allan Holzman
18                                   Budd Boet

In [12]:
# Initialize an empty dictionary to store director instances
director_dict = {}

In [13]:
# Initialize a variable to keep track of the running sum of various columns for each director
running_sum = {'box_office': {}, 'production_budget': {}, 'metascore': {}, 'imdb_rating': {}}

In [14]:
# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Get director names from the 'OMDB_Director' column
    directors = row['OMDB_Director']
    
    # Check if the value is not NaN and is a string
    if pd.notna(directors) and isinstance(directors, str):
        # Split director names separated by commas
        director_list = [director.strip() for director in directors.split(',')]
        
        # Get the corresponding values for various columns
        box_office = row['OMDB_BoxOffice']
        metascore = row['OMDB_Metascore']
        imdb_rating = row['OMDB_imdbRating']
        
        # Iterate through director names and update the director_dict
        for director in director_list:
            if director in director_dict:
                # Increment the instance count if the director is already in the dictionary
                director_dict[director]['instances'] += 1
                
                # Add the values to the running sum for the director
                running_sum['box_office'][director] = running_sum['box_office'].get(director, 0) + box_office
                running_sum['metascore'][director] = running_sum['metascore'].get(director, 0) + metascore
                running_sum['imdb_rating'][director] = running_sum['imdb_rating'].get(director, 0) + imdb_rating
                
                # Update the total values in the director_dict
                director_dict[director]['total_box_office'] = running_sum['box_office'][director]
                director_dict[director]['total_metascore'] = running_sum['metascore'][director]
                director_dict[director]['total_imdb_rating'] = running_sum['imdb_rating'][director]
            else:
                # Add the director to the dictionary with an initial instance count of 1
                director_dict[director] = {
                    'instances': 1,
                    'total_box_office': box_office,
                    'total_metascore': metascore,
                    'total_imdb_rating': imdb_rating
                }
                
                # Initialize the running sum for the director
                running_sum['box_office'][director] = box_office
                running_sum['metascore'][director] = metascore
                running_sum['imdb_rating'][director] = imdb_rating


TypeError: can only concatenate str (not "float") to str

In [5]:
# Display the populated director dictionary
#print(director_dict)

In [6]:
# Initialize an empty dictionary to store writer instances and running totals
writer_dict = {}
# Initialize a variable to keep track of the running sum of various columns for each writer
running_sum_writers = {'box_office': {}, 'production_budget': {}, 'metascore': {}, 'imdb_rating': {}}

In [7]:
# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Get writer names from the 'OMDB_Writer' column
    writers = row['OMDB_Writer']
    
    # Check if the value is not NaN and is a string
    if pd.notna(writers) and isinstance(writers, str):
        # Split writer names separated by commas
        writer_list = [writer.strip() for writer in writers.split(',')]
        
        # Get the corresponding values for various columns
        box_office = row['OMDB_BoxOffice']
        
        # Check if 'OMDB_Metascore' column is present in the DataFrame
        if 'OMDB_Metascore' in df.columns:
            metascore = row['OMDB_Metascore']
        else:
            metascore = 0  # Set a default value if the column is not present
        
        # Check if 'OMDB_imdbRating' column is present in the DataFrame
        if 'OMDB_imdbRating' in df.columns:
            imdb_rating = row['OMDB_imdbRating']
        else:
            imdb_rating = 0  # Set a default value if the column is not present
        
        # Iterate through writer names and update the writer_dict
        for writer in writer_list:
            if writer in writer_dict:
                # Increment the instance count if the writer is already in the dictionary
                writer_dict[writer]['instances'] += 1
                
                # Add the values to the running sum for the writer
                running_sum_writers['box_office'][writer] = running_sum_writers['box_office'].get(writer, 0) + box_office
                running_sum_writers['production_budget'][writer] = running_sum_writers['production_budget'].get(writer, 0) + production_budget
                running_sum_writers['metascore'][writer] = running_sum_writers['metascore'].get(writer, 0) + metascore
                running_sum_writers['imdb_rating'][writer] = running_sum_writers['imdb_rating'].get(writer, 0) + imdb_rating
                
                # Update the total values in the writer_dict
                writer_dict[writer]['total_box_office'] = running_sum_writers['box_office'][writer]
                writer_dict[writer]['total_production_budget'] = running_sum_writers['production_budget'][writer]
                writer_dict[writer]['total_metascore'] = running_sum_writers['metascore'][writer]
                writer_dict[writer]['total_imdb_rating'] = running_sum_writers['imdb_rating'][writer]
            else:
                # Add the writer to the dictionary with an initial instance count of 1
                writer_dict[writer] = {
                    'instances': 1,
                    'total_box_office': box_office,
                    'total_production_budget': production_budget,
                    'total_metascore': metascore,
                    'total_imdb_rating': imdb_rating
                }
                
                # Initialize the running sum for the writer
                running_sum_writers['box_office'][writer] = box_office
                running_sum_writers['production_budget'][writer] = production_budget
                running_sum_writers['metascore'][writer] = metascore
                running_sum_writers['imdb_rating'][writer] = imdb_rating

In [8]:
# Display the populated writer dictionary
#print(writer_dict)

In [9]:
# Initialize an empty dictionary to store actor instances and running totals
actor_dict = {}

# Initialize a variable to keep track of the running sum of various columns for each actor
running_sum_actors = {'box_office': {}, 'production_budget': {}, 'metascore': {}, 'imdb_rating': {}}


In [10]:
# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Get actor names from the 'OMDB_Actors' column
    actors = row['OMDB_Actors']
    
    # Check if the value is not NaN and is a string
    if pd.notna(actors) and isinstance(actors, str):
        # Split actor names separated by commas
        actor_list = [actor.strip() for actor in actors.split(',')]
        
        # Get the corresponding values for various columns
        box_office = row['OMDB_BoxOffice']
        
        # Check if 'OMDB_Metascore' column is present in the DataFrame
        if 'OMDB_Metascore' in df.columns:
            metascore = row['OMDB_Metascore']
        else:
            metascore = 0  # Set a default value if the column is not present
        
        # Check if 'OMDB_imdbRating' column is present in the DataFrame
        if 'OMDB_imdbRating' in df.columns:
            imdb_rating = row['OMDB_imdbRating']
        else:
            imdb_rating = 0  # Set a default value if the column is not present
        
        # Iterate through actor names and update the actor_dict
        for actor in actor_list:
            if actor in actor_dict:
                # Increment the instance count if the actor is already in the dictionary
                actor_dict[actor]['instances'] += 1
                
                # Add the values to the running sum for the actor
                running_sum_actors['box_office'][actor] = running_sum_actors['box_office'].get(actor, 0) + box_office
                running_sum_actors['production_budget'][actor] = running_sum_actors['production_budget'].get(actor, 0) + production_budget
                running_sum_actors['metascore'][actor] = running_sum_actors['metascore'].get(actor, 0) + metascore
                running_sum_actors['imdb_rating'][actor] = running_sum_actors['imdb_rating'].get(actor, 0) + imdb_rating
                
                # Update the total values in the actor_dict
                actor_dict[actor]['total_box_office'] = running_sum_actors['box_office'][actor]
                actor_dict[actor]['total_production_budget'] = running_sum_actors['production_budget'][actor]
                actor_dict[actor]['total_metascore'] = running_sum_actors['metascore'][actor]
                actor_dict[actor]['total_imdb_rating'] = running_sum_actors['imdb_rating'][actor]
            else:
                # Add the actor to the dictionary with an initial instance count of 1
                actor_dict[actor] = {
                    'instances': 1,
                    'total_box_office': box_office,
                    'total_production_budget': production_budget,
                    'total_metascore': metascore,
                    'total_imdb_rating': imdb_rating
                }
                
                # Initialize the running sum for the actor
                running_sum_actors['box_office'][actor] = box_office
                running_sum_actors['production_budget'][actor] = production_budget
                running_sum_actors['metascore'][actor] = metascore
                running_sum_actors['imdb_rating'][actor] = imdb_rating


In [None]:
# Display the populated actor dictionary
#print(actor_dict)

In [12]:
import csv
import math  # Import math to handle nan
# Specify the file path for the CSV file
output_csv_path_director = 'Resources/larger_director_data_avg.csv'
output_csv_path_writer = 'Resources/larger_data_avg.csv'
output_csv_path_actor = 'Resources/larger_data_avg.csv'

In [13]:
# Define the column headers based on the structure of the dictionary
column_headers_director = ['Director', 'Instances', 'Total_Box_Office', 'Total_Production_Budget', 'Total_Metascore', 'Total_IMDB_Rating', 'Average_Box_Office', 'Average_IMDB_Rating', 'Success_Metric']
# Define the column headers based on the structure of the dictionary
column_headers_writer = ['Writer', 'Instances', 'Total_Box_Office', 'Total_Production_Budget', 'Total_Metascore', 'Total_IMDB_Rating', 'Average_Box_Office', 'Average_IMDB_Rating', 'Success_Metric']
# Define the column headers based on the structure of the dictionary
column_headers_actor = ['Actor', 'Instances', 'Total_Box_Office', 'Total_Production_Budget', 'Total_Metascore', 'Total_IMDB_Rating', 'Average_Box_Office', 'Average_IMDB_Rating', 'Success_Metric']


In [17]:
# Write director data to CSV
with open(output_csv_path_director, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=column_headers_director)
    
    # Write the header
    writer.writeheader()
    
    # Write the data
    for director, data in director_dict.items():
        writer.writerow({'Director': director, 'Instances': data['instances'], 
                         'Total_Box_Office': data['total_box_office'], 
                         'Total_Production_Budget': data['total_production_budget'], 
                         'Total_Metascore': data['total_metascore'], 
                         'Total_IMDB_Rating': data['total_imdb_rating'],
                         'Average_Box_Office': data['total_box_office']/data['instances'],
                         'Average_IMDB_Rating': data['total_imdb_rating']/data['instances'],
                         'Success_Metric': (data['total_imdb_rating']/data['instances'] + data['total_box_office']/data['instances']/50000000)
                        })


In [18]:
# Write actor data to CSV
with open(output_csv_path_actor, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=column_headers_actor)
    
    # Write the header
    writer.writeheader()
    
    # Write the data
    for actor, data in actor_dict.items():
        writer.writerow({'Actor': actor, 'Instances': data['instances'], 
                         'Total_Box_Office': data['total_box_office'], 
                         'Total_Production_Budget': data['total_production_budget'], 
                         'Total_Metascore': data['total_metascore'], 
                         'Total_IMDB_Rating': data['total_imdb_rating'],
                         'Average_Box_Office': data['total_box_office']/data['instances'],
                         'Average_IMDB_Rating': data['total_imdb_rating']/data['instances'],
                         'Success_Metric': (data['total_imdb_rating']/data['instances'] + data['total_box_office']/data['instances']/50000000)
                        })


In [19]:
# Write writer data to CSV
with open(output_csv_path_writer, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=column_headers_writer)
    
    # Write the header
    writer.writeheader()
    
    # Write the data
    for writer_value, data in writer_dict.items():
        writer.writerow({'Writer': writer_value, 'Instances': data['instances'], 
                         'Total_Box_Office': data['total_box_office'], 
                         'Total_Production_Budget': data['total_production_budget'], 
                         'Total_Metascore': data['total_metascore'], 
                         'Total_IMDB_Rating': data['total_imdb_rating'],
                         'Average_Box_Office': data['total_box_office']/data['instances'],
                         'Average_IMDB_Rating': data['total_imdb_rating']/data['instances'],
                         'Success_Metric': (data['total_imdb_rating']/data['instances'] + data['total_box_office']/data['instances']/50000000)
                        })
