In [19]:
import pandas as pd

# Define the file path
file_path = "Resources/movies_formatted.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the DataFrame
df['OMDB_Director'].head(30)

0                  James Cameron
1       Anthony Russo, Joe Russo
2                   Rob Marshall
3                    Joss Whedon
4                Louis Leterrier
5       Anthony Russo, Joe Russo
6                 Gore Verbinski
7                    Zack Snyder
8                  James Mangold
9          Christopher McQuarrie
10              Nikolai Kurbatov
11                    Ron Howard
12                   Nia DaCosta
13                Andrew Stanton
14                   Zack Snyder
15     Roger Allers, Rob Minkoff
16    Nathan Greno, Byron Howard
17                     Sam Raimi
18                  Ryan Coogler
19      Anthony Russo, Joe Russo
20                    James Gunn
21                 Taika Waititi
22                   David Yates
23     Ron Clements, John Musker
24                 Peter Jackson
25                 Peter Jackson
26                  F. Gary Gray
27            Cary Joji Fukunaga
28                    Sam Mendes
29                 James Cameron
Name: OMDB

In [20]:
# Initialize an empty dictionary to store director instances
director_dict = {}

In [21]:
# Initialize a variable to keep track of the running sum of box office and production budget for each director
running_sum = {'box_office': {}, 'production_budget': {}}

In [22]:
# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Get director names from the 'OMDB_Director' column
    directors = row['OMDB_Director']
    
    # Check if the value is not NaN and is a string
    if pd.notna(directors) and isinstance(directors, str):
        # Split director names separated by commas
        director_list = [director.strip() for director in directors.split(',')]
        
        # Get the corresponding box office and production budget values
        box_office = row['OMDB_BoxOffice']
        production_budget = row['ProductionBudget']
        
        # Iterate through director names and update the director_dict
        for director in director_list:
            if director in director_dict:
                # Increment the instance count if the director is already in the dictionary
                director_dict[director]['instances'] += 1
                
                # Add the box office and production budget values to the running sum for the director
                running_sum['box_office'][director] = running_sum['box_office'].get(director, 0) + box_office
                running_sum['production_budget'][director] = running_sum['production_budget'].get(director, 0) + production_budget
                
                # Update the total box office and production budget values in the director_dict
                director_dict[director]['total_box_office'] = running_sum['box_office'][director]
                director_dict[director]['total_production_budget'] = running_sum['production_budget'][director]
            else:
                # Add the director to the dictionary with an initial instance count of 1
                director_dict[director] = {
                    'instances': 1,
                    'total_box_office': box_office,
                    'total_production_budget': production_budget
                }
                
                # Initialize the running sum for the director
                running_sum['box_office'][director] = box_office
                running_sum['production_budget'][director] = production_budget

In [18]:
# Display the populated director dictionary
print(director_dict)

{'James Cameron': {'instances': 8, 'total_box_office': 2673746084.0}, 'Anthony Russo': {'instances': 8, 'total_box_office': nan}, 'Joe Russo': {'instances': 8, 'total_box_office': nan}, 'Rob Marshall': {'instances': 6, 'total_box_office': 788887603.0}, 'Joss Whedon': {'instances': 5, 'total_box_office': 1756750722.0}, 'Louis Leterrier': {'instances': 7, 'total_box_office': 757092622.0}, 'Gore Verbinski': {'instances': 7, 'total_box_office': 1153971900.0}, 'Zack Snyder': {'instances': 10, 'total_box_office': nan}, 'James Mangold': {'instances': 9, 'total_box_office': 908798110.0}, 'Christopher McQuarrie': {'instances': 5, 'total_box_office': 673463261.0}, 'Nikolai Kurbatov': {'instances': 1, 'total_box_office': nan}, 'Ron Howard': {'instances': 18, 'total_box_office': nan}, 'Nia DaCosta': {'instances': 1, 'total_box_office': nan}, 'Andrew Stanton': {'instances': 5, 'total_box_office': 1326823651.0}, 'Roger Allers': {'instances': 4, 'total_box_office': 931398302.0}, 'Rob Minkoff': {'inst