# Top Directors Data Processing

This notebook contains steps to aggregate [top_directors_data.csv](https://github.com/the-pudding/data/tree/master/film-or-digital/top_directors_data.csv) to the ones visualized in The Pudding essay [Film or Digital: Breaking Down Hollywood's Choice of Shooting Medium](https://pudding.cool/2018/08/film-or-digital/).

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from IPython.core.display import display

In [None]:
#load data
dataset = pd.read_csv("top_directors_data.csv")

In [None]:
#exclude Short film, Animation, Documentary, and movie with unknown genre
dataset = dataset[~dataset['genres'].isnull()]
dataset = dataset[~dataset['genres'].str.contains("Short|Animation|Documentary")]

In [None]:
#rename film type "D|F" to "B"
film_type = {'D': 'D', 'D|F': 'B', 'F': 'F', 'U':'U'}
dataset["film_type"]=dataset["film_type"].map(film_type)

In [None]:
#expand film type to new columns
dummies=pd.get_dummies(dataset,columns=["film_type"])
dataset=pd.concat([dataset,dummies[["film_type_D","film_type_B","film_type_F","film_type_U"]]],axis=1)

In [None]:
#check samples
dataset.head()

------
## Prepare some functions to map values/formatting later

In [None]:
#Function to map production year to a three-years period.
def calculate_period(series):
    #Categorize production year to period.
    #period 1: 2006-2008, period 2: 2009-2012, etc

    condition=[
                series.between(2006,2008),
                series.between(2009,2011),
                series.between(2012,2014),
                series.between(2015,2017),
              ]

    category=[
                "2006 - 2008", "2009 - 2011", "2012 - 2014", "2015 - 2017"
             ]
    return np.select(condition,category)

In [None]:
# function to format director's name. Last name, First name -> First name Last name
def format_name(name):
    names = name.split(",")
    if len(names)>1:
        return names[1].lstrip()+" "+names[0]
    else:
        return names[0].lstrip()

In [None]:
# function to categorize directors' choice of medium based on all movies they produced within a period
def define_verdict(series):
    #print(series)
    total_film_with_known_medium = series[["film_type_D","film_type_B","film_type_F"]].sum()
    if total_film_with_known_medium==0:
        verdict = "U"
    else:
        score = (series["film_type_D"]+(0.5*series["film_type_B"]))/total_film_with_known_medium
        if score == 0:
            verdict = "F"
        elif score == 1:
            verdict = "D"
        else:
            verdict = "B"
    return verdict

------
## Check Director's Choice of Medium per Period

In [None]:
#select relevant columns for this analysis
movie_list=dataset[["production_year","director","director_id","title","film_type","film_type_D","film_type_B","film_type_F","film_type_U"]]

In [None]:
#categorize production year to period.
movie_list["period"]=movie_list[["production_year"]].apply(calculate_period)

In [None]:
#concate production title, year, and film type to a field.
#This is used in the essay for the tooltip.
def concate_list(movie_data):
    return '{"year": %s, "title": "%s", "film_type": "%s"}'%(movie_data["production_year"],movie_data["title"],movie_data["film_type"])

movie_list["movie_list"]=movie_list[["production_year","title","film_type"]].apply(lambda row:concate_list(row),axis=1)

movie_list["director"] = movie_list["director"].apply(format_name)

In [None]:
#group directors by period, count total movies made each period and the medium.
directors_medium=movie_list.groupby(["period","director","director_id"]).agg({"title":"count",
                                                                "movie_list": ", ".join,
                                                                "film_type_D":"sum",
                                                                "film_type_B":"sum",
                                                                "film_type_F":"sum",
                                                                "film_type_U":"sum",
                                                               }).reset_index()

In [None]:
#summarize directors' choice of medium within a time period.
directors_medium.rename(columns={'title': 'count_title'}, inplace=True)

directors_medium["verdict"]=directors_medium[["film_type_D","film_type_B","film_type_F","film_type_U"]]\
                            .apply(lambda x:define_verdict(x), axis=1)

In [None]:
#set index
directors_nodes = directors_medium.sort_values(by=["period","verdict","director"])\
.set_index(["period","verdict","director"])[["director_id","count_title","movie_list"]]

In [None]:
#display result 
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1):
    display(directors_nodes)

------
## Check How Director's Choice of Medium Change Overtime

In [None]:
#Pivot the data to see the 'path' followed by a director over time
director_path = directors_medium.pivot_table(index=['director',"director_id"],
                                     columns='period', 
                                     values='verdict',
                                     aggfunc=lambda x: ' '.join(x))
#if a director didn't make any movie in a period, set the value of the period as 'U'
director_path.fillna("U",inplace=True)

#check some samples
director_path.reset_index(level='director_id',inplace=True)
director_path.head(5)

In [None]:
#Function to fill the gap between period, by filling the period with unknown medium with medium used in previous period. 
#For example if the path is "F","U","D","D", then the medium in the second period is assumed to be the same with the first period.
#The final path will be "F","F","D","D". 
#In this case, the director is assumed to switch to digital in the third period (2012-2014)
def fill_path(series):
    total_movie_unknown = series.str.contains('U').sum()
    if (total_movie_unknown>0 and total_movie_unknown<3):
        if series["2009 - 2011"]=="U":
            series["2009 - 2011"] = series["2006 - 2008"]
        if (series["2012 - 2014"]=="U") and (series["2015 - 2017"]!="U"):
            series["2012 - 2014"] = series["2009 - 2011"]
    return series

In [None]:
director_path=director_path.apply(lambda row: fill_path(row), axis=1)

In [None]:
# concat the path into one column
def concat_path(series):
    return '"%s","%s","%s","%s"'%(series["2006 - 2008"],series["2009 - 2011"],series["2012 - 2014"],series["2015 - 2017"])

director_path["path"]=director_path.apply(lambda x: concat_path(x),axis=1)

In [None]:
#display result 
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1):
    display(director_path)