In [None]:
from imdb import Cinemagoer, IMDbDataAccessError
from tqdm import TqdmExperimentalWarning as TEW
import pandas as pd
import os
import warnings

warnings.filterwarnings(action='ignore', category=TEW)
from tqdm.autonotebook import tqdm

In [None]:
FILE_PATH = '../bechdel_data.json'

In [None]:
data = pd.read_json(FILE_PATH)
data.reset_index(inplace=True)
data.drop(columns='index', inplace=True)
print(f'There are {len(data)} films')
data.head()

In [None]:
def add_infos(df, start_index=0, end_index=2000):
    data = df.copy().iloc[start_index:end_index, :]
    data['directors'] = pd.NA
    data['writers'] = pd.NA
    data['cast'] = pd.NA
    
    movie_fetcher = Cinemagoer()
    index = start_index
    movie_ids = data['imdbid'].to_list()
    
    for imdb_id in tqdm(movie_ids):
        try:
            movie = movie_fetcher.get_movie_full_credits(imdb_id)
        except IMDbDataAccessError:
            movie = {'data': {}}
        try:
            directors = movie['data']['director']
            directors = [director['name'] for director in directors]
        except KeyError:
            directors = []
        try:
            cast = movie['data']['cast']
            cast = [actor['name'] for actor in cast]
        except KeyError:
            cast = []  
        try:
            writers = movie['data']['writer']
            writers = [writer['name'] for writer in writers if len(writer) > 0]
        except:
            writers = []
        data.loc[index, 'directors'] = '\n'.join(directors)
        data.loc[index, 'writers'] = '\n'.join(writers)
        data.loc[index, 'cast'] = '\n'.join(cast)
        index += 1
        
    return data

In [None]:
sub_df = add_infos(data, start_index=0, end_index=8475)
if not os.path.exists('../data/'):
    os.makedirs('../data/')
sub_df.to_csv('../data/imdb-directors-writers-cast.csv')