# Looking at gender in German movies

Here we can use file globing to restrict the data size. This demo performs work well with a single CPU (each cell executes in a a few seconds).

We will look at th

In [None]:
%%time

import os
import sys

sys.path.append('..')

from movies_dask_bag.movie_reader import TheatersReader, MoviesReader, ShowingsReader

work_dir = os.environ.get('SLURM_TMPDIR', '.')
data_dir = '{}/json'.format(work_dir)

# Notice the glob pattern restricts the data to the Germany data
file_pattern = '{}/*/deu'.format(data_dir)

showings_reader = ShowingsReader(file_pattern)
movies_reader = MoviesReader(file_pattern)

In [None]:
showings_reader.take(1)

In [None]:
movies_reader.take(1)

It's important to notice that **not all films have a director**, so we'll need to take care here. An example of such a film:

In [None]:
movies_reader.bag.filter(lambda x:not x.get('director')).take(1)

We can create a list of all of the first names of the directors. Also important to note: some films have multiple directors.

In [None]:
def movie_director_first_names(movie):
    first_names = []
    directors = movie.get('director',[])
    if type(directors) != list:
        directors = [directors]
    for director in directors:
        names = director.split()
        if len(names) > 1:
            first_names.append(names[0])
        else:
            first_names.append(director)
    return first_names

director_first_names = movies_reader.bag.map(movie_director_first_names).\
                        filter(lambda x:len(x)>0).compute()
director_first_names

# A niave approach to identifying the gender of the director

We'll try to identify the gender of the director by keeping a list of common female first names (this list is intentionally kept small due to time constraints)

In [None]:
FEMALE_FIRST_NAMES = ['Jill', 'Caroline', 'Annika', 'Marie', 'Sabine']

def has_female_director(movie):
    first_names = movie_director_first_names(movie)
    for first_name in first_names:
        if first_name in FEMALE_FIRST_NAMES:
            return True
    return False

count = movies_reader.bag.filter(has_female_director).distinct('movie_id').count().compute()
print("{} Movies with a female director".format(count))

def movie_summary(movie):
    return [
        movie['movie_id'],
        movie['title'],
        movie['director']
    ]

movies_reader.bag.filter(has_female_director).distinct('movie_id').map(movie_summary).compute()

Pluck out those movie IDs to look at the screening data ...

In [None]:
movie_ids = \
  movies_reader.bag.filter(has_female_director).distinct('movie_id').map(lambda x:x['movie_id']).compute()
movie_ids

Create bins for movie screenings based on the `movie_id` and `date_stamp`.

In [None]:
%%time

# Hashing scheme for the bins: "movie_id||date_stamp"

def movie_hash(movie):
    return "{}||{}".format(movie['movie_id'], movie['date_stamp'])
                                     
frequencies = showings_reader.bag.filter(lambda x: x['movie_id'] in movie_ids).map(movie_hash).frequencies().compute()
frequencies

 Break a part those hashes to get the date stamps for our data frame columns ...

In [None]:
date_stamps = list(set([frequency[0].split('||')[1] for frequency in frequencies]))
columns = sorted(date_stamps)
columns

Now loop through the data and populate the data frame...

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(columns=columns, dtype=np.int64)

for row in frequencies:
    (country, date_stamp) = row[0].split('||')
    df.loc[country, date_stamp] = row[1]
df

Fill the missing data with zeros ...

In [None]:
df.fillna(0, inplace=True)
df

Create a lookup table to get some information about a movie (in this case, movie ID and title) using a movie ID. (This will help identify the movie when plotting.)

In [None]:
def label_summary(movie):
    return [movie['movie_id'], "{}--{}".format(movie['movie_id'], movie['title'])]

out = movies_reader.bag.filter(has_female_director).distinct('movie_id').map(label_summary).compute()
out = [item for sublist in out for item in sublist]
labels=dict(zip(out[::2],out[1::2]))
labels

Plot a graph ...

In [None]:
import datetime
import plotly.graph_objects as go

# Convert the x data in the graph to proper date times
plot_columns = [datetime.datetime.strptime(c, '%Y%m%d') for c in columns]

config = {'doubleClickDelay': 1000}

fig = go.Figure()
for movie_id in movie_ids:
    fig.add_scatter(x=plot_columns,
                    y=df.loc[movie_id],
                    mode = 'lines',
                    name=labels[movie_id])

fig.show(config=config)

In [None]:
showings_reader.shutdown()
movies_reader.shutdown()