## Some experiments with how well theaters are doing over time

In [None]:
%%time

import os
import sys

sys.path.append('..')

from movies_dask_bag.movie_reader import TheatersReader, MoviesReader, ShowingsReader

work_dir = os.environ.get('SLURM_TMPDIR', '.')
data_dir = '{}/json'.format(work_dir)
file_pattern = '{}/*/*'.format(data_dir)

showings_reader = ShowingsReader(file_pattern)
theaters_reader = TheatersReader(file_pattern)

In [None]:
%%time

showings_reader.count

In [None]:
%%time

# Hashing scheme for the bins: "theater_id||country||date_stamp"

def theater_hash(movie):
    return "{}||{}||{}".format(movie['theater_id'], movie['country'], movie['date_stamp'])

frequencies = showings_reader.bag.map(theater_hash).frequencies().compute()
frequencies[:10]

In [None]:
len(frequencies)

In [None]:
country_date_stamps_counts = {}
countries = set()
date_stamps = set()

for frequency in frequencies:
    (theater_id, country, date_stamp) = frequency[0].split("||")
    countries.add(country)
    date_stamps.add(date_stamp)

    country_counts = country_date_stamps_counts.get(country, {})
    count = country_counts.get(date_stamp, 0) + 1
    country_counts[date_stamp] = count
    country_date_stamps_counts[country] = country_counts

countries = sorted(list(countries))
date_stamps = sorted(list(date_stamps))

In [None]:
country_date_stamps_counts

In [None]:
import pandas as pd
import numpy as np

columns = date_stamps
df = pd.DataFrame(columns=columns, dtype=np.int64)

for country in country_date_stamps_counts:
    country_counts = country_date_stamps_counts[country]
    for date_stamp in country_counts:
        count = country_counts[date_stamp]
        df.loc[country, date_stamp] = count
df.fillna(0, inplace=True)
df

In [None]:
import datetime
import plotly.graph_objects as go

# Convert columns from strings to proper datetimes
plot_columns = [datetime.datetime.strptime(c, '%Y%m%d') for c in columns]

# Default double-click speed is a bit fast ...
config = {'doubleClickDelay': 1000}

fig = go.Figure()
for country in countries:
    fig.add_scatter(x=plot_columns,
                    y=df.loc[country],
                    mode = 'lines',
                    name=country)

fig.show(config=config)