## Poking around at the movies

For looking at just the movie data, not a lot of CPUs are needed.
Most operations take about 6 seconds on 6 CPUs.

In [None]:
# Load up the movies and look at one of them ...

import os
import sys

sys.path.append('..')

from movies_dask_bag.movie_reader import TheatersReader, MoviesReader, ShowingsReader

work_dir = os.environ.get('SLURM_TMPDIR', '.')
data_dir = '{}/json'.format(work_dir)

file_pattern = '{}/*/*'.format(data_dir)

# Or restrict to one sample period ...
# file_pattern = '{}/20191206/*'.format(data_dir)

# Or restrict to one country ...
# file_pattern = '{}/*/can'.format(data_dir)

movies_reader = MoviesReader(file_pattern)

movies_reader.take(1)

### The "client"

The Dask client has some information about the computational network and a nice dashboard.

In [None]:
movies_reader.client

### Count number of movies vs number of unique movies

In [None]:
%%time

## Counting all movies (including repeats) in the bag

movies_reader.count

In [None]:
%%time

## Counting unique movies in the bag

len(set(movies_reader.bag.map(lambda x:x['movie_id']).compute()))

In [None]:
%%time

## Counting unique movies the Dask way

uniq_movies_bag = movies_reader.bag.distinct('movie_id')

uniq_movies_bag.count().compute()

## Shutdown the Dask network and delete chunked files

In [None]:
movies_reader.shutdown()