## Theaters (sic) and location

We'll work with a single timestamp of data. Most operations perform well on a just a few CPUs.

Let's identify some Swedish movie theatres that may not be within the boundary of Sweden.

In [None]:
import os
import sys

sys.path.append('..')

from movies_dask_bag.movie_reader import TheatersReader, MoviesReader, ShowingsReader

work_dir = os.environ.get('SLURM_TMPDIR', '.')
data_dir = '{}/json'.format(work_dir)
file_pattern = '{}/20191206/*'.format(data_dir)

theaters_reader = TheatersReader(file_pattern)

theaters_reader.take(1)

In [None]:
theaters_reader.client

## Count the number of theatres

In [None]:
%%time

theaters_reader.count

# Mapping the data in the bag into a usable form.

In the case, we'll grab the latitude, longitude, country, and some identifying information (theatre name, id, and what file the original data came from).

In [None]:
def flatten_theaters_for_location(x):
    return {
        'lat': float(x['theater_lat']),
        'lon': float(x['theater_lon']),
        'country': x['country'],
        'hover': "{} (id: {}): {}".format(x['theater_name'],
                                      x['theater_id'],
                                      x['source_xml']) 
    }

location = theaters_reader.bag.map(flatten_theaters_for_location)
out = location.compute()

out[0]

## Load into a Pandas dataframe, and plot the data on a map ...

The 'hover' attributes above will be used for the hover text on the plot.

In [None]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame(out)
fig = px.scatter_geo(df, lat='lat', lon='lon', color='country',
                     hover_name='hover')
fig.show()

# Use the bag to filter only the Swedish theatres

And load into a dataframe

In [None]:
import json
from shapely.geometry import shape, Point
import os, urllib.request
import pandas as pd

country_code = 'swe'
country = location.filter(lambda x:x['country'] == 'swe')
country_theaters = country.compute()
country_df = pd.DataFrame(country_theaters)

# Download the shape of the Swedish border

Downloaded from somebody's repository on Github.

In [None]:
border = '{}.geojson'.format(country_code.upper())
url = 'https://github.com/AshKyd/geojson-regions/raw/master/countries/10m/{}'.format(border)
output = '{}'.format(border)

urllib.request.urlretrieve(url, output)

import json
from shapely.geometry import shape, Point

with open(output) as f:
    js = json.load(f)

polygon = shape(js['geometry'])

# In or Out of Sweden?

Create a column in the dataframe that for each theatre in the Swedish data, if the theatre is within the boundary or not.

Some 'tolerance' may be needed to reduce the false negatives (countries that are close to the border).

In [None]:
# Cherry-picked!
TOLERANCE = 0.06

country_df['in_boundary'] = country_df.apply(lambda x: polygon.distance(Point(x['lon'], x['lat'])) < TOLERANCE,
                                             axis=1)

fig = px.scatter_geo(country_df, lat='lat', lon='lon', color='in_boundary',
                     hover_name='hover')
fig.show()

## A report of how many theatres are out of the boundary ...

In [None]:
country_df['in_boundary'].value_counts()

## A list of the bad theatres ...

In [None]:
country_df[country_df['in_boundary'] == False]

# Shutdown ...

In [None]:
theaters_reader.shutdown()