In [2]:
import altair as alt
import pandas as pd
import geopandas as gpd
import numpy as np
import timeit
from IPython.display import Image, display

import warnings
warnings.filterwarnings("ignore")

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

# 1. Dataset

| Data Type    | Shorthand Code | Description                       |
|--------------|----------------|-----------------------------------|
| quantitative | Q              | a continuous real-valued quantity |
| ordinal      | O              | a discrete ordered quantity       |
| nominal      | N              | a discrete unordered category     |
| temporal     | T              | a time or date value              |
| geojson      | G              | a geographic shape                |

## 1.1. Load Data

In [3]:
## Read csv
movies_df = pd.read_csv("data/movie_dataset.csv", parse_dates=['release_date'])

# show data
len(movies_df)

8747

In [4]:
# Uncomment this if you want random sample from original dataset
movies_df = movies_df.sample(n=1000,random_state=40)

In [5]:
## DF metadata
print("Rows: ", len(movies_df))
print("Columns: ", len(movies_df.columns))
print(list(movies_df.columns))
print("-----------")

Rows:  1000
Columns:  17
['id', 'title', 'vote_average', 'vote_count', 'release_date', 'revenue', 'runtime', 'adult', 'budget', 'original_language', 'popularity', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'iso_countries', 'oscar']
-----------


## 1.2 Load World Map Data

In [6]:
## Retrieve choropleth map metadata
local_map = "data/ne_50m_admin_0_countries.zip"
gdf_ne = gpd.read_file(local_map) 

# lower case all columns names
gdf_ne.columns = map(str.lower, gdf_ne.columns)

# Selects name, iso_a2, continent, pop_st and geometry column
gdf_ne = gdf_ne[["name", "iso_a2", "continent", "pop_est", 'geometry']]

# Map missing country code
gdf_ne.loc[gdf_ne["name"] == "Norway", 'iso_a2'] = "NO"
gdf_ne.loc[gdf_ne["name"] == "France", 'iso_a2'] = "FR"
gdf_ne.loc[gdf_ne["name"] == "Taiwan", 'iso_a2'] = "TW"

# Retrieve all of continents except for Antartica
gdf_ne = gdf_ne.query("continent in ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']")

## Generate iso_a2 -> continent map
iso_continent_df = pd.DataFrame(gdf_ne[["iso_a2", "continent", "name"]])
country_metadata_map = iso_continent_df.set_index('iso_a2').T.to_dict('index')
country_continent_map = country_metadata_map['continent']

In [7]:
## Add continents column to movies dataset
movies_df['continents'] = movies_df['iso_countries']
movies_df['continents'] = movies_df['continents'].replace(pd.Series(country_continent_map).astype(str), regex=True)
def remove_dupes(i1):  
    return ','.join(list(set(i1.split(", "))))

movies_df['continents'] = movies_df['continents'].apply(remove_dupes)

## 1.3. Common Data Manipulations

### 1.3.1. Select first value from list

In [8]:
## Selects the first value from list value e.g. genres and production_companies
movies_df["production_companies"].str.extract(r'([^,]+)')
# company_movies_df['production_company'] = movies_df["production_companies"].str.extract(r'([^,]+)')

Unnamed: 0,0
4485,Universal Pictures
2497,Fantefilm
7265,Venus Records & Tapes
4645,Stuber Productions
1841,Riva Film
...,...
2537,Perdido Productions
5488,Principato-Young Entertainment
2850,Sacromonte Films
1824,Duplass Brothers Productions


### 1.3.2 Split list values into multiple rows

In [9]:
# movies_df.assign(genre=movies_df['genres'].str.split(', ')).explode('genre')
# genre_movies_df = movies_df.assign(genre=movies_df['genres'].str.split(', ')).explode('genre')

# 2. Visualisation

## 2.1. Example

In [10]:
## Count movies by year
alt.Chart(movies_df).mark_bar().encode(
    x='year(release_date):O',
    y='count()',
    tooltip='count()',
).interactive()


In [11]:
## Add first genre from genres (list of genre) into its own column
movies_genre_df = movies_df.copy(True)
movies_genre_df['genre'] = movies_genre_df["genres"].str.extract(r'([^,]+)')

## Generate view 
alt.Chart(movies_genre_df).mark_circle(size=60).encode(
    x='runtime:Q',
    y='vote_average:Q',
    color='genre:N',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T']
).properties(
    title="Relationship between runtime and rating",
    width=500,
    height=300
)

## 2.2. Selection

### 2.2.1. Legend Selection

In [12]:
genre_selection = alt.selection_point(fields=['genre'], bind='legend')

alt.Chart(movies_genre_df).mark_circle(size=60).encode(
    x='runtime:Q',
    y='vote_average:Q',
    color='genre:N',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T'],
    opacity=alt.condition(genre_selection, alt.value(1), alt.value(0.15))
).add_params(
    genre_selection
).properties(
    title="Relationship between runtime and rating",
    width=500,
    height=300
)

### 2.2.2. Input date

In [13]:
release_date_start = movies_genre_df["release_date"].min()
release_date_end = movies_genre_df["release_date"].max()

start_date_input = alt.binding(input='date', name='start_date')
end_date_input = alt.binding(input='date', name='end_date')

start_date_selection = alt.selection_point(fields=['date'], bind=start_date_input, value=release_date_start)
end_date_selection = alt.selection_point(fields=['date'], bind=end_date_input, value=release_date_end)

# now create the chart as normal:
alt.Chart(movies_genre_df).mark_circle(size=60).encode(
    x='runtime:Q',
    y='vote_average:Q',
    color='genre:N',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T'],
    opacity=alt.condition(genre_selection, alt.value(0.8), alt.value(0.15))
).add_params(
    genre_selection,
    start_date_selection,
    end_date_selection
).transform_filter(
    {'and': [
        alt.FieldGTEPredicate(field='release_date', gte=start_date_selection.date),
        alt.FieldLTEPredicate(field='release_date', lte=end_date_selection.date),
    ]}
    
).properties(
    title="Relationship between runtime and rating",
    width=500,
    height=300
)

### 2.2.3. Chart as Legend

In [17]:
## List genre in chart as legend

movie_selection = alt.selection_point(fields=['title', 'release_date'])

genre_selection = alt.selection_point(fields=['genre'])
color = alt.condition(genre_selection,
                      alt.Color('genre:N', legend=None),
                      alt.value('lightgray'))

legend = alt.Chart(movies_genre_df).mark_point(size=200, filled=True).encode(
    y=alt.Y('genre:N', axis=alt.Axis(orient='right')),
    color=color
).add_params(
    genre_selection
)

chart = alt.Chart(movies_genre_df).mark_circle(size=60).encode(
    x='runtime:Q',
    y='vote_average:Q',
    color='genre:N',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T'],
    opacity=alt.condition(movie_selection, alt.value(0.8), alt.value(0.15))
).transform_filter(
    genre_selection    
).properties(
    title="Relationship between runtime and rating",
    width=500,
    height=300
).add_params(
    movie_selection
)

chart | legend

# 100. Generalized Selection - NOT!


## Prototype 1

In [15]:
gen_sel_df = movies_genre_df.copy(True)
gen_sel_df['continent'] = gen_sel_df["continents"].str.extract(r'([^,]+)')

CHART_WIDTH = 800
## ------------ START CONTINENT 
## Init continent dataframe
continent_df = pd.DataFrame({'continent': ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']})

## Continent selection
continent_selection = alt.selection_point(fields=['continent'])
cont_shape_scale = alt.Scale(domain=['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'],
                        range=["triangle","circle","square","cross","diamond","M0,.5L.6,.8L.5,.1L1,-.3L.3,-.4L0,-1L-.3,-.4L-1,-.3L-.5,.1L-.6,.8L0,.5Z"])
cont_color = alt.condition(continent_selection, alt.value('#1f77b4'), alt.value('lightgray'))

## continent legend
continent_legend = alt.Chart(continent_df).mark_point(
    size=200,
    filled=True
).encode(
    y=alt.Y('continent:N', axis=alt.Axis(orient='right')),
    shape=alt.Shape('continent:N').scale(cont_shape_scale).legend(None),
    color=cont_color
).add_params(
    continent_selection
)
## ------------ END CONTINENT 



## ------------ START RELEASE YEAR
## init date selection
date_interval = alt.selection_interval(encodings=['x'], fields=['release_date'])

## release_date legend
rel_date_legend = alt.Chart(gen_sel_df).mark_point(
    size=70
).encode(
    x=alt.X('release_date:T', axis=alt.Axis(orient='bottom'))
).transform_timeunit(
    release_date='year(release_date)'
).add_params(
    date_interval
).properties(
    width=CHART_WIDTH
)
## ------------ END RELEASE YEAR


## ------------ START Genre
## init genre selection
genre_selection = alt.selection_point(fields=['genre'])
genre_color = alt.condition(genre_selection,
                      alt.Color('genre:N', legend=None),
                      alt.value('lightgray'))

genre_legend = alt.Chart(gen_sel_df).mark_point(
    size=200,
    filled=True
).encode(
    y=alt.Y('genre:N', axis=alt.Axis(orient='right')),
    color=genre_color
).add_params(
    genre_selection
)
## ------------ END Genre



## START ---------------- MAIN
## scatter plot runtime + rating
chart = alt.Chart(gen_sel_df).mark_point(size=100).encode(
    x=alt.X('runtime:Q', scale=alt.Scale(domain=[0, 400])),
    y=alt.Y('vote_average:Q', scale=alt.Scale(domain=[0, 10])),
    color=genre_color,
    shape=alt.Shape('continent:N').scale(cont_shape_scale),
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T', 'continent:N'],
    # opacity=alt.condition((genre_selection & continent_selection), alt.value(1), alt.value(0))
).transform_filter(
    date_interval & genre_selection & continent_selection
).properties(
    title="Relationship between runtime and rating",
    width=CHART_WIDTH,
    height=400
)



(chart | continent_legend | genre_legend)  & rel_date_legend

## Prototype 2

In [16]:
oscar_gen_sel_df = movies_genre_df.copy(True)

CHART_WIDTH=800
## ------------ START RELEASE YEAR
## init date selection
date_interval = alt.selection_interval(encodings=['x'], fields=['release_date'])

## release_date legend
rel_date_legend = alt.Chart(oscar_gen_sel_df).mark_point(
    size=70
).encode(
    x=alt.X('release_date:T', axis=alt.Axis(orient='bottom'))
).transform_timeunit(
    release_date='year(release_date)'
).add_params(
    date_interval
).properties(
    width=CHART_WIDTH
)
## ------------ END RELEASE YEAR



## ------------ START OSCAR
## Init continent dataframe
oscar_list = ['Winner', 'Nominated', 'Non-Awardee']
oscar_df = pd.DataFrame({'oscar': oscar_list})

## Continent selection
oscar_selection = alt.selection_point(fields=['oscar'])
oscar_shape_scale = alt.Scale(domain=oscar_list,
                        range=["M0,.5L.6,.8L.5,.1L1,-.3L.3,-.4L0,-1L-.3,-.4L-1,-.3L-.5,.1L-.6,.8L0,.5Z", "diamond","circle"])
oscar_color = alt.condition(oscar_selection, alt.value('#1f77b4'), alt.value('lightgray'))

## continent legend
oscar_legend = alt.Chart(oscar_df).mark_point(
    size=250,
    filled=True
).encode(
    y=alt.Y('oscar:N', axis=alt.Axis(orient='right')),
    shape=alt.Shape('oscar:N').scale(oscar_shape_scale).legend(None),
    color=oscar_color
).add_params(
    oscar_selection
)
## ------------ END OSCAR



## ------------ START Genre
## init genre selection
genre_selection = alt.selection_point(fields=['genre'])
genre_color = alt.condition(genre_selection,
                      alt.Color('genre:N', legend=None),
                      alt.value('lightgray'))

genre_legend = alt.Chart(oscar_gen_sel_df).mark_point(
    size=200,
    filled=True
).encode(
    y=alt.Y('genre:N', axis=alt.Axis(orient='right')),
    color=genre_color
).add_params(
    genre_selection
)
## ------------ END Genre



## START ---------------- MAIN
## scatter plot runtime + rating
tooltips = [alt.Tooltip('title', type='nominal'), alt.Tooltip('release_date', type='temporal')]

chart = alt.Chart(oscar_gen_sel_df).mark_point(size=100).encode(
    x=alt.X('runtime:Q', scale=alt.Scale(domain=[0, 400])),
    y=alt.Y('vote_average:Q', scale=alt.Scale(domain=[0, 10])),
    color=genre_color,
    shape=alt.Shape('oscar:N').scale(oscar_shape_scale),
    detail='release_date:T',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T', 'oscar:O'],
    # opacity=alt.condition((genre_selection & oscar_selection), alt.value(1), alt.value(0))
).transform_filter(
    date_interval & genre_selection & oscar_selection
).properties(
    title="Relationship between runtime and rating",
    width=CHART_WIDTH,
    height=400
)



(chart | oscar_legend | genre_legend)  & rel_date_legend

## Prototype 3

In [17]:
CHART_WIDTH = 800

## Year legend
release_date_start = movies_genre_df["release_date"].min()
release_date_end = movies_genre_df["release_date"].max()
years_list = np.arange(release_date_start.year, release_date_end.year+1)
years_df = pd.DataFrame({'release_year': years_list})

years_interval = alt.selection_interval(encodings=['x'], fields=['release_year'])

rel_year_legend = alt.Chart(years_df).mark_point(
    size=70
).encode(
    x=alt.X('release_year:Q', axis=alt.Axis(orient='bottom',format='d',labelAngle=90), scale=alt.Scale(domain=[release_date_start.year, release_date_end.year])),
    tooltip='release_year:Q'
).add_params(
    years_interval
).properties(
    width=CHART_WIDTH
)
### -------------------------------------

## Month legend
months_list = np.arange(1,13)
months_df = pd.DataFrame({'release_month': months_list})
months_interval = alt.selection_interval(encodings=['x'], fields=['release_month'])

rel_month_legend = alt.Chart(months_df).mark_point(
    size=70
).encode(
    x=alt.X('release_month:Q', axis=alt.Axis(values=months_list, orient='bottom'), scale=alt.Scale(domain=[1, 12]))
).add_params(
    months_interval
).properties(
    width=300
)
### -------------------------------------


## ------------ START Genre
## init genre selection
genre_selection = alt.selection_point(fields=['genre'])
genre_color = alt.condition(genre_selection,
                      alt.Color('genre:N', legend=None),
                      alt.value('lightgray'))

genre_legend = alt.Chart(oscar_gen_sel_df).mark_point(
    size=200,
    filled=True
).encode(
    y=alt.Y('genre:N', axis=alt.Axis(orient='right')),
    color=genre_color
).add_params(
    genre_selection
)
## ------------ END Genre


## START ---------------- MAIN
## scatter plot runtime + rating

chart = alt.Chart(movies_genre_df).mark_point(size=100).encode(
    x=alt.X('runtime:Q', scale=alt.Scale(domain=[0, 400])),
    y=alt.Y('vote_average:Q', scale=alt.Scale(domain=[0, 10])),
    color=genre_color,
    detail='release_date:T',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T'],
    opacity=alt.condition((genre_selection), alt.value(1), alt.value(0.1))
).transform_calculate(
    release_year=alt.expr.year(alt.datum.release_date),
    release_month=alt.expr.month(alt.datum.release_date)+1
).properties(
    title="Relationship between runtime and rating",
    width=CHART_WIDTH,
    height=400
).transform_filter(
    years_interval & months_interval
)

(chart | genre_legend)  & rel_year_legend & rel_month_legend

## Prototype 4

In [127]:
## List genre in chart as legend
period_dropdown = alt.binding_select(options=[None,'%U, %Y','%m %Y', '%q %Y', '%Y'], labels=["None", "Week", "Month", "Quarter", "Year"], name='period')

movie_selection = alt.selection_point(fields=['id', 'release_date'])
year_selection = alt.selection_point(fields=['release_year'])
month_selection = alt.selection_point(fields=['release_month'])


period_selection= alt.selection_point(fields=['period'], bind=period_dropdown)


# start_date_input = alt.binding(input='text', name='title')
# start_date_selection = alt.selection_point(fields=['title'], bind=start_date_input, value=movie_selection.title)
## link to mark text
## clickable mark text

## -------- START --------------- LEGEND
genre_selection = alt.selection_point(fields=['genre'])
color = alt.condition(genre_selection,
                      alt.Color('genre:N', legend=None),
                      alt.value('lightgray'))

legend = alt.Chart(movies_genre_df).mark_point().encode(
    y=alt.Y('genre:N', axis=alt.Axis(orient='right')),
    color=color
).add_params(
    genre_selection
)
## -------- END --------------- LEGEND


## -------- START --------------- CHART
chart = alt.Chart(movies_genre_df).mark_circle(size=100).encode(
    x=alt.X('runtime:Q', scale=alt.Scale(domain=[0, 400])),
    y=alt.Y('vote_average:Q', scale=alt.Scale(domain=[0, 10])),
    color='genre:N',
    # detail='release_date:T',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T', 'id:N', 'release_week:N'],
    opacity=alt.condition(movie_selection, alt.value(0.9), alt.value(0.3))
).transform_calculate(
    release_year=alt.expr.year(alt.datum.release_date),
    release_month=alt.expr.month(alt.datum.release_date)+1,
    release_week=alt.expr.timeFormat(alt.datum.release_date, '%U, %Y'),
    # release_quarter=alt.expr.timeFormat(alt.datum.release_date, '%q %Y')
).transform_filter(
    # genre_selection & year_selection & month_selection  
    {
         'or': [
             {'or': [
                {'not': alt.expr.isDefined(movie_selection.release_date)},
                {'not': alt.expr.isDefined(period_selection.period)}
             ]},
                 
             ( alt.expr.timeFormat(alt.datum.release_date, alt.expr.toString(period_selection.period)) == alt.expr.timeFormat(alt.expr.toNumber(movie_selection.release_date), alt.expr.toString(period_selection.period)) ), 
             
         ]  #
     }
   
).properties(
    title="Relationship between runtime and rating",
    width=800,
    height=400
).add_params(
    movie_selection,
    period_selection
)
## -------- END --------------- CHART


## -------- START --------------- TEXT SELECTION
ranked_text = alt.Chart(movies_genre_df).mark_text(align='right').encode(
    y=alt.Y('row_number:O',axis=None),
    detail=['id:N','release_date:T']
).transform_filter(
    
    alt.datum.id == alt.expr.toNumber(movie_selection.id)
    # movie_selection
    # alt.datum.id == movie_selection.id
)    .transform_window(
    row_number='row_number()'
).transform_filter(
    'datum.row_number < 15'
)

# Data Tables
id = ranked_text.encode(text='id:N').properties(title=alt.TitleParams(text='ID', align='right'))
title = ranked_text.encode(text='title:N').properties(title=alt.TitleParams(text='Title', align='right'))
release_date = ranked_text.encode(text='release_date:T').properties(title=alt.TitleParams(text='Release Date', align='right'))
runtime = ranked_text.encode(text='runtime:Q').properties(title=alt.TitleParams(text='Runtime', align='right'))
release_year = ranked_text.transform_calculate(release_year=alt.expr.year(alt.datum.release_date)
                                              ).encode(text='release_year:Q'
                                              ).add_params(year_selection
                                              ).properties(title=alt.TitleParams(text='Release Year', align='right'))
release_month = ranked_text.transform_calculate(release_month=alt.expr.month(alt.datum.release_date)+1
                                              ).encode(text='release_month:Q'
                                              ).add_params(month_selection
                                              ).properties(title=alt.TitleParams(text='Release Month', align='right'))
text = alt.hconcat(id, title, release_date, release_year, release_month, runtime)



text & (chart | legend) 

## Prototype 5

In [140]:
period_dropdown = alt.binding_select(options=[None,'%U, %Y','%m %Y', '%q %Y', '%Y'], labels=["None", "Week", "Month", "Quarter", "Year"], name='period')
movie_selection = alt.selection_point(fields=['id', 'release_date'])
period_selection= alt.selection_point(fields=['period'], bind=period_dropdown)


## -------- START --------------- CHART
chart = alt.Chart(movies_genre_df).mark_circle(size=100).encode(
    x=alt.X('release_date:T'),
    y=alt.Y('vote_average:Q', scale=alt.Scale(domain=[0, 10])),
    color='genre:N',
    # detail='release_date:T',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T', 'id:N', 'release_week:N'],
    opacity=alt.condition(movie_selection, alt.value(1), alt.value(0.3))
).transform_calculate(
    release_year=alt.expr.year(alt.datum.release_date),
    release_month=alt.expr.month(alt.datum.release_date)+1,
    release_week=alt.expr.timeFormat(alt.datum.release_date, '%U, %Y'),
    # release_quarter=alt.expr.timeFormat(alt.datum.release_date, '%q %Y')
).transform_filter(
    # genre_selection & year_selection & month_selection  
    {
         'or': [
             {'or': [
                {'not': alt.expr.isDefined(movie_selection.release_date)},
                {'not': alt.expr.isDefined(period_selection.period)}
             ]},
                 
             ( alt.expr.timeFormat(alt.datum.release_date, alt.expr.toString(period_selection.period)) == alt.expr.timeFormat(alt.expr.toNumber(movie_selection.release_date), alt.expr.toString(period_selection.period)) ), 
             
         ]  #
     }
   
).properties(
    title="Relationship between release date and rating",
    width=800,
    height=400
).add_params(
    movie_selection,
    period_selection
).interactive(bind_y=False)

chart

In [None]:
# https://uwdata.github.io/visualization-curriculum/altair_data_transformation.html