In [1]:
import altair as alt
import pandas as pd
import numpy as np
import timeit
from IPython.display import Image, display, HTML
import geopandas as gpd

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
movies_df = pd.read_csv("data/movie_dataset.csv")

In [3]:
## Check data exist
movies_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,iso_countries,oscar
0,0,1138352,Rupert’s 6,0.0,0,2024-11-02,20,27,False,300,en,0.891,Thriller,"Kleinhenz Jewelers, Bennett Family Dental",United States of America,English,US,
1,1,1241095,In The Morning,0.0,0,2024-02-08,1,24,False,1,en,0.0,"Drama, Romance",BlueBrick Productions,United States of America,English,US,
2,2,1236224,Nathalie: The first and last prom,1.0,1,2024-01-26,1,8,False,2,es,0.0,"Horror, Drama, Comedy, Documentary",Virgin Productions,Chile,Spanish,CL,


In [4]:
print("Rows: ", len(movies_df))
print("Columns: ", len(movies_df.columns))
print(list(movies_df.columns))
print("-----------")

Rows:  12469
Columns:  18
['Unnamed: 0', 'id', 'title', 'vote_average', 'vote_count', 'release_date', 'revenue', 'runtime', 'adult', 'budget', 'original_language', 'popularity', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'iso_countries', 'oscar']
-----------


In [5]:
alt.Chart(movies_df).mark_bar().encode(
    x='year(release_date):O',
    y='count()',
    tooltip='count()',
).interactive()


In [6]:
# Uncomment this if you want random sample from original dataset
# movies_df = movies_df.sample(n=1000,random_state=40)

# World Map

In [7]:
## Init world map data

local_map = "data/ne_50m_admin_0_countries.zip"
# local_map = "data/ne_110m_admin_0_countries.zip"
gdf_ne = gpd.read_file(local_map) 
gdf_ne.columns = map(str.lower, gdf_ne.columns)
# print(list(gdf_ne.columns))
gdf_ne = gdf_ne[["name", "iso_a2", "continent", "pop_est", 'geometry']]
gdf_ne.loc[gdf_ne["name"] == "Norway", 'iso_a2'] = "NO"
gdf_ne.loc[gdf_ne["name"] == "France", 'iso_a2'] = "FR"
gdf_ne.loc[gdf_ne["name"] == "Taiwan", 'iso_a2'] = "TW"

gdf_ne = gdf_ne.query("continent in ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']")

In [8]:
production_country_df = movies_df.assign(iso_country=movies_df['iso_countries'].str.split(', ')).explode('iso_country')
movie_country_counts = production_country_df.value_counts(['iso_country']).reset_index()
movie_country_counts.head(3)

Unnamed: 0,iso_country,count
0,US,7006
1,GB,1510
2,FR,1211


In [9]:
# production_country_df
gdf_ne.columns

Index(['name', 'iso_a2', 'continent', 'pop_est', 'geometry'], dtype='object')

In [10]:
## DF to get continent and country name from iso A2
iso_continent_df = pd.DataFrame(gdf_ne[["iso_a2", "continent", "name"]])

In [11]:
country_continent_map = iso_continent_df.set_index('iso_a2').T.to_dict('index')
country_continent_map = country_continent_map['continent']
# origin_country_tvseries_df["country_id"] = origin_country_tvseries_df["origin_country"].map(countryid_map)
# country_continent_map

  country_continent_map = iso_continent_df.set_index('iso_a2').T.to_dict('index')


{'ZW': 'Africa',
 'ZM': 'Africa',
 'YE': 'Asia',
 'VN': 'Asia',
 'VE': 'South America',
 'VA': 'Europe',
 'VU': 'Oceania',
 'UZ': 'Asia',
 'UY': 'South America',
 'FM': 'Oceania',
 'MH': 'Oceania',
 'MP': 'Oceania',
 'VI': 'North America',
 'GU': 'Oceania',
 'AS': 'Oceania',
 'PR': 'North America',
 'US': 'North America',
 'PN': 'Oceania',
 'AI': 'North America',
 'FK': 'South America',
 'KY': 'North America',
 'BM': 'North America',
 'VG': 'North America',
 'TC': 'North America',
 'MS': 'North America',
 'JE': 'Europe',
 'GG': 'Europe',
 'IM': 'Europe',
 'GB': 'Europe',
 'AE': 'Asia',
 'UA': 'Europe',
 'UG': 'Africa',
 'TM': 'Asia',
 'TR': 'Asia',
 'TN': 'Africa',
 'TT': 'North America',
 'TO': 'Oceania',
 'TG': 'Africa',
 'TL': 'Asia',
 'TH': 'Asia',
 'TZ': 'Africa',
 'TJ': 'Asia',
 'TW': 'Asia',
 'SY': 'Asia',
 'CH': 'Europe',
 'SE': 'Europe',
 'SZ': 'Africa',
 'SR': 'South America',
 'SS': 'Africa',
 'SD': 'Africa',
 'LK': 'Asia',
 'ES': 'Europe',
 'KR': 'Asia',
 'ZA': 'Africa',
 '

In [12]:
## Maps continent information to dataframe
movies_df['continents'] = movies_df['iso_countries']
movies_df['continents'] = movies_df['continents'].replace(pd.Series(country_continent_map).astype(str), regex=True)
def remove_dupes(i1):  
    return ','.join(list(set(i1.split(", "))))

movies_df['continents'] = movies_df['continents'].apply(remove_dupes)

# movies_df[movies_df['continents'].str.contains(",")]

In [13]:
# movies_df['continents'] = movies_df['continents'].to_dict()
# list(dict.fromkeys(mylist))



In [14]:
## Hack: Move dropdown selection to top right as they are no working solution 
display(HTML("""
<style>
form.vega-bindings {
  position: absolute;
  right: 0px;
  top: 0px;
}
</style>
"""))

In [39]:
## brush
country_brush = alt.selection_point(fields=['iso_country'])
continent_dropdown = alt.binding_select(options=[None, 'Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'],
                                       labels=['All', 'Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'], name='Continent')
continent_selection = alt.selection_point(fields=['continent'], bind=continent_dropdown)

## background world map
background_world = alt.Chart(gdf_ne).mark_geoshape(fill="white", stroke='lightgray')

## content world map
world_movies = alt.Chart(gdf_ne).mark_geoshape(
    stroke='lightgray'
).encode(
    tooltip=['name:N', 'count:Q'],
    color='count:Q',
    opacity=alt.condition(country_brush, alt.OpacityValue(1), alt.OpacityValue(0.2))
).transform_lookup(
    lookup='iso_a2',
    from_=alt.LookupData(movie_country_counts, 'iso_country', list(movie_country_counts.columns))
)

map = (background_world+world_movies).project(
    ##naturalEarth1
    type= 'mercator',
    rotate=[0, 0 ,0],
    # scale=100
).properties(
    title="Global Cinematic Landscape: Total Movie Productions by Country",
    width=900,
    height=500
).add_params(
    country_brush,
    continent_selection
).transform_filter(
    # alt.datum.CONTINENT == 'North America'
    continent_selection
)

# Bar chart - Movies count by country 
movies_count_bar = alt.Chart(movie_country_counts).mark_bar(    
).encode(
    x='count:Q',
    y=alt.Y('name:O').sort('-x'),
    tooltip=['name:O', 'count:Q']
).transform_lookup(
    lookup='iso_country',
    from_=alt.LookupData(iso_continent_df, 'iso_a2', ['continent', 'name'])
).transform_filter(
    continent_selection
).transform_window(
    rank='dense_rank(count)',
    sort=[alt.SortField('rank', order='descending')]
).transform_filter(
    (alt.datum.rank <= 10)
).properties(
    title="Top 10 Production Countries"
)

## Bar chart - Top 10 movies by revenue
movie_rev_bar = alt.Chart(movies_df).mark_bar(    
).encode(
    x='revenue:Q',
    y=alt.Y('title:O').sort('-x'),
    tooltip=['title:O', 'revenue:Q', 'production_countries:O']
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(continent_selection.continent)}, 
             (alt.expr.indexof(alt.datum.continents, continent_selection.continent) != -1)
         ]  #
     }
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(country_brush.iso_country)}, 
             (alt.expr.indexof(alt.datum.iso_countries, country_brush.iso_country) != -1)
         ]  #
     }   
).transform_window(
    rank='dense_rank(revenue)',
    sort=[alt.SortField('revenue', order='descending')]
).transform_filter(
    (alt.datum.rank <= 10)
).properties(
    title="Top 10 Movies by Revenue *in USD"
)

# scatter, budget and revenue 
budget_rev_brush = alt.selection_interval()
budget_rev_df = alt.Chart(movies_df).mark_circle(size=60).encode(
    x='budget:Q',
    y='revenue:Q',
    # color='Origin',
    tooltip=['title', 'revenue', 'budget']
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(continent_selection.continent)}, 
             (alt.expr.indexof(alt.datum.continents, continent_selection.continent) != -1)
         ]  #
     }
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(country_brush.iso_country)}, 
             (alt.expr.indexof(alt.datum.iso_countries, country_brush.iso_country) != -1)
         ]  #
     }   
).properties(
    title="Relationship between budget and revenue",
    width=500,
    height=300
).add_params(budget_rev_brush)##.interactive()


ranked_text = alt.Chart(movies_df).mark_text(align='right').encode(
    y=alt.Y('row_number:O').axis(None)
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(continent_selection.continent)}, 
             (alt.expr.indexof(alt.datum.continents, continent_selection.continent) != -1)
         ]  #
     }
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(country_brush.iso_country)}, 
             (alt.expr.indexof(alt.datum.iso_countries, country_brush.iso_country) != -1)
         ]  #
     }   
).transform_filter(
    budget_rev_brush
).transform_window(
    row_number='row_number()'
).transform_filter(
    alt.datum.row_number < 15
)

# Data Tables
movie_title = ranked_text.encode(text='title:O').properties(
    title=alt.Title(text='Title', align='right')
)
movie_revenue = ranked_text.encode(text=alt.expr.format('revenue',',~s')).properties(
    title=alt.Title(text='Revenue', align='right')
)
movie_budget = ranked_text.encode(text='budget:N').properties(
    title=alt.Title(text='Budget', align='right')
)
movie_scatter_rank = alt.hconcat(movie_title, movie_revenue, movie_budget)

## Custom title *use with addition e.g. chart + movie_rev_title
# movie_rev_title = alt.Chart().mark_text(
#     align="center",
#     baseline="top",
#     fontSize=16
# ).encode(
#     # x=alt.value(200),  # pixels from left
#     y=alt.value(-20),  # pixels from top
#     text=alt.value(country_brush.iso_country)
# ).add_params(
#     country_brush, continent_selection
# )

## Custom table list
# ranked_text = alt.Chart(revenune_df).mark_text(align='left').encode(
#     y=alt.Y('rank:O', sort=["revenue"]).axis(None)
# ).transform_filter(
#      {
#          'or': [
#              {'not': alt.expr.isDefined(country_brush.iso_country)}, 
#              (alt.expr.indexof(alt.datum.iso_countries, country_brush.iso_country) != -1)
#          ]  #
#      }   
#     # (alt.expr.indexof(alt.datum.iso_countries, country_brush.iso_country) != -1)
# ).add_params(
#     country_brush
# ).transform_window(
#     rank='rank(revenue)',
#     sort=[alt.SortField('rank', order='descending')]
# ).transform_filter(
#     (alt.datum.rank < 10)
# )

# movie_title = ranked_text.encode(text='title:O').properties(
#     title=alt.Title(text='Title', align='left')
# )
# movie_rev = ranked_text.encode(text='revenue:N').properties(
#     title=alt.Title(text='Revenue', align='left')
# )
# movie_country = ranked_text.encode(text='production_countries:O').properties(
#     title=alt.Title(text='Production Countries', align='left')
# )
# text = alt.hconcat(movie_title, movie_rev, movie_country)


map & (movies_count_bar | movie_rev_bar) & (budget_rev_df | movie_scatter_rank)

SchemaValidationError: 'format('revenue',',~s')' is an invalid value for `text`. Valid values are of type 'object'.

alt.VConcatChart(...)

In [16]:
# ranked_text = alt.Chart(revenune_df).mark_text(align='left').encode(
#     y=alt.Y('rank:O', sort=["revenue"]).axis(None)
# ).transform_filter(
#      {
#          'or': [
#              {'not': alt.expr.isDefined(country_brush.iso_country)}, 
#              (alt.expr.indexof(alt.datum.iso_countries, country_brush.iso_country) != -1)
#          ]  #
#      }   
#     # (alt.expr.indexof(alt.datum.iso_countries, country_brush.iso_country) != -1)
# ).add_params(
#     country_brush
# ).transform_window(
#     rank='rank(revenue)',
#     sort=[alt.SortField('rank', order='descending')]
# ).transform_filter(
#     (alt.datum.rank < 10)
# )

# movie_title = ranked_text.encode(text='title:O').properties(
#     title=alt.Title(text='Title', align='left')
# )
# movie_rev = ranked_text.encode(text='revenue:N').properties(
#     title=alt.Title(text='Revenue', align='left')
# )
# movie_country = ranked_text.encode(text='production_countries:O').properties(
#     title=alt.Title(text='Production Countries', align='left')
# )
# text = alt.hconcat(movie_title, movie_rev, movie_country)

In [19]:
# movie_country_counts

In [18]:
# movies_df['release_date'] = pd.to_datetime(movies_df['release_date'])