In [26]:
import altair as alt
import pandas as pd
import numpy as np
# import timeit
# from IPython.display import Image, display

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

# 1. Dataset

## 1.1. Load CSV 

In [27]:
## Read csv
movies_df = pd.read_csv("data/movie_dataset.csv", parse_dates=['release_date'])

# show data
print(len(movies_df))

8747


In [28]:
movies_df = movies_df.sample(n=1000,random_state=40)

## 1.2 Create Genre DF

In [29]:
## split genres column
df_genre_split = movies_df.assign(genre=movies_df['genres'].str.split(', ')).explode('genre')

In [30]:
##Select first genre in genres column as main genre
movies_df["genre"] = movies_df["genres"].str.extract(r'([^,]+)')

## 1.3 Create Production_Country DF

In [31]:
## split production_countries column
df_production_countries_split = movies_df.assign(production_country=movies_df['production_countries'].str.split(', ')).explode('production_country')

In [32]:
##Select first production_country in genres column as main genre
movies_df["production_country"] = movies_df["production_countries"].str.extract(r'([^,]+)')
# movies_df.head(5)

# 2. Visualisation System

In [33]:
SYSTEM_HEIGHT=750
SYSTEM_WIDTH=800

In [54]:
## START - BRUSH #######################################################################################################################
# Brushing & Linking

## View 1
year_selection = alt.selection_interval(encodings=['x'])

## View 2
country_selection = alt.selection_point(fields=['production_country'])

## View 3
runtime_rating_brush = alt.selection_interval()

# Create a selection brush for genres
genre_selection = alt.selection_point(fields=['genre'])

## END - BRUSH #########################################################################################################################

## START - VIEW 1 - POSITIVE/NEGATIVE BAR CHART ############################################################
#BAR CHART: POSITIVE/NEGATIVE BAR CHART OF REVENUE AND BUDGET PER YEAR 

movies_df['year'] = pd.to_datetime(movies_df['release_date']).dt.year
df_production_countries_split['year'] = pd.to_datetime(df_production_countries_split['release_date']).dt.year

movies_df['negative_budget'] = -movies_df['budget']

budget_chart = alt.Chart(movies_df).transform_filter(
    # country_selection & genre_selection & runtime_rating_brush
    genre_selection & runtime_rating_brush
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(country_selection.production_country)}, 
             (alt.expr.indexof(alt.datum.production_countries, country_selection.production_country) != -1)
         ]
     }  
).transform_aggregate(
    total_budget='sum(negative_budget)',
    groupby=['year']
).mark_bar(color='red').encode(
    x=alt.X('year:O', scale=alt.Scale(domain=list(range(1990, 2024))), axis=alt.Axis(labels=False), title='Year'),
    y=alt.Y('total_budget:Q', axis=alt.Axis(title='Total Budget')),
    tooltip=['year:O', 'total_budget:Q'],
    opacity=alt.condition(year_selection, alt.value(1), alt.value(0.2)),
).add_params(
    year_selection
).properties(
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH
)

revenue_chart = alt.Chart(movies_df).transform_filter(
    # country_selection & genre_selection & runtime_rating_brush
    genre_selection & runtime_rating_brush
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(country_selection.production_country)}, 
             (alt.expr.indexof(alt.datum.production_countries, country_selection.production_country) != -1)
         ]
     }  
).transform_aggregate(
    total_revenue='sum(revenue)',
    groupby=['year']
).mark_bar(color='green').encode(
    x=alt.X('year:O', scale=alt.Scale(domain=list(range(1990, 2024)))),
    y=alt.Y('total_revenue:Q', axis=alt.Axis(title='Total Revenue')),
    tooltip=['year:O', 'total_revenue:Q'],
    opacity=alt.condition(year_selection, alt.value(1), alt.value(0.2)),
).add_params(
    year_selection
).properties(
    title="Total Movies Revenue vs Budget per Year",
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH
)
## END - VIEW 1 - POSITIVE/NEGATIVE BAR CHART ###########################################################


## START - VIEW 2 - BAR CHART ############################################################
#BAR CHART: TOP 10 PRODUCTION COUNTRIES BY NUMBER OF MOVIES
top_countries_chart = alt.Chart(df_production_countries_split).transform_filter(
    genre_selection & year_selection & runtime_rating_brush
).transform_aggregate(
    count='count()',
    groupby=['production_country']
).transform_window(
    rank='row_number(count)',
    sort=[alt.SortField('count', order='descending')],
).transform_calculate(
    top_country = "datum.rank <= 10 ? datum.production_country : '_Others'"
).transform_aggregate(
    count='sum(count)',
    groupby=["top_country"]
).transform_calculate(
    production_country = alt.datum.top_country
).mark_bar().encode(
    x=alt.X('count:Q', title='Number of Movies'),
    y=alt.Y('production_country:N', sort='-x', title='Production Country'),
    color=alt.Color('production_country:N',legend=None),
    tooltip=['production_country:N', 'count:Q'],
    opacity=alt.condition(country_selection, alt.value(1), alt.value(0.2)),
).add_params(
    country_selection
).properties(
    title='Top 10 Countries by Number of Movies',
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH/2
)

## END - VIEW 2 - BAR CHART ###########################################################



## START - VIEW 3 - SCATTER PLOT ############################################################

# Genre Selection
genre_color = alt.condition(genre_selection,
                      alt.Color('genre:N', legend=None),
                      alt.value('lightgray')) 
genre_list = pd.DataFrame({'genre': movies_df['genre'].unique()}) 
genre_legend = alt.Chart(movies_df).mark_point(size=200, filled=True).encode(
    y=alt.Y('genre:N', axis=alt.Axis(orient='right')),
    color=genre_color,
    opacity=alt.condition(genre_selection, alt.value(1), alt.value(0.2)),
).transform_filter(
    # country_selection & genre_selection & year_selection
    year_selection & runtime_rating_brush
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(country_selection.production_country)}, 
             (alt.expr.indexof(alt.datum.production_countries, country_selection.production_country) != -1)
         ]
     }  
).add_params(genre_selection)


#SCATTER PLOT: RELATIONSHIP BETWEEN RUNTIME AND RATING/VOTE AVERAGE 

relationship_chart = alt.Chart(movies_df).mark_circle(size=50).encode(
    x=alt.X('runtime:Q', title='Runtime (minutes)'),
    y=alt.Y('vote_average:Q', title='Rating'),
    color=alt.Color('genre:N'),
    opacity=alt.condition(runtime_rating_brush, alt.OpacityValue(1), alt.OpacityValue(0.2)),
    tooltip=[
            alt.Tooltip("title:N"),
            alt.Tooltip("budget:Q", title="Budget ($)"),  
            alt.Tooltip("revenue:Q", title="Revenue ($)"),
            alt.Tooltip("vote_average:Q", title="Rating"),
            alt.Tooltip("genre:N", title="Genre"),
            alt.Tooltip("production_countries", title="Production Countries"),
            alt.Tooltip("release_date:T", title="Release Date"),
            alt.Tooltip("runtime:Q", title="Runtime")
        ]
).transform_filter(
    # country_selection & genre_selection & year_selection
    genre_selection & year_selection
).transform_filter(
     {
         'or': [
             {'not': alt.expr.isDefined(country_selection.production_country)}, 
             (alt.expr.indexof(alt.datum.production_countries, country_selection.production_country) != -1)
         ]
     }  
).add_params(
    runtime_rating_brush
).properties(
    title="Relationship between runtime and rating",
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH/2
)

## END - VIEW 3 - SCATTER PLOT ###########################################################



## START - CONSTRUCT SYSTEM ############################################################

chart1 =  revenue_chart & budget_chart
chart2 = top_countries_chart
chart3 = (relationship_chart | genre_legend )

chart1 & (chart2 | chart3)
## END - CONSTRUCT SYSTEM ############################################################