In [14]:
import altair as alt
import pandas as pd
import numpy as np
# import timeit
# from IPython.display import Image, display

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

# 1. Dataset

## 1.1. Load CSV 

In [15]:
## Read csv
movies_df = pd.read_csv("data/movie_dataset.csv", parse_dates=['release_date'])

# show data
movies_df.head(5)

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,iso_countries,oscar
0,951491,Saw X,7.321,56,2023-09-26,29300000,118,False,13000000,en,949.967,"Crime, Horror, Thriller","Twisted Pictures, Lionsgate",United States of America,"English, Spanish",US,Non-Awardee
1,299054,Expend4bles,6.685,81,2023-09-15,15000000,103,False,100000000,en,484.627,"Action, Adventure, Thriller","Millennium Media, Campbell Grobman Films, Lion...",United States of America,English,US,Non-Awardee
2,945729,A Haunting in Venice,6.786,519,2023-09-13,89800000,104,False,60000000,en,147.584,"Mystery, Thriller, Crime","20th Century Studios, Scott Free Productions, ...","United Kingdom, United States of America","English, Italian","GB, US",Non-Awardee
3,820525,After Everything,6.768,69,2023-09-13,3280152,93,False,14000000,en,474.472,"Romance, Drama","Voltage Pictures, Wattpad",United States of America,"English, Portuguese",US,Non-Awardee
4,872906,Jawan,7.4,67,2023-09-07,128690264,169,False,36150000,hi,52.487,"Action, Adventure, Thriller",Red Chillies Entertainment,India,Hindi,IN,Non-Awardee


## 1.2 Create Genre DF

In [16]:
## split genres column
df_genre_split = movies_df.assign(genre=movies_df['genres'].str.split(', ')).explode('genre')

In [17]:
##Select first genre in genres column as main genre
movies_df["genre"] = movies_df["genres"].str.extract(r'([^,]+)')

# 2. Visualisation System

In [18]:
SYSTEM_HEIGHT=750
SYSTEM_WIDTH=800

In [19]:
## START - VIEW 1 - BAR CHART ############################################################
#BAR CHART: TOP 10 PRODUCTION COUNTRIES BY NUMBER OF MOVIES
df_country_exploded = movies_df.copy()
df_country_exploded['production_countries'] = df_country_exploded['production_countries'].str.split(', ')
df_country_exploded = df_country_exploded.explode('production_countries')

country_counts = df_country_exploded['production_countries'].value_counts().reset_index(name='count').head(10)

top_countries_chart = alt.Chart(country_counts).mark_bar().encode(
    x='count:Q',
    y=alt.Y('production_countries:N', sort='-x'),
    tooltip=['production_countries', 'count']
).properties(
    title='Top 10 Countries by Number of Movies',
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH/2
)

## START - VIEW 1.1(UNUSED) - BAR CHART ############################################################
#BAR CHART: TOP 10 PRODUCTION COUNTRIES BY NUMBER OF MOVIES, DIVIDED BY GENRE
genre_country_counts = df_country_exploded.groupby(['production_countries', 'genre']).size().reset_index(name='count')
genre_country_counts
top_countries = genre_country_counts.groupby('production_countries')['count'].sum().nlargest(10).index
genre_country_counts_top = genre_country_counts[genre_country_counts['production_countries'].isin(top_countries)]

top_countries_genre_chart = alt.Chart(genre_country_counts_top).mark_bar().encode(
    x='sum(count):Q',
    y=alt.Y('production_countries:N', sort='-x'),
    color='genre:N',
    tooltip=['production_countries', 'genre', 'sum(count)']
).properties(
    title='Top 10 Countries by Number of Movies, Split by Genre',
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH/2
)

## END - VIEW 1 - BAR CHART ############################################################


## START - VIEW 2 - BAR CHART (UNUSED) ############################################################
#BAR CHART: TOP 10 PRODUCTION COMPANY BY NUMBER OF MOVIES

df_company_exploded = movies_df.copy()
df_company_exploded['production_companies'] = movies_df['production_companies'].str.split(', ')

df_company_exploded = df_company_exploded.explode('production_companies')

company_counts = df_company_exploded['production_companies'].value_counts().reset_index(name='count').head(10)

top_company_chart = alt.Chart(company_counts).mark_bar().encode(
    x='count:Q',
    y=alt.Y('production_companies:N', sort='-x'),
    tooltip=['production_companies', 'count']
).properties(
    title='Top 10 Companies by Number of Movies',
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH/2
)

## START - VIEW 2.1 - BAR CHART (UNUSED) ############################################################
#BAR CHART: TOP 10 PRODUCTION COMPANY BY NUMBER OF MOVIES, DIVIDED BY GENRE
genre_company_counts = df_company_exploded.groupby(['production_companies', 'genre']).size().reset_index(name='count')

top_companies = genre_company_counts.groupby('production_companies')['count'].sum().nlargest(10).index
genre_company_counts_top = genre_company_counts[genre_company_counts['production_companies'].isin(top_companies)]

top_companies_genre_chart = alt.Chart(genre_company_counts_top).mark_bar().encode(
    x='sum(count):Q',
    y=alt.Y('production_companies:N', sort='-x'),
    color='genre:N',
    tooltip=['production_companies', 'genre', 'sum(count)']
).properties(
    title='Top 10 Companies by Number of Movies, Split by Genre',
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH/2
)

## END - VIEW 2 - BAR CHART ############################################################


## START - VIEW 3 - SCATTER PLOT ############################################################
#SCATTER PLOT: RELATIONSHIP BETWEEN RUNTIME AND RATING/VOTE AVERAGE 

relationship_chart = alt.Chart(movies_df).mark_point(size=100).encode(
    x='runtime:Q',
    y='vote_average:Q',
    color='genre:N',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T']
).properties(
    title="Relationship between runtime and rating",
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH/2
)

## END - VIEW 3 - SCATTER PLOT ###########################################################


## START - VIEW 4 - POSITIVE/NEGATIVE BAR CHART ############################################################
#BAR CHART: POSITIVE/NEGATIVE BAR CHART OF REVENUE AND BUDGET PER YEAR 

movies_df['year'] = pd.to_datetime(movies_df['release_date']).dt.year

movies_df['negative_revenue'] = -movies_df['revenue']

budget_chart = alt.Chart(movies_df).mark_bar(color='red').encode(
    x=alt.X('year:O', axis=alt.Axis(labels=False)),
    y=alt.Y('budget:Q', axis=alt.Axis(title='Budget')),
    tooltip=['year:O', 'budget:Q']
).properties(
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH
)

revenue_chart = alt.Chart(movies_df).mark_bar(color='green').encode(
    x='year:O',
    y=alt.Y('negative_revenue:Q', axis=alt.Axis(title='Revenue')),
    tooltip=['year:O', 'revenue:Q']
).properties(
    height=SYSTEM_HEIGHT/2,
    width=SYSTEM_WIDTH
)
## END - VIEW 4 - POSITIVE/NEGATIVE BAR CHART ###########################################################


## START - CONSTRUCT SYSTEM ############################################################

first_half_chart = alt.hconcat(top_countries_chart,relationship_chart).resolve_scale(
    x='independent',
    y='independent'
)

second_half_chart = alt.vconcat(
    budget_chart,
    revenue_chart,
    spacing=0
).resolve_scale(
    x='shared'
)

final_chart = alt.vconcat(first_half_chart, second_half_chart).resolve_scale(
    x='independent',
    y='independent'
)

final_chart
## END - CONSTRUCT SYSTEM ############################################################