In [1]:
import altair as alt
import pandas as pd
import numpy as np
# import timeit
# from IPython.display import Image, display

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [73]:
df = pd.read_csv("data/movie_dataset.csv")

In [74]:
df.head()

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,iso_countries,oscar
0,951491,Saw X,7.321,56,9/26/2023,29300000,118,False,13000000,en,949.967,"Crime, Horror, Thriller","Twisted Pictures, Lionsgate",United States of America,"English, Spanish",US,Non-Awardee
1,299054,Expend4bles,6.685,81,9/15/2023,15000000,103,False,100000000,en,484.627,"Action, Adventure, Thriller","Millennium Media, Campbell Grobman Films, Lion...",United States of America,English,US,Non-Awardee
2,945729,A Haunting in Venice,6.786,519,9/13/2023,89800000,104,False,60000000,en,147.584,"Mystery, Thriller, Crime","20th Century Studios, Scott Free Productions, ...","United Kingdom, United States of America","English, Italian","GB, US",Non-Awardee
3,820525,After Everything,6.768,69,9/13/2023,3280152,93,False,14000000,en,474.472,"Romance, Drama","Voltage Pictures, Wattpad",United States of America,"English, Portuguese",US,Non-Awardee
4,872906,Jawan,7.4,67,9/7/2023,128690264,169,False,36150000,hi,52.487,"Action, Adventure, Thriller",Red Chillies Entertainment,India,Hindi,IN,Non-Awardee


In [75]:
df['production_countries'] = df['production_countries'].str.split(', ')

df_country_exploded = df.explode('production_countries')

In [76]:
country_counts = df_country_exploded['production_countries'].value_counts().reset_index(name='count').head(10)

top_countries_chart = alt.Chart(country_counts).mark_bar().encode(
    x='count:Q',
    y=alt.Y('index:N', sort='-x'),
    tooltip=['index', 'count']
).properties(
    title='Top 10 Countries by Number of Movies',
    height=400,
    width=400
)
top_countries_chart

In [77]:
df_country_exploded['genres'] = df_country_exploded['genres'].str.split(', ')
df_genres_exploded = df_country_exploded.explode('genres')

genre_country_counts = df_genres_exploded.groupby(['production_countries', 'genres']).size().reset_index(name='count')

top_countries = genre_country_counts.groupby('production_countries')['count'].sum().nlargest(10).index
genre_country_counts_top = genre_country_counts[genre_country_counts['production_countries'].isin(top_countries)]

top_countries_genre_chart = alt.Chart(genre_country_counts_top).mark_bar().encode(
    x='sum(count):Q',
    y=alt.Y('production_countries:N', sort='-x'),
    color='genres:N',
    tooltip=['production_countries', 'genres', 'sum(count)']
).properties(
    title='Top 10 Countries by Number of Movies, Split by Genre',
    height=400,
    width=400
)

top_countries_genre_chart

In [78]:
df['production_companies'] = df['production_companies'].str.split(', ')

df_company_exploded = df.explode('production_companies')

In [79]:
company_counts = df_company_exploded['production_companies'].value_counts().reset_index(name='count').head(10)

top_company_chart = alt.Chart(company_counts).mark_bar().encode(
    x='count:Q',
    y=alt.Y('index:N', sort='-x'),
    tooltip=['index', 'count']
).properties(
    title='Top 10 Companies by Number of Movies',
    height=400,
    width=400
)

top_company_chart

In [80]:
df_company_exploded['genres'] = df_company_exploded['genres'].str.split(', ')
df_genres_exploded = df_company_exploded.explode('genres')

genre_company_counts = df_genres_exploded.groupby(['production_companies', 'genres']).size().reset_index(name='count')

top_companies = genre_company_counts.groupby('production_companies')['count'].sum().nlargest(10).index
genre_company_counts_top = genre_company_counts[genre_company_counts['production_companies'].isin(top_companies)]

top_companies_genre_chart = alt.Chart(genre_company_counts_top).mark_bar().encode(
    x='sum(count):Q',
    y=alt.Y('production_companies:N', sort='-x'),
    color='genres:N',
    tooltip=['production_companies', 'genres', 'sum(count)']
).properties(
    title='Top 10 Companies by Number of Movies, Split by Genre',
    height=400,
    width=400
)

top_companies_genre_chart

In [81]:
movies_genre_df = pd.read_csv("data/movie_dataset.csv", parse_dates=['release_date'])
movies_genre_df['genre'] = movies_genre_df["genres"].str.extract(r'([^,]+)')

relationship_chart = alt.Chart(movies_genre_df).mark_point(size=100).encode(
    x='runtime:Q',
    y='vote_average:Q',
    color='genre:N',
    tooltip=['title', 'runtime', 'vote_average', 'release_date:T']
).properties(
    title="Relationship between runtime and rating",
    width=800,
    height=400
)

relationship_chart

In [82]:
first_half_chart = alt.hconcat(relationship_chart, top_countries_chart).resolve_scale(
    x='independent',
    y='independent'
)
first_half_chart

In [92]:
df['year'] = pd.to_datetime(df['release_date']).dt.year

df['negative_revenue'] = -df['revenue']

budget_chart = alt.Chart(df).mark_bar(color='red').encode(
    x=alt.X('year:O', axis=alt.Axis(labels=False)),
    y=alt.Y('budget:Q', axis=alt.Axis(title='Budget')),
    tooltip=['year:O', 'budget:Q']
).properties(
    width=1400,
#     height=400
)

revenue_chart = alt.Chart(df).mark_bar(color='green').encode(
    x='year:O',
    y=alt.Y('negative_revenue:Q', axis=alt.Axis(title='Revenue')),
    tooltip=['year:O', 'revenue:Q']
).properties(
    width=1400,
#     height=400
)

second_half_chart = alt.vconcat(
    budget_chart,
    revenue_chart,
    spacing=0
).resolve_scale(
    x='shared'
)

second_half_chart

In [93]:
final_chart = alt.vconcat(first_half_chart, second_half_chart).resolve_scale(
    x='independent',
    y='independent'
)

final_chart