In [1]:
import pandas as pd
import altair as alt
from altair import Tooltip
import numpy as np
from IPython.display import display

file_path = r'movie_dataset.csv'
cvd_data = pd.read_csv(file_path, parse_dates=['release_date'])

movies_data = cvd_data

alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
movies_data['budget'] = movies_data['budget'].fillna(movies_data['revenue'])
movies_data['budget'] = movies_data['budget'].mask(movies_data['budget'] == 0, movies_data['revenue']) 

def calculate_boi(row):
    if row['revenue'] < row['budget']:
        return 'Flop'
    elif row['revenue'] >= 1.5 * row['budget'] and row['revenue'] < 2 * row['budget']:
        return 'Hit'
    elif row['revenue'] >= 2 * row['budget']:
        return 'Superhit'
    else:  # Revenue more than or equal to budget, less than 1.5 * budget
        return 'Average' 

movies_data['boi'] = movies_data.apply(calculate_boi, axis=1)
movies_data['boi'] = movies_data['boi'].fillna('Average')

movies_data['profit'] = movies_data['revenue'] - movies_data['budget']

genres_df = movies_data.assign(genres=movies_data['genres'].str.split(', ')).explode('genres')

In [3]:
genres_df.head(5)

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,iso_countries,oscar,boi,profit
0,951491,Saw X,7.321,56,2023-09-26,29300000,118,False,13000000,en,949.967,Crime,"Twisted Pictures, Lionsgate",United States of America,"English, Spanish",US,,Superhit,16300000
0,951491,Saw X,7.321,56,2023-09-26,29300000,118,False,13000000,en,949.967,Horror,"Twisted Pictures, Lionsgate",United States of America,"English, Spanish",US,,Superhit,16300000
0,951491,Saw X,7.321,56,2023-09-26,29300000,118,False,13000000,en,949.967,Thriller,"Twisted Pictures, Lionsgate",United States of America,"English, Spanish",US,,Superhit,16300000
1,299054,Expend4bles,6.685,81,2023-09-15,15000000,103,False,100000000,en,484.627,Action,"Millennium Media, Campbell Grobman Films, Lion...",United States of America,English,US,,Flop,-85000000
1,299054,Expend4bles,6.685,81,2023-09-15,15000000,103,False,100000000,en,484.627,Adventure,"Millennium Media, Campbell Grobman Films, Lion...",United States of America,English,US,,Flop,-85000000


In [13]:
year_select = alt.selection_interval(encodings=['x'])
genre_select = alt.selection_point(fields=['genre'])


# 1: Visualizing with 'mark_line'
base_timeline = alt.Chart(movies_data).mark_line(point=True).encode(
    x=alt.X('year(release_date):O', title='Released Year',axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count()', title='Count of movies released'),
    tooltip=[
        alt.Tooltip('year(release_date):O', title='Released Year'),
        alt.Tooltip('count()', title='Count of movies released')
    ]
).properties(
    title="Number of Movies Released per Year"
).add_selection(
    year_select
).properties(
    width=800,
    height=400
)


# 2: Pie Chart by BOI Proportion for count of movies released
pie_chart = alt.Chart(movies_data).mark_arc().encode(
    theta='sum(count):Q',
    color=alt.Color('boi:N', scale=alt.Scale(scheme='category20')),  
    tooltip=[
        alt.Tooltip('boi:N', title='BOI'),
        alt.Tooltip('sum(count):Q', title='Count of Movies Released'),
        alt.Tooltip('percentage:Q', title='Percentage of Total Movies', format='.2%')  # Display percentage
    ]
).transform_filter(
    year_select
).transform_aggregate(
    count='count():Q',
    groupby=['boi']
).transform_joinaggregate(  # Calculate total count of movies
    total_count='sum(count)',
).transform_calculate(  # Calculate percentage
    percentage="datum.count/datum.total_count"
).properties(
    width=400,
    height=400,
    title="Proportion of Movies by BOI (Based on Count)"
).interactive()


# 3: Pie Chart by BOI Proportion for production budget
pie_chart2 = alt.Chart(movies_data).mark_arc().encode(
    theta=alt.Theta('sum(sum_budget):Q', title='Budget Proportion'),
    color=alt.Color('boi:N', scale=alt.Scale(scheme='category20')),  
    tooltip=[
        alt.Tooltip('boi:N', title='BOI'),
        alt.Tooltip('sum(sum_budget):Q', title='Total Budget'), 
        alt.Tooltip('percentage:Q', title='Percentage of Movies', format='.2%') 
    ]
).transform_filter(
    year_select
).transform_aggregate(
    sum_budget='sum(budget):Q',  # Aggregate budget by BOI
    groupby=['boi']
).transform_joinaggregate( 
    total_budget='sum(sum_budget)'  # Total budget (over all BOI for percentage)
).transform_calculate(
    percentage="datum.sum_budget / datum.total_budget"  # Calculate percentage based on budget
).properties(
    width=400,
    height=400,
    title="Proportion of Movies by BOI (Based on Budget)"
).interactive()


# 4: Scatter Plot for BOI Distribution
scatter_boi = alt.Chart(movies_data).mark_point().encode(
    x='budget:Q',
    y='revenue:Q',
    shape=alt.Shape('boi:N',legend=None),
    color=alt.Color('boi:N'),
    tooltip=['title:N', 'budget:Q', 'revenue:Q', 'boi:N', 'profit:Q']
).properties(
    title="Budget and Revenue distribution by BOI"
).transform_filter(
    year_select
).properties(
    width=800,
    height=400
)


# 5: Covariance matrix plot
charts = []

for x_var in ["profit", "vote_average", "budget"]:
    for y_var in ["profit", "vote_average", "budget"]:
        chart = alt.Chart(genres_df).mark_circle().encode(
            x=x_var,
            y=y_var,
            color=alt.Color('genres:N', scale=alt.Scale(scheme='category20')),
            tooltip=['genres:N', x_var, y_var, 'title:N']
        )
        charts.append(chart)

# Combine charts in a matrix layout
scatter_matrix = alt.vconcat(
    alt.hconcat(*charts[0:3]),
    alt.hconcat(*charts[3:6]),
    alt.hconcat(*charts[6:9])
).resolve_scale(
    x='independent', y='independent'
).properties(
    title="Covariance relation between Budget, Rating and Profit"
)



# 6: Budget vs. Revenue: Impact of Movie Ratings
revenue_min = movies_data["vote_average"].mean() - 2.5
revenue_max = movies_data["vote_average"].max()

# Define the logarithmic color scale
revenue_color_scale = alt.Scale(
    scheme="viridis",
    domain=[revenue_min, revenue_max]
)
# Create the chart with the color legend included
rating_chart = (
    alt.Chart(movies_data)
    .mark_square()  
    .encode(
        x=alt.X("budget:Q", title="Budget ($ millions) - Log scale", scale=alt.Scale(type="log"), axis=alt.Axis(grid=False)),
        y=alt.Y("revenue:Q", title="Revenue - Log scale", scale=alt.Scale(type="log"), axis=alt.Axis(grid=False)),  # Use log scale for revenue
        color=alt.Color(
            "vote_average:Q", 
            scale=revenue_color_scale,  
            title="Rating"
        ), 
        tooltip=[
            alt.Tooltip("title:N"),
            alt.Tooltip("budget:Q", title="Budget ($ millions)"),
            alt.Tooltip("revenue:Q", title="Revenue"),
            alt.Tooltip("vote_average:Q", title="Rating"),
        ],
    ).properties(
    width=800, height=400, title = "Budget vs. Revenue: Impact of Movie Ratings" 
    ).transform_filter(
    year_select
    )
)


# Combine all charts
dashboard = alt.vconcat(
    base_timeline,
    pie_chart,
    pie_chart2,
    scatter_boi,
    scatter_matrix,
    rating_chart
).transform_filter(
    year_select
).resolve_legend(color="independent")


# Display the dashboard
dashboard

