# Movie Analysis Dashboard

## Tasks:

## Data Preparation:
1. Reading Data: The code reads a movie dataset from a CSV file using Pandas.
2. Data Cleaning: Various data cleaning steps are performed, such as handling missing values and creating new columns like "boi" (Box Office Impact), "profit", etc.
3. Geospatial Data: Geospatial data for world countries is loaded using GeoPandas to map movies to their respective countries and continents.

In [2]:
#!pip install --upgrade altair vegafusion

import pandas as pd
import altair as alt
from altair import Tooltip, vconcat, hconcat
import numpy as np
import geopandas as gpd

# Read the movie dataset from a CSV file
file_path = r'data/movie_dataset.csv'
cvd_data = pd.read_csv(file_path, parse_dates=['release_date'])
movies_data = cvd_data

# Enable Altair's Vega-Lite data transformers
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [4]:
# Handle missing values in the 'budget' column by filling them with 'revenue' values
movies_data['budget'] = movies_data['budget'].fillna(movies_data['revenue'])
movies_data['budget'] = movies_data['budget'].mask(movies_data['budget'] == 0, movies_data['revenue'])

# Define a function to calculate Box Office Impact (BOI) based on revenue and budget
def calculate_boi(row):
    if row['revenue'] < row['budget']:
        return 'Flop'
    elif row['revenue'] >= 1.5 * row['budget'] and row['revenue'] < 2 * row['budget']:
        return 'Hit'
    elif row['revenue'] >= 2 * row['budget']:
        return 'Superhit'
    else:
        return 'Average'

# Apply the function to create a new column 'boi'
movies_data['boi'] = movies_data.apply(calculate_boi, axis=1)
movies_data['boi'] = movies_data['boi'].fillna('Average')

# Calculate profit by subtracting budget from revenue
movies_data['profit'] = movies_data['revenue'] - movies_data['budget']

# Extracting year from release date
movies_data['release_year'] = movies_data['release_date'].dt.year

# Fill missing values in the 'oscar' column with 'No'
movies_data['oscar'] = movies_data['oscar'].fillna('No')

# Load geospatial data for world countries
local_map = "data/ne_50m_admin_0_countries.zip"
gdf_ne = gpd.read_file(local_map)
gdf_ne.columns = map(str.lower, gdf_ne.columns)
gdf_ne = gdf_ne[["name", "iso_a2", "continent", "pop_est", 'geometry']]
gdf_ne.loc[gdf_ne["name"] == "Norway", 'iso_a2'] = "NO"
gdf_ne.loc[gdf_ne["name"] == "France", 'iso_a2'] = "FR"
gdf_ne.loc[gdf_ne["name"] == "Taiwan", 'iso_a2'] = "TW"
gdf_ne = gdf_ne.query("continent in ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']")

# Explode the 'iso_countries' column to count movie production by country
production_country_df = movies_data.assign(iso_country=movies_data['iso_countries'].str.split(', ')).explode('iso_country')
movie_country_counts = production_country_df.value_counts(['iso_country']).reset_index()

# Extract continent and country names from ISO codes
iso_continent_df = pd.DataFrame(gdf_ne[["iso_a2", "continent", "name"]])
country_metadata_map = iso_continent_df.set_index('iso_a2').T.to_dict('index')
country_continent_map = country_metadata_map['continent']

# Map movie countries to continents
movies_data['continents'] = movies_data['iso_countries']
movies_data['continents'] = movies_data['continents'].replace(pd.Series(country_continent_map).astype(str), regex=True)

# Extract the first country's continent from the 'continents' column
def extract_first_country_continent(country_continent_string):
    return country_continent_string.split(',')[0]

movies_data['continents'] = movies_data['continents'].apply(extract_first_country_continent)
movies_data['production_countries'] = movies_data['production_countries'].apply(extract_first_country_continent)

# Explode the 'genres' column to analyze movie lifespan by genre
genres_df = movies_data.assign(genres=movies_data['genres'].str.split(', ')).explode('genres')

movies_data.head(5)

  country_metadata_map = iso_continent_df.set_index('iso_a2').T.to_dict('index')


Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,...,genres,production_companies,production_countries,spoken_languages,iso_countries,oscar,boi,profit,release_year,continents
0,951491,Saw X,7.321,56,2023-09-26,29300000,118,False,13000000,en,...,"Crime, Horror, Thriller","Twisted Pictures, Lionsgate",United States of America,"English, Spanish",US,Non-Awardee,Superhit,16300000,2023,North America
1,299054,Expend4bles,6.685,81,2023-09-15,15000000,103,False,100000000,en,...,"Action, Adventure, Thriller","Millennium Media, Campbell Grobman Films, Lion...",United States of America,English,US,Non-Awardee,Flop,-85000000,2023,North America
2,945729,A Haunting in Venice,6.786,519,2023-09-13,89800000,104,False,60000000,en,...,"Mystery, Thriller, Crime","20th Century Studios, Scott Free Productions, ...",United Kingdom,"English, Italian","GB, US",Non-Awardee,Average,29800000,2023,Europe
3,820525,After Everything,6.768,69,2023-09-13,3280152,93,False,14000000,en,...,"Romance, Drama","Voltage Pictures, Wattpad",United States of America,"English, Portuguese",US,Non-Awardee,Flop,-10719848,2023,North America
4,872906,Jawan,7.4,67,2023-09-07,128690264,169,False,36150000,hi,...,"Action, Adventure, Thriller",Red Chillies Entertainment,India,Hindi,IN,Non-Awardee,Superhit,92540264,2023,Asia


## Data Visualization:
1. Budget vs. Revenue Scatter Plot: Visualizes the relationship between movie budget and revenue, with each point colored by its average rating. Brushing and linking are enabled to interactively explore the data.
2. Number of Movies Released per Year by Genre (Stacked Bar Chart): Shows the count of movies released per year, grouped by the top production countries. Allows selection of years using a point selection tool.
3. Pie Chart: Proportion of Movies by BOI (Based on Count): Represents the proportion of movies based on their Box Office Impact (BOI) using count aggregation.
4. Pie Chart: Proportion of Movies by BOI (Based on Budget): Illustrates the proportion of movies based on their BOI using budget aggregation.
5. Movie Lifespan by Genre (Box Plot): Displays the distribution of movie runtimes across different genres.

In [5]:
# Brushing & Linking
brush = alt.selection_interval()
click = alt.selection_point(encodings=['x', 'y'])
year_selection = alt.selection_point(fields=['release_year'])

# 1. Visual: Rating Chart
revenue_min = movies_data["vote_average"].mean() - 2.5
revenue_max = movies_data["vote_average"].max()
revenue_color_scale = alt.Scale(
    scheme="yelloworangered",
    domain=[revenue_min, revenue_max]
)

base_chart = (
    alt.Chart(movies_data)
    .mark_square()
    .encode(
        x=alt.X(
            "budget:Q",
            title="Budget ($)"
        ),
        y=alt.Y(
            "revenue:Q",
            title="Revenue ($)"
        ),
        color=alt.Color(
            "vote_average:Q", 
            scale=revenue_color_scale,  
            title="Rating"
        ), 
        tooltip=[
            alt.Tooltip("title:N"),
            alt.Tooltip("budget:Q", title="Budget ($)"),  
            alt.Tooltip("revenue:Q", title="Revenue ($)"),
            alt.Tooltip("vote_average:Q", title="Rating"),
        ],
        opacity=alt.condition(brush, alt.OpacityValue(1), alt.OpacityValue(0.3))
    )
    .properties(
        width=400, 
        height=400, 
        title="Budget vs. Revenue: Impact of Movie Ratings",
    )
)

# Axis selections (to drive the dynamic behavior)
x_select = alt.selection_interval(encodings=["x"], name="x_axis") 
y_select = alt.selection_interval(encodings=["y"], name="y_axis") 

# Chart showing only selected points when brushed
selected_points = (
    alt.Chart(movies_data)
    .mark_square(size=50)
    .encode(
        x=alt.X(
            "budget:Q", 
            title="Budget ($)", 
            scale=alt.Scale(domain=x_select)  # Connect to x_select
        ),
        y=alt.Y(
            "revenue:Q", 
            title="Revenue ($)", 
            scale=alt.Scale(domain=y_select)  # Connect to y_select
        ),
        color=alt.Color(
            "vote_average:Q", 
            scale=revenue_color_scale,  
            title="Rating"
        ),
        tooltip=[
            alt.Tooltip("title:N"),
            alt.Tooltip("budget:Q", title="Budget ($)"),  
            alt.Tooltip("revenue:Q", title="Revenue ($)"),
            alt.Tooltip("vote_average:Q", title="Rating"),
            alt.Tooltip("boi:N", title="Verdict")
        ]
    )
    .transform_filter(brush)  # Respond to both selections
    .transform_filter(click)
    .add_params(x_select, y_select) # Add the axis selections
).interactive()

# 2. Stacked Bar Chart: Number of Movies Released per Year by Genre
movies_per_country_continent = movies_data.groupby(['release_year', 'production_countries']).size().reset_index(name='counts')
def get_top_countries(group, n=7):
    return group.sort_values(by='counts', ascending=False).head(n)
top_countries_per_continent = movies_per_country_continent.groupby('release_year', group_keys=False).apply(get_top_countries)
stack_bar_chart = (alt.Chart(top_countries_per_continent)
    .mark_bar()
    .encode(
        x=alt.X('release_year:O', title="Year Released", axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('counts:Q', stack='zero', title="Count of movies released"),
        color=alt.Color('production_countries:N',scale=alt.Scale(scheme='set3'), title="Production Countries"), 
        opacity=alt.condition(year_selection, alt.OpacityValue(1), alt.OpacityValue(0.3)),
        tooltip=[
        alt.Tooltip('release_year:O', title='Year Released'),
        alt.Tooltip('production_countries:N', title='Production Country'),
        alt.Tooltip('continents:N', title='Continent'),
        alt.Tooltip('counts:Q', title='Count of Movies') 
        ],
        order=alt.Order('release_year:O', sort='ascending')
    )
    .properties(
        width=650,
        height=450,
        title='Number of Movies Released per Year by Genre'
    )
)

# 3. Box Plot: Movie Lifespan by Genre
lifespan_chart = (alt.Chart(genres_df).mark_boxplot().encode(
    x= alt.X('genres:N', title = "Genres", axis=alt.Axis(labelAngle=-45)),
    y= alt.X('runtime:Q', title = "Runtime"),
    color=alt.Color('genres:N',scale=alt.Scale(scheme='category20'), title="Genres"),
    tooltip=[
        alt.Tooltip('title', title='Title'),
        alt.Tooltip('runtime', title='Runtime'),
        ]
).properties(
    width = 700,
    height = 500,
    title = 'Movie Lifespan by Genre'
)
)

# 4. Pie Chart: Proportion of Movies by BOI (Based on Count)
pie_chart = alt.Chart(movies_data).mark_arc().encode(
    theta='sum(count):Q',
    color=alt.Color('boi:N', scale=alt.Scale(scheme='category20'), title="Verdict"),  
    tooltip=[
        alt.Tooltip('boi:N', title='Verdict'),
        alt.Tooltip('sum(count):Q', title='Count of Movies Released'),
        alt.Tooltip('percentage:Q', title='Percentage of Total Movies', format='.2%')
    ]
).transform_aggregate(
    count='count():Q',
    groupby=['boi']
).transform_joinaggregate(
    total_count='sum(count)',
).transform_calculate(
    percentage="datum.count/datum.total_count"
).properties(
    width=270,
    height=270,
    title="Proportion of Movies by BOI (Based on Count)"
)

# 5. Pie Chart: Proportion of Movies by BOI (Based on Budget)
pie_chart2 = alt.Chart(movies_data).mark_arc().encode(
    theta=alt.Theta('sum(sum_budget):Q', title='Budget Proportion'),
    color=alt.Color('boi:N', scale=alt.Scale(scheme='category20'), title="Verdict"),  
    tooltip=[
        alt.Tooltip('boi:N', title='Verdict'),
        alt.Tooltip('sum(sum_budget):Q', title='Total Budget'), 
        alt.Tooltip('percentage:Q', title='Percentage of Movies', format='.2%')
    ]
).transform_aggregate(
    sum_budget='sum(budget):Q',
    groupby=['boi']
).transform_joinaggregate(
    total_budget='sum(sum_budget)'
).transform_calculate(
    percentage="datum.sum_budget / datum.total_budget"
).properties(
    width=270,
    height=270,
    title="Proportion of Movies by BOI (Based on Budget)"
)

# Combine base chart and selected points
combined_charts = (pie_chart | pie_chart2.transform_filter(year_selection))

# Concatenate charts vertically
rating_chart2 = alt.vconcat(
    (base_chart.add_params(brush, click) | selected_points), 
    stack_bar_chart.add_params(year_selection),
    combined_charts,
    lifespan_chart,
)

# Display the final chart
rating_chart2