In [68]:
import pandas as pd
import altair as alt
from altair import Tooltip
from IPython.display import display

file_path = r'movie_dataset.csv'
cvd_data = pd.read_csv(file_path, parse_dates=['release_date'])

movies_data = cvd_data

alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [69]:
movies_data['budget'] = movies_data['budget'].fillna(movies_data['revenue'])
movies_data['budget'] = movies_data['budget'].mask(movies_data['budget'] == 0, movies_data['revenue']) 

def calculate_boi(row):
    if row['revenue'] < row['budget']:
        return 'Flop'
    elif row['revenue'] >= 1.5 * row['budget'] and row['revenue'] < 2 * row['budget']:
        return 'Hit'
    elif row['revenue'] >= 2 * row['budget']:
        return 'Superhit'
    else:  # Revenue more than or equal to budget, less than 1.5 * budget
        return 'Average' 

movies_data['boi'] = movies_data.apply(calculate_boi, axis=1)
movies_data['boi'] = movies_data['boi'].fillna('Average')

movies_data['profit'] = movies_data['revenue'] - movies_data['budget']

movies_data['genres'] = movies_data['genres'].str.split(', ')
# DataFrame with individual genres
genres_data = movies_data['genres'].explode().dropna().unique()

In [77]:
movies_data.head(5)

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,adult,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,iso_countries,boi,profit
0,1138352,Rupert’s 6,0.0,0,2024-11-02,20,27,False,300,en,0.891,[Thriller],"Kleinhenz Jewelers, Bennett Family Dental",United States of America,English,US,Flop,-280
1,1241095,In The Morning,0.0,0,2024-02-08,1,24,False,1,en,0.0,"[Drama, Romance]",BlueBrick Productions,United States of America,English,US,Average,0
2,1236224,Nathalie: The first and last prom,1.0,1,2024-01-26,1,8,False,2,es,0.0,"[Horror, Drama, Comedy, Documentary]",Virgin Productions,Chile,Spanish,CL,Flop,-1
3,1237502,SILENTLY UNITED,0.0,0,2024-01-25,5000,13,False,300,en,0.0,"[Drama, Adventure]",Smooth Feather Productions,United States of America,English,US,Superhit,4700
4,1236207,Tim Kim's Great Escape,0.0,0,2024-01-25,25,2,False,1316,en,0.0,[Action],Tickle Gang,United States of America,English,US,Flop,-1291


In [89]:
year_select = alt.selection_interval(encodings=['x'])

# Genre Selection
genres = ['Thriller', 'Drama', 'Romance', 'Horror', 'Comedy', 'Documentary', 'Adventure',
          'Action', 'Music', 'Crime', 'History', 'Fantasy', 'Animation', 'TV Movie',
          'Family', 'Science Fiction', 'Mystery', 'Western', 'War']
genres_dropdown = alt.binding_select(options=[None] + genres,
                                     labels=['All'] + genres, name='Genres')
genres_selection = alt.selection_multi(fields=['genres'], bind=genres_dropdown, name='Genres')


movie_counts_per_year = movies_data.groupby(pd.Grouper(key='release_date', freq='Y')).size().reset_index(name='movie_count')

# Step 2: Visualizing with 'mark_line'
base_timeline = alt.Chart(movie_counts_per_year).mark_line(point=True).encode(
    x=alt.X('year(release_date):T', title='Released Year'),
    y=alt.Y('movie_count:Q', title='Count of movies released'),
    tooltip=[
        alt.Tooltip('year(release_date):T', title='Released Year'),
        alt.Tooltip('movie_count:Q', title='Count of movies released')
    ]
).properties(
    title="Number of Movies Released per Year"
).add_selection(
    year_select
).properties(
    width=800,
    height=400
)

# Bar Chart for BOI Distribution
bar_chart = alt.Chart(movies_data).mark_bar().encode(
    x=alt.X('boi:N', title='Box Office Indicator'),
    y='sum(count):Q',
    color=alt.condition(genre_select, alt.Color('genres:N', legend=None), alt.value('lightgray')),
    tooltip=['boi:N', 'sum(count):Q']
).transform_filter(
    year_select
).transform_aggregate(
    count='count():Q',
    groupby=['boi']
).properties(
    width=800,
    height=400
)

# Pie Chart for BOI Proportion
pie_chart = alt.Chart(movies_data).mark_arc().encode(
    theta='sum(count):Q',
    color=alt.Color('boi:N', scale=alt.Scale(scheme='category20')),  
    tooltip=[
        alt.Tooltip('boi:N', title='BOI'),
        alt.Tooltip('sum(count):Q', title='Count of Movies Released'),
        alt.Tooltip('percentage:Q', title='Percentage of Total Movies', format='.2%')  # Display percentage
    ]
).transform_filter(
    year_select
).transform_aggregate(
    count='count():Q',
    groupby=['boi']
).transform_joinaggregate(  # Calculate total count of movies
    total_count='sum(count)',
).transform_calculate(  # Calculate percentage
    percentage="datum.count / datum.total_count"
).properties(
    width=400,
    height=400,
    title="Proportion of Movies by Box Office Indicator"
).interactive()


# Scatter Plot for BOI Distribution
scatter_boi = alt.Chart(movies_data).mark_point().encode(
    x='budget:Q',
    y='revenue:Q',
    shape=alt.Shape('boi:N'),
    color=alt.Color('boi:N'),
    tooltip=['title:N', 'budget:Q', 'revenue:Q', 'boi:N', 'profit:Q']
).transform_filter(
    year_select
).properties(
    width=800,
    height=400
)

# Combine all charts
dashboard = alt.vconcat(
    base_timeline,
    bar_chart, 
    pie_chart,
    scatter_boi
).resolve_legend(
    color="independent"
).add_selection(
    genre_select
)

# Display the dashboard
dashboard

