In [1]:
import altair as alt
import pandas as pd
import numpy as np
import timeit
from IPython.display import Image, display

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
df = pd.read_csv("data/cleaned_TMDB_tv_dataset.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,adult,first_air_date,...,status,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time
0,0,1399,Game of Thrones,8,73,en,21857,8.442,False,4/17/2011,...,Ended,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0
1,1,71446,Money Heist,3,41,es,17836,8.257,False,5/2/2017,...,Ended,"Crime, Drama",Álex Pina,es,"Netflix, Antena 3",ES,Español,Vancouver Media,Spain,70
2,2,66732,Stranger Things,4,34,en,16161,8.624,False,7/15/2016,...,Returning Series,"Drama, Sci-Fi & Fantasy, Mystery","Matt Duffer, Ross Duffer",en,Netflix,US,English,"21 Laps Entertainment, Monkey Massacre Product...",United States of America,0
3,3,1402,The Walking Dead,11,177,en,15432,8.121,False,10/31/2010,...,Ended,"Action & Adventure, Drama, Sci-Fi & Fantasy",Frank Darabont,en,AMC,US,English,"AMC Studios, Circle of Confusion, Valhalla Mot...",United States of America,42
4,4,63174,Lucifer,6,93,en,13870,8.486,False,1/25/2016,...,Ended,"Crime, Sci-Fi & Fantasy",Tom Kapinos,en,"FOX, Netflix",US,English,"Warner Bros. Television, DC Entertainment, Jer...",United States of America,45


In [4]:
for col in df.columns:
    print(col)

Unnamed: 0
id
name
number_of_seasons
number_of_episodes
original_language
vote_count
vote_average
adult
first_air_date
last_air_date
in_production
original_name
popularity
type
status
genres
created_by
languages
networks
origin_country
spoken_languages
production_companies
production_countries
episode_run_time


In [5]:
df['genres'] = df['genres'].str.split(', ')
df = df.explode('genres')

df['year'] = pd.to_datetime(df['first_air_date']).dt.year  # Assuming 'first_air_date' is in 'mm/dd/yyyy' format
genre_yearly = df.groupby(['genres', 'year'])['vote_count'].mean().reset_index()

line_chart = alt.Chart(genre_yearly).mark_line().encode(
    x='year:O',
    y='vote_count:Q',
    color='genres:N',
    tooltip=['genres', 'year', 'vote_count']
).properties(
    title='Average Vote Count by Genre Over Time'
)

line_chart

In [6]:
bar_chart = alt.Chart(df).mark_bar().encode(
    x='genres:N',
    y='average(vote_average):Q',
    color='genres:N',
    tooltip=['genres', 'average(vote_average)']
).properties(
    title='Average Vote by Genre'
)

bar_chart

In [7]:
scatter_plot = alt.Chart(df).mark_circle(size=60).encode(
    x='vote_count:Q',
    y='vote_average:Q',
    color='genres:N',
    tooltip=['name', 'vote_count', 'vote_average', 'genres']
).properties(
    title='Vote Count vs. Average Vote by Genre'
)

scatter_plot

In [8]:
area_chart = alt.Chart(df).mark_area().encode(
    x='year:O',
    y='average(popularity):Q',
    color='genres:N',
    tooltip=['year', 'average(popularity)', 'genres']
).properties(
    title='Popularity Over Time by Genre'
)

area_chart

In [9]:
top_genres = df['genres'].value_counts().head(10).index.tolist()

top_genre_filter = df['genres'].isin(top_genres)
df_top_genres = df[top_genre_filter]

multiline_chart = alt.Chart(df_top_genres).mark_line().encode(
    x='year:O',
    y='average(popularity):Q',
    color='genres:N',
    tooltip=['year', 'average(popularity)', 'genres']
).properties(
    title='Popularity Trends for Top Genres Over Time'
)

multiline_chart

# First half

In [10]:
# First, get the top shows based on their vote_average
top_rated_shows = df.nlargest(30, 'vote_count')

# Create a horizontal bar chart for the top rated shows
ratings_chart = alt.Chart(top_rated_shows).mark_bar().encode(
    x='vote_count:Q',
    y=alt.Y('name:N', sort='-x'),  # Sort the shows based on vote_average
    color='vote_count:Q',
    tooltip=['name', 'vote_count']
).properties(
    title='Ratings Overview of Top Awarded Shows'
)
ratings_chart

In [11]:
# Create a scatter plot to show the relationship between vote average and popularity
relationship_chart = alt.Chart(df).mark_circle().encode(
    x='popularity:Q',
    y='vote_average:Q',
    color='vote_average:Q',
    tooltip=['name', 'popularity', 'vote_average']
).properties(
    title='Relationship Between Vote Average and Popularity'
)
relationship_chart

In [12]:
#First aired - popularity

In [13]:
first_half_chart = alt.hconcat(ratings_chart, relationship_chart).resolve_scale(
    x='independent',
    y='independent'
)
first_half_chart

# Second half

In [14]:
df['genres'] = df['genres'].apply(lambda x: x.split(', '))
df = df.explode('genres')

df['year'] = pd.to_datetime(df['first_air_date']).dt.year

genre_year_count = df.groupby(['genres', 'year']).size().reset_index(name='aired')

top_genres = genre_year_count.groupby('genres')['aired'].sum().nlargest(10).index
genre_year_count = genre_year_count[genre_year_count['genres'].isin(top_genres)]

line_chart = alt.Chart(genre_year_count).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(format='d', title='Year')),
    y=alt.Y('aired:Q', axis=alt.Axis(title='Number of Airings')),
    color='genres:N',
    tooltip=['genres', 'year', 'aired']
).properties(
    title='Top 10 Aired Genres Over Time'
)

line_chart

In [15]:
runtime_chart = alt.Chart(df).mark_bar().encode(
    x='episode_run_time:N',
    y='count():Q',
    tooltip=['episode_run_time', 'count()']
).properties(
    title='Distribution of Show Runtimes'
)
runtime_chart

In [16]:
first_half = alt.hconcat(line_chart, runtime_chart).resolve_scale(
    x='independent',
    y='independent'
)

first_half

In [17]:
# country_chart = alt.Chart(df).mark_bar().encode(
#     x='count():Q',
#     y='origin_country:N',
#     tooltip=['origin_country', 'count()']
# ).properties(
#     title='Number of Shows by Country'
# )
# country_chart

country_counts = df['origin_country'].value_counts().reset_index(name='count').head(6)

# Create a bar chart for the top 6 countries
top_countries_chart = alt.Chart(country_counts).mark_bar().encode(
    x='count:Q',
    y=alt.Y('index:N', sort='-x'),  # Sort by the x value in descending order
    tooltip=['index', 'count']
).properties(
    title='Top 6 Countries by Number of Shows'
)
top_countries_chart

In [18]:
# language_chart = alt.Chart(df).mark_bar().encode(
#     x='count():Q',
#     y='languages:N',
#     tooltip=['languages', 'count()']
# ).properties(
#     title='Number of Shows by Language'
# )
# language_chart
language_counts = df['languages'].value_counts().reset_index(name='count').head(6)

# Create a bar chart for the top 6 languages
top_languages_chart = alt.Chart(language_counts).mark_bar().encode(
    x='count:Q',
    y=alt.Y('index:N', sort='-x'),
    tooltip=['index', 'count']
).properties(
    title='Top 6 Languages by Number of Shows'
)
top_languages_chart

In [19]:
# producer_chart = alt.Chart(df).mark_bar().encode(
#     x='count():Q',
#     y='production_companies:N',
#     tooltip=['production_companies', 'count()']
# ).properties(
#     title='Number of Shows by Production Company'
# )
# producer_chart

producer_counts = df['production_companies'].value_counts().reset_index(name='count').head(6)

# Create a bar chart for the top 6 production companies
top_producers_chart = alt.Chart(producer_counts).mark_bar().encode(
    x='count:Q',
    y=alt.Y('index:N', sort='-x'),
    tooltip=['index', 'count']
).properties(
    title='Top 6 Production Companies by Number of Shows'
)
top_producers_chart

In [20]:
# director_chart = alt.Chart(df).mark_point().encode(
#     x='count():Q',
#     y='created_by:N',
# #     size='sum(awards_count):Q',  # Replace 'awards_count' with your actual column for awards
# #     tooltip=['created_by', 'count()', 'sum(awards_count)']
#     tooltip=['created_by', 'count()']
# ).properties(
#     title='Top Awarded Directors'
# )
# director_chart
director_counts = df['created_by'].value_counts().reset_index(name='count').head(6)

# Create a scatter plot for the top 6 directors
top_directors_chart = alt.Chart(director_counts).mark_circle().encode(
    x='index:N',
    y='count:Q',
    size='count:Q',
    tooltip=['index', 'count']
).properties(
    title='Top 6 Directors by Number of Shows'
)
top_directors_chart

In [21]:
top_row = alt.hconcat(top_countries_chart, top_languages_chart)
bottom_row = alt.hconcat(top_producers_chart, top_directors_chart)

final_chart = alt.vconcat(top_row, bottom_row).resolve_scale(
    x='independent',
    y='independent'
)
final_chart

In [22]:
first_row = alt.hconcat(line_chart, runtime_chart)
second_row = alt.hconcat(top_countries_chart, top_languages_chart)
third_row = alt.hconcat(top_producers_chart, top_directors_chart)

final_chart = alt.vconcat(first_row, second_row, third_row).resolve_scale(
    x='independent',
    y='independent'
)
final_chart

# Other options 

Genre Popularity Over Time

In [23]:
df['first_air_date'] = pd.to_datetime(df['first_air_date'])

df['year'] = df['first_air_date'].dt.year

# Continue with the grouping and chart creation
average_popularity_over_time = df.groupby(['genres', 'year'])['popularity'].mean().reset_index()

genre_popularity_chart = alt.Chart(average_popularity_over_time).mark_line().encode(
    x='year:O',
    y='popularity:Q',
    color='genres:N',
    tooltip=['genres', 'year', 'popularity']
).properties(
    title='Genre Popularity Over Time'
)
genre_popularity_chart

Network Influence

In [24]:
# This assumes that 'networks' column has been exploded.
network_influence = df.groupby('networks')['popularity'].sum().reset_index()

network_influence_chart = alt.Chart(network_influence).mark_bar().encode(
    x='networks:N',
    y='popularity:Q',
    color='networks:N',
    tooltip=['networks', 'popularity']
).properties(
    title='Network Influence by Show Popularity'
)
network_influence_chart

Show Lifespan

In [25]:
# Convert 'first_air_date' and 'last_air_date' to datetime
df['first_air_date'] = pd.to_datetime(df['first_air_date'], errors='coerce')
df['last_air_date'] = pd.to_datetime(df['last_air_date'], errors='coerce')

# Now calculate the 'lifespan' as the number of days between the two dates
df['lifespan'] = (df['last_air_date'] - df['first_air_date']).dt.days

# Create a box plot with this lifespan data
lifespan_chart = alt.Chart(df).mark_boxplot().encode(
    x='genres:N',
    y='lifespan:Q',
    color='genres:N',
    tooltip=['name', 'lifespan']
).properties(
    title='Show Lifespan by Genre'
)
lifespan_chart

 Awarded Shows by Country and Language

In [26]:
awarded_shows = df.groupby(['origin_country', 'original_language'])['vote_average'].mean().reset_index()

awarded_shows_chart = alt.Chart(awarded_shows).mark_rect().encode(
    x='origin_country:N',
    y='original_language:N',
    color='vote_average:Q',
    tooltip=['origin_country', 'original_language', 'vote_average']
).properties(
    title='Awarded Shows by Country and Language'
)
awarded_shows_chart