In [1]:

import pandas as pd
import altair as alt
from IPython.display import display

file_path = 'data/cleaned_TMDB_tv_dataset.csv'
cvd_data = pd.read_csv(file_path)

tv_shows_data = cvd_data
tv_shows_data.dropna(inplace=True)
tv_shows_data = tv_shows_data.sample(n=100,random_state=40)

# alt.data_transformers.enable('json')

tv_shows_data['last_air_date'] = pd.to_datetime(tv_shows_data['last_air_date'], format='%Y-%m-%d')
# Extract month from 'last_air_date'
tv_shows_data['month_year'] = tv_shows_data['last_air_date'].dt.to_period('M')
tv_shows_data['month_year'] = tv_shows_data['month_year'].astype(str)
tv_shows_data['month'] = tv_shows_data['last_air_date'].dt.month_name()


# Task 1: Explore trends in TV show popularity based on vote count and average.
popularity_chart = alt.Chart(tv_shows_data).mark_point().encode(
    x='vote_count',
    y='vote_average',
    color=alt.Color('popularity', scale=alt.Scale(scheme='category20')),
    tooltip=['name', 'vote_count', 'vote_average', 'popularity']
).interactive().properties(
    title='Interactive Exploration: Vote Count vs Vote Average'
)


In [9]:

# Task 2: Analyze TV show genres to identify the most popular genres or combinations of genres.
tv_shows_genres = tv_shows_data.assign(genres=tv_shows_data['genres'].str.split(', ')).explode('genres')
print(tv_shows_genres)
# Count the occurrences of each genre
genre_counts = tv_shows_genres['genres'].value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']

# Task 2: Analyze TV show genres to identify the most popular genres or combinations of genres.
genre_chart = alt.Chart(genre_counts).mark_bar().encode(
    x=alt.X('Genre:N', title='Genre'),
    y=alt.Y('Count:Q', title='Number of TV Shows'),
    color=alt.Color('Genre:N', title='Genre')
).properties(
    title='Distribution of TV Show Genres'
)


       Unnamed: 0      id                                             name  \
14769      113283   94433                               Riising Late Night   
4732         9626   61589                            El Club de la Comedia   
1585         2472   82782                          The Righteous Gemstones   
13560       80014   99572                                  Mon ami Gaylord   
7404        19206    4390  Roswell Conspiracies: Aliens, Myths and Legends   
...           ...     ...                                              ...   
5496        12093   70386                Origins: The Journey of Humankind   
9040        27761  228429              Soul Land 2: The Peerless Tang Clan   
9040        27761  228429              Soul Land 2: The Peerless Tang Clan   
9040        27761  228429              Soul Land 2: The Peerless Tang Clan   
9040        27761  228429              Soul Land 2: The Peerless Tang Clan   

       number_of_seasons  number_of_episodes original_language 

In [3]:

# Task 3: Investigate the relationship between TV show ratings and the number of seasons and episodes.
ratings_chart = alt.Chart(tv_shows_data).mark_circle().encode(
    x='number_of_seasons',
    y='number_of_episodes',
    color='vote_average',
    tooltip=['name', 'number_of_seasons', 'number_of_episodes', 'vote_average']
).properties(
    title='Relationship between Ratings and Number of Seasons/Episodes'
)


In [4]:

# Task 4: Explore the distribution of TV show run times and investigate whether episode duration affects overall ratings.
runtime_chart = alt.Chart(tv_shows_data).mark_bar().encode(
    x=alt.X('episode_run_time:Q', title='Episode Run Time (minutes)'),
    y=alt.Y('count():Q', title='Number of TV Shows'),
    tooltip=['episode_run_time:Q', 'count():Q']
).properties(
    title='Distribution of TV Show Run Times'
)


In [5]:

# Task 5: Investigate TV show production trends across different countries and networks.
production_trends_chart = alt.Chart(tv_shows_data).transform_aggregate(count='count()',groupby=['origin_country', 'networks']).mark_bar().encode(
    x=alt.X('origin_country:N', title='Country'),
    y=alt.Y('count:Q', title='Number of TV Shows'),
    color=alt.Color('networks:N', title='Network'),
    tooltip=['origin_country:N', 'networks:N', 'count:Q']
).properties(
    title='TV Show Production Trends Across Countries and Networks'
)


In [6]:

# Task 6: Analyze the relationship between TV show language and popularity, and investigate the popularity of non-English shows.
language_popularity_chart = alt.Chart(tv_shows_data).mark_point().encode(
    x='original_language',
    y='popularity',
    tooltip=['original_language', 'popularity'],
    color='original_language'
).properties(
    title='Relationship between TV Show Language and Popularity'
)


In [7]:

# Task 7: Track the status of TV shows (in production or not) and analyze their popularity over time.
status_popularity_chart = alt.Chart(tv_shows_data).mark_line().encode(
    x='month_year:O',
    y='popularity:Q',
    color='in_production:N',
    tooltip=['name','month_year', 'popularity', 'in_production']
).properties(
    title='Popularity of TV Shows Over Time by Production Status'
)

#highlight_circle = base_chart.mark_circle(size=100, opacity=0).encode(opacity=alt.condition(alt.selection_point(), alt.value(1), alt.value(0)))

#status_popularity_chart = alt.layer(base_chart, highlight_circle)

In [8]:
# Combine all charts for visualization
combined_chart = (popularity_chart | genre_chart) & ( runtime_chart | ratings_chart) & (language_popularity_chart | production_trends_chart) & (status_popularity_chart)
combined_chart = combined_chart.configure_legend(
    titleFontSize=14,   
    labelFontSize=12,   
    symbolSize=100
)
display(combined_chart)