In [1]:
# Importing libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

In [2]:
#Reading the dataset
netflix_data = pd.read_csv('netflix_content.csv')

In [3]:
netflix_data.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,English,Show
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,English,Show
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,Korean,Show
3,Wednesday: Season 1,Yes,2022-11-23,507700000,English,Show
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,English,Movie


In [4]:
#Checking the datatypes of columns
netflix_data.dtypes

Title                  object
Available Globally?    object
Release Date           object
Hours Viewed           object
Language Indicator     object
Content Type           object
dtype: object

In [5]:
#Cleaning the 'Hours viewed' column
netflix_data['Hours Viewed'] = netflix_data['Hours Viewed'].replace(',','', regex=True).astype(float)

In [19]:
#Changing the datatype of the 'Release date' column
netflix_data['Release Date'] = pd.to_datetime(netflix_data['Release Date'])

In [20]:
netflix_data.dtypes

Title                          object
Available Globally?            object
Release Date           datetime64[ns]
Hours Viewed                  float64
Language Indicator             object
Content Type                   object
dtype: object

In [6]:
# Aggregating viwership by content type
viewership_by_content_type = netflix_data.groupby('Content Type')['Hours Viewed'].sum()
viewership_by_content_type

Content Type
Movie    5.063780e+10
Show     1.077641e+11
Name: Hours Viewed, dtype: float64

In [8]:
fig = go.Figure(data=[
        go.Bar(
            x=viewership_by_content_type.index,
            y=viewership_by_content_type.values,
            marker_color = ['skyblue', 'salmon']
        )
    ])

fig.update_layout(
    title = 'Total Viewership Hours by Content type',
    xaxis_title = 'Content type',
    yaxis_title = 'Total Hours Viewed (in billions)',
    xaxis_tickangle = 0,
    height = 500,
    width = 800    
)

fig.show()

In [None]:
# Aggregating viwership by language
viewership_by_language = netflix_data.groupby('Language Indicator')['Hours Viewed'].sum().sort_values(ascending=False)
viewership_by_language

Language Indicator
English        1.244417e+11
Korean         1.537840e+10
Non-English    1.043910e+10
Japanese       7.102000e+09
Hindi          9.261000e+08
Russian        1.146000e+08
Name: Hours Viewed, dtype: float64

In [16]:
fig = go.Figure(data=[
        go.Bar(
            x=viewership_by_language.index,
            y=viewership_by_language.values,
            marker_color = 'lightcoral'
        )
    ])

fig.update_layout(
    title = 'Total Viewership Hours by language',
    xaxis_title = 'Language',
    yaxis_title = 'Total Hours Viewed (in billions)',
    xaxis_tickangle = 45,
    height = 600,
    width = 1000
)

In [26]:
# Adding the month column
netflix_data['Release Month'] = netflix_data['Release Date'].dt.month

In [31]:
# Aggregating viwership by months
monthly_viwership = netflix_data.groupby('Release Month')['Hours Viewed'].sum()

In [35]:
fig = go.Figure(data=[
    go.Scatter(
        x = monthly_viwership.index,
        y = monthly_viwership.values,
        mode= 'lines+markers',
        marker= dict(color='blue'),
        line = dict(color='blue')
    )
])

fig.update_layout(
    title = 'Total Viewership Hours by Release Month',
    xaxis_title = 'Month',
    yaxis_title = 'Total Hours Viewed (in billions)',
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(1,13)),
        ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height = 600,
    width = 1000
)

In [53]:
top_5_title = netflix_data.nlargest(5, 'Hours Viewed')
top_5_title['Hours Viewed'] = top_5_title['Hours Viewed'] / 1000000
top_5_title[['Title', 'Hours Viewed', 'Language Indicator', 'Content Type', 'Release Date']]

Unnamed: 0,Title,Hours Viewed,Language Indicator,Content Type,Release Date
0,The Night Agent: Season 1,812.1,English,Show,2023-03-23
1,Ginny & Georgia: Season 2,665.1,English,Show,2023-01-05
18227,King the Land: Limited Series // 킹더랜드: 리미티드 시리즈,630.2,Korean,Movie,2023-06-17
2,The Glory: Season 1 // 더 글로리: 시즌 1,622.8,Korean,Show,2022-12-30
18214,ONE PIECE: Season 1,541.9,English,Show,2023-08-31


In [58]:
# Aggregating viewership hours by content type and release month
monthly_viewership_by_content_type = netflix_data.pivot_table(
                                        index='Release Month',
                                        columns= 'Content Type',
                                        values= 'Hours Viewed',
                                        aggfunc=    'sum'
                                    )


In [64]:
fig = go.Figure()

for content_type in monthly_viewership_by_content_type.columns:
    fig.add_trace(
        go.Scatter(
            x=monthly_viewership_by_content_type.index,
            y=monthly_viewership_by_content_type[content_type],
            mode = 'lines+markers',
            name=content_type
        )
    )
fig.update_layout(
    title = 'ViewerShip trends by Content type and Release Month',
    xaxis_title = 'Month',
    yaxis_title = 'Total Hours Viewed (in billions)',
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(1,13)),
        ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width = 1000,
    legend_title = 'Content type'
)
fig.show()

In [93]:
# Defining seasons based on release months

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6,7,8]:
        return 'Summer'
    else:
        return 'Fall'

In [95]:
# Adding a 'Release Season' Column to analyse trends based on seasons
netflix_data['Release Season'] = netflix_data['Release Month'].apply(get_season)

In [100]:
#Aggregating viewership hours by release season

Viewership_by_seasons = netflix_data.groupby('Release Season')['Hours Viewed'].sum()

# Ordering the seasons
seasons_order = ['Winter', 'Spring', 'Summer', 'Fall']
seasonal_viewership = Viewership_by_seasons.reindex(seasons_order)

fig = go.Figure(data=[
    go.Bar(
    x = Viewership_by_seasons.index,
    y= Viewership_by_seasons.values,
    marker_color = 'orange'
    )
])

fig.update_layout(
    title = 'Total Viewership by Release Season (2023)',
    xaxis_title = 'Season',
    yaxis_title = 'Total Hours Viewed (in billions)',
    xaxis_tickangle = 0,
    height=500,
    width=800,
    xaxis = dict(
        categoryorder='array',
        categoryarray=seasons_order
    )
)

fig.show()

In [104]:
monthly_releases = netflix_data['Release Month'].value_counts().sort_index()

monthly_viewership = netflix_data.groupby('Release Month')['Hours Viewed'].sum()

In [112]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=monthly_releases.index,
        y=monthly_releases.values,
        name= 'Number of Releases',
        marker_color = 'goldenrod',
        opacity=0.7,
        yaxis='y'
    )
)

fig.add_trace(
    go.Scatter(
        x=monthly_viewership.index,
        y=monthly_viewership.values,
        name= 'Viewership Hours',
        mode='lines+markers',
        marker= dict(color='red'),
        line=dict(color='red'),
        yaxis='y2'
    )
)

fig.update_layout(
    title='Monthly Release Patterns and Viewership Hours (2023)',
    xaxis=dict(
        title='Month',
        tickmode='array',
        tickvals=list(range(1,13)),
        ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    yaxis=dict(
        title='Number of Releases',
        showgrid=False,
        side='left'
    ),
    yaxis2=dict(
        title='Total Hours Viewed (in billions)',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    height=600,
    width=1000
)

fig.show()

In [116]:
#Adding a Day column
netflix_data['Release Day'] = netflix_data["Release Date"].dt.day_name()

In [128]:
weekday_releases = netflix_data['Release Day'].value_counts().reindex(
                        ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
                    )

In [129]:
viewership_by_weekday = netflix_data.groupby('Release Day')['Hours Viewed'].sum().reindex(
                            ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
                        )

In [135]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=weekday_releases.index,
        y=weekday_releases.values,
        name='Number of Releases',
        marker_color='blue',
        opacity=0.6,
        yaxis='y1'
    )
)

fig.add_trace(
    go.Scatter(
        x=viewership_by_weekday.index,
        y=viewership_by_weekday.values,
        name= 'Viewership Hours',
        mode='lines+markers',
        marker=dict(color='red'),
        line=dict(color='red'),
        yaxis='y2'
    )
)

fig.update_layout(
    title='Weekly Release Patterns and Viewership Hours (2023)',
    xaxis=dict(
        title = 'Day of the Week',
        categoryorder='array',
        categoryarray=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    ),
    yaxis=dict(
        title='Number of Releases',
        showgrid=False,
        side='left'
    ),
    yaxis2=dict(
        title='Total Hours Viewed (in billions)',
        overlaying ='y',
        side='right',
        showgrid=False
    ),
    legend=dict(
        x=1.05,
        y=1,
        orientation='v',
        xanchor='left'
    ),
    height=600,
    width=1000
)

fig.show()

In [137]:
# define significant holidays and events in 2023
important_dates = [
    '2023-01-01',  # new year's day
    '2023-02-14',  # valentine's ay
    '2023-07-04',  # independence day (US)
    '2023-10-31',  # halloween
    '2023-12-25'   # christmas day
]

#Converting to datetime values
important_dates = pd.to_datetime(important_dates)

In [139]:
holiday_releases = netflix_data[netflix_data['Release Date'].apply(
                    lambda x: any((x- date).days in range(-3 , 4) for date in important_dates)
                )]

In [140]:
# Aggregating viewership hours for release near significant holidays

holiday_releases.groupby('Release Date')['Hours Viewed'].sum()

Release Date
2022-12-29    6.070000e+07
2022-12-30    1.352300e+09
2022-12-31    1.062000e+08
2023-01-01    2.816000e+08
2023-01-04    9.000000e+07
2023-02-13    3.870000e+07
2023-02-14    3.912000e+08
2023-02-15    3.494000e+08
2023-02-16    3.660000e+07
2023-02-17    1.458000e+08
2023-07-04    1.770000e+07
2023-07-05    8.590000e+07
2023-07-06    3.234000e+08
2023-07-07    1.757000e+08
2023-10-28    1.079000e+08
2023-10-31    6.300000e+06
2023-11-01    4.830000e+07
2023-11-02    2.390000e+08
2023-11-03    4.049000e+08
2023-12-22    7.260000e+07
2023-12-24    5.200000e+07
2023-12-25    8.400000e+06
2023-12-28    5.800000e+06
Name: Hours Viewed, dtype: float64

In [141]:
holiday_releases[['Title', 'Release Date', 'Hours Viewed']]

Unnamed: 0,Title,Release Date,Hours Viewed
2,The Glory: Season 1 // 더 글로리: 시즌 1,2022-12-30,622800000.0
6,La Reina del Sur: Season 3,2022-12-30,429600000.0
11,Kaleidoscope: Limited Series,2023-01-01,252500000.0
29,Perfect Match: Season 1,2023-02-14,176800000.0
124,Lady Voyeur: Limited Series // Olhar Indiscret...,2022-12-31,86000000.0
...,...,...,...
22324,The Romantics: Limited Series,2023-02-14,1000000.0
22327,Aggretsuko: Season 5 // アグレッシブ烈子: シーズン5,2023-02-16,900000.0
22966,The Lying Life of Adults: Limited Series // La...,2023-01-04,900000.0
22985,Community Squad: Season 1 // División Palermo:...,2023-02-17,800000.0
