Importing packages

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

import bokeh.plotting as bpl
import bokeh.models as bmo
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, GroupFilter, CDSView, Legend
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.palettes import Viridis256  
from bokeh.io import output_notebook
from bokeh.layouts import column

from datetime import datetime

output_notebook()

In [10]:
!pip install scipy



In [11]:
!pip install matplotlib



In [12]:
!pip install bokeh



In [13]:
!pip install seaborn



In [14]:
!pip install numpy



Importing the data

In [15]:
!pip install pandas



In [16]:
pip install kaggle

Collecting kaggle
  Downloading kaggle-1.6.6.tar.gz (84 kB)
     ---------------------------------------- 0.0/84.6 kB ? eta -:--:--
     -------------- ------------------------- 30.7/84.6 kB ? eta -:--:--
     -------------- ------------------------- 30.7/84.6 kB ? eta -:--:--
     ------------------------------------ - 81.9/84.6 kB 416.7 kB/s eta 0:00:01
     -------------------------------------- 84.6/84.6 kB 397.4 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting certifi (from kaggle)
  Using cached certifi-2024.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting requests (from kaggle)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm (from kaggle)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 3.2 MB/s eta 0:00:00
Collecting python-

In [17]:
df = pd.read_csv('netflix_titles.csv')

Cleaning the data

In [18]:
# copy df and remove nans
df1 = df.copy()
df_clean = df1.fillna('unknown')

#set new index
df_clean['show_id'].str[0].value_counts()

def remove_s(x):
    return int(x[1:])

# release year converted to int
df_clean['release_year'] = df_clean['release_year'].astype('int')

#change datetime format
df_clean['date_added'] = pd.to_datetime(df['date_added'], format='mixed')

# Removing a column
df_clean = df_clean.drop(columns=['description'], axis=1)

EDA / Data Visualisations

Creating a scatterplot of release year x count

In [19]:
df_new = df_clean.groupby(['type'])['release_year'].value_counts().reset_index()
df_new = df_new.sort_values(by='release_year', ascending=False)

In [20]:
plot = figure(width=900, height=550, tools=['pan','reset','box_zoom', 'wheel_zoom', 'click'],
              title='Release years of Movies and TV Shows added to Netflix',
                 x_axis_label='Year of release',
                 y_axis_label='Count added',
                 title_location='above')

plot.title.text_color = "black"
# plot.title.text_font_style = "bold"
plot.title.text_font_size = '13pt'

plot.background_fill_color = "#2d2e2e"
plot.grid.grid_line_alpha = 0.1  # Set the alpha value for grid lines

# plot.border_fill_color = "#2d2e2e"

color_mapper = factor_cmap('type', palette=['#758ea5', '#c35c55'], factors=['TV Show', 'Movie'])

plot.circle('release_year', 
         'count', 
         source=df_new, 
         size=12, 
         alpha=1,
         fill_color = color_mapper,
         line_color='white',
         legend_field='type')

plot.legend.title = 'Type'
plot.legend.label_text_font_size = '12px'
plot.legend.location = 'top_left'


show(plot)

Defining a function to create a lineplot of date added x count added

aggregating the data:

In [21]:
df_new_two = df_clean.groupby(['type'])['date_added'].value_counts().reset_index()
df_new_two = df_new_two.sort_values(by='date_added', ascending=True)

In [22]:
df_new_three = df_new_two.copy()
df_new_three['year'] = pd.to_datetime(df_new_three['date_added']).dt.strftime('%Y').astype(int) # Changing date structure
df_new_three = df_new_three.drop(columns='date_added')
df_new_three = df_new_three.groupby(['year','type'])['count'].sum().reset_index()

Defining a function to create a lineplot:

In [23]:
def bokeh_lineplot(data_to_plot, x_column, y_column):
    
    tv_show_data = data_to_plot[data_to_plot['type'] == 'TV Show']
    movie_data = data_to_plot[data_to_plot['type'] == 'Movie']
    
    vis = figure(width=900, height=550, tools=['pan','reset','box_zoom', 'wheel_zoom', 'click'],
              title='Dual lineplot showing type of title added by year',
                 x_axis_label='Year added',
                 y_axis_label='Count added',
                 title_location='above')
    
    vis.title.text_color = "black"
    vis.title.text_font_size = '13pt'

    vis.background_fill_color = "#2d2e2e"
    vis.grid.grid_line_alpha = 0.1  # Set the alpha value for grid lines
    
    tv_line = vis.line(x=tv_show_data['year'], y=tv_show_data['count'], line_color='#758ea5', legend_label='TV Show', width=5)
        
    movie_line = vis.line(x=movie_data['year'], y=movie_data['count'], line_color='#c35c55', legend_label='Movie', width = 5)
                
    vis.legend.title = 'Type'
    vis.legend.label_text_font_size = '12px'
    vis.legend.location = 'top_left'

    
    return(show(vis))

In [24]:
bokeh_lineplot(df_new_three, 'count', 'year')

#Creating a lineplot of month x content added

Aggregating the data:

In [25]:
df3 = df_clean.copy()
df3 = df3.rename(columns={"date_added": "Date Added"}) # Renaming columns
df3['Month'] = pd.to_datetime(df3['Date Added']).dt.strftime('%m') # Changing date structure
df4 = df3.groupby(['Month', 'type']).size().reset_index(name='Total Content') # Aggregating data

In [26]:
p = figure(width=900, height=600, x_axis_label='Month', y_axis_label='Total Content Added',
           title='Monthly trend of TV show & Movie content added to Netflix',
           title_location='above',
           background_fill_color="#2d2e2e")

p.title.text_color = "black"
p.title.text_font_size = '13pt'

p.grid.grid_line_alpha = 0.7

# Plot lines
movie_source = df4[df4['type'] == 'Movie']
tv_show_source = df4[df4['type'] == 'TV Show']

movie_line = p.line(x='Month', y='Total Content', source=movie_source, line_color='#c35c55', line_width=2, legend_label='Movie')
tv_show_line = p.line(x='Month', y='Total Content', source=tv_show_source, line_color='#758ea5', line_width=2, legend_label='TV Show')

# Fill between lines
p.varea(x='Month', y1='Total Content', y2=0, source=movie_source, fill_color="#c35c55", alpha=0.9, legend_label='Movie')
p.varea(x='Month', y1='Total Content', y2=0, source=tv_show_source, fill_color="#758ea5", alpha=0.9, legend_label='TV Show')

# Customize the legend
legend = Legend(items=[("Movie", [movie_line]),("TV Show", [tv_show_line])])
p.legend.title = 'Type'
p.legend.label_text_font_size = '12px'
p.legend.location = 'bottom_left'


# Show the plot
show(p)