In [1]:
import nltk
import re
import pandas as pd
import numpy as np
import seaborn as sns 
import collections as c
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Data preprocessing
df_netflix = pd.read_csv('netflix_titles.csv')
df_netflix.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."


In [3]:
len(df_netflix.drop_duplicates())

6234

In [4]:
len(df_netflix[df_netflix['type'] == 'Movie'])

4265

In [5]:
list(df_netflix[df_netflix['date_added'].isna()]['title'])

['Gunslinger Girl',
 'Anthony Bourdain: Parts Unknown',
 'Frasier',
 'La Familia P. Luche',
 'The Adventures of Figaro Pho',
 'Kikoriki',
 'Red vs. Blue',
 'Maron',
 'Little Baby Bum: Nursery Rhyme Friends',
 "A Young Doctor's Notebook and Other Stories",
 'Friends']

In [6]:
df_netflix = df_netflix[df_netflix['date_added'].notna()]

In [7]:
# Handled missing values and standardized datetime 
df_netflix['rating'] = df_netflix['rating'].fillna("")
df_netflix['director'] = df_netflix['director'].fillna("")
df_netflix['date_added'] = pd.to_datetime(df_netflix['date_added'])
df_netflix['year_added'] = df_netflix['date_added'].dt.year

In [52]:
# Objective 1: Understanding what content is available in different countries
country_count = c.Counter(", ".join(df_netflix['country'].dropna()).split(", "))
top_ten_countries = country_count.most_common(10)
country = [val[0] for val in top_ten_countries][::-1]
show_count = [val[1] for val in top_ten_countries][::-1]
trace1 = go.Bar(y=show_count, x=country, orientation="v", name="", marker=dict(color='Pink'))
data = [trace1]
layout = go.Layout(title="Top 10 countries with most content on Netflix", height=400, width=700, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [43]:
# Objective 2: The top genre on Netflix.
top_ten_genre = c.Counter((", ".join(df_netflix['listed_in'])).split(", ")).most_common(10)
genre = [val[0] for val in top_ten_genre][::-1]
count = [val[1] for val in top_ten_genre][::-1]
trace2 = go.Bar(y=count, x=genre, orientation="v", name="", marker=dict(color="Black"))
data = [trace2]
layout = go.Layout(title="Top 10 genres on Netflix", height=400, width=700, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [20]:
# Objective 3: The top 10 genres by Countries
def df_by_country(df, country):
  '''
  Returns a filtered df with shows available to a specific country
  Input: A dataframe and a selected country name
  Output: A dictionary with country as key and show_id as value
  '''
  drop_country_na = df[df['country'].notna()]
  return drop_country_na[drop_country_na['country'].str.contains(country)]

In [40]:
# Objective 3: The top 10 genres by Countries
def top_genre_by_country(df, country):
  genre_counter = c.Counter(", ".join(df_by_country(df_netflix, country)['listed_in']).split(", ")).most_common(10)
  genre = [val[0] for val in genre_counter][::-1]
  count = [val[1] for val in genre_counter][::-1]
  return genre, count

fig = make_subplots(rows=2, cols=2, horizontal_spacing=0.4,
      subplot_titles=("USA",'France', 'Japan', 'India'))
country = ['United States', 'France', 'Japan', 'India']
colors = ['Violet', 'Lime', 'Brown', 'Grey']
position = [(1,1), (1,2), (2,1), (2,2)]

for i in np.arange(len(country)):
  genre, count = top_genre_by_country(df_netflix, country[i])
  fig.add_trace(go.Bar(y=genre, x=count, 
                       orientation="h", name="", 
                       marker=dict(color=colors[i])), position[i][0],  position[i][1])
  
  fig.update_layout(showlegend=False, height = 650, width = 900, title_text="Top 10 genres by country")
  
fig.show()

In [46]:
# Objective 4: Percentage of Netflix Content in English,Spanish,Korean
netflix_spanish = sum(df_netflix['listed_in'].str.contains('Spanish'))
netflix_korean = sum(df_netflix['listed_in'].str.contains('Korean'))
netflix_english = len(df_netflix) - netflix_spanish - netflix_korean

labels = ['Spanish','Korean','English']
values = [netflix_spanish, netflix_korean, 1053, 500]
colors = ['Light Red ', 'Light Blue', 'Green']

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_traces(marker=dict(colors=colors), hoverinfo = 'skip')
fig.update_layout(title_text ='Percentage of Netflix content in English, Korean and Spanish',)
fig.show()

In [50]:
# Objective 5: Determining if Netflix has increasingly focusing on TV rather than movies in recent years

tv_add_count = df_netflix[df_netflix['type'] == 'TV Show'].groupby('year_added').size().reset_index(name = 'added_count').iloc[0:9,:]
movie_add_count = df_netflix[df_netflix['type'] == 'Movie'].groupby('year_added').size().reset_index(name = 'added_count').iloc[0:12,:]
trace1 = go.Scatter(x=tv_add_count['year_added'], y=tv_add_count['added_count'], name="TV Shows", marker=dict(color="Black"))
trace2 = go.Scatter(x=movie_add_count["year_added"], y=movie_add_count['added_count'], name="Movies", marker=dict(color="Blue"))
data = [trace1, trace2]
layout = go.Layout(title="TV shows vs. Movies added over the years", width = 600, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()