# Netflix Shows and Movies - Exploratory Analysis
#### The dataset used here contains information such as rear added, cast, director, TV-ratings, etc of the content on netflix from 2008 to 2019.

Immediate scope of this work: 
* Brush up on basic plotly vizualizations.

Future scope of this work: 
* Find and append audience rating data to the current data set and find correlations between existing data and audience ratings.
* Perform NLP analysis of the 'description' feature and find correlation between NLP features and audience ratings.
* Use text generation algorithms to generate descriptions that would give higher ratings.

In [1]:
import pandas as pd
import numpy as np
from _plotly_future_ import v4_subplots
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from codes import country_codes
from collections import Counter

In [2]:
nf_df = pd.read_csv('netflix_titles.csv')
nf_df['date_added'] = pd.to_datetime(nf_df['date_added'])
nf_df['year_added'] = nf_df['date_added'].dt.year
nf_df['month_added'] = nf_df['date_added'].dt.month

In [3]:
nf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 14 columns):
show_id         6234 non-null int64
type            6234 non-null object
title           6234 non-null object
director        4265 non-null object
cast            5664 non-null object
country         5758 non-null object
date_added      6223 non-null datetime64[ns]
release_year    6234 non-null int64
rating          6224 non-null object
duration        6234 non-null object
listed_in       6234 non-null object
description     6234 non-null object
year_added      6223 non-null float64
month_added     6223 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(2), object(9)
memory usage: 681.9+ KB


In [4]:
nf_df.shape

(6234, 14)

There are a total of 6234 movies/tv shows.

In [5]:
nf_df.isna().sum()

show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
year_added        11
month_added       11
dtype: int64

In [6]:
nf_df['season_count'] = nf_df.apply(lambda x : int(x['duration'].split()[0]) if "Season" in x['duration'] else np.nan, axis = 1)
nf_df['duration_min'] = nf_df.apply(lambda x : int(x['duration'].split()[0]) if "min" in x['duration'] else np.nan, axis = 1)

In [7]:
nf_df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count,duration_min
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09-09,2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,2019.0,9.0,,90.0
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,2016-09-09,2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,2016.0,9.0,,94.0
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,2018-09-08,2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob...",2018.0,9.0,1.0,


In [34]:
type_count = nf_df['type'].value_counts().reset_index()
type_count.rename(columns = {'index' : 'type', 'type': 'count'}, inplace = True)

trace = go.Pie(labels=type_count['type'], values=type_count['count'], pull=[0.04, 0], marker=dict(colors=["#8ao49b", "#z028de"]))
layout = go.Layout(title="Netflix Content", height=500, legend=dict(x=0.1, y=1.1))
fig = go.Figure(data = [trace], layout = layout)
fig.show()

In [9]:
type_year = nf_df[['type','year_added']]

movie_year = type_year[type_year['type'] == 'Movie']['year_added'].value_counts().reset_index()
movie_year.rename(columns = {'index' : 'year', 'year_added': 'count'}, inplace = True)
movie_year = movie_year[movie_year['year'] != 2020]
tv_year = type_year[type_year['type'] == 'TV Show']['year_added'].value_counts().reset_index()
tv_year.rename(columns = {'index' : 'year', 'year_added': 'count'}, inplace = True)
tv_year = tv_year[tv_year['year'] != 2020]

trace1 = go.Scatter(x= movie_year['year'], y=movie_year["count"], name="Movies", marker=dict(color="#a678de"))
trace2 = go.Scatter(x = tv_year['year'], y=tv_year["count"], name="TV Shows", marker=dict(color="#6ab48b"))
data = [trace1, trace2]
layout = go.Layout(title="Number Of Content Added Over The Years", legend=dict(x=0.7, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [10]:
nf_countries = list(", ".join(nf_df['country'].dropna()).split(', '))
country_with_code, country = {}, {}
for k,v in dict(Counter(nf_countries)).items():
    code = ''
    if k.lower() in country_codes:
        code = country_codes[k.lower()]
        country_with_code[code] = v
        country[k] = v

data = [dict(
            type = 'choropleth',
            locationmode='country names',
            locations = list(country.keys()),
            z = list(country.values()),
            colorscale = [[0,"rgb(5, 10, 172)"],[0.65,"rgb(40, 60, 190)"],[0.75,"rgb(70, 100, 245)"],\
                        [0.80,"rgb(90, 120, 245)"],[0.9,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
            autocolorscale = False,
            reversescale = True,
            marker = dict(
                line = dict (
                    color = 'gray',
                    width = 0.4
                ) ),
          ) ]

layout = dict(
        title = 'Netflix Content By Countries',
        height = 700,
        width = 1000,
        geo = dict(
            showframe = False,
            showcoastlines = True,
            projection = dict(
                type = 'mercator'
            )
        )
    )



fig = go.Figure(data=data, layout=layout)
fig.show()

In [11]:
country_sorted = {k: v for k, v in sorted(country.items(), key=lambda item: item[1], reverse = True)}
top_countries  = list(country_sorted.keys())[:5][::-1]
top_countries_vals  = list(country_sorted.values())[:5][::-1]

trace1 = go.Bar(y=top_countries, x=top_countries_vals, orientation="h", name="", marker=dict(color="#6ab48b"))

data = [trace1]
layout = go.Layout(title="Top 5 Countries For Content", height=300,legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [12]:
mov_dur = nf_df['duration_min'].reset_index()
mov_dur.drop(['index'], axis =1, inplace = True)
mov_dur.dropna(inplace=True)

fig = ff.create_distplot([mov_dur['duration_min']], ['# of Movies'], bin_size=1, curve_type='normal', colors=["#3a489b"])
fig.layout.update(shapes=[
    dict(
      type= 'line',
      yref= 'y', y0 =0, y1=0.027,
      xref= 'x', x0= mov_dur['duration_min'].median(), x1= mov_dur['duration_min'].median(),
      line = dict(color = 'orange', width = 3, dash= 'dashdot')
    )
])

fig.add_trace(go.Scatter(
    x=[mov_dur['duration_min'].median()+29, mov_dur['duration_min'].median()+29, 6],
    y=[00.025, 0.025, 0.025],
    text=["Median Duration"],
    mode="text",))

fig.layout.update(title_text='Distribution Of Movie Duration (mins)')
fig.show()

In [13]:
sns_count = nf_df[nf_df['type'] == 'TV Show']['season_count'].value_counts().reset_index()
sns_count.rename(columns = {'index' : 'seasons', 'season_count': 'count'}, inplace = True)


trace1 = go.Bar(x=sns_count['seasons'], y=sns_count['count'], name="TV Shows", marker=dict(color="#e377c2"))
data = [trace1]
layout = go.Layout(title="Season Count Of TV Shows", height = 400, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [14]:
mov_ratings = nf_df[nf_df['type'] == 'Movie']['rating'].value_counts().reset_index()
mov_ratings.rename(columns = {'index' : 'rating', 'rating': 'count'}, inplace = True)

tv_ratings = nf_df[nf_df['type'] == 'TV Show']['rating'].value_counts().reset_index()
tv_ratings.rename(columns = {'index' : 'rating', 'rating': 'count'}, inplace = True)

trace1 = go.Bar(x=mov_ratings['rating'], y=mov_ratings['count'], name="Movies", marker=dict(color="#bcbd22"))
trace2 = go.Bar(x=tv_ratings['rating'], y=tv_ratings['count'], name="TV Show", marker=dict(color="#6ab48b"))
data = [trace1, trace2]
layout = go.Layout(title="Ratings", height = 400, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [15]:
genres = ", ".join(nf_df['listed_in']).split(", ")
genres = pd.DataFrame(genres)
genres.rename(columns = {0:'genre'}, inplace = True)
genre_count = genres['genre'].value_counts().reset_index()
genre_count.rename(columns = {'index' : 'genre', 'genre': 'count'}, inplace = True)


fig = go.Figure(data=[go.Pie(labels=genre_count[0:10]['genre'], values=genre_count[0:10]['count'], hole=.3)])
fig.layout.update(title_text='Top 10 Genres On Netflix')
fig.show()

In [32]:
traces = []
top5_countries = list(country_sorted.keys())[:5][::-1]
for cnt in top5_countries:
    actors_countries = nf_df[['cast', 'country']]
    actors_countries.dropna(inplace=True)
    for i, c in actors_countries.iterrows():
        cs = c[1].split(', ')
        if cnt in cs:
            actors_countries.loc[i,'top5'] = int(1)
        else:
            actors_countries.loc[i,'top5'] = int(0)

    actors_countries = actors_countries[actors_countries['top5']==1]
    top_actors = ", ".join(actors_countries['cast']).split(", ")

    top_actors = pd.DataFrame(top_actors, columns= ['actors'])
    top_actors = top_actors['actors'].value_counts().reset_index()
    top_actors.rename(columns = {'index' : 'actor', 'actors': 'count'}, inplace = True)
    top5_actors = top_actors[0:5]
    trace = go.Bar(y=top5_actors['actor'], x=top5_actors['count'], orientation="h", name="", marker=dict(color="#a678de"))
    traces.append(trace)

In [33]:
fig = make_subplots(rows=5, cols=1, subplot_titles=top5_countries)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 2,1)
fig.add_trace(traces[2], 3,1)
fig.add_trace(traces[3], 4,1)
fig.add_trace(traces[4], 5,1)

fig.layout.update(height=1200, showlegend=False, title = 'Most Featured Actors In Countries With The Most Content')
fig.show()