# Import Library and Dataset

In [None]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
netflix = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

# Data Exploration

In [None]:
#data info show and first 5 row
netflix.info()

netflix.head(5)

from "date_added" I need to chenge format date from Month D,Yr to YYYY-MM-DD

In [None]:
#change format date from Month D,Yr to YYYY-MM-DD
netflix['date_added'] = pd.to_datetime(netflix['date_added'])

netflix.info()

netflix.head(5)

In [None]:
#count missing value
missing_value_count = netflix.isnull().sum()
missing_value_count

#viz missing value in hearmap
plt.figure(figsize=(8,8))
sns.heatmap(netflix.isnull(), cbar=False)
plt.show()

The column "director" has the most empty values, followed by "country," "cast," and "date added."
I won't use prediction as this column is of the string type and independent of other variables.

## What are the type of contents
Analyze the variable and "type" indicates if the item is a Movie or a TV show.

In [None]:
#text
movie = netflix[netflix["type"]=="Movie"]
tvshow = netflix[netflix["type"]=="TV Show"]

#viz
plt.figure(figsize=(12,8))
sns.countplot(x='type', data=netflix, palette="rocket")
plt.xticks(fontsize=13)
plt.yticks(fontsize=10)
plt.title("Number of contents", fontsize = 13)
plt.xlabel("")
plt.ylabel("")

#print
print("{} movies and {} TV shows".format(movie.shape[0],tvshow.shape[0]))
plt.show()

## Rating

In [None]:
kids = ['TV-Y','G','TV-T7','TV-Y7-FV','TV-G','PG','TV-PG']
teens = ['PG-13','TV-14']
adult = ['R','TV-MA','NC-17']

netflix_rating = pd.DataFrame(netflix ['rating'],columns = ['rating'])

def age(x):
    if x in kids :
        return 'Kids'
    if x in teens:
        return 'Teens'    
    if x in adult :
        return 'Adults'
    else:
        return 'Not Rated'
netflix_rating['ages'] = netflix['rating'].apply(age)

netflix_rating

In [None]:
#count show number for each rating
nr = netflix_rating.groupby(['ages','rating']).agg(number=('rating','count')).sort_values(by = 'ages').reset_index()

nr

In [None]:
#viz of rating
fig = px.sunburst(nr,path = ['ages','rating'], values = 'number')

fig.show()

## Genre

In [None]:
netflix['listed_in'].head(3)

In [None]:
df = netflix
#split rows and put to new dataframe
new_df = pd.DataFrame(df.listed_in.str.split(',').tolist()).stack()
new_df = pd.DataFrame(new_df).reset_index()
new_df.columns = ['a','b','genre']
new_df.drop(columns=['a','b'],inplace = True)
new_df['genre'] = new_df['genre'].str.strip()

new_df[:5]

In [None]:
#count of genre
genre = pd.DataFrame(new_df[['genre']].value_counts()).reset_index()
genre.columns = ['genre','count']
genre[:10]

In [None]:
#viz of genre
fig = px.bar(genre[:10], x = 'genre',y = 'count')

fig.show()

## Country (making contents)

In [None]:
netflix['country'] = netflix['country'].fillna("null")
netflix_country = pd.concat([Series(row['date_added'], row['country'].split(','))
                             for _, row in netflix.iterrows()]).reset_index()
netflix_country.columns = ['country','date_added']

netflix_country

In [None]:
netflix_country['country'] = netflix_country['country'].str.strip()
contents_by_country = netflix_country['country'].value_counts().to_frame().reset_index()
contents_by_country.columns = ['country','number of contents']

contents_by_country

In [None]:
#delete null row (index = 3)

contents_by_country.drop([3])

In [None]:
netflix_country = netflix_country.sort_values(by = 'date_added', ascending = False)
netflix_country['n'] = 1
date_country = netflix_country.groupby(['country','date_added']).sum().groupby(level = 0).cumsum().reset_index()

date_country.sort_values(by = 'n')

In [None]:
netflix_country[netflix_country['country'] == 'United States']

In [None]:
netflix_country[netflix_country['date_added'].isnull()]

In [None]:
#Replace NaT with 22-10-2009
netflix_country['date_added'].fillna(pd.Timestamp('20150101'),inplace = True)
date_country = netflix_country.groupby(['country','date_added']).sum().groupby(level = 0).cumsum().reset_index()
pd.DataFrame(date_country.groupby(['country'])['n'].max()).sort_values('n',ascending = False)

In [None]:
#viz contents by country
top10_country = contents_by_country[:10].set_index('country')
dc = date_country[date_country['country'].isin(top10_country.index.tolist())]
fig = px.line(dc, x = 'date_added', y = 'n', color = 'country')

fig.show()