In [1]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import iplot, plot
import plotly.graph_objects as go
from plotly.subplots import make_subplots



from collections import Counter


In [2]:
os.listdir('/kaggle/input/netflix-dataset') 

['netflix1.csv']

# Read Data 

In [3]:
netflix = pd.read_csv('/kaggle/input/netflix-dataset/netflix1.csv') 

# Check Basic Information  

In [4]:
netflix.shape

(8790, 10)

In [5]:
netflix.dtypes

show_id         object
type            object
title           object
director        object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
dtype: object

In [6]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB


In [7]:
netflix.head(10)

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"
5,s9,TV Show,The Great British Baking Show,Andy Devonshire,United Kingdom,9/24/2021,2021,TV-14,9 Seasons,"British TV Shows, Reality TV"
6,s10,Movie,The Starling,Theodore Melfi,United States,9/24/2021,2021,PG-13,104 min,"Comedies, Dramas"
7,s939,Movie,Motu Patlu in the Game of Zones,Suhas Kadav,India,5/1/2021,2019,TV-Y7,87 min,"Children & Family Movies, Comedies, Music & Mu..."
8,s13,Movie,Je Suis Karl,Christian Schwochow,Germany,9/23/2021,2021,TV-MA,127 min,"Dramas, International Movies"
9,s940,Movie,Motu Patlu in Wonderland,Suhas Kadav,India,5/1/2021,2013,TV-Y7,76 min,"Children & Family Movies, Music & Musicals"


 # **Data Cleaning & Exploration** 

# 1. show_id column 

In [8]:
# Check Duplicated ID:   

netflix['show_id'].duplicated().sum() 

0

In [9]:
# Delete show_id column:  

netflix = netflix.drop(['show_id'], axis = 1) 

# 2. type column

In [10]:
# Count the Value of Each Type: 

netflix['type'].value_counts() 

type
Movie      6126
TV Show    2664
Name: count, dtype: int64

In [11]:
# The proportion of each unique value relative to the total number of values: 

netflix['type'].value_counts(normalize = 1) 

type
Movie      0.696928
TV Show    0.303072
Name: proportion, dtype: float64

In [12]:
# Draw a Frequency Table of Type Column: 

type_of_shows = netflix['type'].value_counts() 

fig = px.bar(data_frame = type_of_shows, 
             x = type_of_shows.index, 
             y = type_of_shows, 
             color = type_of_shows.index, 
             color_discrete_sequence = ['#222', '#E50914'], 
             text_auto = True, 
             title = 'Frequency of Show Type', 
             labels = {'index': 'Type', 
                      'y': 'Frequency'
                     }) 

fig.update_traces(insidetextfont = {
                                    'family' : 'consolas',
                                    'size': 20 
})

fig

# 🌃 3.  country column

In [13]:
netflix['country'].describe()

count              8790
unique               86
top       United States
freq               3240
Name: country, dtype: object

In [14]:
# Value Counts of Country Column: 

netflix['country'].value_counts()  

country
United States     3240
India             1057
United Kingdom     638
Pakistan           421
Not Given          287
                  ... 
Iran                 1
West Germany         1
Greece               1
Zimbabwe             1
Soviet Union         1
Name: count, Length: 86, dtype: int64

# 4.date_added column 

In [15]:
netflix['date_added'].dtype

dtype('O')

In [16]:
# Converting The Data Type From String Into DateTime: 


netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix['date_added']

0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-22
4      2021-09-24
          ...    
8785   2017-01-17
8786   2018-09-13
8787   2016-12-15
8788   2018-06-23
8789   2018-06-07
Name: date_added, Length: 8790, dtype: datetime64[ns]

In [17]:
# Check the oldest added_date and newest added_date: 

print(f"The oldest added date: {netflix['date_added'].min()}") 
print(f"The newest added date: {netflix['date_added'].max()}")

The oldest added date: 2008-01-01 00:00:00
The newest added date: 2021-09-25 00:00:00


# 5. release_year column 

In [18]:
# Counts the value: 

released_year = netflix['release_year'].value_counts() 
released_year

release_year
2018    1146
2017    1030
2019    1030
2020     953
2016     901
        ... 
1966       1
1959       1
1925       1
1947       1
1961       1
Name: count, Length: 74, dtype: int64

In [19]:
# Draw an area plot to see how value changes over time: 

fig = px.area(released_year, x = released_year.index, y = released_year) 

fig

#  6. rating column

In [20]:
rating = netflix['rating'].value_counts() 
rating

rating
TV-MA       3205
TV-14       2157
TV-PG        861
R            799
PG-13        490
TV-Y7        333
TV-Y         306
PG           287
TV-G         220
NR            79
G             41
TV-Y7-FV       6
NC-17          3
UR             3
Name: count, dtype: int64

In [21]:
# Draw a bar about the Popularity of Rating: 

fig = px.bar(data_frame = rating, 
             x = rating, 
             y = rating.index, 
             color = rating.index,
             text_auto = True, 
             orientation = 'h', 
             title = 'Popularity of Rating', 
             labels = {'x' : 'Frequency',
                       'y' : 'Type of Rating'},
            height = 500) 

fig.update_traces(textposition = 'outside', 
                  insidetextfont = {"family":"consolas", 
                                    "size":15})   
fig

# 📆 7. duration column

In [22]:
netflix['duration'].value_counts() 

duration
1 Season      1791
2 Seasons      421
3 Seasons      198
90 min         152
97 min         146
              ... 
5 min            1
16 min           1
186 min          1
193 min          1
11 Seasons       1
Name: count, Length: 220, dtype: int64

# Q1. The Most Common Rating for Each Type of Shows

# *Digital View* 

In [23]:
print('Movie') 

netflix.loc[netflix['type'] == 'Movie', 'rating'].value_counts().nlargest(1)    

Movie


rating
TV-MA    2062
Name: count, dtype: int64

In [24]:
print('TV Show')  

netflix.loc[netflix['type'] == 'TV Show', 'rating'].value_counts().nlargest(1)   

TV Show


rating
TV-MA    1143
Name: count, dtype: int64

# *Graphical View*

In [25]:
movies = netflix.loc[netflix['type']=='Movie', 'rating'].value_counts() 
tv_show = netflix.loc[netflix['type']=='TV Show', 'rating'].value_counts() 

movie_bar = go.Bar(x = movies.index, y = movies, name = 'movie')
tv_show_bar = go.Bar(x = tv_show.index, y = tv_show, name = 'tv show')  
 
fig = make_subplots(rows=1, cols=2, shared_yaxes = False)  

fig.add_trace(movie_bar, row=1, col=1)  
fig.add_trace(tv_show_bar, row=1, col=2) 

fig.update_layout(height = 500, width = 700, title_text = 'Ratigin Per Each Show Type')
fig.update_xaxes(tickangle = 90)

fig

# Who are the top 5 directors with the most Movies & TV Shows?

In [26]:
directors = netflix['director'].value_counts()[1:6]   
directors

director
Rajiv Chilaka             20
Alastair Fothergill       18
Raúl Campos, Jan Suter    18
Suhas Kadav               16
Marcus Raboy              16
Name: count, dtype: int64

In [27]:
fig = px.bar(directors, 
             x = directors, 
             y = directors.index, 
             orientation = 'h',
             color = directors.index, 
             text_auto = True, 
             title = 'Top 5 Directors', 
             labels = {'y': 'Directors',
                       'x': 'Amounts of movies & TV'}) 

fig.update_traces(textposition = 'outside', 
                  outsidetextfont = { 'family': 'consolas',
                                      'size': 20} )

fig.update_layout(showlegend = False, height = 600, width = 800)

# What is The Number of Movies and TV Shows Added to the Netflix Platform per Each Year??

In [28]:
added_per_year = netflix.groupby(netflix['date_added'].dt.year)['type'].count()  

added_per_year

date_added
2008       2
2009       2
2010       1
2011      13
2012       3
2013      11
2014      24
2015      82
2016     426
2017    1185
2018    1648
2019    2016
2020    1879
2021    1498
Name: type, dtype: int64