

### Dataset: https://www.kaggle.com/shivamb/netflix-shows

### Step by step:
- Import data and libraries
- High level EDA: use describe function to check values
- Data analysis 
- String column analysis

### Import data and libraries

In [1]:
import pandas as pd

In [2]:
import plotly.express as px

In [34]:
data_import = pd.read_csv("netflix_titles.csv")

In [35]:
# Change date type to date time format
data_import["date_added"] = pd.to_datetime(data_import["date_added"].str.strip(), format = "%B %d, %Y")
data_import.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### High level EDA

In [36]:
# check datatype of columns
data_import.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

### Numerical column describe

In [38]:
# do basic statistics for numerical column to check values
data_import.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


### Date analysis

#### Release year

In [39]:
# When did the time release movies and tv shows the most ?
px.histogram(data_import, x = "release_year") # plotly method

In [10]:
# Outliers? Check the released year <= 1925
data_import[data_import["release_year"] <= 1925]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
4250,s4251,TV Show,Pioneers: First Women Filmmakers*,,,,2018-12-30,1925,TV-14,1 Season,TV Shows,This collection restores films from women who ...


#### Date added

In [11]:
# Which period of time did the movies and shows add to netflix the most?

px.histogram(data_import, x = "date_added") # plotly method

 # >> netflix add a bunch of movies at the middle and end of the year

In [12]:
# Extract month from date_added 
data_import["date_added_month"] = data_import["date_added"].dt.month.fillna(0)

# Extract day from date_added 
data_import["date_added_day"] = data_import["date_added"].dt.day.fillna(0)

data_import.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,date_added_month,date_added_day
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",9.0,25.0
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",9.0,24.0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,9.0,24.0
3,s4,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",9.0,24.0
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,9.0,24.0


In [13]:
# Which month did the movies and shows add to netflix the most?

px.histogram(data_import, x = "date_added", color = "date_added_month") 

In [14]:
px.histogram(data_import, x = "date_added_month", color = "type")

# >> Netflix has Movies more than TV Shows.

In [15]:
px.histogram(data_import, x = "date_added_day", color = "type") 
# looks like Netflix adds movies on the first of the month more than other day

### String column analysis

In [16]:
# see all columns
data_import.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'date_added_month', 'date_added_day'],
      dtype='object')

In [43]:
data_import["country"].unique()

# >> One movie is added to multiple countries on netflix.

array(['United States', 'South Africa', nan, 'India',
       'United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia',
       'United Kingdom', 'Germany, Czech Republic', 'Mexico', 'Turkey',
       'Australia', 'United States, India, France', 'Finland',
       'China, Canada, United States',
       'South Africa, United States, Japan', 'Nigeria', 'Japan',
       'Spain, United States', 'France', 'Belgium',
       'United Kingdom, United States', 'United States, United Kingdom',
       'France, United States', 'South Korea', 'Spain',
       'United States, Singapore', 'United Kingdom, Australia, France',
       'United Kingdom, Australia, France, United States',
       'United States, Canada', 'Germany, United States',
       'South Africa, United States', 'United States, Mexico',
       'United States, Italy, France, Japan',
       'United States, Italy, Romania, United Kingdom',
       'Australia, United States', 'Argentina, Venezuela',
       'United States, United Kin

In [21]:
# Split the countries into one country in one row
data_import["country"].str.split(",", expand= True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,United States,,,,,,,,,,,
1,South Africa,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,India,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,United States,,,,,,,,,,,
8803,,,,,,,,,,,,
8804,United States,,,,,,,,,,,
8805,United States,,,,,,,,,,,


In [22]:
country_count = data_import.copy()

country_count = pd.concat([country_count, data_import["country"].str.split(",", expand = True)], axis = 1)

country_count = country_count.melt(id_vars = ["type","title"], value_vars = range(12), value_name ="Country")

country_count = country_count[country_count["Country"].notna()]
country_count["Country"] = country_count["Country"].str.strip()
country_count

Unnamed: 0,type,title,variable,Country
0,Movie,Dick Johnson Is Dead,0,United States
1,TV Show,Blood & Water,0,South Africa
4,TV Show,Kota Factory,0,India
7,Movie,Sankofa,0,United States
8,TV Show,The Great British Baking Show,0,United Kingdom
...,...,...,...,...
78859,Movie,The Look of Silence,8,Germany
85496,Movie,Barbecue,9,Sweden
87666,Movie,The Look of Silence,9,Netherlands
94303,Movie,Barbecue,10,United States


In [23]:
# Which countries do prefer watching movie to TV shows?

px.histogram(country_count, x = "Country", color = "type").update_xaxes(categoryorder = "total descending")

In [24]:
# Display ratings
px.histogram(data_import, x = "rating").update_xaxes(categoryorder = "total descending")

In [29]:
# split casts
data_import["Cast_count"] = data_import["cast"].str.split(",")
data_import = data_import[data_import["Cast_count"].notna()]

# count number of casts in a movie/ tv show
data_import["Cast_count"] = data_import["Cast_count"].apply(lambda x: len(x))
data_import.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,date_added_month,date_added_day,Cast_count
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",9.0,24.0,19
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,9.0,24.0,9
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,9.0,24.0,8
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,2021-09-24,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...,9.0,24.0,16
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,2021-09-24,2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...,9.0,24.0,10


In [33]:
# Display cast count in movies
px.histogram(data_import, x = "Cast_count").update_xaxes(categoryorder = "total descending")