In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv("data/netflix_titles.csv") # Import netflix_titles.csv file
data.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


### 1. Countries with most content

In [3]:
data_country_content_list = list(data["country"]) # Create a list of country in each movie

In [4]:
country_dict = {}
for countries in data_country_content_list:
    try:
        for country in countries.split(", "):
            if country in country_dict: # Check if this country already in country_dict
                country_dict[country]+=1
            else:
                country_dict[country]=1
    except:
        pass

In [5]:
sorted_country = sorted(country_dict.items(), key=lambda x: x[1], reverse=True) # Sort country_dict descending
sorted_country_df = pd.DataFrame(sorted_country,columns=["Country","Number of content"]) # Create the DataFrame for country and the number of  content
sorted_country_df.head(10)

Unnamed: 0,Country,Number of content
0,United States,3296
1,India,990
2,United Kingdom,722
3,Canada,412
4,France,349
5,Japan,287
6,Spain,215
7,South Korea,212
8,Germany,199
9,Mexico,154


### 2. Content types by Directors

In [6]:
data_director_content_type = data[["director","listed_in"]].dropna() # Drop the movies that have no director
data_director_content_type.head()

Unnamed: 0,director,listed_in
1,Jorge Michel Grau,"Dramas, International Movies"
2,Gilbert Chan,"Horror Movies, International Movies"
3,Shane Acker,"Action & Adventure, Independent Movies, Sci-Fi..."
4,Robert Luketic,Dramas
5,Serdar Akar,"International TV Shows, TV Dramas, TV Mysteries"


In [7]:
director_content_dict = {}
for director_content in data_director_content_type.values:
    directors = director_content[0].split(", ") # Get the name of director
    for director in directors:
        director = director.strip()
        if director not in director_content_dict:
            director_content_dict[director] = {}
    contents = director_content[1].split(", ")
    for director in directors: # Get the content from each director
        for content in contents:
            content = content.strip()
            if content not in ["International TV Shows","International Movies"]: 
                if content in director_content_dict[director]: # Add the content to each director
                    director_content_dict[director][content] += 1
                else:
                    director_content_dict[director][content] = 1

In [8]:
# Sort director_content_dict descending
sorted_director_content = dict(sorted(director_content_dict.items(), key=lambda x: len(x[1]), reverse=True)[:100])

In [9]:
director_most_content = {}
for director in sorted_director_content: # Get the most content for each director
    director_content = director_content_dict[director]
    most_content = sorted(director_content.items(), key=lambda x: x[1], reverse=True)[0]
    director_most_content[director] = most_content[0]

In [10]:
director_most_content_df = pd.DataFrame() # Create the DataFrame for director and most content
director_most_content_df["Director"] = list(director_most_content.keys())
director_most_content_df["Content"] = list(director_most_content.values())
director_most_content_df.head(10)

Unnamed: 0,Director,Content
0,Martin Scorsese,Dramas
1,Vishal Bhardwaj,Dramas
2,Anurag Kashyap,Dramas
3,Jeremy Saulnier,Thrillers
4,Vikramaditya Motwane,Dramas
5,Sean McNamara,Children & Family Movies
6,Ron Howard,Children & Family Movies
7,Priyadarshan,Dramas
8,David Dhawan,Comedies
9,Kevin Smith,Comedies


### 3. Rating type

In [11]:
movie_rating_type_df = data[["title","rating"]]
movie_rating_type_df.head(5)

Unnamed: 0,title,rating
0,3%,TV-MA
1,7:19,TV-MA
2,23:59,R
3,9,PG-13
4,21,PG-13


### 4. Content added over the year

In [12]:
data["date_added"] = pd.to_datetime(data["date_added"]) # Cast date of date_added to datetime
data["year"] = data["date_added"].dt.year # Extract year from datetime column
content_by_year = data.groupby(by=["year"],as_index=False).count() # Group by year and count the number of content
content_by_year[["year","show_id"]]

Unnamed: 0,year,show_id
0,2008.0,2
1,2009.0,2
2,2010.0,1
3,2011.0,13
4,2012.0,3
5,2013.0,11
6,2014.0,25
7,2015.0,88
8,2016.0,443
9,2017.0,1225


# 5. Director with the most content

In [13]:
name_director = list(sorted_director_content)[0] # Get the director with the most content from sorted_director_content
number_content = sum(list(sorted_director_content.values())[0].values()) # Get the most content from that director

In [14]:
print(f"{name_director} is the director that make the most number of content, {number_content} contents")

Martin Scorsese is the director that make the most number of content, 27 contents


### 6. Actor with the most content

In [15]:
cast_list = list(data["cast"].dropna()) # Get list of the cast for each movie

In [16]:
actor_dict = {}
for cast in cast_list:
    for actor in cast.split(", "): 
        if actor in actor_dict: # Check if the actor alread in the actor_dict
            actor_dict[actor]+=1
        else:
            actor_dict[actor]=1

In [17]:
#Sort director_content_dict descending
sorted_cast = sorted(actor_dict.items(), key=lambda x: x[1], reverse=True)

# Create the DataFrame for Actor and the number of  content
sorted_cast_df = pd.DataFrame(sorted_cast,columns=["Actor","Number of content"])
sorted_cast_df.head(10)

Unnamed: 0,Actor,Number of content
0,Anupam Kher,42
1,Shah Rukh Khan,35
2,Naseeruddin Shah,30
3,Om Puri,30
4,Takahiro Sakurai,29
5,Akshay Kumar,29
6,Boman Irani,27
7,Amitabh Bachchan,27
8,Paresh Rawal,27
9,Yuki Kaji,27


### 7. Average movie duration

In [18]:
movie = data[data["type"]=="Movie"] # Filter only content that is Movie as type
duration_list = []
for duration in movie["duration"]:
    duration_list.append(int(duration.split(" ")[0])) # Extract only the number and cast it to integer
duration_avg = np.mean(duration_list)
print(f"Average Movie duration: {duration_avg:.2f} minutes")

Average Movie duration: 99.31 minutes


### 8. Average number of season per TV Show

In [19]:
tv_show = data[data["type"]=="TV Show"] # Filter only content that is Movie as type
ss_list = []
for ss in tv_show["duration"]:
    ss_list.append(int(ss.split(" ")[0])) # Extract only the number and cast it to integer
ss_avg = np.mean(ss_list)
print(f"Average number of season per TV Show: {ss_avg:.2f} seasons or {round(ss_avg)} season")

Average number of season per TV Show: 1.78 seasons or 2 season


### 9. Top 15 genres

In [20]:
genre_list = list(data["listed_in"]) # Get all the genre from each content
genre_dict = {}
for genres in genre_list:
    for genre in genres.split(", "): 
        if genre in genre_dict: # Check if the genre alread in the genre_list
            genre_dict[genre]+=1
        else:
            genre_dict[genre]=1
sorted_genre = sorted(genre_dict.items(), key=lambda x: x[1], reverse=True)
genre_most_df = pd.DataFrame(sorted_genre,columns=["Genre","The number of contents"])
genre_most_df.head(15)

Unnamed: 0,Genre,The number of contents
0,International Movies,2437
1,Dramas,2106
2,Comedies,1471
3,International TV Shows,1199
4,Documentaries,786
5,Action & Adventure,721
6,TV Dramas,704
7,Independent Movies,673
8,Children & Family Movies,532
9,Romantic Movies,531


### 10. Most used words for title

In [21]:
from nltk.corpus import stopwords
import re

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

In [22]:
titles = list(data["title"]) # Get all the title from each content
word_dict = {}
for title in titles:
    for word in title.split(" "):
        word = re.sub('[^a-zA-Z]','',word) # Remove irrelavant alphabet like number or punctuation
        word = word.lower() # Make the word to lower case
        if word not in all_stopwords and word !="": # If the word is not stop word and None then add in word_dict
            if word in word_dict:
                word_dict[word]+=1
            else:
                word_dict[word]=1
sorted_word = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
sorted_word_df = pd.DataFrame(sorted_word,columns=["Word","The number of word occur in title"])
sorted_word_df.head(15)

Unnamed: 0,Word,The number of word occur in title
0,love,151
1,christmas,78
2,man,74
3,story,67
4,life,64
5,world,63
6,movie,60
7,little,58
8,one,54
9,time,53
