# Import libraries and data


In [2]:
import numpy as np #linear algebra
import pandas as pd #data preparation
import plotly.express as px #data visaulization
from textblob import TextBlob #used for sentiment analysis

df=pd.read_csv("/content/netflix_titles.csv.zip")


In [4]:
df.shape

(8807, 12)

In [8]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [11]:
x = df.groupby("rating").size().reset_index(name="counts")
print(x)

      rating  counts
0     66 min       1
1     74 min       1
2     84 min       1
3          G      41
4      NC-17       3
5         NR      80
6         PG     287
7      PG-13     490
8          R     799
9      TV-14    2160
10      TV-G     220
11     TV-MA    3207
12     TV-PG     863
13      TV-Y     307
14     TV-Y7     334
15  TV-Y7-FV       6
16        UR       3


# Creating piechart based on rating

In [13]:
piechart=px.pie(x,values="counts",names="rating",title="distribution of content rating on netflix") #based on rating
piechart.show()

# Analysising top 5 directors

In [16]:
df["director"]=df["director"].fillna("directors not specified") #filling values where the values is null or empty
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,directors not specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,directors not specified,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,directors not specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [31]:
directors_list=pd.DataFrame()


In [32]:
directors_list = df["director"].str.split(',', expand=True).stack()


In [36]:
directors_list.columns=["Director"]


In [30]:
directors=directors_list.groupby("Director").size().reset_index(name="total counts")
print(directors)

                       Director  total counts
0                Aaron Moorhead             2
1                   Aaron Woolf             1
2      Abbas Alibhai Burmawalla             1
3              Abdullah Al Noor             1
4           Abhinav Shiv Tiwari             1
...                         ...           ...
5116                Çagan Irmak             1
5117           Ísold Uggadóttir             1
5118        Óskar Thór Axelsson             1
5119           Ömer Faruk Sorak             2
5120               Şenol Sönmez             2

[5121 rows x 2 columns]


In [38]:
directors=directors[directors.Director != "directors not specified"]
print(directors)

                       Director  total counts
0                Aaron Moorhead             2
1                   Aaron Woolf             1
2      Abbas Alibhai Burmawalla             1
3              Abdullah Al Noor             1
4           Abhinav Shiv Tiwari             1
...                         ...           ...
5116                Çagan Irmak             1
5117           Ísold Uggadóttir             1
5118        Óskar Thór Axelsson             1
5119           Ömer Faruk Sorak             2
5120               Şenol Sönmez             2

[5120 rows x 2 columns]


In [40]:
directors=directors.sort_values(by=["total counts"],ascending=False)
print(directors)

             Director  total counts
4020    Rajiv Chilaka            22
4067      Raúl Campos            18
261         Jan Suter            18
4651      Suhas Kadav            16
3235     Marcus Raboy            16
...               ...           ...
1727     Deane Taylor             1
1726      Dean Wright             1
1725     Dean Parisot             1
1724     Dean DeBlois             1
1705  David Sampliner             1

[5120 rows x 2 columns]


In [41]:
top5Directors=directors.head()
print(top5Directors)

           Director  total counts
4020  Rajiv Chilaka            22
4067    Raúl Campos            18
261       Jan Suter            18
4651    Suhas Kadav            16
3235   Marcus Raboy            16


In [45]:
top5Directors=top5Directors.sort_values(by=["total counts"])
barchart=px.bar(top5Directors,x="total counts",y="Director",title="top 5 Directors on netflix")
barchart.show()

# ANALYSING TOP 5 ACTORS ON NETFLIX


In [48]:
df["cast"]=df["cast"].fillna("no cast specified")
cast_df=pd.DataFrame()
cast_df=df["cast"].str.split(',',expand=True).stack()
cast_df=cast_df.to_frame()
cast_df.columns=["actor"]
actors=cast_df.groupby("actor").size().reset_index(name="total counts")
actors=actors[actors.actor !="no cast specified"]
actors=actors.sort_values(by=["total counts"],ascending=False)
top5actors=actors.head()
top5actors=top5actors.sort_values(by=["total counts"])
barchart2=px.bar(top5actors,x="total counts",y="actor",title="top 5 actors on netflix ")
barchart2.show()

# ANALYSING CONTENT PRODUCED N NETFLIX BASED ON YEARS

In [52]:
df1=df[["type","release_year"]]
df1=df1.rename(columns={"release_year":"Release Year","type":"Type"})
df2=df1.groupby(["Release Year","Type"]).size().reset_index(name="total counts")


In [53]:
print(df2)

     Release Year     Type  total counts
0            1925  TV Show             1
1            1942    Movie             2
2            1943    Movie             3
3            1944    Movie             3
4            1945    Movie             3
..            ...      ...           ...
114          2019  TV Show           397
115          2020    Movie           517
116          2020  TV Show           436
117          2021    Movie           277
118          2021  TV Show           315

[119 rows x 3 columns]


In [56]:
df2=df2[df2["Release Year"]>=2000]
graph=px.line(df2,x="Release Year",y="total counts",color="Type",title="Trend of content produced on Netflix every year")
graph.show()

# SENTIMENT ANALYSIS OF ETFLIX CONTENT

In [64]:

df3 = df[["release_year", "description"]]
df3 = df3.rename(columns={"release_year": "Release Year", "description": "Description"})

# Add sentiment column
for index, row in df3.iterrows():
    d = row["Description"]
    testimonial = TextBlob(d)
    p = testimonial.sentiment.polarity
    if p == 0:
        sent = "neutral"
    elif p > 0:
        sent = "positive"
    else:
        sent = "negative"
    df3.loc[index, "Sentiment"] = sent

# Group and filter
df3 = df3.groupby(["Release Year", "Sentiment"]).size().reset_index(name="total count")
df3 = df3[df3["Release Year"] > 2005]

# Plot
bargraph = px.bar(df3, x="Release Year", y="total count", color="Sentiment", title="Sentiment analysis of Netflix content")
bargraph.show()
