In [1]:
# Importing Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# Reading CSV File
df = pd.read_csv("netflix_titles.csv")

In [12]:
# Dropping description column
df.drop(["show_id","description"], axis=1, inplace=True)

In [13]:
df.shape

(8807, 10)

In [14]:
df.size

88070

In [15]:
df.index

RangeIndex(start=0, stop=8807, step=1)

In [16]:
# Columns in the DataFrame
df.columns

Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in'],
      dtype='object')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   director      6173 non-null   object
 3   cast          7982 non-null   object
 4   country       7976 non-null   object
 5   date_added    8797 non-null   object
 6   release_year  8807 non-null   int64 
 7   rating        8803 non-null   object
 8   duration      8804 non-null   object
 9   listed_in     8807 non-null   object
dtypes: int64(1), object(9)
memory usage: 688.2+ KB


In [18]:
df.isna().sum()


type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
dtype: int64

In [19]:
df["country"].value_counts().head(3)

country
United States     2818
India              972
United Kingdom     419
Name: count, dtype: int64

In [20]:
mode_country = df["country"].mode()[0]
mode_country

'United States'

In [21]:
# Filling in the Null values with most frequent value
df["country"] = df["country"].fillna(mode_country)

In [22]:
df["country"].isna().sum()

0

In [23]:
df["date_added"].value_counts().head(3)

date_added
January 1, 2020     109
November 1, 2019     89
March 1, 2018        75
Name: count, dtype: int64

In [24]:
mode_date = df["date_added"].mode()[0]
mode_date

'January 1, 2020'

In [25]:
df["date_added"] = df["date_added"].fillna(mode_date)

In [26]:
df["date_added"].isna().sum()

0

In [27]:
df["rating"].value_counts().head(3)

rating
TV-MA    3207
TV-14    2160
TV-PG     863
Name: count, dtype: int64

In [28]:
mode_rating = df["rating"].mode()[0]
mode_rating

'TV-MA'

In [29]:
df["rating"] = df["rating"].fillna(mode_rating)

In [30]:
df["rating"].isna().sum()

0

In [34]:
# Count of unique values in the duration column
df["duration"].value_counts().head(3)

duration
1 Season     1796
2 Seasons     425
3 Seasons     199
Name: count, dtype: int64

In [31]:
mode_duration = df["duration"].mode()[0]
mode_duration

'1 Season'

In [32]:
# Filling in the Null values with most frequent value
df["duration"] = df["duration"].fillna(mode_duration)

In [33]:
df["duration"].isna().sum()

0

In [35]:
df["director"] = df["director"].fillna("Not Mentioned")

In [36]:
df["director"].isna().sum()

0

In [37]:
df["cast"] = df["cast"].fillna("Not Mentioned")

In [38]:
df["cast"].isna().sum()

0

In [39]:
df["rating"].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR',
       'TV-Y7-FV', 'UR'], dtype=object)

In [40]:
df = df[~df["rating"].str.contains("min")]

In [41]:
df["rating"].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'], dtype=object)

In [42]:
df["date_added"] = df["date_added"].str.replace(",","")

In [43]:
df["date_added"] = pd.to_datetime(df["date_added"], format="mixed")

In [44]:
# Extracting Year from date_added column
df["year"] = df["date_added"].dt.year

In [45]:
# Extracting Month Names from date_added column
df["month"] = df["date_added"].dt.month_name()

In [46]:
# Extracting Date from date_added column
df["date"] = df["date_added"].dt.day

In [47]:
df["genre"] = df["listed_in"].str.split(",").str[0]

In [48]:
df["genre"].value_counts().head(5)

genre
Dramas                    1600
Comedies                  1210
Action & Adventure         859
Documentaries              829
International TV Shows     774
Name: count, dtype: int64

In [49]:
df = df[df["cast"]!="Not Mentioned"]

In [50]:
df["lead_actor"] = df["cast"].str.split(",").str[0]

In [51]:
df["lead_actor"].value_counts().head(5)

lead_actor
Shah Rukh Khan        26
Akshay Kumar          23
David Attenborough    20
Adam Sandler          20
Amitabh Bachchan      20
Name: count, dtype: int64

In [52]:
# Dropping date_added, listed_in and cast column
df.drop(["date_added","listed_in","cast"], axis=1, inplace=True)

In [53]:
df.sample

<bound method NDFrame.sample of          type                             title  \
1     TV Show                     Blood & Water   
2     TV Show                         Ganglands   
4     TV Show                      Kota Factory   
5     TV Show                     Midnight Mass   
6       Movie  My Little Pony: A New Generation   
...       ...                               ...   
8801    Movie                           Zinzana   
8802    Movie                            Zodiac   
8804    Movie                        Zombieland   
8805    Movie                              Zoom   
8806    Movie                            Zubaan   

                           director                       country  \
1                     Not Mentioned                  South Africa   
2                   Julien Leclercq                 United States   
4                     Not Mentioned                         India   
5                     Mike Flanagan                 United States   
6     Robe

In [56]:
df.columns

Index(['type', 'title', 'director', 'country', 'release_year', 'rating',
       'duration', 'year', 'month', 'date', 'genre', 'lead_actor'],
      dtype='object')

In [55]:
df.to_csv("cleaned_data.csv")