## Netflix Movies & TV Shows Dataset

Dataset: "netflix_titles.csv"

#### Dataset Cleaning Process

Importing necessary libraries

In [66]:
import numpy as np
import pandas as pd

Dataset Loading

In [67]:
df = pd.read_csv("netflix_titles.csv")

Dataset Understanding

In [68]:
print("Total Rows", df.shape[0])
print("Total Columns", df.shape[1])

Total Rows 8807
Total Columns 12


In [69]:
df.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [70]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated()]

print("Total Duplicate Rows:", duplicate_rows.shape[0])

duplicate_rows

Total Duplicate Rows: 0


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description


In [71]:
# Checking null values in each column
null_counts = df.isnull().sum()

print(null_counts)

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


Replacing NULL values with valid value

In [72]:
print("director replaced:", df['director'].isnull().sum())
df['director'] = df['director'].fillna("Unknown")

print("cast replaced:", df['cast'].isnull().sum())
df['cast'] = df['cast'].fillna("Unknown")

print("country replaced:", df['country'].isnull().sum())
df['country'] = df['country'].fillna("Unknown")

print("date_added replaced:", df['date_added'].isnull().sum())
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])

print("rating replaced:", df['rating'].isnull().sum())
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])

print("duration replaced:", df['duration'].isnull().sum())
df['duration'] = df['duration'].fillna(df['duration'].mode()[0])


director replaced: 2634
cast replaced: 825
country replaced: 831
date_added replaced: 10
rating replaced: 4
duration replaced: 3


checking if rows & columns number are fine

In [73]:
df.shape

(8807, 12)

Verifying Missing Values

In [74]:
null_counts = df.isnull().sum()

print(null_counts)

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


Checking Unique values for inconcistency

In [76]:
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].unique())
    print(f"Total unique: {df[col].nunique()}\n")


Column: show_id
['s1' 's2' 's3' ... 's8805' 's8806' 's8807']
Total unique: 8807

Column: type
['Movie' 'TV Show']
Total unique: 2

Column: title
['Dick Johnson Is Dead' 'Blood & Water' 'Ganglands' ... 'Zombieland'
 'Zoom' 'Zubaan']
Total unique: 8807

Column: director
['Kirsten Johnson' 'Unknown' 'Julien Leclercq' ... 'Majid Al Ansari'
 'Peter Hewitt' 'Mozez Singh']
Total unique: 4529

Column: cast
['Unknown'
 'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng'
 'Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera'
 ...
 'Jesse Eisenberg, Woody Harrelson, Emma Stone, Abigail Breslin, Amber Heard, Bill Murray, Derek Graf'
 'Tim Allen, Courtene

In [77]:
print("Total Rows", df.shape[0])
print("Total Columns", df.shape[1])

Total Rows 8807
Total Columns 12


Ensuring Consistency

In [78]:
wrong_ratings = ['74 min', '84 min', '66 min']
df['rating'] = df['rating'].replace(wrong_ratings, "Unknown")

In [79]:
df['duration_num'] = df['duration'].str.extract(r'(\d+)').astype(int)
df['season_num'] = df['duration'].apply(lambda x: int(x.split()[0]) if 'Season' in x else 0)

In [80]:
df['country'] = df['country'].str.strip()

df['country'] = df['country'].str.split(',').str[0].str.strip()

In [81]:
df['genres'] = df['listed_in'].str.split(',').apply(lambda x: [g.strip() for g in x])

In [82]:
df['main_genre'] = df['genres'].apply(lambda x: x[0])

In [83]:
df['title'] = df['title'].str.strip()
df['director'] = df['director'].str.strip()
df['cast'] = df['cast'].str.strip()

In [84]:
df['date_added'] = df['date_added'].str.strip()

df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce', infer_datetime_format=True)

  df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce', infer_datetime_format=True)


In [85]:
df['added_year'] = df['date_added'].dt.year
df['added_month'] = df['date_added'].dt.month_name()   
df['added_day'] = df['date_added'].dt.day

In [86]:
df['rating_group'] = df['rating'].replace({
    'G': 'Kids', 'TV-Y': 'Kids', 'TV-Y7': 'Kids', 
    'PG': 'Teen', 'PG-13': 'Teen', 
    'R': 'Adult', 'TV-MA': 'Adult', 'NC-17': 'Adult', 'UR': 'Adult'
})


In [87]:
df['cast_count'] = df['cast'].apply(lambda x: len(x.split(',')))

In [88]:
df['age_since_release'] = df['added_year'] - df['release_year']

In [89]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,duration_num,season_num,genres,main_genre,added_year,added_month,added_day,rating_group,cast_count,age_since_release
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,...,90,0,[Documentaries],Documentaries,2021,September,25,Teen,1,1
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,...,2,2,"[International TV Shows, TV Dramas, TV Mysteries]",International TV Shows,2021,September,24,Adult,19,0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,...,1,1,"[Crime TV Shows, International TV Shows, TV Ac...",Crime TV Shows,2021,September,24,Adult,9,0
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1 Season,...,1,1,"[Docuseries, Reality TV]",Docuseries,2021,September,24,Adult,1,0
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,...,2,2,"[International TV Shows, Romantic TV Shows, TV...",International TV Shows,2021,September,24,Adult,8,0


In [90]:
df.tail()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,duration_num,season_num,genres,main_genre,added_year,added_month,added_day,rating_group,cast_count,age_since_release
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158 min,...,158,0,"[Cult Movies, Dramas, Thrillers]",Cult Movies,2019,November,20,Adult,10,12
8803,s8804,TV Show,Zombie Dumb,Unknown,Unknown,Unknown,2019-07-01,2018,TV-Y7,2 Seasons,...,2,2,"[Kids' TV, Korean TV Shows, TV Comedies]",Kids' TV,2019,July,1,Kids,1,1
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88 min,...,88,0,"[Comedies, Horror Movies]",Comedies,2019,November,1,Adult,7,10
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88 min,...,88,0,"[Children & Family Movies, Comedies]",Children & Family Movies,2020,January,11,Teen,9,14
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2019-03-02,2015,TV-14,111 min,...,111,0,"[Dramas, International Movies, Music & Musicals]",Dramas,2019,March,2,TV-14,8,4


Final checking dataset

In [91]:
df.shape

(8807, 22)

In [92]:
df.dtypes

show_id                      object
type                         object
title                        object
director                     object
cast                         object
country                      object
date_added           datetime64[ns]
release_year                  int64
rating                       object
duration                     object
listed_in                    object
description                  object
duration_num                  int64
season_num                    int64
genres                       object
main_genre                   object
added_year                    int32
added_month                  object
added_day                     int32
rating_group                 object
cast_count                    int64
age_since_release             int64
dtype: object

In [93]:
df.isnull().sum()

show_id              0
type                 0
title                0
director             0
cast                 0
country              0
date_added           0
release_year         0
rating               0
duration             0
listed_in            0
description          0
duration_num         0
season_num           0
genres               0
main_genre           0
added_year           0
added_month          0
added_day            0
rating_group         0
cast_count           0
age_since_release    0
dtype: int64

In [None]:
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].unique())
    print(f"Total unique: {df[col].nunique()}\n")

Column: show_id
['s1' 's2' 's3' ... 's8805' 's8806' 's8807']
Total unique: 8807

Column: type
['Movie' 'TV Show']
Total unique: 2

Column: title
['Dick Johnson Is Dead' 'Blood & Water' 'Ganglands' ... 'Zombieland'
 'Zoom' 'Zubaan']
Total unique: 8806

Column: director
['Kirsten Johnson' 'Unknown' 'Julien Leclercq' ... 'Majid Al Ansari'
 'Peter Hewitt' 'Mozez Singh']
Total unique: 4529

Column: cast
['Unknown'
 'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng'
 'Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera'
 ...
 'Jesse Eisenberg, Woody Harrelson, Emma Stone, Abigail Breslin, Amber Heard, Bill Murray, Derek Graf'
 'Tim Allen, Courtene

TypeError: unhashable type: 'list'

In [95]:
df['genres'].head()


0                                      [Documentaries]
1    [International TV Shows, TV Dramas, TV Mysteries]
2    [Crime TV Shows, International TV Shows, TV Ac...
3                             [Docuseries, Reality TV]
4    [International TV Shows, Romantic TV Shows, TV...
Name: genres, dtype: object

In [96]:
# Flatten all genre lists into a single list
all_genres = [genre for sublist in df['genres'] for genre in sublist]

# Get unique genres
unique_genres = set(all_genres)

print(unique_genres)
print(f"Total unique genres: {len(unique_genres)}")


{'Teen TV Shows', "Kids' TV", 'Classic & Cult TV', 'TV Action & Adventure', 'TV Horror', 'Dramas', 'Stand-Up Comedy', 'Docuseries', 'Anime Series', 'LGBTQ Movies', 'Horror Movies', 'TV Comedies', 'Movies', 'TV Sci-Fi & Fantasy', 'Stand-Up Comedy & Talk Shows', 'Spanish-Language TV Shows', 'Action & Adventure', 'Sci-Fi & Fantasy', 'Children & Family Movies', 'Korean TV Shows', 'TV Thrillers', 'TV Shows', 'Independent Movies', 'British TV Shows', 'Classic Movies', 'Sports Movies', 'Faith & Spirituality', 'Documentaries', 'Cult Movies', 'TV Mysteries', 'Romantic Movies', 'International Movies', 'Anime Features', 'Reality TV', 'Music & Musicals', 'Crime TV Shows', 'Romantic TV Shows', 'Comedies', 'Thrillers', 'International TV Shows', 'TV Dramas', 'Science & Nature TV'}
Total unique genres: 42


In [97]:
from collections import Counter

# Count how many times each genre appears
genre_counts = Counter([genre for sublist in df['genres'] for genre in sublist])

# Convert to a sorted DataFrame for easy viewing
genre_counts_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count']).sort_values(by='Count', ascending=False)

print(genre_counts_df)


                           Genre  Count
14          International Movies   2752
12                        Dramas   2427
16                      Comedies   1674
1         International TV Shows   1351
0                  Documentaries    869
25            Action & Adventure    859
2                      TV Dramas    763
13            Independent Movies    756
11      Children & Family Movies    641
19               Romantic Movies    616
9                    TV Comedies    581
18                     Thrillers    577
4                 Crime TV Shows    470
24                      Kids' TV    451
6                     Docuseries    395
20              Music & Musicals    375
8              Romantic TV Shows    370
21                 Horror Movies    357
38               Stand-Up Comedy    343
7                     Reality TV    255
15              British TV Shows    253
22              Sci-Fi & Fantasy    243
29                 Sports Movies    219
30                  Anime Series    176


### Exporting the cleaned dataset

In [98]:
df.to_csv('Netflix_Dataset.csv', index=False)
print("CSV file 'Netflix_Dataset.csv' has been successfully saved!")

CSV file 'Netflix_Dataset.csv' has been successfully saved!


## Finally Dataset is Ready for Dashboard