# Netflix dataset cleaning and preprocessing

# Step-1:Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv("C:/Users/vpriy/Downloads/netflix_titles.csv/netflix_titles.csv")

In [3]:
# Drop duplicate rows
df = df.drop_duplicates()

In [4]:
# Check for missing values
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [5]:
# Fill missing values
df['director'] = df['director'].fillna('Not Specified')
df['cast'] = df['cast'].fillna('Not Specified')
df['country'] = df['country'].fillna('Not Specified')
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])
df['duration'] = df['duration'].fillna(df['duration'].mode()[0])

In [6]:
# Fill missing date_added with mode
most_common_date = df['date_added'].mode()[0]
df['date_added'] = df['date_added'].fillna(most_common_date)

In [7]:
# Fix rating column with duration values (e.g., "74 min")
mask = df['rating'].str.contains(r'\d+\s*min', case=False, na=False)
most_common_rating = df['rating'].mode()[0]
df.loc[mask, 'duration'] = df.loc[mask, 'rating']
df.loc[mask, 'rating'] = most_common_rating

In [8]:
# Strip whitespace from text columns
text_columns = ['title', 'director', 'cast', 'country', 'rating', 'listed_in', 'description']
for col in text_columns:
    df[col] = df[col].astype(str).str.strip()

In [9]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Not Specified,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Not Specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Not Specified,24-Sep-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Not Specified,Not Specified,Not Specified,24-Sep-21,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Not Specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,20-Nov-19,2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,Not Specified,Not Specified,Not Specified,1-Jul-19,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,1-Nov-19,2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,11-Jan-20,2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


# 2.Data Transformation

In [10]:
# Convert 'date_added' to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [11]:
# Extract year, month, and month name
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['month_name'] = df['date_added'].dt.month_name()

In [12]:
# Format date back to string format
df['date_added'] = df['date_added'].dt.strftime('%d-%m-%Y')

In [13]:
# Split 'duration' into numeric and unit parts
df['duration_num'] = df['duration'].str.extract(r'(\d+)').astype(float)
df['duration_unit'] = df['duration'].str.extract(r'([a-zA-Z]+)')

In [14]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,month_name,duration_num,duration_unit
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Not Specified,United States,25-09-2021,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021,9,September,90.0,min
1,s2,TV Show,Blood & Water,Not Specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,24-09-2021,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9,September,2.0,Seasons
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Not Specified,24-09-2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021,9,September,1.0,Season
3,s4,TV Show,Jailbirds New Orleans,Not Specified,Not Specified,Not Specified,24-09-2021,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021,9,September,1.0,Season
4,s5,TV Show,Kota Factory,Not Specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,24-09-2021,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021,9,September,2.0,Seasons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,20-11-2019,2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...",2019,11,November,158.0,min
8803,s8804,TV Show,Zombie Dumb,Not Specified,Not Specified,Not Specified,01-07-2019,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...",2019,7,July,2.0,Seasons
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,01-11-2019,2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,2019,11,November,88.0,min
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,11-01-2020,2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...",2020,1,January,88.0,min


# 3. Data Filtering (Outlier & Invalid Data Removal)

In [15]:
# Remove rows with invalid or zero duration
df = df[df['duration_num'] > 0]

In [17]:
# Calculate estimated duration in minutes
def estimate_duration(row):
    if row['duration_unit'].lower().startswith('season'):
        return row['duration_num'] * 10 * 45  # ~10 episodes * 45 mins
    elif row['duration_unit'].lower() == 'min':
        return row['duration_num']
    else:
        return np.nan

In [18]:
df['estimated_minutes'] = df.apply(estimate_duration, axis=1)

In [19]:
# Remove outliers using IQR
Q1 = df['estimated_minutes'].quantile(0.25)
Q3 = df['estimated_minutes'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df = df[(df['estimated_minutes'] >= lower) & (df['estimated_minutes'] <= upper)]

In [20]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,month_name,duration_num,duration_unit,estimated_minutes
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Not Specified,United States,25-09-2021,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021,9,September,90.0,min,90.0
1,s2,TV Show,Blood & Water,Not Specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,24-09-2021,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9,September,2.0,Seasons,900.0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Not Specified,24-09-2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021,9,September,1.0,Season,450.0
3,s4,TV Show,Jailbirds New Orleans,Not Specified,Not Specified,Not Specified,24-09-2021,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021,9,September,1.0,Season,450.0
4,s5,TV Show,Kota Factory,Not Specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,24-09-2021,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021,9,September,2.0,Seasons,900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,20-11-2019,2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...",2019,11,November,158.0,min,158.0
8803,s8804,TV Show,Zombie Dumb,Not Specified,Not Specified,Not Specified,01-07-2019,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...",2019,7,July,2.0,Seasons,900.0
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,01-11-2019,2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,2019,11,November,88.0,min,88.0
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,11-01-2020,2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...",2020,1,January,88.0,min,88.0


# 4.Final Check

In [24]:
# Unique types
print(df['type'].unique())


['Movie' 'TV Show']


In [25]:


# Rating distribution
print(df['rating'].value_counts())



TV-MA       3032
TV-14       2034
TV-PG        814
R            799
PG-13        490
TV-Y7        289
PG           287
TV-Y         275
TV-G         198
NR            78
G             41
TV-Y7-FV       6
NC-17          3
UR             3
Name: rating, dtype: int64


In [26]:

# Country distribution
print(df['country'].value_counts())



United States                                                      2588
India                                                               968
Not Specified                                                       814
United Kingdom                                                      375
Japan                                                               228
                                                                   ... 
India, United Kingdom, Canada, United States                          1
United Kingdom, United States, Germany, Denmark, Belgium, Japan       1
United States, Poland                                                 1
Mauritius, South Africa                                               1
United Arab Emirates, Jordan                                          1
Name: country, Length: 732, dtype: int64


In [27]:

# Release year range
print(df['release_year'].min(), df['release_year'].max())



1925 2021


In [28]:

# Check duration units by type
print(df[df['type'] == 'Movie']['duration_unit'].unique())
print(df[df['type'] == 'TV Show']['duration_unit'].unique())

['min']
['Seasons' 'Season']


In [29]:
# View updated date format
print(df['date_added'].head())

0    25-09-2021
1    24-09-2021
2    24-09-2021
3    24-09-2021
4    24-09-2021
Name: date_added, dtype: object


In [30]:
print(f"Final dataset shape: {df.shape}")
print(f"Total null values: {df.isnull().sum().sum()}")
print(f"Total duplicates: {df.duplicated().sum()}")

Final dataset shape: (8349, 22)
Total null values: 0
Total duplicates: 0


# 5.Saving the cleaned dataset

In [31]:
# Save cleaned and processed data to a new CSV file
df.to_csv("cleaned_netflix_data.csv", index=False)

print("Cleaned data saved as 'cleaned_netflix_data.csv'")

Cleaned data saved as 'cleaned_netflix_data.csv'


In [32]:
import os
print(os.getcwd())

C:\Users\vpriy
