In [5]:
import pandas as pd

# ---------------------------------------
# 1. Load Dataset
# ---------------------------------------
df = pd.read_csv("netflix_titles.csv")

print("Before cleaning:")
display(df.head())

# ---------------------------------------
# 2. Check Missing Values
# ---------------------------------------
print("\nMissing values before cleaning:")
display(df.isnull().sum())

# ---------------------------------------
# 3. Handle Missing Values
# ---------------------------------------
df = df.fillna({
    'director': "Unknown",
    'cast': "Not available",
    'country': df['country'].mode()[0]
})

# Remove rows where date_added is missing
df = df.dropna(subset=['date_added'])

# ---------------------------------------
# 4. Remove Duplicates
# ---------------------------------------
df = df.drop_duplicates()

# ---------------------------------------
# 5. Clean Text Formatting
# ---------------------------------------
df['type'] = df['type'].str.strip().str.lower()
df['rating'] = df['rating'].str.strip().str.upper()

# ---------------------------------------
# 6. Fix Date Column
# ---------------------------------------
df['date_added'] = pd.to_datetime(df['date_added'], format='mixed')

df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month

# ---------------------------------------
# 7. Rename Columns
# ---------------------------------------
df.columns = df.columns.str.lower().str.replace(' ', '_')

# ---------------------------------------
# 8. Save Cleaned Dataset
# ---------------------------------------
df.to_csv("netflix_titles_cleaned.csv", index=False)

print("\n✅ Data cleaned successfully and saved as netflix_titles_cleaned.csv!")

# ---------------------------------------
# 9. Show Results After Cleaning
# ---------------------------------------
print("\nAfter cleaning:")
print(f"Dataset shape: {df.shape}")

print("\nMissing values after cleaning:")
display(df.isnull().sum())

print("\nFirst few rows of cleaned data:")
display(df.head())

# ---------------------------------------
# 10. Verification Summary
# ---------------------------------------
print("\n" + "="*50)
print("CLEANING VERIFICATION")
print("="*50)

print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Columns: {list(df.columns)}")
print(f"Date range: {df['date_added'].min()} to {df['date_added'].max()}")
print(f"Unique types: {df['type'].unique()}")

print("✅ All missing values handled successfully!")


Before cleaning:


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...



Missing values before cleaning:


Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3



✅ Data cleaned successfully and saved as netflix_titles_cleaned.csv!

After cleaning:
Dataset shape: (8797, 14)

Missing values after cleaning:


Unnamed: 0,0
show_id,0
type,0
title,0
director,0
cast,0
country,0
date_added,0
release_year,0
rating,4
duration,3



First few rows of cleaned data:


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added
0,s1,movie,Dick Johnson Is Dead,Kirsten Johnson,Not available,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021,9
1,s2,tv show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9
2,s3,tv show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021,9
3,s4,tv show,Jailbirds New Orleans,Unknown,Not available,United States,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021,9
4,s5,tv show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021,9



CLEANING VERIFICATION
Total rows: 8797
Total columns: 14
Columns: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'year_added', 'month_added']
Date range: 2008-01-01 00:00:00 to 2021-09-25 00:00:00
Unique types: ['movie' 'tv show']
✅ All missing values handled successfully!
