In [3]:
import pandas as pd
import numpy as np

# Load the Dataset from a Local File
file_name = "netflix.csv"
df = pd.read_csv(file_name)

# Data Cleaning for Each Column
# Handling Missing Values
df['release_year'].fillna(df['release_year'].mean(), inplace=True)
df['country'].fillna('Unknown', inplace=True)
df['director'].fillna('Unknown', inplace=True)
df['date_added'].fillna('Unknown', inplace=True)

# Remove Duplicates (if any)
df.drop_duplicates(subset=['show_id'], keep='first', inplace=True)

# Handling Outliers (for numeric columns like 'release_year')
Q1 = df['release_year'].quantile(0.25)
Q3 = df['release_year'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['release_year'] >= lower_bound) & (df['release_year'] <= upper_bound)]

# Print the First Few Rows of the Cleaned DataFrame
print(df.head())

# Save Cleaned Data
df.to_csv("cleaned_dataset.csv", index=False)


  show_id     type                             title         director  \
0      s1    Movie              Dick Johnson Is Dead  Kirsten Johnson   
1      s3  TV Show                         Ganglands  Julien Leclercq   
2      s6  TV Show                     Midnight Mass    Mike Flanagan   
3     s14    Movie  Confessions of an Invisible Girl    Bruno Garotti   
5      s9  TV Show     The Great British Baking Show  Andy Devonshire   

          country date_added  release_year rating   duration  \
0   United States  9/25/2021          2020  PG-13     90 min   
1          France  9/24/2021          2021  TV-MA   1 Season   
2   United States  9/24/2021          2021  TV-MA   1 Season   
3          Brazil  9/22/2021          2021  TV-PG     91 min   
5  United Kingdom  9/24/2021          2021  TV-14  9 Seasons   

                                           listed_in  
0                                      Documentaries  
1  Crime TV Shows, International TV Shows, TV Act...  
2          