# Load the dataset

In [70]:
import pandas as pd

df = pd.read_csv("netflix_titles.csv")

# Check for missing values

In [71]:
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


# Remove duplicate rows

In [72]:
df_cleaned = df.drop_duplicates()

# Rename column headers to be clean and uniform

In [73]:
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(" ", "_")

# Convert 'date_added' to datetime format

In [74]:
df_cleaned['date_added'] = pd.to_datetime(df_cleaned['date_added'], errors='coerce')

# Standardize text fields (title case)

In [75]:
text_fields = ['type', 'country', 'rating']
for col in text_fields:
    df_cleaned[col] = df_cleaned[col].astype(str).str.strip().str.title()

# Impute missing values

In [81]:
import numpy as np

# For 'director' and 'cast', filled with 'Not Specified'
df_cleaned['director'].fillna('Unknown', inplace=True)
df_cleaned['cast'].fillna('Unknown', inplace=True)

# For 'country', filled with 'Unknown'
df_cleaned['country'] = df_cleaned['country'].replace(['nan', 'Nan', 'NaN'], np.nan)
df_cleaned['country'].fillna('Not Specified', inplace=True)

# For 'date_added', filled with the median date
median_date = df_cleaned['date_added'].median()
df_cleaned['date_added'].fillna(median_date, inplace=True)

# For 'rating', filled with most common rating
most_common_rating = df_cleaned['rating'].mode()[0]
df_cleaned['rating'].fillna(most_common_rating, inplace=True)

# For 'duration', filled with 'Unknown'
df_cleaned['duration'].fillna('Not Specified', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['country'].fillna('Not Specified', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['date_added'].fillna(median_date, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on wh

# Verify no missing values remain

In [82]:
df_cleaned.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,0
cast,0
country,0
date_added,0
release_year,0
rating,0
duration,0


# Final data type check

In [83]:
print("\nFinal Data Types:\n", df_cleaned.dtypes)


Final Data Types:
 show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


In [84]:
df_cleaned.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,Pg-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Tv Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,Tv-Ma,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Tv Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Not Specified,2021-09-24,2021,Tv-Ma,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,Tv Show,Jailbirds New Orleans,Unknown,Unknown,Not Specified,2021-09-24,2021,Tv-Ma,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,Tv Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,Tv-Ma,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [85]:
# Save the cleaned DataFrame
df_cleaned.to_csv("netflix_titles_cleaned.csv", index=False)

# For Google Colab: Download the file
from google.colab import files
files.download("netflix_titles_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>