In [1]:
import pandas as pd


In [2]:
#Load Dataset
df = pd.read_csv("netflix_titles.csv")

In [3]:
# 1. Identify and handle missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [4]:
df.fillna("Unknown", inplace=True)
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [5]:
df = df.drop_duplicates()

In [6]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [7]:
# 3. Standardize text values
df['type'] = df['type'].str.strip().str.lower().str.title()
df['country'] = df['country'].str.strip().str.lower().str.title()
df['director'] =df['director'].str.strip().str.lower().str.title()

In [8]:
# 4. Convert 'date_added' to datetime, then format as dd-mm-yyyy
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce', format='%B %d, %Y')
df['date_added'] = df['date_added'].dt.strftime('%d-%m-%Y')

In [9]:
# 5. Rename column headers to be clean and uniform
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [11]:
# 6. Check and fix data types
print("Data types before fixing:\n", df.type)

Data types before fixing:
 0         Movie
1       Tv Show
2       Tv Show
3       Tv Show
4       Tv Show
         ...   
8802      Movie
8803    Tv Show
8804      Movie
8805      Movie
8806      Movie
Name: type, Length: 8807, dtype: object


In [14]:
df['release_year'] = df['release_year'].astype('Int64')
df['date_added'] = pd.to_datetime(df['date_added'], errors = 'coerce', format='%d-%m-%Y')
print("Data Types after Fixing:\n" , df.type)

Data Types after Fixing:
 0         Movie
1       Tv Show
2       Tv Show
3       Tv Show
4       Tv Show
         ...   
8802      Movie
8803    Tv Show
8804      Movie
8805      Movie
8806      Movie
Name: type, Length: 8807, dtype: object


In [15]:
df.to_csv("netflix_cleaned.csv", index=False)