In [0]:
import pandas as pd

# Load dataset
df = pd.read_csv("/Volumes/workspace/default/netflix/netflix_titles.csv")

# Peek at data
display(df.head())
df.info()



show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",Unknown,2021-09-24,2021,TV-MA,1 Seasons,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1 Seasons,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8807 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [0]:
# Replace 'Unknown' with proper NaN
df.replace("Unknown", pd.NA, inplace=True)

# Drop duplicate rows if any
df.drop_duplicates(inplace=True)

# Check missing values
df.isna().sum()



show_id            0
type               0
title              1
director        2634
cast             825
country          831
date_added         0
release_year       0
rating             0
duration           3
listed_in          0
description        0
dtype: int64

In [0]:
# Convert date_added to datetime
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")

# Check conversion
df[["date_added"]].head()



Unnamed: 0,date_added
0,2021-09-25
1,2021-09-24
2,2021-09-24
3,2021-09-24
4,2021-09-24


In [0]:
# Strip whitespace from titles
df["title"] = df["title"].str.strip()

# Lowercase 'listed_in' for consistency
df["listed_in"] = df["listed_in"].str.lower().str.strip()

# Normalize rating (remove spaces and uppercase)
df["rating"] = df["rating"].str.strip().str.upper()

df[["title", "listed_in", "rating"]].head()




Unnamed: 0,title,listed_in,rating
0,Dick Johnson Is Dead,documentaries,PG-13
1,Blood & Water,"international tv shows, tv dramas, tv mysteries",TV-MA
2,Ganglands,"crime tv shows, international tv shows, tv act...",TV-MA
3,Jailbirds New Orleans,"docuseries, reality tv",TV-MA
4,Kota Factory,"international tv shows, romantic tv shows, tv ...",TV-MA


In [0]:
# Extract numeric value and unit (e.g., "90 min" → 90 + 'min')
df[["duration_value", "duration_unit"]] = df["duration"].str.extract(r"(\d+)\s*(\w+)")

# Convert duration_value to numeric
df["duration_value"] = pd.to_numeric(df["duration_value"], errors="coerce")

df[["duration", "duration_value", "duration_unit"]].head()





Unnamed: 0,duration,duration_value,duration_unit
0,90 min,90.0,min
1,2 Seasons,2.0,Seasons
2,1 Seasons,1.0,Seasons
3,1 Seasons,1.0,Seasons
4,2 Seasons,2.0,Seasons


In [0]:
# If multiple countries, take only the first
df["main_country"] = df["country"].str.split(",").str[0].str.strip()

df[["country", "main_country"]].head()





Unnamed: 0,country,main_country
0,United States,United States
1,South Africa,South Africa
2,,
3,,
4,India,India


In [0]:
# Summary of cleaned dataset
df.info()

# Quick peek at cleaned data
display(df.head(10))






<class 'pandas.core.frame.DataFrame'>
Int64Index: 8807 entries, 0 to 8806
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   show_id         8807 non-null   object        
 1   type            8807 non-null   object        
 2   title           8806 non-null   object        
 3   director        6173 non-null   object        
 4   cast            7982 non-null   object        
 5   country         7976 non-null   object        
 6   date_added      8807 non-null   datetime64[ns]
 7   release_year    8807 non-null   int64         
 8   rating          8807 non-null   object        
 9   duration        8804 non-null   object        
 10  listed_in       8807 non-null   object        
 11  description     8807 non-null   object        
 12  duration_value  8804 non-null   float64       
 13  duration_unit   8804 non-null   object        
 14  main_country    7976 non-null   object        
dtypes: d

show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_value,duration_unit,main_country
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25T00:00:00.000Z,2020,PG-13,90 min,documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.",90.0,min,United States
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,2021-09-24T00:00:00.000Z,2021,TV-MA,2 Seasons,"international tv shows, tv dramas, tv mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.",2.0,Seasons,South Africa
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",,2021-09-24T00:00:00.000Z,2021,TV-MA,1 Seasons,"crime tv shows, international tv shows, tv action & adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.",1.0,Seasons,
s4,TV Show,Jailbirds New Orleans,,,,2021-09-24T00:00:00.000Z,2021,TV-MA,1 Seasons,"docuseries, reality tv","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.",1.0,Seasons,
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,2021-09-24T00:00:00.000Z,2021,TV-MA,2 Seasons,"international tv shows, romantic tv shows, tv comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.",2.0,Seasons,India
s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",,2021-09-24T00:00:00.000Z,2021,TV-MA,1 Seasons,"tv dramas, tv horror, tv mysteries","The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe.",1.0,Seasons,
s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",,2021-09-24T00:00:00.000Z,2021,PG,91 min,children & family movies,"Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it.",91.0,min,
s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri","United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia",2021-09-24T00:00:00.000Z,1993,TV-MA,125 min,"dramas, independent movies, international movies","On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past.",125.0,min,United States
s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",United Kingdom,2021-09-24T00:00:00.000Z,2021,TV-14,9 Seasons,"british tv shows, reality tv","A talented batch of amateur bakers face off in a 10-week competition, whipping up their best dishes in the hopes of being named the U.K.'s best.",9.0,Seasons,United Kingdom
s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, Timothy Olyphant, Daveed Diggs, Skyler Gisondo, Laura Harrier, Rosalind Chao, Kimberly Quinn, Loretta Devine, Ravi Kapoor",United States,2021-09-24T00:00:00.000Z,2021,PG-13,104 min,"comedies, dramas",A woman adjusting to life after a loss contends with a feisty bird that's taken over her garden — and a husband who's struggling to find a way forward.,104.0,min,United States


In [0]:
# Count missing values per column
df.isna().sum()


show_id              0
type                 0
title                1
director          2634
cast               825
country            831
date_added           0
release_year         0
rating               0
duration             3
listed_in            0
description          0
duration_value       3
duration_unit        3
main_country       831
dtype: int64

In [0]:
# Drop rows where critical info is missing (like title/type)
df = df.dropna(subset=["title", "type"])

# Drop columns with more than 50% nulls (if any)
threshold = len(df) * 0.5
df = df.dropna(axis=1, thresh=threshold)


In [0]:
# Fill missing director with "Unknown Director"
df["director"] = df["director"].fillna("Unknown Director")

# Fill missing cast with "Not Available"
df["cast"] = df["cast"].fillna("Not Available")

# Fill missing country with "Unknown"
df["country"] = df["country"].fillna("Unknown")

# Fill missing rating with "Unrated"
df["rating"] = df["rating"].fillna("Unrated")

# Fill missing date_added with a placeholder or leave as NaT
df["date_added"] = df["date_added"].fillna(pd.NaT)


In [0]:
df.isna().sum()


show_id             0
type                0
title               0
director            0
cast                0
country             0
date_added          0
release_year        0
rating              0
duration            3
listed_in           0
description         0
duration_value      3
duration_unit       3
main_country      831
dtype: int64

In [0]:
# Fill missing duration with "0 min" for Movies and "0 Seasons" for TV Shows
df.loc[df["type"] == "Movie", "duration"] = df.loc[df["type"] == "Movie", "duration"].fillna("0 min")
df.loc[df["type"] == "TV Show", "duration"] = df.loc[df["type"] == "TV Show", "duration"].fillna("0 Seasons")

# Re-extract after filling
df[["duration_value", "duration_unit"]] = df["duration"].str.extract(r"(\d+)\s*(\w+)")
df["duration_value"] = pd.to_numeric(df["duration_value"], errors="coerce")


In [0]:
df["main_country"] = df["main_country"].fillna("Unknown")


In [0]:
df.isna().sum()



show_id           0
type              0
title             0
director          0
cast              0
country           0
date_added        0
release_year      0
rating            0
duration          0
listed_in         0
description       0
duration_value    0
duration_unit     0
main_country      0
dtype: int64