In [0]:
# Databricks Notebook Code

# Import libraries
import pandas as pd

# Load dataset (adjust path if needed, e.g., dbfs:/FileStore/tables/netflix_titles.csv)
file_path = "/Volumes/workspace/default/netflix/netflix_titles.csv"
df = pd.read_csv(file_path)

# -------------------------------
# 1. Check for duplicates
# -------------------------------
print("Duplicates before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicates after:", df.duplicated().sum())

# -------------------------------
# 2. Handle missing values
# -------------------------------
# Fill text columns with 'Unknown'
for col in ["director", "cast", "country"]:
    df[col] = df[col].fillna("Unknown")

# Fill date_added missing values with a placeholder
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")
df["date_added"] = df["date_added"].fillna(pd.Timestamp("1900-01-01"))

# Fill rating with most common value
df["rating"] = df["rating"].fillna(df["rating"].mode()[0])

# Fill duration with 'Unknown'
df["duration"] = df["duration"].fillna("Unknown")

# -------------------------------
# 3. Data Cleaning
# -------------------------------
# Extract duration in minutes or seasons
def clean_duration(x):
    if "Season" in str(x):
        return str(x).replace("Seasons", "").replace("Season", "").strip() + " Seasons"
    elif "min" in str(x):
        return str(x).replace(" min", "").strip() + " min"
    else:
        return "Unknown"

df["duration"] = df["duration"].apply(clean_duration)

# Trim whitespace in string columns
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# -------------------------------
# 4. Save cleaned dataset
# -------------------------------
# Save back to Databricks FileStore
cleaned_path = "/Volumes/workspace/default/netflix/netflix_titles.csv"
df.to_csv(cleaned_path, index=False)

print("✅ Cleaning complete! Cleaned file saved at:", cleaned_path)
display(df.head())


Duplicates before: 0
Duplicates after: 0
✅ Cleaning complete! Cleaned file saved at: /Volumes/workspace/default/netflix/netflix_titles.csv


show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25T00:00:00.000Z,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,2021-09-24T00:00:00.000Z,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",Unknown,2021-09-24T00:00:00.000Z,2021,TV-MA,1 Seasons,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24T00:00:00.000Z,2021,TV-MA,1 Seasons,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,2021-09-24T00:00:00.000Z,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."
