In [0]:
# Databricks Notebook: Netflix Dataset Cleaning & Encoding

# Step 1: Import Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Step 2: Load Dataset
df = pd.read_csv("/Volumes/workspace/default/netflix-1/netflix_titles.csv")
print("✅ Dataset loaded successfully.")

✅ Dataset loaded successfully.


In [0]:
# Step 3: Remove Duplicate Rows
df = df.drop_duplicates()
print("✅ Duplicate rows removed.")

✅ Duplicate rows removed.


In [0]:
# Step 4: Handle Missing Values
df['country'] = df['country'].fillna("unknown")
df['director'] = df['director'].fillna("Not Available")
df['cast'] = df['cast'].fillna("Not Available")
df['rating'] = df['rating'].fillna("Not Rated")
df['duration'] = df['duration'].fillna("0")
print("✅ Missing values handled.")

✅ Missing values handled.


In [0]:
# Step 5: Strip Extra Spaces from Strings
for col in ['type', 'rating', 'country', 'director', 'listed_in']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
print("✅ Extra spaces removed from object columns.")

✅ Extra spaces removed from object columns.


In [0]:
# Step 6: Remove Special Characters from 'country'
df['country'] = df['country'].str.replace(r'[^a-zA-Z ,]', '', regex=True)
print("✅ Special characters removed from 'country'.")

✅ Special characters removed from 'country'.


In [0]:
# Step 7: Rename Columns (if needed)
df = df.rename(columns={"show_id": "id"})
print("✅ Columns renamed successfully.")


✅ Columns renamed successfully.


In [0]:

# Step 8: Drop Unnecessary Columns (optional)
# Uncomment below to drop 'description' column if desired
# if 'description' in df.columns:
#     df = df.drop(columns=['description'])
print("✅ Optional columns dropped if applied.")

# Step 9: Handle Duplicates Again
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"✅ Removed {before - after} duplicate rows.")

✅ Optional columns dropped if applied.
✅ Removed 0 duplicate rows.


In [0]:
# Step 10: Create a Cleaned Copy and Save
df_cleaned = df.copy()
df_cleaned.to_csv("/Volumes/workspace/default/netflix-1/cleaned_netflix_titles.csv", index=False)
print("✅ Cleaned dataset saved as 'cleaned_netflix_titles.csv'.")


✅ Cleaned dataset saved as 'cleaned_netflix_titles.csv'.


In [0]:
# Step 11: Encoding 'rating' using LabelEncoder
label_encoder = LabelEncoder()
df_cleaned['rating_encoded'] = label_encoder.fit_transform(df_cleaned['rating'])
print("✅ 'rating' column label encoded.")

✅ 'rating' column label encoded.


In [0]:
# Step 12: Frequency Encoding for 'country'
if 'country' in df_cleaned.columns:
    country_freq = df_cleaned['country'].value_counts().to_dict()
    df_cleaned['country_encoded'] = df_cleaned['country'].map(country_freq)
    print("✅ Frequency encoding applied to 'country'.")

✅ Frequency encoding applied to 'country'.


In [0]:
# Step 13: Multi-label One-Hot Encoding for Genres
df_cleaned['genres_list'] = df_cleaned['listed_in'].apply(lambda x: [g.strip() for g in str(x).split(',')])
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(
    mlb.fit_transform(df_cleaned['genres_list']),
    columns=mlb.classes_,
    index=df_cleaned.index
)
df_cleaned = pd.concat([df_cleaned, genres_encoded], axis=1)
print("✅ Multi-label one-hot encoding applied to genres.")

✅ Multi-label one-hot encoding applied to genres.


In [0]:

# Step 14: Frequency Encoding for Primary Genre
df_cleaned['primary_genre'] = df_cleaned['listed_in'].apply(lambda x: str(x).split(',')[0].strip())
genre_freq = df_cleaned['primary_genre'].value_counts().to_dict()
df_cleaned['genre_encoded'] = df_cleaned['primary_genre'].map(genre_freq)
print("✅ Frequency encoding applied to primary genre.")


✅ Frequency encoding applied to primary genre.


In [0]:
# Step 15: Save Frequency Encoded Dataset
df_cleaned.to_csv("/Volumes/workspace/default/netflix-1/freq_encoded_netflix_titles.csv", index=False)
print("✅ Frequency encoded dataset saved as 'freq_encoded_netflix_titles.csv'.")

✅ Frequency encoded dataset saved as 'freq_encoded_netflix_titles.csv'.
