# 🎯 Feature Engineering – Netflix Dataset
In this notebook, we will create new features from the cleaned Netflix dataset to support data analysis, visualization, and modeling.

In [1]:
# Change to parent directory and import libraries
import os
import pandas as pd

# Change to parent directory to access data folder
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
print(f"Working directory: {os.getcwd()}")

Working directory: c:\Users\Student\Downloads\DET\Practice PBI\Power BI\Netflix-Movies-and-TV-Shows-


In [2]:
# Load cleaned data
df = pd.read_csv('data/netflix_cleaned.csv')

# Preview
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (8807, 13)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Not Specified,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021.0
1,s2,TV Show,Blood & Water,Not Specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021.0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021.0
3,s4,TV Show,Jailbirds New Orleans,Not Specified,Not Specified,United States,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021.0
4,s5,TV Show,Kota Factory,Not Specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021.0


## Extract Year and Month from 'date_added'

In [3]:
# Make sure date_added is datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Create year and month features
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month

print("Date features created:")
print(df[['date_added', 'year_added', 'month_added']].head())

Date features created:
  date_added  year_added  month_added
0 2021-09-25      2021.0          9.0
1 2021-09-24      2021.0          9.0
2 2021-09-24      2021.0          9.0
3 2021-09-24      2021.0          9.0
4 2021-09-24      2021.0          9.0


##  Title Features

In [4]:
# Title length
df['title_length'] = df['title'].apply(len)

# Word count in title
df['title_word_count'] = df['title'].apply(lambda x: len(x.split()))

print("Title features created:")
print(df[['title', 'title_length', 'title_word_count']].head())

Title features created:
                   title  title_length  title_word_count
0   Dick Johnson Is Dead            20                 4
1          Blood & Water            13                 3
2              Ganglands             9                 1
3  Jailbirds New Orleans            21                 3
4           Kota Factory            12                 2


## Country Feature

In [6]:
# Check if show is available in multiple countries
df['has_multiple_countries'] = df['country'].apply(lambda x: ',' in x)

print("Country features created:")
print(df[['title', 'country', 'has_multiple_countries']].head())

Country features created:
                   title        country  has_multiple_countries
0   Dick Johnson Is Dead  United States                   False
1          Blood & Water   South Africa                   False
2              Ganglands  United States                   False
3  Jailbirds New Orleans  United States                   False
4           Kota Factory          India                   False


## Duration Feature (Movies Only)

In [7]:
# Filter only Movies
df_movies = df[df['type'] == 'Movie'].copy()

# Extract duration in minutes
df_movies['duration_mins'] = df_movies['duration'].str.replace(' min', '').astype(float)

# Categorise duration
df_movies['duration_category'] = pd.cut(
    df_movies['duration_mins'],
    bins=[0, 60, 90, 120, float('inf')],
    labels=['Short', 'Standard', 'Long', 'Very Long']
)

print(f"Movie duration features created for {len(df_movies)} movies:")
print(df_movies[['title', 'duration', 'duration_mins', 'duration_category']].head())

Movie duration features created for 6131 movies:
                               title duration  duration_mins duration_category
0               Dick Johnson Is Dead   90 min           90.0          Standard
6   My Little Pony: A New Generation   91 min           91.0              Long
7                            Sankofa  125 min          125.0         Very Long
9                       The Starling  104 min          104.0              Long
12                      Je Suis Karl  127 min          127.0         Very Long


## 💾 Save the Feature-Engineered Datasets

In [8]:
# Save the full dataset with new features
df.to_csv('data/netflix_with_features.csv', index=False)

# Save movies dataset with duration features
df_movies.to_csv('data/netflix_movies_with_features.csv', index=False)

print("✅ Feature engineered datasets saved:")
print("   • data/netflix_with_features.csv")
print("   • data/netflix_movies_with_features.csv")

print(f"\n📊 Summary:")
print(f"   • Total records: {len(df):,}")
print(f"   • Total features: {len(df.columns)}")
print(f"   • Movies with duration features: {len(df_movies):,}")

print(f"\n🎯 New features created:")
new_features = ['year_added', 'month_added', 'title_length', 'title_word_count', 
               'num_cast_members', 'has_director', 'has_multiple_countries']
for feature in new_features:
    print(f"   • {feature}")

print(f"\n🎬 Movie-specific features:")
movie_features = ['duration_mins', 'duration_category']
for feature in movie_features:
    print(f"   • {feature}")

✅ Feature engineered datasets saved:
   • data/netflix_with_features.csv
   • data/netflix_movies_with_features.csv

📊 Summary:
   • Total records: 8,807
   • Total features: 19
   • Movies with duration features: 6,131

🎯 New features created:
   • year_added
   • month_added
   • title_length
   • title_word_count
   • num_cast_members
   • has_director
   • has_multiple_countries

🎬 Movie-specific features:
   • duration_mins
   • duration_category
