In [3]:
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import kagglehub

In [4]:
path = kagglehub.dataset_download("infamouscoder/dataset-netflix-shows")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/infamouscoder/dataset-netflix-shows?dataset_version_number=1...


100%|██████████| 1.34M/1.34M [00:01<00:00, 987kB/s] 

Extracting files...
Path to dataset files: C:\Users\Muhammad Umair\.cache\kagglehub\datasets\infamouscoder\dataset-netflix-shows\versions\1





In [6]:
dataset = pd.read_csv(f"{path}/netflix_titles.csv")

In [10]:
dataset.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [11]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [13]:
def combine_description_title_and_genre(description, listed_in, title):
    return f"{description} Genre: {listed_in} Title: {title}"

In [14]:
dataset['combined_text'] = dataset.apply(lambda row: combine_description_title_and_genre(row['description'], row['listed_in'], row['title']), axis=1)

In [16]:
dataset.head(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,combined_text
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...","As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,In a city of coaching centers known to train I...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...,The arrival of a charismatic young priest brin...
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...,A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...,A woman adjusting to life after a loss contend...


In [17]:
batch_size = 32
embeddings = []

In [18]:
for i in range(0, len(dataset), batch_size):
    batch_texts = dataset['combined_text'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
    embeddings.extend(batch_embeddings.cpu().numpy())

In [19]:
embeddings = np.array(embeddings)

In [20]:
embeddings.shape

(8807, 384)

In [24]:
embeddings[8806]

array([-8.13445672e-02,  4.46532257e-02, -3.18311155e-02,  3.84816490e-02,
       -9.98240337e-02,  1.88089631e-04,  8.43751058e-02, -1.07415570e-02,
       -3.00694574e-02, -6.42659739e-02,  2.09726784e-02, -6.54207543e-02,
        4.23450656e-02,  4.59216088e-02, -3.37506905e-02,  4.41308646e-03,
        5.09787276e-02,  6.15022182e-02,  1.24310646e-02,  4.22593858e-03,
       -6.68079332e-02,  7.47554153e-02,  3.78033407e-02, -6.72754571e-02,
       -2.31081210e-02, -5.49594685e-02,  3.26447971e-02,  8.66291746e-02,
       -3.92470770e-02,  4.52225655e-02,  3.71220969e-02,  9.02458578e-02,
        5.36908349e-03, -4.67322096e-02,  1.09626658e-01, -2.92169489e-02,
       -6.38121273e-03, -2.61673727e-03, -2.67291870e-02, -2.53001358e-02,
        7.47718439e-02,  8.91354382e-02, -5.34150638e-02, -5.14100157e-02,
       -1.11470804e-01, -1.48738340e-01,  4.25345488e-02, -1.28498763e-01,
        3.52993943e-02,  5.42039387e-02, -8.71088877e-02, -1.61601566e-02,
        3.47651318e-02, -

In [25]:
np.save("netflix_embeddings.npy", embeddings)
dataset[['show_id', 'title', 'description', 'listed_in']].to_csv("netflix_metadata.csv", index=False)