In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.models import FastText
from sklearn.neighbors import NearestNeighbors
import joblib

# from sentence_transformers import SentenceTransformer, util
# from gensim.utils import simple_preprocess

# from sklearn.model_selection import train_test_split
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import classification_report


In [2]:
# load CSV file
df = pd.read_csv('netflix.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genres,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,"Horror Movies, International Movies, Thrillers",An architect and his wife move into a castle t...
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,05-Jan-19,2016,TV-PG,124,"Dramas, International Movies, Sports Movies",Three Indonesian women break records by becomi...
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,01-Mar-16,2016,R,90,Comedies,New NFL star Thad buys his old teammates' belo...
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,03-Dec-18,2017,TV-G,1,"British TV Shows, Docuseries, Science & Nature TV",This sequel to the award-winning nature series...


In [3]:
# drop rows with missing values
df.dropna(inplace=True)

In [4]:
# convert text to lowercase
# split text into individal words

def preprocess_text(text):
    return word_tokenize(text.lower())

In [5]:
# tokenize text
def train_fasttext(texts):
    tokenized_text = [preprocess_text(text) for text in texts]
    model = FastText(sentences=tokenized_text, vector_size=200, window=10, min_count=2, workers=4)
    return model

In [6]:
# train for different categories
columns = ['director', 'genres', 'country', 'description', 'cast']
models = {column: train_fasttext(df[column]) for column in columns}

In [7]:
# save models
for column, model in models.items():
    model.save(f'models/{column}_model.bin')

In [8]:
# calculate embedding for each column
def get_avg_embedding(tokens, model):
    embeddings = [model.wv[token] for token in tokens if token in model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

for column in columns:
    df[f'{column}_embedding'] = df[column].apply(lambda x: get_avg_embedding(preprocess_text(x), models[column]))

In [9]:
# calculate mean of embeddings
def combine_embeddings(row):
    embeddings = [row[f'{column}_embedding'] for column in columns]
    return np.mean(embeddings, axis=0)

In [10]:
# save dataframe with combined embeddings of all categories
df['combined_embedding'] = df.apply(lambda row: combine_embeddings(row), axis=1)
df.to_csv('processed_data.csv', index=False)

In [11]:
# KNN for nearest neighbor search
embeddings_matrix = np.vstack(df['combined_embedding'].values)
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(embeddings_matrix)

In [12]:
# KNN model
joblib.dump(knn, 'knn_model.pkl')

['knn_model.pkl']