## Netflix Recommender App using K-means and Fasttext embedding
by Tiffany Mangelli MBD Section 1

In [1]:
import pandas as pd
import numpy as np
import random
import re

### 1.Preprocessing
Select rows which is movie type and nessecary columns<br/>
Remove special character and transform to lower case

In [2]:
orig_netflix = pd.read_csv("netflix_titles.csv")
print(orig_netflix.shape)
orig_netflix.head()

(8807, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
orig_netflix = orig_netflix[orig_netflix["type"] == "Movie"].reset_index()
netflix = orig_netflix[["title", "listed_in", "description"]].copy()
netflix.head()

Unnamed: 0,title,listed_in,description
0,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,My Little Pony: A New Generation,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
2,Sankofa,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
3,The Starling,"Comedies, Dramas",A woman adjusting to life after a loss contend...
4,Je Suis Karl,"Dramas, International Movies",After most of her family is murdered in a terr...


In [4]:
netflix.isnull().sum()

title          0
listed_in      0
description    0
dtype: int64

In [5]:
def preprocessing(desc):
    desc = desc.lower()
    desc = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', ' ', desc)
    desc = " ".join(desc.split())
    
    return desc

In [6]:
netflix["new_description"] = netflix["description"].apply(lambda x: preprocessing(x))
print(netflix.shape)
netflix.head()

(6131, 4)


Unnamed: 0,title,listed_in,description,new_description
0,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm...",as her father nears the end of his life filmma...
1,My Little Pony: A New Generation,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...,equestria s divided but a bright eyed hero bel...
2,Sankofa,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",on a photo shoot in ghana an american model sl...
3,The Starling,"Comedies, Dramas",A woman adjusting to life after a loss contend...,a woman adjusting to life after a loss contend...
4,Je Suis Karl,"Dramas, International Movies",After most of her family is murdered in a terr...,after most of her family is murdered in a terr...


In [7]:
print(netflix["description"].iloc[0])
print(netflix["new_description"].iloc[0])

As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
as her father nears the end of his life filmmaker kirsten johnson stages his death in inventive and comical ways to help them both face the inevitable


### 2. Embedding
1. Split each sentence to make the corpus<br/>
2. Embedding the corpus with Fasttext method<br/>
3. Transform sentences to feature vector

In [8]:
from gensim.models.fasttext import FastText as FT_gensim

corpus = netflix["new_description"].tolist()
sentences = [re.split(' ', str(sentence)) for sentence in corpus]
print(corpus[0])
print(sentences[0])

as her father nears the end of his life filmmaker kirsten johnson stages his death in inventive and comical ways to help them both face the inevitable
['as', 'her', 'father', 'nears', 'the', 'end', 'of', 'his', 'life', 'filmmaker', 'kirsten', 'johnson', 'stages', 'his', 'death', 'in', 'inventive', 'and', 'comical', 'ways', 'to', 'help', 'them', 'both', 'face', 'the', 'inevitable']


In [9]:
embedding_size = 30

FT_model = FT_gensim(vector_size=embedding_size, min_count=2, min_n=2, max_n=5, sg=1, negative=10,
                         sample=0.001, window=5, alpha=0.025, min_alpha=0.0001, epochs=50)

FT_model.build_vocab(sentences)

print('corpus_count: ', FT_model.corpus_count)
print('corpus_total_words: ', FT_model.corpus_total_words)

FT_model.train(sentences,
    epochs=FT_model.epochs,
    total_examples=FT_model.corpus_count, total_words=FT_model.corpus_total_words)

print(FT_model)

corpus_count:  6131
corpus_total_words:  151843
FastText<vocab=8454, vector_size=30, alpha=0.025>


In [11]:
FT_vector = []

for item in corpus:
    FT_vector.append(FT_model.wv[str(item)])
FT_vector = np.asarray(FT_vector)

### 3. K-means Clustering
1. Train k-means clustering with feature vector<br/>
2. Add cluster_id on dataframe 

In [18]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

kmeanModel = KMeans(n_clusters=26, random_state=42).fit(FT_vector)
cluster_id = kmeanModel.predict(FT_vector)
netflix["cluster_id"] = cluster_id



In [19]:
netflix.head()

Unnamed: 0,title,listed_in,description,new_description,cluster_id
0,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm...",as her father nears the end of his life filmma...,11
1,My Little Pony: A New Generation,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...,equestria s divided but a bright eyed hero bel...,7
2,Sankofa,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",on a photo shoot in ghana an american model sl...,5
3,The Starling,"Comedies, Dramas",A woman adjusting to life after a loss contend...,a woman adjusting to life after a loss contend...,11
4,Je Suis Karl,"Dramas, International Movies",After most of her family is murdered in a terr...,after most of her family is murdered in a terr...,22


### 4. Recommendation system
Searching similarity of new description between source movie and target movie in same cluster<br/>
Sorting dataframe with similarity and return title of most similar movie with number of top_k

In [20]:
def recommendation_system(title_name):
    top_k = 5
    title_row = netflix[netflix["title"] == title_name].copy()
    search_df = netflix[netflix["cluster_id"].isin(title_row["cluster_id"])].copy()
    search_df = search_df.drop(search_df[search_df["title"] == title_name].index)
    
    search_df["Similarity"] = search_df.apply(lambda x: FT_model.wv.similarity(title_row["new_description"], x["new_description"]), axis=1)
    search_df.sort_values(by=["Similarity"], ascending=False, inplace=True)
    
    return search_df[["title", "Similarity"]].head(top_k)

In [21]:
recommendation_system("Killing Them Softly")

Unnamed: 0,title,Similarity
330,Major Grom: Plague Doctor,[0.97557825]
433,Jagame Thandhiram,[0.97205675]
4800,John Day,[0.9719341]
3393,Aakhri Adaalat,[0.9707152]
90,In Too Deep,[0.9705769]


In [24]:
recommendation_system("The Interview")

Unnamed: 0,title,Similarity
6053,We're No Animals,[0.960557]
3686,Dieter Nuhr: Nuhr in Berlin,[0.9560946]
974,Death to 2020,[0.9553115]
2472,Bollywood Calling,[0.95481765]
3023,I Hate Luv Storys,[0.9543296]


In [26]:
from joblib import dump, load

# Save the model
dump(kmeanModel, 'kmeans_model.joblib')

['kmeans_model.joblib']

In [27]:
netflix.head()

Unnamed: 0,title,listed_in,description,new_description,cluster_id
0,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm...",as her father nears the end of his life filmma...,11
1,My Little Pony: A New Generation,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...,equestria s divided but a bright eyed hero bel...,7
2,Sankofa,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",on a photo shoot in ghana an american model sl...,5
3,The Starling,"Comedies, Dramas",A woman adjusting to life after a loss contend...,a woman adjusting to life after a loss contend...,11
4,Je Suis Karl,"Dramas, International Movies",After most of her family is murdered in a terr...,after most of her family is murdered in a terr...,22


In [28]:
netflix.to_csv('processed_netflix_titles.csv', index=False)
