In [5]:
import pandas as pd
# Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Load Movies Metadata
data = pd.read_csv('../leetcode_data.csv', low_memory=False)

In [9]:
tfidf = TfidfVectorizer(stop_words='english')
data['content'] = data['content'].fillna('')
tfidf_matrix = tfidf.fit_transform(data['content'])
tfidf_matrix.shape # (45446, 75827) --> 75000 different words are used to describe 45000+ movies

(870, 4458)

In [10]:
data.head(3)

Unnamed: 0,id,name,content,difficulty
0,1160,Letter Tile Possibilities,"You have a set of tiles, where each tile has o...",2
1,1159,Smallest Subsequence of Distinct Characters,Return the lexicographically smallest subseque...,2
2,1157,Insufficient Nodes in Root to Leaf Paths,"Given the root of a binary tree, consider all ...",2


In [12]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#Construct a reverse map of indices and movie titles
indices = pd.Series(data.index, index=data['name']).drop_duplicates()
idx = pd.Series(data.index, index=data['name']).drop_duplicates()

In [13]:
idx[:3]

name
Letter Tile Possibilities                      0
Smallest Subsequence of Distinct Characters    1
Insufficient Nodes in Root to Leaf Paths       2
dtype: int64

In [15]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # get the idx of the movie that matches the title
    movie_idx = idx[title]
    # get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    # sort
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    res_idx = [i[0] for i in sim_scores]
    
    return data['name'].iloc[res_idx]

In [17]:
get_recommendations('LRU Cache')

510                        LFU Cache
536         All O`one Data Structure
376                    Map Sum Pairs
270                   Design HashMap
126                    Knight Dialer
472                    Freedom Trail
520             Delete Node in a BST
455      Convert BST to Greater Tree
202    Shortest Path to Get All Keys
79        Time Based Key-Value Store
Name: name, dtype: object

In [18]:
get_recommendations('LFU Cache')

724                        LRU Cache
536         All O`one Data Structure
270                   Design HashMap
376                    Map Sum Pairs
126                    Knight Dialer
472                    Freedom Trail
202    Shortest Path to Get All Keys
455      Convert BST to Greater Tree
520             Delete Node in a BST
79        Time Based Key-Value Store
Name: name, dtype: object

In [21]:
get_recommendations('Word Ladder')

744                               Word Ladder II
266                      Unique Morse Code Words
95                            Vowel Spellchecker
840    Substring with Concatenation of All Words
791                                  Word Search
812                          Length of Last Word
251                             Most Common Word
618              Maximum Product of Word Lengths
730                                Word Break II
349                   Longest Word in Dictionary
Name: name, dtype: object