In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
url = "https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv"
df = pd.read_csv(url)
df.head()

In [None]:
x = df.iloc[0]
x

In [None]:
x['genres']

In [None]:
x['keywords']

In [None]:
j = json.loads(x['genres'])
j

In [None]:
' '.join(''.join(jj['name'].split())for jj in j)

In [None]:
#convert the relevant data for each movie into a single string
#to be ingested by TfidfVectorizer
def genres_and_keywords_to_string(row):
    genres = json.loads(row['genres'])
    genres = ' '.join(''.join(j['name'].split())for j in genres)
    keywords = json.loads(row['keywords'])
    keywords = ' '.join(''.join(j['name'].split())for j in keywords)
    return "{} {}".format(genres, keywords)

In [None]:
#create a new representation of each movie
df['string'] = df.apply(genres_and_keywords_to_string, axis=1)

In [None]:
#create a tf-idf vectorizer object
tfidf = TfidfVectorizer(max_features = 2000)

In [None]:
#create a data matrix from the overviews
X = tfidf.fit_transform(df['string'])

In [None]:
X

In [None]:
#generate a mapping from moive title 
movie2idx = pd.Series(df.index, index=df['title'])
movie2idx

In [None]:
 idx = movie2idx['Avatar']
idx

In [None]:
query = X[idx]
query.toarray()

In [None]:
#compute similarity between query and every vector in X
scores = cosine_similarity(query, X)
scores

In [None]:
#currently the array is 1 * n, make it just a 1-D array
scores = scores.flatten()
plt.plot(scores)

In [None]:
(-scores).argsort()

In [None]:
plt.plot(scores[(-scores).argsort()])

In [None]:
#get top 5 matches
recommended_idx = (-scores).argsort()[1:6]

In [None]:
#convert indices back to titles
df['title'].iloc[recommended_idx]

In [None]:
#create a function that generates recommendations
def recommend(title):
    #get the row in the dataframe for this movie
    idx = movie2idx[title]
    if type(idx) == pd.Series:
        idx = idx.iloc[0]
    #calculate the pairwise similarities for this movie
    query = X[idx]
    scores = cosine_similarity(query, X)
    scores = scores.flatten()
    recommended_idx = (-scores).argsort()[1:6]

    #return the titles of recommendations
    return df['title'].iloc[recommended_idx]



In [None]:
print(recommend('Mortal Kombat'))

In [None]:
print(recommend('Avatar'))