The task is to build a movie recommender system based on TF-IDF.

In [1]:
import pandas as pd
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [2]:
# download dataset: 5000 movies from TMDb
# original dataset on Kaggle: https://www.kaggle.com/tmdb/tmdb-movie-metadata
!wget https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv

--2022-10-20 21:06:10--  https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5698602 (5,4M) [text/csv]
Saving to: ‘tmdb_5000_movies.csv.1’


2022-10-20 21:06:12 (3,08 MB/s) - ‘tmdb_5000_movies.csv.1’ saved [5698602/5698602]



In [3]:
# save the dataset in Pandas dataframe
df = pd.read_csv('tmdb_5000_movies.csv')

In [4]:
# check the dataset
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
# create a function to combine genres and keywords: this will be useful when fitting the model
def merge_genres_keywords(row):
    genres = json.loads(row['genres'])
    genres = ' '.join(''.join(item['name'].split()) for item in genres)
    
    keywords = json.loads(row['keywords'])
    keywords = ' '.join(''.join(item['name'].split()) for item in keywords)
    
    return "%s %s" % (genres, keywords)

In [6]:
# create a new column in the dataframe
df['genres_keywords'] = df.apply(merge_genres_keywords, axis=1)

In [7]:
# use TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features = 2000)

In [8]:
X = tfidf.fit_transform(df['genres_keywords'])

In [9]:
# create a movie to index dictionary
movie2idx = pd.Series(df.index, index=df['title'])

In [10]:
# create a recommender function
def recommend(title):
    # get query movie index
    idx = movie2idx[title]
    if type(idx) == pd.Series:
        idx = idx.iloc[0]  
    query = X[idx]
    # calculate cosine similarity
    scores = cosine_similarity(query, X)
    scores = scores.flatten()
    # find top 5 similar movies (excluding 0 since it's the movie from the query)
    recommended_idx = (-scores).argsort()[1:6]
    return df['title'].iloc[recommended_idx]

In [24]:
# get recommendations based on 3 random movies 
for t in df['title'].sample(3).values:
    print(f'Movies likes {t}:\n{recommend(t)}\n')

Movies likes Raise the Titanic:
1269     Raise the Titanic
4111           Dinner Rush
4124    This Thing of Ours
3857          Wicked Blood
3284        Brooklyn Rules
Name: title, dtype: object

Movies likes Kundun:
2276     The Painted Veil
3650               Tycoon
372              Spy Game
1234       The Art of War
684     The Hateful Eight
Name: title, dtype: object

Movies likes Rushmore:
3044            Get Over It
1780    Shakespeare in Love
2604          Here On Earth
1772               Flawless
4607            Tumbleweeds
Name: title, dtype: object

