In [2]:
import numpy as np
import pandas as pd
import os
import json
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the dataset
# The dataset was retrieved from https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
df = pd.read_csv('/content/tmdb_5000_movies.csv')

In [4]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
def tfidf_tokens_creation(row):
    # Taking genres, keywords, languages

    # Loads the genre data that was written in JSON format
    genre = json.loads(row['genres'])

    # Ex. : A row consists of "Action", "Science Fiction", "Thriller"
    #
    #            1. "Science Fiction" -> "Science", "Fiction"
    #                                |
    #      2. "Sci", "Fi" -> "SciFi" |
    #                    |           |           3. "Action", "SciFi", "Thriller" -> "Action SciFi Thriller"
    #                    |  I--------+--------I     |
    #                    |  |                 |     |
    #               I----+--|-----------------|I    |
    #               |       |                 ||    |
    #       I-------|-------|-----------------||----+---------I
    genre = ' '.join(''.join(g['name'].split()) for g in genre)

    # The same goes to the others
    # Keywords
    keyword = json.loads(row['keywords'])
    keyword = ' '.join(''.join(k['name'].split()) for k in keyword)

    # Languages
    language = json.loads(row['spoken_languages'])
    language = ' '.join(''.join(l['name'].split()) for l in language)

    # Join all of the tokens gained from genres, keywords, and languages to a single string
    res = ' '.join([genre, keyword, language])

    # Removing unnecessary whitespaces
    res = re.sub("^\s+", "", res)
    res = re.sub("\s+$", "", res)
    res = re.sub("\s+", " ", res)
    return res

In [6]:
# Applying the tokenizing function for every record in the dataframe and put the results in a new column
df['for_tfidf'] = df.apply(tfidf_tokens_creation, axis=1)

In [7]:
# Preview of the tokenizing result
df['for_tfidf']

0       Action Adventure Fantasy ScienceFiction cultur...
1       Adventure Fantasy Action ocean drugabuse exoti...
2       Action Adventure Crime spy basedonnovel secret...
3       Action Crime Drama Thriller dccomics crimefigh...
4       Action Adventure ScienceFiction basedonnovel m...
                              ...                        
4798    Action Crime Thriller unitedstates–mexicobarri...
4799                                       Comedy Romance
4800    Comedy Drama Romance TVMovie date loveatfirsts...
4801                                              English
4802    Documentary obsession camcorder crush dreamgir...
Name: for_tfidf, Length: 4803, dtype: object

In [8]:
# Initiate TF-IDF instance
vect = TfidfVectorizer()

In [9]:
# Fit the TF-IDF instance with the dataframe
res = vect.fit_transform(df['for_tfidf'])

In [10]:
# Create a 2D array of similarity scores using cosine_similarity
sim_score = cosine_similarity(res)

# Preview of the array
print(sim_score.shape)
print(sim_score)

(4803, 4803)
[[1.         0.02811759 0.04418895 ... 0.01306468 0.03292404 0.00233724]
 [0.02811759 1.         0.02632518 ... 0.00202723 0.03688681 0.00261855]
 [0.04418895 0.02632518 1.         ... 0.00301984 0.05494804 0.0039007 ]
 ...
 [0.01306468 0.00202723 0.00301984 ... 1.         0.05495814 0.00390142]
 [0.03292404 0.03688681 0.05494804 ... 0.05495814 1.         0.07098886]
 [0.00233724 0.00261855 0.0039007  ... 0.00390142 0.07098886 1.        ]]


In [11]:
# Create a dictionary for the title
movie_dict = pd.Series(df.index, index=df['title'])

In [12]:
# Input the title query
query = input("Query a movie name: ")

Query a movie name: Avatar


In [13]:
# Retrieve the desired row from the similarity scores array and create a new dataframe for it
ranks = pd.DataFrame(sim_score[movie_dict[query]], columns=['SimScore'])

# Sort the dataframe by the similarity score
ranks = ranks.sort_values('SimScore', ascending=False)

# Preview of the sorted dataframe
ranks

Unnamed: 0,SimScore
0,1.000000
47,0.248063
1287,0.197770
61,0.196649
1201,0.185065
...,...
3273,0.000000
1948,0.000000
4544,0.000000
4553,0.000000


In [14]:
# The result of the recommendations, only the top 5
recommendations = pd.DataFrame([movie_dict[ranks[1:6].index].index, ranks['SimScore'][1:6]]).T
recommendations = recommendations.rename({0: 'Movie Titles', 1: 'CS Score'}, axis=1)
recommendations

Unnamed: 0,Movie Titles,CS Score
0,Star Trek Into Darkness,0.248063
1,A Monster in Paris,0.19777
2,Jupiter Ascending,0.196649
3,Predators,0.185065
4,Planet 51,0.174946
