# KEYWORD RECOMMENDER SYSTEM

## Importing libraries

In [24]:
import kagglehub
import pandas as pd
import numpy as np
import json

## Downloading and loading the dataset

In [25]:
path = kagglehub.dataset_download(
                            "tmdb/tmdb-movie-metadata",
                            output_dir = '../data')
movies_path =  path + "/tmdb_5000_movies.csv"
credits_path = path + "/tmdb_5000_credits.csv"

In [26]:
movies = pd.read_csv(movies_path)
credits = pd.read_csv(credits_path)

In [27]:
display(
    movies.head(),
    credits.head()
)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# Create dictionaries of:

1. Movie names and their keywords from the keywords column as well as the overview column

2. Keywords and their counts

### Keywords dictionary

In [28]:
movie_keywords = {}

for row in movies.iterrows():
    row = pd.Series(row)
    name = row[1]['original_title']
    keywords_list = json.loads(row[1]['keywords'])

    keyword_names = []

    for keyword in keywords_list:
        keyword_names.append(keyword['name'])

    movie_keywords[name] = keyword_names

In [29]:
print('Shrek:', movie_keywords['Shrek'])
print('Spider-Man:', movie_keywords['Spider-Man'])

Shrek: ['magic', 'liberation', 'lordship', 'castle', 'robin hood', 'enchantment', 'fairy-tale figure', 'princess', 'parody', 'woman director', 'ogre']
Spider-Man: ['loss of lover', 'spider', 'thanksgiving', 'bad boss', 'hostility', 'marvel comic', 'superhero', 'pokies', 'evil', 'reference to superman', 'goblin']


### Overview keywords dictionary

In [30]:
# Importing NLTK libraries and downloading dependencies
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Downloading and setting stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Setting up the regex tokenizer
tokenizer = RegexpTokenizer(r"\w+'t|\w+'ve|\w+").tokenize # Spare all words ending with 't and 've; haven't -> haven't, would've -> would've, but Macy's -> Macy

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shauryasingru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# Creating an initial dictionary of movie name as key and its lowercase overview as value
movie_overview = {title: str(overview).lower() for title, overview in movies[['original_title', 'overview']].to_numpy()}

# Tokenising the overview for each movie
movie_overview = {movie: tokenizer(movie_overview[movie]) for movie in movie_keywords}

# Removing stopwords for the overview tokens for each movie
for movie in movie_overview:
    overview_keywords = [kw for kw in movie_overview[movie] if kw not in stop_words]
    movie_overview[movie] = overview_keywords

In [32]:
print('Shrek:', movie_overview['Shrek'])
print('Spider-Man:', movie_overview['Spider-Man'])

Shrek: ["ain't", 'easy', 'bein', 'green', 'especially', 'likable', 'albeit', 'smelly', 'ogre', 'named', 'shrek', 'mission', 'retrieve', 'gorgeous', 'princess', 'clutches', 'fire', 'breathing', 'dragon', 'shrek', 'teams', 'unlikely', 'compatriot', 'wisecracking', 'donkey']
Spider-Man: ['bitten', 'genetically', 'altered', 'spider', 'nerdy', 'high', 'school', 'student', 'peter', 'parker', 'endowed', 'amazing', 'powers']


In [33]:
# Combining given keywords as well as overview keywords
movie_keywords_all = {}

for movie in movie_keywords:
    movie_keywords_all[movie] = movie_keywords[movie] + movie_overview[movie]

In [34]:
print('Shrek:', movie_keywords_all['Shrek'])
print('Spider-Man:', movie_keywords_all['Spider-Man'])

Shrek: ['magic', 'liberation', 'lordship', 'castle', 'robin hood', 'enchantment', 'fairy-tale figure', 'princess', 'parody', 'woman director', 'ogre', "ain't", 'easy', 'bein', 'green', 'especially', 'likable', 'albeit', 'smelly', 'ogre', 'named', 'shrek', 'mission', 'retrieve', 'gorgeous', 'princess', 'clutches', 'fire', 'breathing', 'dragon', 'shrek', 'teams', 'unlikely', 'compatriot', 'wisecracking', 'donkey']
Spider-Man: ['loss of lover', 'spider', 'thanksgiving', 'bad boss', 'hostility', 'marvel comic', 'superhero', 'pokies', 'evil', 'reference to superman', 'goblin', 'bitten', 'genetically', 'altered', 'spider', 'nerdy', 'high', 'school', 'student', 'peter', 'parker', 'endowed', 'amazing', 'powers']


### Lemmatising the keywords

In [35]:
#Importing sub-modules required for lemmatising
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Downloading required dependencies
# nltk.download("punkt")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/shauryasingru/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shauryasingru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
# Setting up lemmatiser
wnl = WordNetLemmatizer()

def get_wordnet_pos(tag):
    """Map NLTK POS tags to WordNet POS tags"""
    if tag.startswith("V"): return wordnet.VERB
    if tag.startswith("J"): return wordnet.ADJ
    if tag.startswith("R"): return wordnet.ADV
    return wordnet.NOUN  # default

In [37]:
# Extracting lemmas for all movies
movie_keywords_lemmas = {}
for movie in movie_keywords_all:
    tags = pos_tag(movie_keywords_all[movie])
    lemmas = [wnl.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tags]
    movie_keywords_lemmas[movie] = lemmas

In [38]:
# Spltting multi-word expressions into separate words and appending them to the existing list of lemmas for each movie
for movie in movie_keywords_lemmas:
    lemmas = movie_keywords_lemmas[movie]
    split_words = []

    for lemma in lemmas:
        if ' ' in lemma:
            split_words += lemma.split(' ')
    lemmas += split_words
    movie_keywords[movie] = lemmas

### Counting the number of words across all movies

In [48]:
keyword_counts = {}
for movie in movie_keywords_lemmas:
    lemmas = movie_keywords[movie]
    
    for lemma in lemmas:
        keyword_counts[lemma] = keyword_counts.get(lemma, 0) + 1

keyword_counts = sorted(keyword_counts.items(), key = lambda kw: kw[0])
keyword_counts = {kw[0]: kw[1] for kw in keyword_counts}

In [49]:
keyword_counts = sorted(keyword_counts.items(), key = lambda kw: kw[1], reverse=True)
keyword_counts = {kw[0]: kw[1] for kw in keyword_counts}
keyword_counts

{'life': 1176,
 'new': 995,
 'find': 900,
 'love': 875,
 'world': 832,
 'woman': 827,
 'film': 792,
 'family': 780,
 'young': 776,
 'one': 760,
 'man': 711,
 'relationship': 662,
 'year': 661,
 'get': 649,
 'friend': 646,
 'war': 621,
 'take': 615,
 'of': 573,
 'story': 568,
 'on': 565,
 'father': 559,
 'two': 554,
 'murder': 522,
 'school': 518,
 'time': 493,
 'based': 486,
 'secret': 474,
 'make': 471,
 'become': 470,
 'go': 452,
 'death': 445,
 'must': 437,
 'old': 428,
 'director': 418,
 'brother': 402,
 'live': 400,
 'high': 389,
 'son': 384,
 'come': 383,
 'drug': 383,
 'town': 383,
 'child': 381,
 'city': 378,
 'help': 371,
 'police': 370,
 'mother': 363,
 'daughter': 361,
 'force': 361,
 'american': 359,
 'girl': 359,
 'york': 359,
 'set': 353,
 'wife': 349,
 'turn': 345,
 'day': 339,
 'way': 337,
 'back': 333,
 'try': 333,
 'independent': 331,
 'home': 330,
 'woman director': 324,
 'work': 324,
 'independent film': 317,
 'duringcreditsstinger': 307,
 'begin': 306,
 'lead': 306

In [55]:
cutoff = 5
keyword_counts_cutoff = ({kw: keyword_counts[kw] for kw in keyword_counts if keyword_counts[kw] >= cutoff})
keyword_counts_cutoff

{'life': 1176,
 'new': 995,
 'find': 900,
 'love': 875,
 'world': 832,
 'woman': 827,
 'film': 792,
 'family': 780,
 'young': 776,
 'one': 760,
 'man': 711,
 'relationship': 662,
 'year': 661,
 'get': 649,
 'friend': 646,
 'war': 621,
 'take': 615,
 'of': 573,
 'story': 568,
 'on': 565,
 'father': 559,
 'two': 554,
 'murder': 522,
 'school': 518,
 'time': 493,
 'based': 486,
 'secret': 474,
 'make': 471,
 'become': 470,
 'go': 452,
 'death': 445,
 'must': 437,
 'old': 428,
 'director': 418,
 'brother': 402,
 'live': 400,
 'high': 389,
 'son': 384,
 'come': 383,
 'drug': 383,
 'town': 383,
 'child': 381,
 'city': 378,
 'help': 371,
 'police': 370,
 'mother': 363,
 'daughter': 361,
 'force': 361,
 'american': 359,
 'girl': 359,
 'york': 359,
 'set': 353,
 'wife': 349,
 'turn': 345,
 'day': 339,
 'way': 337,
 'back': 333,
 'try': 333,
 'independent': 331,
 'home': 330,
 'woman director': 324,
 'work': 324,
 'independent film': 317,
 'duringcreditsstinger': 307,
 'begin': 306,
 'lead': 306

In [56]:
print(f'Total number of unique keywords that have a count of at least {cutoff}: {len(keyword_counts_cutoff)}')

Total number of unique keywords that have a count of at least 5: 6456
