In [1]:
import os
import pandas as pd
import json
import re

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rake_nltk import Rake 
import yake
import spacy
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.summarization import keywords

# Process movie description keywords

In [3]:
data_dir = os.getcwd()

with open(os.path.join(data_dir, 'movie_id_to_info.json')) as f:
    movie_id_to_info = json.load(f)
with open(os.path.join(data_dir, 'movie_id_ranked_by_audience_number.json')) as f:
    ranked_movies_lst = json.load(f)

In [4]:
movie_id_to_info

{'0814255': {'name': 'Percy Jackson & the Olympians: The Lightning Thief',
  'simplified_name': 'Percy Jackson & the Olympians',
  'description': "Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.",
  'genre': ['drama',
   'action & adventure',
   'comedy',
   'science fiction & fantasy'],
  'critics_consensus': 'Though it may seem like just another Harry Potter knockoff, Percy Jackson benefits from a strong supporting cast, a speedy plot, and plenty of fun with Greek mythology.',
  'content_rating': 'PG',
  'audience_count': 254421.0},
 '0878835': {'name': 'Please Give',
  'simplified_na

### Basic cleaning, filtering, lemmatization

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(STOPWORDS)
stop_words.extend('lot')

lemmatizer = WordNetLemmatizer()

"""
Cleans the description string
Returns: string of cleaned description in lower case
"""
def clean_description(description):
    description = re.sub('[^A-Za-z0-9\-\']+', ' ', description)
    description = description.replace('--', '-')
    description = ''.join(description)
    return description.lower()

"""
Lemmatizes and filters given description string
Returns: tokenized list of filtered words
"""
def lemmatize_and_filter(description):
    tokenized = description.split()
    lemmatized_lst = [lemmatizer.lemmatize(w) for w in tokenized]
    filtered = list(filter(lambda x: x not in stop_words, lemmatized_lst))
    return filtered

"""
Extracts all 2-word keyphrases from a list of tokenized words in the form of noun-adj
Returns: the set of noun-adj keyphrases
"""
def create_noun_adj_list(word_lst):
    lemmatized_lst = [lemmatizer.lemmatize(w) for w in word_lst]
    pos = nltk.pos_tag(lemmatized_lst)
    noun_adj_set = set()
    for i in range(len(pos) - 1):
        if (pos[i][1] == 'JJ' and pos[i+1][1] == 'NN'):
            noun_adj_phrase = ' '.join([pos[i][0], pos[i+1][0]])
            noun_adj_set.add(noun_adj_phrase)
    return noun_adj_set

### Extraction functions for other libraries
- Yake
- Rake
- Gensim

In [6]:
"""
Extracts keywords and phrases using YAKE library
"""
def yake_extraction(description):       
    language = "en"
    deduplication_threshold = 0.9

    custom_kw_extractor_3gram = yake.KeywordExtractor(lan=language, n=3,dedupLim=deduplication_threshold, 
                                                      top=5, features=None)
    custom_kw_extractor_2gram = yake.KeywordExtractor(lan=language, n=2,dedupLim=deduplication_threshold, 
                                                      top=5, features=None)
    custom_kw_extractor_single = yake.KeywordExtractor(lan=language, n=1,dedupLim=deduplication_threshold, 
                                                      top=10, features=None)
    
    keyphrase_3 = custom_kw_extractor_3gram.extract_keywords(description)
    keyphrase_2 = custom_kw_extractor_2gram.extract_keywords(description)
    keywords = custom_kw_extractor_single.extract_keywords(description)
    
    all_keywords = keyphrase_3 + keyphrase_2 + keywords
    all_keywords = sorted(all_keywords, key=lambda item:(item[1]))
    keywords_only = [x[0] for x in all_keywords]
    
    return set(keywords_only) 

"""
Extracts keywords and phrases using rake library
"""
def rake_extraction(description):
    rake_nltk_var = Rake()  
    rake_nltk_var.extract_keywords_from_text(description)  
    keyword_extracted = rake_nltk_var.get_ranked_phrases()
    return set(keyword_extracted)

"""
Extracts keywords and phrases using gensim summarize library
"""
def gensim_keyword_extraction(description):
    result = gensim.summarization.keywords(description, ratio=1, words=None, 
                                           split=False, scores=True, pos_filter=None, 
                                           lemmatize=True, deacc=True)
    keywords = [x[0] for x in result]
    return keywords

This example shows that the library extractor work better when description is not cleaned or filtered  

In [7]:
ex = "Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld."
ex_cleaned = clean_description(ex)

In [10]:
print(gensim_keyword_extraction(ex))
print(rake_extraction(ex))
print(yake_extraction(ex))

['percy', 'greek god', 'logan', 'jackson', 'training ground', 'trouble', 'devastating war', 'divine powers', 'lerman gets', 'hades', 'poseidon', 'teenager', 'deities', 'learns']
{'prepare', 'son', 'lot', 'harness', 'erupting', 'life', 'devastating war', 'adventure', 'olympians', 'logan lerman', 'must prevent', 'teenager percy jackson', 'clutches', 'feud among', 'prone', 'lifetime', 'god', 'earth', 'underworld', 'hades', 'percy learns', 'always trouble', 'deities', 'learns', 'children', 'gets', 'greek god poseidon', 'mother', 'divine powers', 'rescue', 'training ground', 'complicated'}
{'god', 'god poseidon', 'lerman', 'greek god', 'learns', 'teenager percy', 'logan lerman', 'greek god poseidon', 'greek', 'percy jackson', 'trouble-prone', 'teenager percy jackson', 'jackson', 'poseidon', 'life', 'logan', 'percy'}


In [11]:
print(gensim_keyword_extraction(ex_cleaned))
print(rake_extraction(ex_cleaned))
print(yake_extraction(ex_cleaned))

['teenager percy jackson logan lerman gets', 'greek god poseidon', 'trouble', 'devastating war', 'divine powers', 'training ground', 'hades', 'learns', 'deities']
{'prepare', 'son', 'lot', 'harness', 'erupting', 'teenager percy jackson logan lerman gets', 'life', 'devastating war', 'adventure', 'olympians', 'must prevent', 'clutches', 'feud among', 'prone', 'lifetime', 'earth', 'deities percy learns', 'hades god', 'underworld', 'always trouble', 'learns', 'children', 'greek god poseidon', 'mother', 'divine powers', 'rescue', 'training ground', 'complicated'}
{'lerman', 'jackson', 'life', 'teenager', 'devastating war', 'logan', 'logan lerman', 'teenager percy jackson', 'god', 'deities percy learns', 'trouble-prone', 'underworld', 'percy', 'learns', 'greek god poseidon', 'jackson logan lerman', 'divine powers', 'training ground', 'jackson logan', 'percy jackson logan'}


### Extract keywords
- Uses combined results from of Yake, Rake and Gensim extractions

In [39]:
def create_movie_tf_idf(movie_id_list):
    tf = dict()
    idf = dict()
    for movie in movie_id_list:
        raw_desc = movie_id_to_info[movie]['description']
        if not type(raw_desc) == str:
            continue
        description = clean_description(raw_desc)
        filtered_lst = lemmatize_and_filter(description)
        filtered_desc = ' '.join(filtered_lst)
        
        # noun_adj_keyphrase = create_noun_adj_list(filtered_lst)
        yake_keywords = yake_extraction(raw_desc)
        rake_keywords = rake_extraction(raw_desc)
        gensim_keyword = gensim_keyword_extraction(raw_desc)
        
        all_keywords = yake_keywords.union(rake_keywords).union(gensim_keyword)
                
        for word in all_keywords:
            if len(word.split()) <= 4 and not set(word.split()).intersection(stop_words):
#                 word = re.sub('[^A-Za-z0-9\-_&$\']+', ' ', word)
                lst = list(filter(lambda x: (len(x) > 2), word.split())) 
                word = ' '.join(lst)
                if len(re.sub('[^A-Za-z]+', '', word)) > 2:
                    tf.setdefault(movie, []).append(word)
                    idf.setdefault(word,[]).append(movie)  
                
    return tf, idf

In [40]:
all_movies = ranked_movies_lst
top_5000_movies = ranked_movies_lst[:5000]
top_500_movies = ranked_movies_lst[:500]

movie_tf, movie_idf = create_movie_tf_idf(all_movies)

In [41]:
sorted(movie_idf.keys(), key = lambda item:(len(item.split())), reverse = True)

['reigning world heavyweight champion',
 'nfl coach nate scarborough',
 'island theme park populated',
 'ferocious predators break free',
 'angers junior crime lord',
 'supernatural playboy exuding charm',
 'day strange phenomena surface',
 'means braving giant bugs',
 'magical board game unleashes',
 'themed game called jumanji',
 'town called radiator springs',
 'fbi agent carl hanratty',
 'eating tiger shere khan',
 'dade detectives mike lowrey',
 'game writer roger meet',
 'skunk kit named flower',
 'young rabbit named thumper',
 'young doe named faline',
 'fawn named bambi joins',
 'superhero adventure finds batman',
 'rescues explorer ursula stanhope',
 'gentle farmer arthur hoggett',
 'motherly border collie fly',
 'seven friendly little miners',
 'grimm fairy tale gets',
 'deep space salvage team',
 'year old girl named',
 'eyed tribeswoman named evolet',
 'roald dahl work tells',
 'image adorns magazine covers',
 'grace strips randy daytona',
 'fbi chemical warfare expert',
 '

In [43]:
# Output to json
out_dir = data_dir
with open(os.path.join(out_dir, 'movie_desc_keywords.json'), 'w') as json_file:
    json.dump(movie_tf, json_file)
with open(os.path.join(out_dir, 'inv_movie_desc_keywords.json'), 'w') as json_file:
    json.dump(movie_idf, json_file)