In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json
import math
import pickle
import re
from matplotlib import pyplot as plt
from collections import defaultdict
%matplotlib inline

In [2]:
def cosine_similarity(e,v):
    """
    #Input:
    #e = nxd input matrix with n row-vectors of dimensionality d (n is number of dictionary_keys)
    #v = mxd input matrix with m row-vectors of dimensionality d (m is number of test samples)
    # Output:
    # Matrix D of size nxm
    # s(i,j) is the cosinesimiarlity of embed(i,:) and test(j,:)
    """
    g=e.dot(v.T)
    b=np.expand_dims(np.linalg.norm(e,axis=1),1)+1e-16  # plus this small value to avoid division zero.
    a=np.expand_dims(np.linalg.norm(v,axis=1),1)+1e-16  # plus this small value to avoid division zero.
    s=np.divide(g,np.multiply(b,a.T))
    # ... until here
    return s.T
def findknn(D,k):
    """
   # D=cos_distance matrix
   # k = number of nearest neighbors to be found
   # flag =0 , recommend book
   # flag =1 , recommend movie
    
   # Output:
   # indices = kxm matrix, where indices(i,j) is the i^th nearest neighbor of xTe(j,:)
   # dists = Euclidean distances to the respective nearest neighbors
    """
    
    m = D.shape[0]
    ind = np.argsort(D, axis=1)
    
    indices = ind[:,::-1][:,:k]
   # print(indices)
    r = np.array([_ for _ in range(m)], dtype=np.int)
    r = np.array([r] * k).T   
    dists = D[r,indices] 
    return indices,dists

def popularity_multiplier(z, strength=1): 
    """A multiplier between 1 to ~1.6 based on a z-score."""
    z += 4.5
    z = min(z, 7)
    z = max(z, 2)
    return strength*math.log(z/2.0)+1

def load_from_json(file_name):
    with open(file_name, "r") as fp:
            json_file=json.load(fp)
    return json_file


def flattened_list(list_of_lists):
    if list_of_lists is None:
        return None
    flattened = []
    for sublist in list_of_lists:
        for val in sublist:
            flattened.append(val)
    return flattened
def top_tropes_from_vector(v, n_tropes,col_to_trope_list):
    top_dot = np.argsort(-v)[0]

    top_tropes = []
    for i in top_dot[:n_tropes]:
        if v[0][i] != 0:
            top_tropes.append(col_to_trope_list[i])
    return top_tropes
def get_boosted_index_from_summary(query,direction,threshold=0.15):
    """
    # Input:
    # query : name of book or movie
    # k : number of recomendation 
    # threshold: boosting if summary tf-idf theshold exceeds the threshold default:0.2
    # direction: 
    # direction = 'mb' : movie - >  books
    # direction = 'bm' : book  - >  movies
    
    # Output:
    # index of documents to be boosted
    """
        
    if direction == "mb":
        input_data = movie_summary
        input_id2name = movie_id_to_name
        input_name2id = movie_name_to_id
        output_data =book_summary
        output_id2name = book_id_to_name
        ouput_name2id = book_name_to_id
    elif direction == "bm":
        input_data = book_summary
        input_id2name = book_id_to_name
        input_name2id = book_name_to_id
        output_data = movie_summary
        output_id2name = movie_id_to_name
        ouput_name2id = movie_name_to_id
    else:
        raise Exception("Input direction not defined !")
        
    query_vec = input_data[[input_name2id[query]]]
    

    sim = cosine_similarity(output_data,query_vec)
    
    
    boosted_indices= np.where(sim>=threshold)[1]

    return boosted_indices

def recommendation(title, k=5,n_tropes=5,direction='mb', popularity_weight=0,boosting=True,relevance_feedback=False):
#     mod_mbt = np.where(movie_by_trope==0, -x, movie_by_trope*y)
#     mod_bbt = np.where(book_by_trope==0, -x*c, book_by_trope*y*c)
    """
    # Input:
    # query : name of book or movie
    # k : number of recomendation 
    # direction: 
    # direction = 'mb' : movie - >  books
    # direction = 'bm' : book  - >  movies
    # n_tropes: number of top tropes to be returned and displayed
    # popularity_weight: popularity weight
    # boosting : apply boosting to tf-idf tropes using tf-idf summary
    # relevance_feedback 
    
    # Output:
    # recomendations: name of top k of recommended results
    # recomendations_scores : scores of top k of recommended results
     # recomendations_scores : a nested list of top tropes returned of size : (k * n_tropes)
    """
    if popularity_weight is None: popularity_weight = 0
    popularity_weight = float(popularity_weight)
    
    if direction=='mb':
        input_data = movie_by_trope
        input_id2name = movie_id_to_name
        input_name2id = movie_name_to_id
        output_data = book_by_trope
        output_id2name = book_id_to_name
        ouput_name2id = book_name_to_id
        popularity=books_popularity
    elif direction == "bm":
        input_data = book_by_trope
        input_id2name = book_id_to_name
        input_name2id = book_name_to_id
        output_data = movie_by_trope
        output_id2name = movie_id_to_name
        ouput_name2id = movie_name_to_id
        popularity=movies_popularity
    else:
        raise Exception("Input direction not defined !")
        
        
    query_vec = input_data[[input_name2id[query]]]
  
    sim = cosine_similarity(output_data,query_vec)
    

    if popularity_weight > 0:
        sim = np.multiply(sim, popularity_weight * popularity)

    if relevance_feedback:
        
        indices,scores = findknn(sim,k)
        
        alpha = 1
        beta = 0.75
        gamma = 0.15
        top_k=2 # choose top 2 as relevant query
        
        def get_irrevalent(sim,threshold=0):
            """
            # Similarity score <= threshold will be consider as irrelevant docs
            
            """
            m = sim.shape[0]
            
            ind = np.argsort(sim, axis=1)
            
            ire_ind = np.where(sim<=0)[1]
            
            return ire_ind
        
        irrelevant_docs_ids = get_irrevalent(sim)

        relevant_docs_ids = indices[0][:top_k] 
   
        modified_query_vec =   alpha * query_vec  \
                             + beta * np.sum(output_data[relevant_docs_ids],axis=0,keepdims=True)/len(relevant_docs_ids) \
                             - gamma * np.sum(output_data[relevant_docs_ids],axis=0,keepdims=True)/len(irrelevant_docs_ids) 
        
        
        query_vec = modified_query_vec
        sim = cosine_similarity(output_data,query_vec)

        indices,scores = findknn(sim,k)
        
        
    if boosting :
        boosted_score=0.2
        boosted_idx=get_boosted_index_from_summary(query,direction=direction,threshold=0.2)
        
        if boosted_idx is not None:
            for idx in boosted_idx:
                 sim[0][idx]=min(sim[0][idx]+boosted_score,1.0) 
    
    indices,scores = findknn(sim,k)   
    recomendations=[]
    recomendation_scores=[]
    top_tropes=[]
    for i in range(len(indices[0])):
        print ("{} \x1b[31m{:.3f}\x1b[0m".format(output_id2name[indices[0][i]], scores[0][i])) 
        # print(["".join(elem for elem in topNTropes(retrieval[1].get(entry[0]), 5))])
        recomendations.append(output_id2name[indices[0][i]])
        recomendation_scores.append(scores[0][i])
        dot=np.multiply(movie_by_trope[[indices[0][i]]], query_vec[0])
        tropes = top_tropes_from_vector(dot,n_tropes,col_to_trope_list)
        top_tropes.append(tropes)
        
        print(tropes)
    
    return recomendations,recomendation_scores,top_tropes

In [3]:
with open("app/irsystem/controllers/TVTropesScraper/Film/Film_tropes_dataset3.json", 'r') as f:
    movie_tropes_data = json.load(f)
with open("app/irsystem/controllers/TVTropesScraper/Literature/Literature_tropes_dataset3.json", 'r') as f:
    book_tropes_data = json.load(f)

In [4]:
with open("./app/irsystem/controllers/DatasetInfo/book_dataset.json", 'r', encoding='utf-8') as json_file:  
    alena_books = json.loads(json_file.read())
with open("./app/irsystem/controllers/DatasetInfo/movie_dataset.json", 'r', encoding='utf-8') as json_file:  
    alena_movies = json.loads(json_file.read())
movielens_reviews = pickle.load(open("./app/irsystem/controllers/DatasetInfo/movielens_reviews.p", "rb" ))

In [5]:
movie_id_to_summary=load_from_json("./app/irsystem/controllers/DatasetInfo/movie_summary.json")
book_id_to_summary=load_from_json("./app/irsystem/controllers/DatasetInfo/book_summary.json")
movie_summary_corpus= [" ".join(flattened_list(movie_id_to_summary[idx])) if movie_id_to_summary[idx] is not None else "" for idx in list(movie_id_to_summary.keys())]
book_summary_corpus= [" ".join(flattened_list(book_id_to_summary[idx]))  if book_id_to_summary[idx] is not None else "" for idx in list(book_id_to_summary.keys()) ]
# vecterize movie and book
movie_vectorizer = TfidfVectorizer(sublinear_tf =True,smooth_idf=True,stop_words=None)
movie_vectorizer.fit(movie_summary_corpus+book_summary_corpus)
movie_summary=movie_vectorizer.transform(movie_summary_corpus).toarray()
book_summary=movie_vectorizer.transform(book_summary_corpus).toarray()

In [6]:
inverted_index_books = defaultdict(list)
for book, trope_list in book_tropes_data.items():
    for trope in trope_list:
        inverted_index_books[trope].append(book)

inverted_index_movies = defaultdict(list)
for movie, trope_list in movie_tropes_data.items():
    for trope in trope_list:
        inverted_index_movies[trope].append(movie)

In [7]:
movie_titles = []
for k, v in alena_movies.items():
    movie_titles.append((k, v['idx']))
movie_titles.sort(key=lambda pair : pair[1])
movie_titles = [k[0] for k in movie_titles]

book_titles = []
for k, v in alena_books.items():
    book_titles.append((k, v['idx']))
book_titles.sort(key=lambda pair : pair[1])
book_titles = [k[0] for k in book_titles]

In [8]:
common_tropes = set(inverted_index_movies.keys()) | set(inverted_index_books.keys())
# common_tropes = {s.lower() for s in common_tropes}
tf_idf = TfidfVectorizer(min_df=3, lowercase=False, vocabulary = common_tropes, norm='l2', use_idf=True, binary=True)
movie_by_trope = tf_idf.fit_transform([' '.join(movie_tropes_data[movie_titles[i]]) for i in range(len(movie_titles))]).toarray()
book_by_trope = tf_idf.fit_transform([' '.join(book_tropes_data[book_titles[i]]) for i in range(len(book_titles))]).toarray()

trope_to_col = tf_idf.vocabulary_
col_to_trope_list = tf_idf.get_feature_names()

In [9]:
movie_name_to_id= {movie_titles[i]:i  for i in range(len(movie_titles))}
movie_id_to_name= {i:movie_titles[i]  for i in range(len(movie_titles))}
book_name_to_id= {book_titles[i]:i  for i in range(len(book_titles))}
book_id_to_name= {i:book_titles[i]  for i in range(len(book_titles))}


In [10]:


movies_popularity = np.zeros(len(movie_titles))
books_popularity = np.zeros(len(book_titles))

for j in range(len(movie_titles)):
    popularity_boost = 0
    if movie_titles[j] in movielens_reviews:
        z = (movielens_reviews[movie_titles[j]][0]-2000)/8000 # z-score of number of reviews
        popularity_boost += popularity_multiplier(z, strength=2)/5
        z = (movielens_reviews[movie_titles[j]][1]-3)/0.5  # z-score of 5-star rating
        popularity_boost += popularity_multiplier(z, strength=2)/5
    movies_popularity[j] = popularity_boost

for i in range(len(book_titles)):
    popularity_boost = 0
    if 'num_reviews' in alena_books[book_titles[i]]:
        z = (alena_books[book_titles[i]]['num_reviews']-54)/364
        popularity_boost += popularity_multiplier(z, strength=0.3)/2.2
    if 'rating' in alena_books[book_titles[i]]:
        z = (alena_books[book_titles[i]]['rating']-3)/0.5
        popularity_boost += popularity_multiplier(z, strength=0.3)/2.2
    books_popularity[i] = popularity_boost

In [11]:

titles = ["Harry Potter and the Chamber of Secrets", 'Heart Of Darkness', 'Romeo And Juliet', 'The Hunger Games']
for query in titles:
    print(query)
    print("-------------------------")
    print("bookTomovie")   
    names,recomendation_scores,top_tropes=recommendation(query,direction="bm",popularity_weight=0,boosting=True,relevance_feedback=True)
    print()
   
    

Harry Potter and the Chamber of Secrets
-------------------------
bookTomovie
Harry Potter and the Chamber of Secrets [31m0.593[0m
['GiantSpider', 'CoverIdentityAnomaly', 'BadassLongrobe', 'LiteralCliffHanger', 'LaughingAtYourOwnJokes']
Arachnophobia [31m0.399[0m
['OrificeEvacuation', 'SpiderSwarm', 'GiantSpider', 'FacePalm', 'AllForNothing']
Harry Potter and the Goblet of Fire [31m0.273[0m
['ArtisticLicenseBiology', 'EpicRocking', 'AdaptationInducedPlothole', 'INeverSaidItWasPoison', 'BalefulPolymorph']
Harry Potter and the Prisoner of Azkaban [31m0.261[0m
['EpicRocking', 'AdaptationInducedPlothole', 'BodyHorror', 'BalefulPolymorph', 'DarkerAndEdgier']
Harry Potter and the Half-Blood Prince [31m0.250[0m
['AdaptationInducedPlothole', 'AdaptationExplanationExtrication', 'NiceJobFixingItVillain', 'AscendedExtra', 'RedHerring']

Heart Of Darkness
-------------------------
bookTomovie
Tarzan [31m0.402[0m
['HollywoodNatives', 'HungryJungle', 'DarkestAfrica', 'JunglePrincess', 'S

In [12]:
names

['The Hunger Games',
 'The Hunger Games: Mockingjay - Part 2',
 'The Hunger Games: Mockingjay - Part 1',
 'The Hunger Games: Catching Fire',
 'Snowpiercer']

In [14]:
recomendation_scores

[0.6296697813833947,
 0.426556792160219,
 0.426556792160219,
 0.13597624654165127,
 0.12206583195133926]

In [15]:
top_tropes

[['PresidentEvil',
  'AfterActionHealingDrama',
  'ChildrenForcedToKill',
  'DeadlyGame',
  'EverythingTryingToKillYou'],
 ['PresidentEvil',
  'VoiceOfTheResistance',
  'EverythingTryingToKillYou',
  'HerHeartWillGoOn',
  'DeathCourse'],
 ['PresidentEvil',
  'VoiceOfTheResistance',
  'EverythingTryingToKillYou',
  'HerHeartWillGoOn',
  'DeathCourse'],
 ['PresidentEvil',
  'EverythingTryingToKillYou',
  'ForcedToWatch',
  'FogOfDoom',
  'ManiacMonkeys'],
 ['TheRevolutionWillNotBeCivilized',
  'FirstWorldProblems',
  'NotInThisForYourRevolution',
  'NeckSnap',
  'NecessarilyEvil']]

In [16]:

titles = ['The Hunger Games', "The Emperor's Club", 'Titanic']
for query in titles:  
    print(query)
    print("-------------------------")
    print("movieTobook")   
    names,recomendation_scores,top_tropes=recommendation(query,direction="mb",popularity_weight=0,boosting=True,relevance_feedback=True)
    print()
   

The Hunger Games
-------------------------
movieTobook
The Hunger Games [31m0.626[0m
['EvilGloating', 'DueToTheDead', 'Squick', 'TheDitz', 'JumpScare']
Miss Peregrine's Home for Peculiar Children [31m0.393[0m
['OohMeAccentsSlipping', 'NotSoDifferent', 'KarmicDeath', 'OhCrap', 'LargeHam']
Macbeth [31m0.097[0m
['FoodPorn', 'HeldGaze', 'AscendedExtra', 'LightIsNotGood', 'DemotedToExtra']
Vampire Huntress Legend [31m0.088[0m
['DiedInYourArmsTonight', 'RaceLift', 'AscendedExtra', 'AdaptationExpansion', 'SparedByTheAdaptation']
Codex Alera [31m0.088[0m
['BigNo', 'OhCrap', 'LargeHam', 'InterruptedSuicide', 'RunningGag']

The Emperor's Club
-------------------------
movieTobook
One Fine Day In The Middle Of The Night [31m0.451[0m
['AntiHero']
Bend Sinister [31m0.445[0m
['BerserkButton', 'MeaningfulName']
The Wish List [31m0.138[0m
['BrokenPedestal', 'ChekhovsGun', 'BerserkButton', 'MeaningfulName', 'ShoutOut']
The Kite Runner [31m0.104[0m
['AnAesop', 'Egopolis', 'ChekhovsGun'