In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json
import math
import pickle
import re
from collections import defaultdict
from typing import Dict, List, Tuple

In [3]:
with open("./app/irsystem/controllers/TVTropesScraper/Film/Film_tropes_dataset3.json", 'r') as f:
    movie_tropes_data = json.load(f)
with open("./app/irsystem/controllers/TVTropesScraper/Literature/Literature_tropes_dataset3.json", 'r') as f:
    book_tropes_data = json.load(f)

In [4]:
with open("./app/irsystem/controllers/DatasetInfo/tbwb_book_dataset.json", 'r', encoding='utf-8') as json_file:  
    alena_books = json.loads(json_file.read())
with open("./app/irsystem/controllers/DatasetInfo/tbwb_movie_dataset.json", 'r', encoding='utf-8') as json_file:  
    alena_movies = json.loads(json_file.read())
movielens_reviews = pickle.load(open("./app/irsystem/controllers/DatasetInfo/movielens_reviews.p", "rb" ))

In [5]:
inverted_index_books = defaultdict(list)
for book, trope_list in book_tropes_data.items():
    for trope in trope_list:
        inverted_index_books[trope].append(book)

inverted_index_movies = defaultdict(list)
for movie, trope_list in movie_tropes_data.items():
    for trope in trope_list:
        inverted_index_movies[trope].append(movie)

In [6]:
datasets = [movie_tropes_data, book_tropes_data]
inverted_indices = [inverted_index_movies, inverted_index_books]

In [7]:
def topNTropes(d, n):
    top = []

    i = 0
    while(i < n):
        v=list(d.values())
        k=list(d.keys())
        m = k[v.index(max(v))]
        d.pop(m)

        if(i == n-1):
            m = re.sub(r"(\w)([A-Z])", r"\1 \2", m)
            m = re.sub(r"([A-Z])([A-Z])", r"\1 \2", m)

        else:
            m = re.sub(r"(\w)([A-Z])", r"\1 \2", m) + ", "
            m = re.sub(r"([A-Z])([A-Z])", r"\1 \2", m)
        top.append(m)

        i += 1

    return top

In [8]:
def doc_norm(tropes_data,
             inverted_index,
             idf: str=None):
    """
    Note the custom formulae for normalization: avoids rewarding when norms[document] is small (e.g. <1)
    """
    if idf == "inverse":
        f = lambda trope: (1.0 / len(inverted_index[trope])) **2
    elif idf == "log":
        f = lambda trope: (1.0/(1+np.log(len(inverted_index[trope]))))**2
    elif idf is None:
        f = lambda trope: 1
    else:
        raise Exception("Invalid IDF")

    norms = defaultdict(int)
    for document, trope_list in tropes_data.items():
        for trope in trope_list:
            norms[document] += f(trope)
        norms[document] = math.sqrt(norms[document])
    return norms

def get_idf_func(input_inverted_index, result_inverted_index, idf: str):
    if idf == "inverse":
        return lambda trope: (1.0 / len(input_inverted_index[trope])) * (1.0 / len(result_inverted_index[trope]))
    elif idf == "log":
        return lambda trope: (1.0/(1+np.log(len(input_inverted_index[trope])))) * (1.0/(1+np.log(len(result_inverted_index[trope]))))
    elif idf is None:
        return lambda trope: 1
    else:
        raise Exception("Invalid IDF")

def filter_with_num_tropes(doc_scores: List[Tuple],
                           trope_contributions: Dict[str, Dict[str, int]],
                           num_tropes: int):
    """
    Exclude documents where number of similar tropes is <= [num_tropes]
    """
    return list(filter(lambda ds: len(trope_contributions[ds[0]]) >= num_tropes, doc_scores))

def find_relevant(datasets: List[Dict],
                  inverted_indices: List[Dict],
                  query: str,
                  input_category: str,
                  result_category: str,
                  normalize: bool=True,
                  idf:str=None,
                  min_df:int=0,
                  popularity_weight = 0
                 ):
    """
    THE main TF-IDF function
    """
    idx = {"movie": 0, "book": 1}

    input_idx = idx[input_category]
    result_idx = idx[result_category]

    input_dataset = datasets[input_idx]

    f = get_idf_func(input_inverted_index=inverted_indices[input_idx],
                     result_inverted_index=inverted_indices[result_idx],
                     idf=idf)

    # Correcting search query to database title
    if query not in input_dataset:
        print("Could not find title: {}".format(query))
        return

    query_vec = input_dataset[query]

    doc_scores = defaultdict(int)

    trope_contributions = defaultdict(dict)
    # record weightage of each trope contributions

    # Update accumulators
    for trope in query_vec:
        if len(inverted_indices[input_idx][trope]) < min_df or len(inverted_indices[result_idx][trope]) < min_df:
            continue

        postings = inverted_indices[result_idx][trope]
        for doc in postings:
            weight_update = f(trope)
            doc_scores[doc] += weight_update
            trope_contributions[doc][trope] = weight_update

    # Normalize
    if normalize:
        norms = doc_norm(datasets[result_idx],
                 inverted_indices[result_idx],
                 idf=idf)
        for d in doc_scores:
            if norms[d] != 0:
                doc_scores[d] /= norms[d]
                
    ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! WEIGHT BY POPULARITY !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ##
    def popularity_multiplier(z): 
        """A multiplier between 1 to ~1.6 based on a z-score."""
        z += 4.5
        z = min(z, 7)
        z = max(z, 2)
        return math.log(z/2.0)+1
    
    if result_category == 'book':
        for doc in doc_scores.keys():
            doc_ = doc.lower()
            if doc_ in alena_books and 'num_reviews' in alena_books[doc_]:
                z = (alena_books[doc_]['num_reviews']-54)/364
                doc_scores[doc] *= popularity_multiplier(z) * popularity_weight
    else:
        for doc in doc_scores.keys():
            if doc in movielens_reviews:
                z = (movielens_reviews[doc][0]-2000)/8000 # z-score of number of reviews
                doc_scores[doc] *= popularity_multiplier(z)
                z = (movielens_reviews[doc][1]-3)/0.5  # z-score of 5-star rating
                doc_scores[doc] *= popularity_multiplier(z) * popularity_weight
    ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! WEIGHT BY POPULARITY !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ##            
        

    doc_idx_scores = sorted(doc_scores.items(), key=lambda x:x[1], reverse=True)
    doc_scores = [(doc, score) for doc, score in doc_idx_scores if score > 0]

    doc_scores = filter_with_num_tropes(doc_scores, trope_contributions, num_tropes=5)

    return doc_scores[:5], trope_contributions


In [9]:
def regular_query_book_without_popularity(title):
    retrieval = find_relevant(datasets = datasets, inverted_indices = inverted_indices, query = title, input_category = 'book', result_category = 'movie', min_df = 3, normalize = True, idf = "log", popularity_weight = 0)
    i = 0
    for entry in retrieval[0]:
        print("{} \x1b[31m{:.3f}\x1b[0m".format(entry[0], entry[1])) 
        # print(["".join(elem for elem in topNTropes(retrieval[1].get(entry[0]), 5))])
        i += 1
        
def regular_query_book_with_popularity(title, pop_multiplier=1):
    retrieval = find_relevant(datasets = datasets, inverted_indices = inverted_indices, query = title, input_category = 'book', result_category = 'movie', min_df = 3, normalize = True, idf = "log", popularity_weight = pop_multiplier)
    i = 0
    for entry in retrieval[0]:
        print("{} \x1b[31m{:.3f}\x1b[0m".format(entry[0], entry[1])) 
        i += 1
        
def regular_query_movie_without_popularity(title):
    retrieval = find_relevant(datasets = datasets, inverted_indices = inverted_indices, query = title, input_category = 'movie', result_category = 'book', min_df = 3, normalize = True, idf = "log", popularity_weight = 0)
    i = 0
    for entry in retrieval[0]:
        print("{} \x1b[31m{:.3f}\x1b[0m".format(entry[0], entry[1])) 
        i += 1

def regular_query_movie_with_popularity(title, pop_multiplier=1):
    retrieval = find_relevant(datasets = datasets, inverted_indices = inverted_indices, query = title, input_category = 'movie', result_category = 'book', min_df = 3, normalize = True, idf = "log", popularity_weight = pop_multiplier)
    i = 0
    for entry in retrieval[0]:
        print("{} \x1b[31m{:.3f}\x1b[0m".format(entry[0], entry[1])) 
        i += 1

In [10]:
movie_titles = list(movie_tropes_data.keys())
book_titles = list(book_tropes_data.keys())
combined_titles = movie_titles + book_titles

n = len(movie_titles)
N = len(combined_titles)

tfidf_vec = TfidfVectorizer(min_df=5, max_df=0.95, norm = 'l2', use_idf=True)
doc_by_trope = tfidf_vec.fit_transform([' '.join(movie_tropes_data[combined_titles[i]]) if i<n else ' '.join(book_tropes_data[combined_titles[i]]) for i in range(N)]).toarray()
movie_by_trope = doc_by_trope[:n]
book_by_trope = doc_by_trope[n:]
# tfidf_vec.vocabulary_

In [11]:
x = 1
y = 200
c = 10
mod_mbt = np.where(movie_by_trope==0, -x, movie_by_trope*y)
mod_bbt = np.where(book_by_trope==0, -x*c, book_by_trope*y*c)

You are adding $(0.1y)^2 = 0.01y^2$ for agreeing on a trope, subtracting $0.1yx$ for disagreeing on a trope, adding $x^2$ for mutually not having a trope.

The ratio between agreeing on a trope vs. disagreeing on a trope and disagreeing on a trope vs. mutually not having a trope is $\frac{y}{10x}$. We can't make these two different ratios.

We can make it asymmetric. I care more $c$ times more about my query tropes appearing in the result than the other way around.

In the `test` method below, both of these matrices are reconstructed for whatever x, y, c you put in, so the above cell isn't actually affecting anything

In [12]:
def modified_query_book(title):
    """Adds for agreeing on not having a trope. Uses mod_mbt. """
    i = book_titles.index(title)
    query_vec = mod_bbt[i]
    similarities = np.matmul(mod_mbt, query_vec)
    sorted_titles = np.flip(np.argsort(similarities), axis=0)
    for i in range(7):
        print("{} \x1b[31m{:.3f}\x1b[0m".format(movie_titles[sorted_titles[i]], similarities[sorted_titles[i]])) 

def match_modified_query_book(title, x=0.02, y=1, c=5):
    """Does not add for agreeing on not having a trope. Uses vector dot products. """
    i = book_titles.index(title)
    query_vec = mod_bbt[i]
    similarities = []
    for movie_vec in movie_by_trope:
        sum_vec = query_vec + movie_vec
        zero_out = np.where(sum_vec>0, 1, 0)
        mod_query_vec = np.where(query_vec==0, -x*c, query_vec*y*c)
        mod_movie_vec = np.where(movie_vec==0, -x, movie_vec*y)
        mod_query_vec = np.multiply(mod_query_vec, zero_out)
        similarities.append(np.dot(mod_query_vec, mod_movie_vec))
    sorted_titles = np.flip(np.argsort(similarities), axis=0)
    for i in range(7):
        print("{} \x1b[31m{:.3f}\x1b[0m".format(movie_titles[sorted_titles[i]], similarities[sorted_titles[i]])) 
        
def modified_query_movie(title):
    """Adds for agreeing on not having a trope. Uses mod_bbt"""
    i = movie_titles.index(title)
    query_vec = mod_mbt[i]
    similarities = np.matmul(mod_bbt, query_vec)
    sorted_titles = np.flip(np.argsort(similarities), axis=0)
    for i in range(7):
        print("{} \x1b[31m{:.3f}\x1b[0m".format(book_titles[sorted_titles[i]], similarities[sorted_titles[i]])) 
        
def match_modified_query_movie(title, x=0.02, y=1, c=5):
    """Does not add for agreeing on not having a trope. Uses vector dot products. """
    i = movie_titles.index(title)
    query_vec = mod_mbt[i]
    similarities = []
    for book_vec in book_by_trope:
        sum_vec = query_vec + book_vec
        zero_out = np.where(sum_vec>0, 1, 0)
        mod_query_vec = np.where(query_vec==0, -x*c, query_vec*y*c)
        mod_book_vec = np.where(book_vec==0, -x, book_vec*y)
        mod_query_vec = np.multiply(mod_query_vec, zero_out)
        similarities.append(np.dot(mod_query_vec, mod_book_vec))
    sorted_titles = np.flip(np.argsort(similarities), axis=0)
    for i in range(7):
        print("{} \x1b[31m{:.3f}\x1b[0m".format(book_titles[sorted_titles[i]], similarities[sorted_titles[i]])) 

In [13]:
def test(f, x=0.02, y=1, c=5, pop_multiplier=1): 
    """
    This function takes as input one of the functions defined above:
            - regular_query_book_without_popularity      (current implementation)
            - regular_query_book_with_popularity
            - regular_query_movie_without_popularity     (current implementation)
            - regular_query_movie_with_popularity
            - modified_query_book                        (Adds for mutual absence of trope)
            - match_modified_query_book
            - modified_query_movie                       (Adds for mutual absence of trope)
            - match_modified_query_movie
            
    and tests them on the titles defined within this method. Parameters x, y, c only mean anything for
    the last 4 methods. 
    
    pop_multiplier only means anything for the 2 regular_query methods that use it. Note that the 
    multiplier is multiplier to this other multiplier defined within the method, so 1 is a "normal" 
    weight, 0 means not weighting by popularity, and 2 is caring a lot about popularity.
    
    """
    book_methods = [regular_query_book_without_popularity, regular_query_book_with_popularity, modified_query_book, match_modified_query_book]
    if f in book_methods:
        titles = ["Harry Potter and the Chamber of Secrets", 'Heart Of Darkness', 'Romeo And Juliet', 'The Hunger Games']
    else:
        titles = ['The Hunger Games', 'Blade Runner 2049', 'Titanic', '(500) Days of Summer']
        
        
    if f == modified_query_book or f == modified_query_movie:
        mod_mbt = np.where(movie_by_trope==0, -x, movie_by_trope*y)
        mod_bbt = np.where(book_by_trope==0, -x*c, book_by_trope*y*c)
        for title in titles:
                print(title)
                print('-'*25)
                f(title)
                print("\n")
    elif f == match_modified_query_book or f == match_modified_query_movie:
        for title in titles:
                print(title)
                print('-'*25)
                f(title, x=x, y=y, c=c)
                print("\n")
    elif f == regular_query_book_with_popularity or f == regular_query_movie_with_popularity:
        for title in titles:
                print(title)
                print('-'*25)
                f(title, pop_multiplier = pop_multiplier)
                print("\n")
    else: 
        for title in titles:
                print(title)
                print('-'*25)
                f(title)
                print("\n")

In [15]:
test(regular_query_book_with_popularity)

Harry Potter and the Chamber of Secrets
-------------------------
Harry Potter and the Chamber of Secrets [31m1.207[0m
Arachnophobia [31m1.014[0m
Home Alone [31m0.989[0m
Monsters, Inc. [31m0.934[0m
Pan's Labyrinth [31m0.915[0m


Heart Of Darkness
-------------------------
Apocalypse Now [31m0.788[0m
The Last Samurai [31m0.747[0m
The Man Who Would Be King [31m0.708[0m
Seven [31m0.447[0m
The Conversation [31m0.419[0m


Romeo And Juliet
-------------------------
Romeo and Juliet [31m9.141[0m
William Shakespeare's Romeo + Juliet [31m1.333[0m
West Side Story [31m1.274[0m
The Godfather: Part II [31m1.025[0m
The Godfather [31m1.025[0m


The Hunger Games
-------------------------
The Hunger Games [31m2.632[0m
The Hunger Games: Mockingjay - Part 1 [31m2.629[0m
The Hunger Games: Mockingjay - Part 2 [31m2.528[0m
Snowpiercer [31m2.118[0m
The Truman Show [31m2.097[0m


