In [1]:
from nltk.corpus import wordnet as wn
import os
import json
import numpy as np

In [8]:
# test data, free typing and movie id
# 1068044-jumanji, godfather, titanic, mission_impossible
free_typing = ['gun', 'crime']
input_movie_id = 'godfather'

In [3]:
DATA_DIR = "/Users/changwei/Documents/GitHub/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/app/data/pca_svd"
GAME_INFO_DIR = "/Users/changwei/Documents/GitHub/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/app/data"
EIGENVECTORS_PCA_COLUMNS = "game_movie_eigenvectors_column.json"
TOKEN_LST_BEFORE_PCA = "token_list_before_pca.json"
DICT_TOKEN_TO_IDX_BEFORE_PCA = "dict_token_to_id_before_pca.json"
MOVIE_VECTORS_PCA = "dict_movieid_to_vector_pca.json"
GAME_VECTORS_PCA = "dict_gameid_to_vector_pca.json"
GAME_INFO_FILENAME = 'game_info.json'

with open(os.path.join(DATA_DIR, EIGENVECTORS_PCA_COLUMNS), 'r', encoding='utf8') as in_json:
    eigenvectors_column = json.load(in_json)
with open(os.path.join(DATA_DIR, TOKEN_LST_BEFORE_PCA), 'r', encoding='utf8') as in_json:
    token_list = json.load(in_json)
with open(os.path.join(DATA_DIR, DICT_TOKEN_TO_IDX_BEFORE_PCA), 'r', encoding='utf8') as in_json:
    dict_token_to_idx = json.load(in_json)
with open(os.path.join(DATA_DIR, MOVIE_VECTORS_PCA), 'r', encoding='utf8') as in_json:
    movie_vectors = json.load(in_json)
with open(os.path.join(DATA_DIR, GAME_VECTORS_PCA), 'r', encoding='utf8') as in_json:
    game_vectors = json.load(in_json)
with open(os.path.join(GAME_INFO_DIR, GAME_INFO_FILENAME), 'r', encoding='utf8') as in_json_file:
    GAME_INFO = json.load(in_json_file)

In [20]:
eigenvectors = np.array(eigenvectors_column)
game_id_list = np.array(list(game_vectors.keys()))
dict_gameid_to_idx = {gid:i for i, gid in enumerate(game_id_list)}
game_vectors_2d = np.array([game_vectors[key] for key in game_id_list])
print(eigenvectors.shape)
print(game_vectors_2d.shape)

(9716, 4731)
(2734, 4731)


In [18]:
# return a list of syns of given token
def return_syns(token)->list:
    res = []
    syn_n = wn.synsets(token, pos=wn.NOUN)
    syn_adj = wn.synsets(token, pos=wn.ADJ)
    syns = syn_n + syn_adj
    for syn in syns:
        for lem in syn.lemmas():
            res.append(lem.name())
    return res

#
def ranking_by_cosine_similarity(free_list, movie_id)->list:
    """
    free_list is the list of free typing strs
    """
    mvec = np.array(movie_vectors[movie_id])
    if not free_list:
        qvec = mvec
    else:
        qlist = [entry.lower() for entry in free_list]
        for entry in qlist:
            if ' ' in entry:
                qlist.extend(entry.split())
        for term in free_list:
            qlist.extend(return_syns(term))
        qlist = list(set(qlist))
    
        qvec = np.zeros(len(token_list))
        for term in qlist:
            if term in token_list:
                if term in free_list:
                    qvec[dict_token_to_idx[term]] += 5
                else:
                    qvec[dict_token_to_idx[term]] += 2
        qvec = np.matmul(qvec, eigenvectors)
        qvec += mvec
    
    game_norms = np.linalg.norm(game_vectors_2d, axis=1)
    
    scores = np.dot(game_vectors_2d, qvec) / game_norms
    rank_idx_20 = np.flip(np.argsort(scores))[:20]
    rank_gameid_20 = game_id_list[rank_idx_20]
    rank_name_20 = [GAME_INFO[key]['name'] for key in rank_gameid_20]
    return [(rank_gameid_20[i], rank_name_20[i], scores[rank_idx_20[i]]) for i in range(20)], qvec
    

In [6]:
return_syns('touching')

['touch', 'touching', 'touch', 'touching', 'affecting', 'poignant', 'touching']

In [28]:
# test data, free typing and movie id
# movieid example: 1068044-jumanji, godfather, titanic, mission_impossible, world-war-z, star_wars_episode_i_the_phantom_menace
free_typing = ['crime', 'chase']
input_movie_id = 'godfather'
ranking_by_cosine_similarity(free_typing, input_movie_id)

([('233370', 'The Raven - Legacy of a Master Thief', 1.6628290082864796),
  ('1030830', 'Mafia II: Definitive Edition', 1.365286977378788),
  ('40990', 'Mafia', 1.2799347210157626),
  ('21780', 'Driver Parallel Lines', 1.2621576142280475),
  ('354500', 'PAYDAY: The Web Series', 1.2549795793052239),
  ('1030840', 'Mafia: Definitive Edition', 1.2193679499878438),
  ('443810', 'This Is the Police', 1.1889673609984395),
  ('605740',
   'Flashing Lights - Police, Firefighting, Emergency Services Simulator',
   1.142025827721868),
  ('318220', 'Enforcer: Police Crime Action', 1.1323869329965712),
  ('584980', 'Late Shift', 1.1252952916915862),
  ('255280', '1954 Alcatraz', 1.1007059144879268),
  ('50130', 'Mafia II (Classic)', 1.0987729579285255),
  ('17430', 'Need for Speed Undercover', 1.0619964781831053),
  ('208520', 'Omerta - City of Gangsters', 1.0471394484528243),
  ('704210', 'BLOCK WARRIORS: Open World Game', 1.0080758674197225),
  ('358150', "PAYDAY 2: The Butcher's BBQ Pack", 0.96

In [27]:
# if not useful, it indicates that the distance between the game vector and the query vector 
# should not be as close as they are originally. Conversely, if useful, it indicates that the
# distance between two vectors should be closer.
def vector_update(useful, gameid, qvec):
    if useful == True:
        game_vectors_2d[dict_gameid_to_idx[gameid]] += 1/100 * qvec
    else:
        game_vectors_2d[dict_gameid_to_idx[gameid]] -= 1/50 * qvec    

In [26]:
game_vectors_2d[dict_gameid_to_idx['289300']]

array([-0.09051644, -0.72745856,  0.19479312, ..., -0.02940112,
       -0.02888876,  0.00465072])

In [16]:
'high-strength' in token_list

False