In [29]:
import json
import os
import numpy as np
from nltk.corpus import wordnet as wn

In [2]:
G_REV_KEYWORD_VEC_FILENAME = 'game_rev_keyword_vec.json'
G_REV_WORD_TO_SYNPHRASES_FILENAME = 'game_rev_word_to_synphrase.json'
GAME_INFO_FILENAME = 'game_info.json'
DATA_DIR = "/Users/changwei/Documents/GitHub/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/app/data"
MOVIE_INFO_FILENAME = 'movie_info.json'

In [3]:
with open(os.path.join(DATA_DIR, GAME_INFO_FILENAME), 'r', encoding='utf8') as in_json_file:
    GAME_INFO = json.load(in_json_file)
with open(os.path.join(DATA_DIR, G_REV_WORD_TO_SYNPHRASES_FILENAME), 'r', encoding='utf8') as in_json_file:
    G_REV_WORD_TO_SYNPHRASES = json.load(in_json_file)
with open(os.path.join(DATA_DIR, G_REV_KEYWORD_VEC_FILENAME), 'r', encoding='utf8') as in_json_file:
    G_REV_KEYWORD_VEC = json.load(in_json_file)
with open(os.path.join(DATA_DIR, MOVIE_INFO_FILENAME), 'r', encoding='utf8') as in_json_file:
    MOVIE_INFO = json.load(in_json_file)

In [5]:
token_dict = dict()
for key in GAME_INFO:
    for entry in GAME_INFO[key]['tags']:
        token_dict[entry] = token_dict.get(entry, 0) + 1
    for entry in GAME_INFO[key]['desc_keywords']:
        token_dict[entry] = token_dict.get(entry, 0) + 1
    if key in G_REV_KEYWORD_VEC:
        for entry in G_REV_KEYWORD_VEC[key]['vector'].keys():
            token_dict[entry] = token_dict.get(entry, 0) + 1  
for key in MOVIE_INFO:
    for entry in MOVIE_INFO[key]['desc_keywords']:
        token_dict[entry] = token_dict.get(entry, 0) + 1
token_lst = np.array(sorted(list(set(token_dict.keys()))))
token_count = np.array([token_dict[e] for e in token_lst])
num_token = len(token_lst)

In [6]:
print(len(token_lst))
print(token_lst[:10])
print(token_lst[-10:])
print(token_count)
print(np.sum(token_count>3))

130148
['-' '0' '000 french' '000 mark payment' '000 miles'
 '000 mischievous ghouls descend' '000 pounds' '000 sleeping passengers'
 '000 years' '001 robo']
['zula' 'zuniga' 'zup' 'zurer' 'zus' 'zuwanie' 'zyon' 'zytron' 'édith'
 'édith piaf']
[1 1 1 ... 1 1 1]
9716


In [7]:
selected_idx = np.nonzero(token_count > 3)[0]
selected_token_lst = token_lst[selected_idx]
selected_num_token = len(selected_token_lst)
dict_selected_token_to_idx = {t:i for i,t in enumerate(selected_token_lst)}

In [8]:
movie_vectors = dict()
for key in MOVIE_INFO:
    vector = np.zeros(selected_num_token)
    for entry in MOVIE_INFO[key]['desc_keywords']:
        if entry in selected_token_lst:
            vector[dict_selected_token_to_idx[entry]] += 3
    movie_vectors[key] = vector.copy()
movie_ids = list(MOVIE_INFO.keys())
movie_vectors_2d = np.array([movie_vectors[i] for i in movie_ids])
print(len(movie_ids), 'movies in total')
print(movie_vectors_2d.shape)

3000 movies in total
(3000, 9716)


In [9]:
game_vectors = dict()
for key in GAME_INFO:
    vector = np.zeros(selected_num_token)
    for entry in GAME_INFO[key]['tags']:
        if entry in selected_token_lst:
            vector[dict_selected_token_to_idx[entry]] += 5
    for entry in GAME_INFO[key]['desc_keywords']:
        if entry in selected_token_lst:
            vector[dict_selected_token_to_idx[entry]] += 3
    if key in G_REV_KEYWORD_VEC:
        for entry in G_REV_KEYWORD_VEC[key]['vector'].keys():
            if entry in selected_token_lst:
                vector[dict_selected_token_to_idx[entry]] += G_REV_KEYWORD_VEC[key]['vector'][entry]
    game_vectors[key] = vector.copy()

In [10]:
game_id_lst = list(game_vectors.keys())
game_vectors_2d = list()
for i in game_id_lst:
    game_vectors_2d.append(game_vectors[i])
game_vectors_2d = np.array(game_vectors_2d)

In [11]:
print(game_vectors_2d.shape)
print(game_vectors_2d.shape[0], "games in total")
print(game_vectors_2d.shape[1], "tokens in total")

(2734, 9716)
2734 games in total
9716 tokens in total


In [12]:
game_movie_vectors_2d = np.vstack((game_vectors_2d, movie_vectors_2d))
game_movie_vectors_2d_normalized = game_movie_vectors_2d - np.mean(game_movie_vectors_2d, axis=0)
print(game_movie_vectors_2d.shape)

(5734, 9716)


In [13]:
u, s, vh = np.linalg.svd(game_movie_vectors_2d_normalized)

In [14]:
cutoff_idx = np.nonzero((np.cumsum(s)/np.sum(s)) > 0.95)[0][0] + 1
print(cutoff_idx)
print(vh.shape)

4525
(9716, 9716)


In [21]:
v_pca = vh[:cutoff_idx].T
print(v_pca.shape)

(9716, 4525)


In [18]:
game_movie_vectors_2d_pca = game_movie_vectors_2d_normalized.dot(v_pca)
game_vectors_2d_pca = game_movie_vectors_2d_pca[:2734]
movie_vectors_2d_pca = game_movie_vectors_2d_pca[2734:]
print(game_movie_vectors_2d_pca.shape)
print(game_vectors_2d_pca.shape)
print(movie_vectors_2d_pca.shape)
dict_game_vectors_2d_pca = {i:vec.tolist() for i, vec in zip(game_id_lst, game_vectors_2d_pca)}
dict_movie_vectors_2d_pca = {i:vec.tolist() for i, vec in zip(movie_ids, movie_vectors_2d_pca)}

(5734, 4525)
(2734, 4525)
(3000, 4525)


In [25]:
STORE_DIR = "/Users/changwei/Documents/GitHub/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/app/data/pca_svd"
GAME_VECTORS_PCA = "dict_gameid_to_vector_pca.json"
MOVIE_VECTORS_PCA = "dict_movieid_to_vector_pca.json"
EIGENVECTORS_PCA_COLUMNS = "game_movie_eigenvectors_column.json"
TOKEN_LST_BEFORE_PCA = "token_list_before_pca.json"
DICT_TOKEN_TO_IDX_BEFORE_PCA = "dict_token_to_id_before_pca.json"

with open(os.path.join(STORE_DIR, GAME_VECTORS_PCA), 'w', encoding='utf8') as to_json:
    json.dump(dict_game_vectors_2d_pca, to_json)
with open(os.path.join(STORE_DIR, MOVIE_VECTORS_PCA), 'w', encoding='utf8') as to_json:
    json.dump(dict_movie_vectors_2d_pca, to_json)
with open(os.path.join(STORE_DIR, EIGENVECTORS_PCA_COLUMNS), 'w', encoding='utf8') as to_json:
    json.dump(v_pca.tolist(), to_json)
with open(os.path.join(STORE_DIR, TOKEN_LST_BEFORE_PCA), 'w', encoding='utf8') as to_json:
    json.dump(selected_token_lst.tolist(), to_json)
with open(os.path.join(STORE_DIR, DICT_TOKEN_TO_IDX_BEFORE_PCA), 'w', encoding='utf8') as to_json:
    json.dump(dict_selected_token_to_idx, to_json)

In [26]:
movie_ids

['0814255',
 '1000013-12_angry_men',
 '1000079-20000_leagues_under_the_sea',
 '10000_bc',
 '10004209-tristan_and_isolde',
 '10004288-running_scared',
 '10004504-ultraviolet',
 '10004659-arthur',
 '10004925-matador',
 '10006007-the_second_chance',
 '1000617-aliens',
 '10006370-stay_alive',
 '10007415-amazing_grace',
 '10007598-cocaine_cowboys',
 '10007947-ten',
 '10007985-happening',
 '10008502-christmas_carol',
 '10008587-beverly_hills_chihuahua',
 '10008606-eye',
 '10008611-my_best_friends_girl',
 '10008617-midnight_meat_train',
 '10008621-run_fat_boy_run',
 '10008655-whiteout',
 '10008760-what_happens_in_vegas',
 '10008785-appaloosa',
 '10008820-visitor',
 '10008954-untraceable',
 '10009063-perfect_holiday',
 '10009083-land_of_the_lost',
 '10009151-box',
 '10009192-21',
 '10009225-body_of_lies',
 '10009254-shutter',
 '10009274-priest',
 '10009460-the_road',
 '10009462-g_force',
 '10009493-duchess',
 '10009516-women',
 '10009526-public_enemies',
 '10009596-old_dogs',
 '10009598-surrog

In [42]:
free_typing = ['monster', 'geralt', 'storyline', 'touching','poignant','affecting']
for entry in free_typing:
    print(entry in selected_token_lst)

True
True
True
False
False
False
