In [16]:
import json
import os
import numpy as np
from nltk.corpus import wordnet as wn

In [17]:
G_REV_KEYWORD_VEC_FILENAME = 'game_rev_keyword_vec.json'
G_REV_WORD_TO_SYNPHRASES_FILENAME = 'game_rev_word_to_synphrase.json'
GAME_INFO_FILENAME = 'game_info.json'
DATA_DIR = "/Users/changwei/Documents/GitHub/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/app/data"
MOVIE_INFO_FILENAME = 'movie_info.json'

In [18]:
len(GAME_INFO)

2734

In [19]:
with open(os.path.join(DATA_DIR, GAME_INFO_FILENAME), 'r', encoding='utf8') as in_json_file:
    GAME_INFO = json.load(in_json_file)
with open(os.path.join(DATA_DIR, G_REV_WORD_TO_SYNPHRASES_FILENAME), 'r', encoding='utf8') as in_json_file:
    G_REV_WORD_TO_SYNPHRASES = json.load(in_json_file)
with open(os.path.join(DATA_DIR, G_REV_KEYWORD_VEC_FILENAME), 'r', encoding='utf8') as in_json_file:
    G_REV_KEYWORD_VEC = json.load(in_json_file)
with open(os.path.join(DATA_DIR, MOVIE_INFO_FILENAME), 'r', encoding='utf8') as in_json_file:
    MOVIE_INFO = json.load(in_json_file)

In [20]:
# build the token list, the token includes all from game description, tags, review, and movie description
token_dict = dict()
for key in GAME_INFO:
    for entry in GAME_INFO[key]['tags']:
        token_dict[entry] = token_dict.get(entry, 0) + 1
    for entry in GAME_INFO[key]['desc_keywords']:
        token_dict[entry] = token_dict.get(entry, 0) + 1
    if key in G_REV_KEYWORD_VEC:
        for entry in G_REV_KEYWORD_VEC[key]['vector'].keys():
            token_dict[entry] = token_dict.get(entry, 0) + 1  
for key in MOVIE_INFO:
    for entry in MOVIE_INFO[key]['desc_keywords']:
        token_dict[entry] = token_dict.get(entry, 0) + 1
token_lst = np.array(sorted(list(set(token_dict.keys()))))
token_count = np.array([token_dict[e] for e in token_lst])
num_token = len(token_lst)

In [37]:
print(len(token_lst))
print(token_lst[:10])
print(token_lst[-10:])
print(token_count)
print(np.sum(token_count>30))

130148
['-' '0' '000 french' '000 mark payment' '000 miles'
 '000 mischievous ghouls descend' '000 pounds' '000 sleeping passengers'
 '000 years' '001 robo']
['zula' 'zuniga' 'zup' 'zurer' 'zus' 'zuwanie' 'zyon' 'zytron' 'édith'
 'édith piaf']
[1 1 1 ... 1 1 1]
2079


In [38]:
# only keeping the tokens whose occurrence > 3 out of all sources
selected_idx = np.nonzero(token_count > 30)[0]
selected_token_lst = token_lst[selected_idx]
selected_num_token = len(selected_token_lst)
dict_selected_token_to_idx = {t:i for i,t in enumerate(selected_token_lst)}

In [39]:
# build raw movie vectors based on selected token list, each keyword in the movie is weighted 3
movie_vectors = dict()
for key in MOVIE_INFO:
    vector = np.zeros(selected_num_token)
    for entry in MOVIE_INFO[key]['desc_keywords']:
        if entry in selected_token_lst:
            vector[dict_selected_token_to_idx[entry]] += 3
    movie_vectors[key] = vector.copy()
movie_ids = list(MOVIE_INFO.keys())
movie_vectors_2d = np.array([movie_vectors[i] for i in movie_ids])
print(len(movie_ids), 'movies in total')
print(movie_vectors_2d.shape)

3000 movies in total
(3000, 2079)


In [40]:
# build raw game vectors based on selected token list, tag word is weighted 5
# description word 3, review word from the given vector (max weight 2)
game_vectors = dict()
for key in GAME_INFO:
    vector = np.zeros(selected_num_token)
    for entry in GAME_INFO[key]['tags']:
        if entry in selected_token_lst:
            vector[dict_selected_token_to_idx[entry]] += 5
    for entry in GAME_INFO[key]['desc_keywords']:
        if entry in selected_token_lst:
            vector[dict_selected_token_to_idx[entry]] += 3
    if key in G_REV_KEYWORD_VEC:
        for entry in G_REV_KEYWORD_VEC[key]['vector'].keys():
            if entry in selected_token_lst:
                vector[dict_selected_token_to_idx[entry]] += G_REV_KEYWORD_VEC[key]['vector'][entry]
    game_vectors[key] = vector.copy()

In [41]:
# build 2d game np array
game_id_lst = list(game_vectors.keys())
game_vectors_2d = list()
for i in game_id_lst:
    game_vectors_2d.append(game_vectors[i])
game_vectors_2d = np.array(game_vectors_2d)

In [42]:
print(game_vectors_2d.shape)
print(game_vectors_2d.shape[0], "games in total")
print(game_vectors_2d.shape[1], "tokens in total")

(2734, 2079)
2734 games in total
2079 tokens in total


In [43]:
# concatenate vectors of games and movies, games at the top, movies at the bottom
game_movie_vectors_2d = np.vstack((game_vectors_2d, movie_vectors_2d))
print(game_movie_vectors_2d.shape)

(5734, 2079)


In [44]:
# calculate the idf-value for each selected term
term_count_by_doc = np.sum(game_movie_vectors_2d > 0, axis=0)
print(term_count_by_doc)
idf_array = 1 / np.log2(term_count_by_doc + 1)
print(idf_array)
print(len(idf_array))

[ 42  38 641 ... 137 210  45]
[0.18428883 0.18920036 0.10722217 ... 0.14067617 0.12951524 0.1810426 ]
2079


In [45]:
# multiply the 2d vector array with idf value
game_movie_vectors_2d *= idf_array
game_movie_vectors_2d_normalized = game_movie_vectors_2d - np.mean(game_movie_vectors_2d, axis=0)

In [46]:
# svd
u, s, vh = np.linalg.svd(game_movie_vectors_2d_normalized)

In [49]:
# keeping 95% variance
cutoff_idx = np.nonzero((np.cumsum(s)/np.sum(s)) > 0.7)[0][0] + 1
print(cutoff_idx)
print(vh.shape)

933
(2079, 2079)


In [50]:
v_pca = vh[:cutoff_idx].T
print(v_pca.shape)

(2079, 933)


In [51]:
# projecting the raw vectors of games and movies into new basis, store as dictionary
game_movie_vectors_2d_pca = game_movie_vectors_2d_normalized.dot(v_pca)
game_vectors_2d_pca = game_movie_vectors_2d_pca[:2734]
movie_vectors_2d_pca = game_movie_vectors_2d_pca[2734:]
print(game_movie_vectors_2d_pca.shape)
print(game_vectors_2d_pca.shape)
print(movie_vectors_2d_pca.shape)
dict_game_vectors_2d_pca = {i:vec.tolist() for i, vec in zip(game_id_lst, game_vectors_2d_pca)}
dict_movie_vectors_2d_pca = {i:vec.tolist() for i, vec in zip(movie_ids, movie_vectors_2d_pca)}

(5734, 933)
(2734, 933)
(3000, 933)


In [52]:
STORE_DIR = "/Users/changwei/Documents/GitHub/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/app/data/pca_svd"
GAME_VECTORS_PCA = "dict_gameid_to_vector_pca.json"
MOVIE_VECTORS_PCA = "dict_movieid_to_vector_pca.json"
EIGENVECTORS_PCA_COLUMNS = "game_movie_eigenvectors_column.json"
TOKEN_LST_BEFORE_PCA = "token_list_before_pca.json"
DICT_TOKEN_TO_IDX_BEFORE_PCA = "dict_token_to_id_before_pca.json"
IDF_ARRAY = "idf_array_before_pca.json"

with open(os.path.join(STORE_DIR, GAME_VECTORS_PCA), 'w', encoding='utf8') as to_json:
    json.dump(dict_game_vectors_2d_pca, to_json)
with open(os.path.join(STORE_DIR, MOVIE_VECTORS_PCA), 'w', encoding='utf8') as to_json:
    json.dump(dict_movie_vectors_2d_pca, to_json)
with open(os.path.join(STORE_DIR, EIGENVECTORS_PCA_COLUMNS), 'w', encoding='utf8') as to_json:
    json.dump(v_pca.tolist(), to_json)
with open(os.path.join(STORE_DIR, TOKEN_LST_BEFORE_PCA), 'w', encoding='utf8') as to_json:
    json.dump(selected_token_lst.tolist(), to_json)
with open(os.path.join(STORE_DIR, DICT_TOKEN_TO_IDX_BEFORE_PCA), 'w', encoding='utf8') as to_json:
    json.dump(dict_selected_token_to_idx, to_json)
with open(os.path.join(STORE_DIR, IDF_ARRAY), 'w', encoding='utf8') as to_json:
    json.dump(idf_array.tolist(), to_json)