In [1]:
import re
import pandas as pd
import numpy as np

movies = []
with open('./dataset/movielens/movies.dat', encoding='latin1') as f:
    for l in f:
        id_, title, genres = l.strip().split('::')
        genres_set = set(genres.split('|'))

        # extract year
        assert re.match(r'.*\([0-9]{4}\)$', title)
        year = title[-5:-1]
        title = title[:-6].strip()

        data = {'movie_id': int(id_), 'title': title, 'year': year}
        for g in genres_set:
            data[g] = True
        movies.append(data)
movies = pd.DataFrame(movies).astype({'year': 'category'})

ratings = []
with open('./dataset/movielens/ratings.dat', encoding='latin1') as f:
    for l in f:
        user_id, movie_id, rating, timestamp = [int(_) for _ in l.split('::')]
        ratings.append({
            'user_id': user_id,
            'movie_id': movie_id,
            'rating': rating,
            'timestamp': timestamp,
            })
ratings = pd.DataFrame(ratings)

# ratings에 등장한 적 있는 유저와 영화만 필터링
distinct_users_in_ratings = ratings['user_id'].unique()
distinct_movies_in_ratings = ratings['movie_id'].unique()
movies = movies[movies['movie_id'].isin(distinct_movies_in_ratings)]

# 제목 제거하고, genre categorical comlumns 정리
genre_columns = movies.columns.drop(['movie_id', 'title', 'year'])
movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool')
movies_categorical = movies.drop('title', axis=1)

In [2]:
entities = movies['movie_id'].astype('category')
entities = entities.cat.reorder_categories(movies['movie_id'].values)

index_id_to_movie_id = {}
movie_id_to_index_id = {}
for idx, movie_id in enumerate(entities):
    index_id_to_movie_id[idx] = movie_id
    movie_id_to_index_id[movie_id] = idx

In [3]:
from scipy import spatial

saved_npz = np.load('./pinsage/' + 'h_items.npz')
h_item = saved_npz['movie_vectors']
tree = spatial.KDTree(h_item.tolist())
tree.query(h_item[movie_id_to_index_id[1]].tolist(), 10)

(array([0.        , 0.7739352 , 0.77502719, 0.77830732, 0.82051339,
        1.00335428, 1.03126933, 1.03271216, 1.14840752, 1.16873208]),
 array([   0,   12,  237,  829,  232,  996, 2162, 2161,   52, 2898]))

In [4]:
index_ids = tree.query(h_item[movie_id_to_index_id[1]].tolist(), 10)[1]
movie_ids = [index_id_to_movie_id[idx] for idx in index_ids]
for mid in movie_ids:
    print(movies[movies['movie_id']==mid]['title'].values)

['Toy Story']
['Balto']
['Gumby: The Movie']
['Land Before Time III: The Time of the Great Giving']
['Goofy Movie, A']
['Aladdin and the King of Thieves']
["Bug's Life, A"]
['Rugrats Movie, The']
['Big Green, The']
['Toy Story 2']


In [6]:
movies

Unnamed: 0,movie_id,title,year,Comedy,Animation,Children's,Adventure,Fantasy,Romance,Drama,...,Action,Crime,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,1995,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji,1995,False,False,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men,1995,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,4,Waiting to Exhale,1995,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,5,Father of the Bride Part II,1995,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,2000,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3879,3949,Requiem for a Dream,2000,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3880,3950,Tigerland,2000,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3881,3951,Two Family House,2000,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
