# Movie lens nearest neighbor query


In [414]:
import struct 
import pickle
import numpy as np

key_type = 'I64' # {'I64', 'I32'}, default is 'I32'
key_type_map = {"I32": ["I", 4], "I64": ["q", 8]}

embedding_vec_size = 128

HUGE_CTR_VERSION = 3.0 # set HugeCTR version here, 2.2 for v2.2, 2.21 for v2.21

if HUGE_CTR_VERSION <= 2.2:
    each_key_size = key_type_map[key_type][1] + key_type_map[key_type][1] + 4 * embedding_vec_size
else:
    each_key_size = key_type_map[key_type][1] + 8 + 4 * embedding_vec_size

In [415]:
slot_size_array=[137,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 162542,
 56586]

In [416]:
import numpy as np
np.sum(slot_size_array)

219325

In [417]:
offset = np.insert(np.cumsum(slot_size_array), 0, 0)[:-1]
offset

array([     0,    137,    140,    143,    146,    149,    152,    155,
          158,    161,    164,    167,    170,    173,    176,    179,
          182,    185,    188,    191,    194,    197, 162739])

In [418]:
embedding_table = []
for i in range(23):
    embedding_table.append({})
embedding_table

[{},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {}]

In [419]:
key_count = 0
with open('./0_sparse_10000.model', 'rb') as file:
    try:
        while True:
            buffer = file.read(each_key_size)
            if len(buffer) == 0:
                break
            if HUGE_CTR_VERSION <= 2.2:
                key, slot_id = struct.unpack("2" + key_type_map[key_type][0], 
                                             buffer[0: 2*key_type_map[key_type][1]])
                values = struct.unpack(str(embedding_vec_size) + "f", buffer[2*key_type_map[key_type][1]: ])
            else:
                key = struct.unpack(key_type_map[key_type][0], buffer[0 : key_type_map[key_type][1]])[0]
                slot_id = struct.unpack("Q", buffer[key_type_map[key_type][1] : key_type_map[key_type][1] + 8])[0]
                values = struct.unpack(str(embedding_vec_size) + "f", buffer[key_type_map[key_type][1] + 8: ])
            
            embedding_table[slot_id][key] = values
            key_count +=1
    except BaseException as error:
        print(error)
print(key_count)

215204


In [420]:
file_size = key_count*each_key_size
file_size

113627712

In [422]:
import os
os.path.getsize('./0_sparse_10000.model')

113627712

In [423]:
for i in range(23): 
    print(len(embedding_table[i].keys()))

136
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
162541
52487


In [424]:
sum([len(x.keys()) for x in embedding_table])

215204

In [425]:
# Extracting item embedding
item_embedding = 1e99 * np.ones((56586, embedding_vec_size), dtype='float')
for i in embedding_table[-1].keys():
    item_embedding[i-offset[-1]] = embedding_table[-1][i]



In [426]:
len(embedding_table[1].keys())

2

In [427]:
sorted(embedding_table[1].keys())

[138, 139]

In [428]:
sorted(embedding_table[21].keys())[-10:]

[162729,
 162730,
 162731,
 162732,
 162733,
 162734,
 162735,
 162736,
 162737,
 162738]

In [429]:
sorted(embedding_table[22].keys())[:10]

[162740,
 162741,
 162742,
 162743,
 162744,
 162745,
 162746,
 162747,
 162748,
 162749]

In [430]:
sorted(embedding_table[22].keys())[-10:]

[219312,
 219313,
 219314,
 219315,
 219317,
 219318,
 219321,
 219322,
 219323,
 219324]

# Read NVTab movie mapping table

In [431]:
import pandas as pd

movies_mapping = pd.read_parquet('./data/ml-25m/NVTab_preprocessing/workflow/categories/unique.movieId.parquet')

In [432]:
movies_mapping.head()

Unnamed: 0,movieId
0,
1,1.0
2,2.0
3,3.0
4,4.0


In [433]:
movies_mapping.tail()

Unnamed: 0,movieId
56581,209155.0
56582,209157.0
56583,209159.0
56584,209169.0
56585,209171.0


In [434]:
nn_to_movies = {}
movies_to_nn = {}
for i, row in movies_mapping.iterrows():
    movies_to_nn[row.movieId] = i
    nn_to_movies[i] = row.movieId

import pandas as pd
movies = pd.read_csv("./data/ml-25m/movies.csv", index_col="movieId")

In [435]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [436]:
from scipy.spatial.distance import cdist

def find_similar_movies(nn_movie_id, item_embedding, k=10, metric="euclidean"):
    #find the top K similar items according to one of the distance metric: cosine or euclidean
    sim = 1-cdist(item_embedding, item_embedding[nn_movie_id].reshape(1, -1), metric=metric)
   
    return sim.squeeze().argsort()[-k:][::-1]

In [437]:
for movie_ID in range(1,100):
    if movie_ID in movies.index:    
        print("Query: ", movies.loc[movie_ID]["title"], movies.loc[movie_ID]["genres"])

        print("Similar movies: ")
        similar_movies = find_similar_movies(movies_to_nn[movie_ID], item_embedding)

        for i in similar_movies:
            if i ==0: #missing movie
                continue
            print(nn_to_movies[i], movies.loc[nn_to_movies[i]]["title"], movies.loc[nn_to_movies[i]]["genres"])
        print("=================================\n")
    

Query:  Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
Similar movies: 
1.0 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
196493.0 Vampires of Geona (1991) Animation|Sci-Fi
158.0 Casper (1995) Adventure|Children
189711.0 Pop World (2005) Comedy|Drama
74456.0 Babysitter Wanted (2008) Horror|Thriller
83927.0 Lucky Jordan (1942) Comedy|Crime|Drama
137315.0 Hyena (2014) Crime|Drama
104795.0 Backlash (1956) Mystery|Romance|Western
158268.0 My Big Night (2015) Comedy
86770.0 Porto of My Childhood (Porto da Minha Infância) (2001) Drama

Query:  Jumanji (1995) Adventure|Children|Fantasy
Similar movies: 
2.0 Jumanji (1995) Adventure|Children|Fantasy
80500.0 Fear Me Not (Den du frygter) (2008) Drama|Thriller
145120.0 Monopol (1996) Comedy
66437.0 Man in the Chair (2007) Comedy|Drama
157631.0 A Man Who Was Superman (2008) Comedy|Drama
177651.0 The Florida Project (2017) Drama
55620.0 For the Bible Tells Me So (2007) Documentary
194090.0 Nat Turner: A Troublesome Prope

In [438]:
similar_movies

array([   98, 46350,  7580, 33733, 41084,  1503, 31062, 39624,  8713,
        6326])