# Embedding similarity with faiss


This notebook performs cosine similarity calcluation with faiss library.

### Parameters for invoking the notebook

- `cskg_embedding_path`: a .gz file contaning the embeddings for all cskg entites
- `query_ent_mat`: a matrix form of query entities, here we mimic some query data from the training data 

In [1]:
# Parameters 
cskg_embedding_path = "../output/embeddings/comp_log_dot_0.05.tsv.gz"
query_ent_mat = None # delcare the varibale and will assign it later

In [2]:
import faiss
import gzip
import numpy as np

###  Prepare data
- read embeddings from cskg_embedding_path to construct training data (expressed in matrix form)
- build a entity name-index bi dictionary for future indexing
- build a a faiss index that stores vector

In [3]:
%%time
# specify a certain entity embedding tsv file
entity_dict = {}        # build a entity name-index bi dictionary
entity_embeddings = []  # all the embeddings 

with gzip.open(cskg_embedding_path,'rt') as f:
    for index,line in enumerate(f):
        line = line.split('\t')
        entity_name = line[0]
        entity_vec =  [ float(i) for i in line[1:]]
        entity_embeddings.append(entity_vec)
        entity_dict[entity_name] = index
        entity_dict[index] = entity_name
    
# entity_embeddings=> matrix
X = np.array(entity_embeddings).astype(np.float32) # float32
dimension = X.shape[1]


# build index (METRIC_INNER_PRODUCT => cos )
vec_index = faiss.index_factory(dimension, "Flat", faiss.METRIC_INNER_PRODUCT)
# normalize all vectors in order to get cos sim 
faiss.normalize_L2(X)  
# add vectors to inde 
vec_index.add(X) 
print(f'number of vectors in the index: {vec_index.ntotal}')

number of vectors in the index: 2160968
CPU times: user 2min 23s, sys: 15.9 s, total: 2min 39s
Wall time: 2min 22s


### Search topk neighbors



    normal case:
    query_set = [[...],[...],[...]]
    query_mat = np.array(query_set).astype(np.float32)
    faiss.normalize_L2(query_mat) 

In [4]:
# mimic some query data from the training data
query_ent_indices = list(range(0,10)) # first 10 entities
query_ent_vecs = [] 
for i in query_ent_indices:
    query_ent_vecs.append(X[i])
query_ent_mat = np.array(query_ent_vecs)
faiss.normalize_L2(query_ent_mat) 

topk = 5
cos_sim, index = vec_index.search(query_ent_mat, topk) # both of them are matrices
print(f'Similarity by FAISS:\n {cos_sim}')
print(f'Index by FAISS:\n {index}')

Similarity by FAISS:
 [[1.0000001  0.9826269  0.980425   0.97756755 0.9767041 ]
 [1.         0.96249706 0.9542226  0.9423557  0.94022256]
 [1.         0.8458103  0.84088576 0.8378214  0.83243716]
 [1.0000001  0.9761975  0.9707729  0.9688461  0.9074847 ]
 [1.         0.8186028  0.81521285 0.8092427  0.80634975]
 [1.0000001  0.9248386  0.80462617 0.79793155 0.7964016 ]
 [1.         0.973209   0.96717876 0.9661685  0.9611064 ]
 [1.0000001  0.9695602  0.967589   0.90732634 0.8127198 ]
 [1.         0.74222136 0.7317495  0.7272495  0.7266457 ]
 [1.0000001  0.75190324 0.7302896  0.72995096 0.7257391 ]]
Index by FAISS:
 [[      0 1138350  599086 1463561 1807656]
 [      1  179432 1474619 1716895  168821]
 [      2  800835 1643610  841396 1981023]
 [      3  575003  720266  592886 2133432]
 [      4 1621135 1850318   84768 1496129]
 [      5 1413990  605709  938719  798085]
 [      6  634081  890617  414098 1731982]
 [      7 1471507 2059336  107527 1779741]
 [      8 2075453 1434461  755631  4

In [5]:
# prepare result 
res = []
for row in range(len(index)):
    top5_res = []
    for col in range(len(index[0])):
        ent_name = entity_dict[index[row,col]]
        sim = cos_sim[row,col]
        top5_res.append((ent_name,sim))
    res.append(top5_res)
    
# print result
for r in res:
    print('Query:', r[0][0])
    print(r[1:])
    print()

Query: wd:Q419890
[('wd:Q409192', 0.9826269), ('wd:Q47521603', 0.980425), ('wd:Q47521613', 0.97756755), ('wd:Q47521440', 0.9767041)]

Query: /c/en/xanthiosite/n
[('/c/en/triplite/n', 0.96249706), ('/c/en/liversidgeite/n', 0.9542226), ('/c/en/ulrichite/n', 0.9423557), ('/c/en/dufrenite/n', 0.94022256)]

Query: /c/en/form_of_pasta
[('/c/en/tube_shaped_noodle', 0.8458103), ('/c/en/common_dish', 0.84088576), ('/c/en/food_orignially_from_italy', 0.8378214), ('/c/en/long_noodles', 0.83243716)]

Query: /c/en/stargateverse/n
[('/c/en/shatnerverse/n', 0.9761975), ('/c/en/xenaverse/n', 0.9707729), ('/c/en/disneyverse/n', 0.9688461), ('/c/en/duneverse/n', 0.9074847)]

Query: /c/en/knd_of_insect
[("/c/en/mother's_brother", 0.8186028), ("/c/en/father's_brother", 0.81521285), ('/c/en/extremely_severe_storm', 0.8092427), ('/c/en/so_basic', 0.80634975)]

Query: /c/en/chatfield
[('/c/en/chatfields/n', 0.9248386), ('/c/en/schuldt', 0.80462617), ('/c/en/cousinette', 0.79793155), ('/c/en/zymad', 0.7964016