In [1]:
import numpy as np
d = 768                           # dimension
nb = 1_000_000                      # database size
nq = 10                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

In [2]:
import faiss                   # make faiss available
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
1000000


In [3]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)
D, I = index.search(xq, k)     # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:]) 

[[   0   64  802  403]
 [   1  529   23 1013]
 [   2  295 1736  336]
 [   3  193   94 1152]
 [   4 2081   20 1495]]
[[  0.       115.78073  116.944664 117.295944]
 [  0.       109.616135 111.056984 113.481445]
 [  0.       112.658844 115.38965  116.00951 ]
 [  0.       113.80682  116.79196  116.82472 ]
 [  0.       110.88393  111.97954  112.40068 ]]
[[ 274  719 2145  472]
 [ 298  634  938  206]
 [ 299  447  979  652]
 [ 395  548  111  715]
 [ 284 1789  409  247]]
[[ 369  587  312 1715]
 [1309   39  529   34]
 [ 223  191  634  944]
 [ 579  772  777  391]
 [1495  731  507 2457]]


# Apply to geocoding

In [35]:
import torch

def get_entity_embedding(entity: str):
    input = tokenizer(entity, return_tensors="pt")
    with torch.no_grad():
        output = model(**input)
    embeddings = output.last_hidden_state
    entity_embedding = embeddings.mean(dim=1)
    return entity_embedding

In [34]:
import xml.etree.ElementTree as ET
from transformers import AutoTokenizer, AutoModel
import numpy as np
embeddings = []
node_info = []
tokenizer = AutoTokenizer.from_pretrained("tohoku-nlp/bert-base-japanese-whole-word-masking")
model = AutoModel.from_pretrained("tohoku-nlp/bert-base-japanese-whole-word-masking")

input_file = "../data/nodes_with_names_no_duplicates.osm"
tree = ET.parse(input_file)

In [None]:

def func():
    root = tree.getroot()
    i = 0
    for node in root.findall('node'):
        node_info.append(node)
        name = node.find('tag[@k="name"]').attrib.get('v', '')
        embedding = get_entity_embedding(name)[0]
        normalized_embedding = embedding.numpy() / np.linalg.norm(embedding)
        embeddings.append(normalized_embedding)
        i += 1
        if i % 10_000 == 0:
            print(i)
        # if i==400:
        #     break
func()

In [16]:
embeddings = np.array(embeddings)

In [ ]:
# save to pkl
import pickle
with open("../data/embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)
with open("../data/node_info.pkl", "wb") as f:
    pickle.dump(node_info, f)

In [17]:
# import line_profiler
# 
# 
# 
# profiler = line_profiler.LineProfiler()
# profiler.add_function(func)
# profiler.run('func()')
# profiler.print_stats()

In [18]:
import faiss                   # make faiss available
d = 768
index = faiss.IndexFlatIP(d)   # build the index
print(index.is_trained)
index.add(embeddings)                  # add vectors to the index
print(index.ntotal)

True
16


In [20]:
node_names = [node.find('tag[@k="name"]').attrib.get('v', '') for node in node_info]

In [31]:
entity = "台場"
entity_embedding = get_entity_embedding(entity)
entity_embedding = entity_embedding.numpy() / np.linalg.norm(entity_embedding)

In [32]:
k = 3 
D, I = index.search(entity_embedding, k) 
print(I)
print(D)

[[ 0 15  1]]
[[0.9999999  0.82408404 0.81893873]]


In [33]:
# print names and similarities
for i in range(k):
    print(f"{node_names[I[0][i]]}: {D[0][i]}")
    

台場: 0.9999998807907104
新橋: 0.8240840435028076
芝浦ふ頭: 0.8189387321472168
