In [1]:

import json
import torch
from blink.biencoder.biencoder import BiEncoderRanker, load_biencoder, BertEncoder
from pytorch_transformers.modeling_bert import (
    BertPreTrainedModel,
    BertConfig,
    BertModel,
)
from pytorch_transformers.tokenization_bert import BertTokenizer


biencoder_config = "models/biencoder_wiki_large.json"
biencoder_model = "models/biencoder_wiki_large.bin"
with open(biencoder_config) as json_file:
    biencoder_params = json.load(json_file)
    biencoder_params["path_to_model"] = biencoder_model

# candidate encoder
# biencoder = load_biencoder(biencoder_params)
cand_bert = BertModel.from_pretrained(biencoder_params['bert_model'])
cand_encoder = BertEncoder(
    cand_bert,
    biencoder_params["out_dim"],
    layer_pulled=biencoder_params["pull_from_layer"],
    add_linear=biencoder_params["add_linear"],
)
state_dict = torch.load(biencoder_params["path_to_model"])
cand_encoder.load_state_dict({k[13:]: v for k, v in state_dict.items() if "cand_encoder" in k})
cand_encoder.training = False

# tokenizer
tokenizer = BertTokenizer.from_pretrained(
    biencoder_params["bert_model"], do_lower_case=biencoder_params["lowercase"]
)


100%|██████████| 434/434 [00:00<00:00, 118595.87B/s]
100%|██████████| 1344997306/1344997306 [03:10<00:00, 7074277.15B/s] 
100%|██████████| 231508/231508 [00:01<00:00, 203690.74B/s]


In [2]:
from blink.biencoder.data_process import get_candidate_representation
from blink.biencoder.biencoder import to_bert_input

with open("title2desc.json", "r") as f:
    title2desc = json.load( f)

title2vecs = {}
for title, desc in title2desc.items():
    print(title)
    tokens = get_candidate_representation(desc, tokenizer, biencoder_params["max_cand_length"], title)
    
    token_ids = torch.tensor(tokens['ids'], dtype=torch.long).unsqueeze(0)
    token_idx_cands, segment_idx_cands, mask_cands = to_bert_input(token_ids, 0) 
    embedding_cands = cand_encoder(
        token_idx_cands, segment_idx_cands, mask_cands
    )
    
    title2vecs[title] = embedding_cands.detach().to("cpu").squeeze()

# cos = torch.nn.CosineSimilarity(dim=0)
# 
# e1 = title2vecs['Laptop']
# e2 = title2vecs['Mouse']
# e3 = title2vecs['Cup']
# e4 = title2vecs['Elephant']
# e5 = title2vecs['Burger']
# 
# print(cos(torch.tensor(e1), torch.tensor(e2)).item())
# print(cos(torch.tensor(e1), torch.tensor(e3)).item())
# print(cos(torch.tensor(e1), torch.tensor(e4)).item())
# print(cos(torch.tensor(e1), torch.tensor(e5)).item())

# 0.8001438975334167
# 0.7863001823425293
# 0.7561507821083069
# 0.7998831868171692


Keyboard
Mouse
Laptop
Desktop


In [15]:
import torch
torch.load('semantics_blink.pt')

{'Keyboard': tensor([ 0.0187, -0.1631,  0.1399,  ...,  0.0399, -0.2025,  0.1039]),
 'Mouse': tensor([ 0.4275, -0.1971, -0.3478,  ..., -0.2199, -0.3707,  0.3632]),
 'Laptop': tensor([ 0.1120, -0.5406, -0.2654,  ..., -0.1065, -0.3009,  0.0345]),
 'Desktop': tensor([-0.0888, -0.1752,  0.0979,  ...,  0.0107, -0.3411,  0.0434])}