In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from canlpy.core.models.knowbert.model import KnowBert
from canlpy.core.util.knowbert_tokenizer.tokenizer import KnowBertBatchifier
from canlpy.core.util.knowbert_tokenizer.vocabulary import Vocabulary

TACRED_WORDNET_WIKI = "https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/knowbert_wiki_wordnet_model.tar.gz"
WORDNET_ARCHIVE = "https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/knowbert_wordnet_model.tar.gz"
WIKI_ARCHIVE = "https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/knowbert_wiki_model.tar.gz"

WIKI_VOCAB_FILE = "https://allennlp.s3-us-west-2.amazonaws.com/knowbert/wiki_entity_linking/vocabulary_wiki.tar.gz"

In [3]:
wiki_model =  KnowBert.from_pretrained(WIKI_ARCHIVE)
entity_vocabulary = Vocabulary.from_files(WIKI_VOCAB_FILE)

wiki_batcher= KnowBertBatchifier.get_wiki_batchifier(entity_vocab=entity_vocabulary)

Reading pretrained embeddings from file


470105it [00:49, 9466.95it/s] 


duplicate_mentions_cnt:  6777
end of p_e_m reading. wall time: 1.219716235001882  minutes
p_e_m_errors:  0
incompatible_ent_ids:  0


In [5]:
sentences = [["Paris is located in [MASK].", "Michael [MASK] is a great music singer"],
                ["The Louvre contains the Mona Lisa", "The Amazon river is in Brazil"],
                "Donald Duck is a cartoon character",
                ["Hayao Miyazaki is the co-founder of Studio Ghibli and a renowned anime film maker",
                "The Alpine ibex is one of Switzerland's most famous animal along its grazing cows"]]

sentence = ["Paris is located in [MASK]."]


for batch in wiki_batcher.iter_batches(sentence, verbose=False):

    print(f"\nInput\n")
    #tokens: Tensor of tokens indices (used to idx an embedding) => because a batch contains multiple
    #sentences with varying # of tokens, all tokens tensors are padded with zeros 
    #shape: (batch_size (#sentences), max_seq_len)
    #print(batch['tokens'])#dict with only 'tokens'
    print(f"Tokens shape {batch['tokens']['tokens'].shape}")
    print(f"Tokenized sentence {batch['tokens']['tokens']}")
    #Defines the segments_ids (0 for first segment and 1 for second), can be used for NSP
    #shape: (batch_size,max_seq_len)
    print(f"Segment ids shape: {batch['segment_ids'].shape}")

    #Dict with only wordnet
    #Candidates: stores for multiple knowledge base, the entities detected using this knowledge base
    for kb_key in batch['candidates'].keys():
        kb_input = batch['candidates'][kb_key]
    
        #Stores for each detected entities, a list of candidate KB entities that correspond to it
        #Priors: correctness probabilities estimated by the entity linker (sum to 1 (or 0 if padding) on axis 2)
        #Adds 0 padding to axis 1 when there is less detected entities in the sentence than in the max sentence
        #Adds 0 padding to axis 2 when there is less detected KB entities for an entity in the sentence than in the max candidate KB entities entity
        #shape:(batch_size, max # detected entities, max # KB candidate entities)
        print(f"Candidate entity_priors shape: {kb_input['candidate_entity_priors'].shape}")
        #Ids of the KB candidate entities + 0 padding on axis 1 or 2 if necessary
        #shape: (batch_size, max # detected entities, max # KB candidate entities)
        print(f"Candidate entities ids shape: {kb_input['candidate_entities']['ids'].shape}")
        #Spans of which sequence of tokens correspond to an entity in the sentence, eg: [1,2] for Michael Jackson (both bounds are included)
        #Padding with [-1,-1] when no more detected entities
        #shape: (batch_size, max # detected entities, 2)
        print(f"Candidate span shape: {kb_input['candidate_spans'].shape}")

        #For each sentence entity, indicate to which segment ids it corresponds to
        #shape: (batch_size, max # detected entities)
        print(f"Candidate segments_ids shape: {kb_input['candidate_segment_ids'].shape}")

    model_output = wiki_model(**batch)
    
    print(f"\nOutput\n")
    for kb_key in model_output.keys():
        if(kb_key in ['wiki','wordnet']):
            kb_output = model_output[kb_key]
            #Span attention layers scores for wordnet KB
            #shape: (batch_size,?,max_seq_len,max # detected entities)
            print(f"Output wordnet entity_attention_probs shape: {kb_output['entity_attention_probs'].shape}")
            #Entity linker score for each text entity and possible KB entity, -1.0000e+04 padding in case of no score
            #shape: (batch_size, max # detected entities, max # KB candidate entities)
            print(f"Output wordnet linking_scores shape: {kb_output['linking_scores'].shape}")
        
    #Scalar indicating loss over this batch (0 if not training?)
    print(f"Output loss: {model_output['loss']}")

    #Final CLS embedding for each sentence of batch
    # shape: (batch_size, hidden_size) 
    print(f"Pooled output shape: {model_output['pooled_output'].shape}")
    #For each tokens, its final embeddings
    #Important!!!, still predicts something for 0 padded tokens => ignore (or 0 padding <=> MASK???)
    print(f"Contextual embeddings: {model_output['contextual_embeddings'].shape}")
    



Input

Tokens shape torch.Size([1, 8])
Tokenized sentence tensor([[ 101, 3000, 2003, 2284, 1999,  103, 1012,  102]])
Segment ids shape: torch.Size([1, 8])
Candidate entity_priors shape: torch.Size([1, 3, 30])
Candidate entities ids shape: torch.Size([1, 3, 30])
Candidate span shape: torch.Size([1, 3, 2])
Candidate segments_ids shape: torch.Size([1, 3])

Output

Output wordnet entity_attention_probs shape: torch.Size([1, 4, 8, 3])
Output wordnet linking_scores shape: torch.Size([1, 3, 30])
Output loss: 0.0
Pooled output shape: torch.Size([1, 768])
Contextual embeddings: torch.Size([1, 8, 768])
