In [1]:
import torch
import transformers
import numpy as np

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce 940MX'

In [31]:
tag2idx = {'B-art': 13, 'B-eve': 4, 'B-geo': 8, 'B-gpe': 7, 'B-nat': 10,
           'B-org': 12, 'B-per': 2, 'B-tim': 1, 'I-art': 14, 'I-eve': 0, 'I-geo': 16,
           'I-gpe': 3, 'I-nat': 5, 'I-org': 9, 'I-per': 11, 'I-tim': 15,
           'O': 6, 'PAD': 17}

tag2idx = {k: v for k, v in sorted(tag2idx.items(), key=lambda item: item[1])}
tag_values = list(tag2idx.keys())

In [32]:
tokenizer = transformers.BertTokenizer.from_pretrained('../../Resources/models/ner/bert_ner_tokenizer')
model = transformers.BertForTokenClassification.from_pretrained('../../Resources/models/ner/bert_ner_model')

In [33]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [40]:
def ner(sentence, tokenizer, model):
    tokenized_sentence = tokenizer.encode(sentence)
    input_ids = torch.tensor([tokenized_sentence]).cuda()

    with torch.no_grad():
        output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
                
    for token, label in zip(new_tokens, new_labels):
        print("{}\t{}".format(label, token))
        
    return [i for i, lab in enumerate(new_labels) if lab != 'O'], new_tokens

In [41]:
sentence = """ Google recently launched a new Pixel device, unveiled on stage by CEO Sundar Pichai """

In [43]:
idx, tokens = ner(sentence, tokenizer, model)

O	[CLS]
B-org	Google
O	recently
O	launched
O	a
O	new
B-org	Pixel
O	device
O	,
O	unveiled
O	on
O	stage
O	by
O	CEO
B-per	Sundar
I-per	Pichai
O	[SEP]


In [44]:
idx

[1, 6, 14, 15]

In [46]:
for i in idx:
    print(tokens[i])

Google
Pixel
Sundar
Pichai
