In [None]:
!pip3 install transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 12.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 48.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
ner_dict = dict( enumerate(['O','B-MIS','I-MIS','B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']))

In [None]:
params = {
    'debug': False,
    'checkpoint': 'dslim/bert-base-NER',
    'max_len': 256,
    'batch_size': 32,
    'device': device
}

In [None]:
text = [ "My name is New York and I live in Berlin", "I am going to visit Washington DC"]

In [None]:
class NerDataset:
    def __init__(self, text, max_len=params['max_len'], checkpoint=params['checkpoint']):
        self.text = text
        self.max_len = max_len
        self.checkpoint = checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.num_examples = len(self.text)

    def __len__(self):
        return self.num_examples

    def __getitem__(self, idx):
        text = str(self.text[idx])

        tokenized_text = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_attention_mask=True,
            return_token_type_ids=True,
        )

        ids = tokenized_text['input_ids']
        mask = tokenized_text['attention_mask']
#         token_type_ids = tokenized_text['token_type_ids']

        return {'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
#                 'token_type_ids': torchs.tensor(class BERTDataset:
        }

In [None]:
class NerModel:
    def __init__(self, checkpoint=params['checkpoint']):
        self.checkpoint = checkpoint
        self.model = AutoModelForTokenClassification.from_pretrained(self.checkpoint)

In [None]:
model = NerModel().model
# model.load_state_dict(torch.load(model_name))
model = model.to(params['device'])

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

In [None]:
model.eval()

test_dataset = NerDataset(
    text = text
)

test_loader = DataLoader(
    test_dataset, batch_size=params['batch_size'],
    shuffle=False,
    pin_memory=True
)

temp_preds = None
l = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc=f'Predicting. '):
        ids= batch['ids'].to(device)
        # mask = batch['mask'].to(device)
#             token_type_ids = batch['token_type_ids'].to(device)
        predictions = model(ids)

        # print(predictions, predictions.shape, type(predictions))
        logits = predictions.logits.detach().cpu().numpy()
        l.extend([list(p) for p in np.argmax(logits, axis=2)])
        
        # for prediction in predictions.logits:
        #   lst = torch.argmax(prediction,dim = 1).tolist()
        #   print(lst)

print(*l, sep='\n')


Predicting. : 100%|██████████| 1/1 [00:04<00:00,  4.28s/it]

[0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,




# **Inference**

In [None]:
for a,b in zip(text,l):
    print(*list(zip(a.split(), [ner_dict[item] for item in b[1:]] )),end='\n\n', sep='\n')

('My', 'O')
('name', 'O')
('is', 'O')
('New', 'B-LOC')
('York', 'I-LOC')
('and', 'O')
('I', 'O')
('live', 'O')
('in', 'O')
('Berlin', 'B-LOC')

('I', 'O')
('am', 'O')
('going', 'O')
('to', 'O')
('visit', 'O')
('Washington', 'B-LOC')
('DC', 'I-LOC')

