# Named Entity Recognition

In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch


# Setup model, tokenizer and device

model_id = "dslim/bert-base-NER"

tokenizer = AutoTokenizer.from_pretrained(model_id)

ner_model = AutoModelForTokenClassification.from_pretrained(model_id)

device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
from transformers import pipeline

nlp = pipeline(
    "ner", 
    model=ner_model, 
    tokenizer=tokenizer, 
    device=device, 
    aggregation_strategy="max"
)

Device set to use cpu


In [16]:
nlp("My name is Vinicius Finger and I live in Canoas, Brazil.")

[{'entity_group': 'PER',
  'score': 0.99606293,
  'word': 'Vinicius Finger',
  'start': 11,
  'end': 26},
 {'entity_group': 'LOC',
  'score': 0.9980527,
  'word': 'Canoas',
  'start': 41,
  'end': 47},
 {'entity_group': 'LOC',
  'score': 0.9995635,
  'word': 'Brazil',
  'start': 49,
  'end': 55}]

In [18]:
# In portuguese the result is different
nlp("Eu sou o Vinicius Finger e moro em Canoas, Brasil.")

[{'entity_group': 'LOC',
  'score': 0.93247026,
  'word': 'moro',
  'start': 27,
  'end': 31},
 {'entity_group': 'LOC',
  'score': 0.9914705,
  'word': 'Canoas',
  'start': 35,
  'end': 41},
 {'entity_group': 'LOC',
  'score': 0.9995523,
  'word': 'Brasil',
  'start': 43,
  'end': 49}]