<a href="https://colab.research.google.com/github/uk-zash/HuggingFace-Transformers/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
!pip install datasets



In [51]:
from datasets import load_dataset

In [52]:
dataset = load_dataset("conll2003")

In [53]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [54]:
dataset["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [55]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")




In [68]:
def tokenize(example):
  tokenize_inputs = tokenizer(
        example['tokens'],
        padding='max_length',  # Pad to the maximum length of the batch
        truncation=True,       # Truncate to the maximum length
        is_split_into_words=True
    )

  labels = []

  for i , label in enumerate(example["ner_tags"]):
    word_ids = tokenize_inputs.word_ids(batch_index = i)
    label_ids = []
    previous_word = None
    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word:
        label_ids.append(label[word_idx])
        # print(label_ids)
      else:
        label_ids.append(-100)
      previous_word = word_idx

    labels.append(label_ids)

  tokenize_inputs["labels"] = labels
  return tokenize_inputs






tokenized_data = dataset.map(tokenize , batched = True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [69]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [70]:
tokenized_data["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [71]:
len(dataset["train"].features["ner_tags"].feature.names)

9

In [72]:
from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained("bert-base-cased" , num_labels = len(dataset["train"].features["ner_tags"].feature.names))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "./ner_model",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    num_train_epochs = 3,
    weight_decay = 0.01,
    push_to_hub = False
)



In [62]:
!pip install evaluate



In [63]:
!pip install seqeval




In [74]:
import numpy as np
# from datasets import load_metric
import evaluate

metric = evaluate.load("seqeval")

label_list = dataset["train"].features["ner_tags"].feature.names

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [label_list[pred] for (pred, label) in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return metric.compute(predictions=true_predictions, references=true_labels)


In [84]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
    )

In [85]:
trainer.train()

Epoch,Training Loss,Validation Loss,Loc,Misc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.041755,"{'precision': 0.9256933542647828, 'recall': 0.9629831246597713, 'f1': 0.9439701173959446, 'number': 1837}","{'precision': 0.8366445916114791, 'recall': 0.8221258134490239, 'f1': 0.8293216630196938, 'number': 922}","{'precision': 0.9005891016200295, 'recall': 0.9120059656972409, 'f1': 0.9062615783623564, 'number': 1341}","{'precision': 0.9701573521432447, 'recall': 0.9706840390879479, 'f1': 0.9704206241519674, 'number': 1842}",0.920239,0.932009,0.926087,0.988396
2,0.093500,0.037509,"{'precision': 0.960741548527808, 'recall': 0.9591725639629831, 'f1': 0.9599564151457368, 'number': 1837}","{'precision': 0.8280450358239508, 'recall': 0.8774403470715835, 'f1': 0.8520273828330699, 'number': 922}","{'precision': 0.9210134128166915, 'recall': 0.9217002237136466, 'f1': 0.9213566902720834, 'number': 1341}","{'precision': 0.9713668287412209, 'recall': 0.9761129207383279, 'f1': 0.9737340915245059, 'number': 1842}",0.933544,0.943285,0.938389,0.989934
3,0.028500,0.036646,"{'precision': 0.9629831246597713, 'recall': 0.9629831246597713, 'f1': 0.9629831246597712, 'number': 1837}","{'precision': 0.8694736842105263, 'recall': 0.89587852494577, 'f1': 0.8824786324786325, 'number': 922}","{'precision': 0.9108838568298028, 'recall': 0.9299030574198359, 'f1': 0.9202952029520295, 'number': 1341}","{'precision': 0.972885032537961, 'recall': 0.9739413680781759, 'f1': 0.973412913727618, 'number': 1842}",0.939333,0.948502,0.943895,0.990577


Trainer is attempting to log a value of "{'precision': 0.9256933542647828, 'recall': 0.9629831246597713, 'f1': 0.9439701173959446, 'number': 1837}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8366445916114791, 'recall': 0.8221258134490239, 'f1': 0.8293216630196938, 'number': 922}" of type <class 'dict'> for key "eval/MISC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9005891016200295, 'recall': 0.9120059656972409, 'f1': 0.9062615783623564, 'number': 1341}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9701573521432447, 'recall': 0.9706840390879479

TrainOutput(global_step=1317, training_loss=0.05067843239508349, metrics={'train_runtime': 4242.6806, 'train_samples_per_second': 9.928, 'train_steps_per_second': 0.31, 'total_flos': 1.1007299854181376e+16, 'train_loss': 0.05067843239508349, 'epoch': 3.0})

In [140]:
input = "Welcome to America , we will meet at 2am in Financial Apartment"

tokenized_input = tokenizer(input , return_tensors = "pt")

In [141]:
tokenizer.tokenize(input)

['Welcome',
 'to',
 'America',
 ',',
 'we',
 'will',
 'meet',
 'at',
 '2',
 '##am',
 'in',
 'Financial',
 'Apart',
 '##ment']

In [142]:

tokenized_input

{'input_ids': tensor([[  101, 12050,  1106,  1738,   117,  1195,  1209,  2283,  1120,   123,
          2312,  1107,  7748, 10342,  1880,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [143]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [144]:
tokenized_input = {key: value.to(device) for key, value in tokenized_input.items()}


In [145]:
output = model(**tokenized_input)

In [146]:
output

TokenClassifierOutput(loss=None, logits=tensor([[[ 8.8921, -1.1858, -1.6924, -0.7534, -1.7402, -0.8299, -1.8654,
          -0.4262, -1.6921],
         [ 7.8770, -0.6384, -2.1359,  0.1926, -2.2867, -0.0311, -2.3004,
          -0.6461, -1.9000],
         [ 9.1592, -1.3372, -2.0376, -1.4427, -1.0769, -1.4987, -1.1165,
          -1.6187, -1.2330],
         [-0.5665, -1.3176, -1.9163, -0.9842, -1.3861,  7.0135,  0.1725,
          -0.4729, -1.5798],
         [ 9.7646, -1.1794, -1.7494, -1.4484, -1.0442, -1.3602, -1.2440,
          -1.4432, -1.5818],
         [ 9.6575, -1.1239, -1.8639, -1.2505, -1.2255, -1.0884, -1.4050,
          -1.4049, -1.8750],
         [ 9.7936, -1.1120, -1.9515, -1.3908, -1.0437, -1.4131, -1.3461,
          -1.2127, -1.4562],
         [ 9.6656, -1.0401, -1.8948, -1.4726, -1.0367, -1.2072, -1.5202,
          -1.4056, -1.4030],
         [ 9.7392, -0.9472, -1.7869, -1.5958, -1.1705, -1.4179, -1.3617,
          -1.4075, -1.6114],
         [ 8.2752, -0.8020, -2.0745, -0.85

In [147]:
import torch.nn.functional as F

# Apply softmax to the logits
probs = F.softmax(output.logits, dim=-1)


In [148]:
# Get the predicted class for each token
predicted_classes = torch.argmax(output.logits, dim=-1)


In [149]:
predicted_classes

tensor([[0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 0]], device='cuda:0')

In [150]:
class_label = dataset["train"].features["ner_tags"].feature.names

In [151]:
class_label

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [152]:
label_map = {0: "O", 1: "B-ORG", 2: "I-ORG", 3: "B-PER", 4: "I-PER", 5: "B-LOC", 6: "I-LOC"}


In [153]:
predicted_labels = [label_map[int(cls)] for cls in predicted_classes[0]]
print(predicted_labels)


['O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']
