<a href="https://colab.research.google.com/github/ujjalkumarmaity/NLP/blob/main/NER/Named_Entity_Recognition_with_BERT_wikiann.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets

In [1]:
import transformers
from datasets import load_dataset,load_metric
from transformers import AutoTokenizer,BertForTokenClassification,AdamW
from transformers import Trainer,TrainingArguments
from pprint import pprint
import numpy as np

In [2]:
dataset = load_dataset('wikiann','en')



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
print(dataset.shape)
print(dataset['train'])
print(dataset['train'][4])

{'validation': (10000, 4), 'test': (10000, 4), 'train': (20000, 4)}
Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'spans'],
    num_rows: 20000
})
{'tokens': ['Her', 'daughter', 'from', 'the', 'second', 'marriage', 'was', 'Marie', "d'Agoult", '(', '1805–1876', ')', ',', 'who', 'in', 'turn', 'gave', 'birth', 'to', 'several', 'children', ',', 'among', 'them—from', 'her', 'liaison', 'to', 'Franz', 'Liszt', '–-', 'Cosima', 'Wagner', '(', '1837–1930', ')', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0], 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en'], 'spans': ["PER: Marie d'Agoult", 'PER: Franz Liszt', 'PER: Cosima Wagner']}


In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
def tokenization(x):
    return tokenizer(x['tokens'],padding='max_length',truncation=True,is_split_into_words=True)
tok_data = dataset.map(tokenization)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



In [6]:
# Here input_ids length and ner_tags length not same
len(tok_data['train'][0]['input_ids']),len(tok_data['train'][0]['ner_tags'])

(512, 11)

In [7]:
pprint(dataset["train"][0]['tokens'])
pprint(tokenizer(dataset["train"][0]['tokens'])['input_ids'])

['R.H.',
 'Saunders',
 '(',
 'St.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')']
[[101, 1054, 1012, 1044, 1012, 102],
 [101, 15247, 102],
 [101, 1006, 102],
 [101, 2358, 1012, 102],
 [101, 5623, 102],
 [101, 2314, 102],
 [101, 1007, 102],
 [101, 1006, 102],
 [101, 5986, 2620, 102],
 [101, 12464, 102],
 [101, 1007, 102]]


In [8]:
z = tokenizer.batch_encode_plus(dataset['train']["tokens"], is_split_into_words=True, truncation=True)
print(z.word_ids(batch_index=0))
print(z['input_ids'][0])

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]
[101, 1054, 1012, 1044, 1012, 15247, 1006, 2358, 1012, 5623, 2314, 1007, 1006, 5986, 2620, 12464, 1007, 102]


In [23]:
def adjusted_tokenized_label(data):
    tokenized_data = tokenizer.batch_encode_plus(data["tokens"], is_split_into_words=True, truncation=True)
    tokenized_label = []
    for ind in range(len(tokenized_data['input_ids'])):
        prev_id = -1
        word_id = tokenized_data.word_ids(batch_index=ind)
        ner_label =  data['ner_tags'][ind]
        temp = []
        i = -1
        for w in word_id:
            if w is None:
                temp.append(-100)
            elif w==prev_id:
                temp.append(ner_label[i])
            else:
                i += 1
                temp.append(ner_label[i])
                prev_id = w
        tokenized_label.append(temp)
    tokenized_data['labels'] = tokenized_label
    return tokenized_data

In [24]:
tokenized_data = dataset.map(adjusted_tokenized_label,batched=True,remove_columns=['tokens', 'ner_tags', 'langs', 'spans'] )

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [25]:
tokenized_data

DatasetDict({
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

In [26]:
# Pad token for having same length
from transformers import DataCollatorForTokenClassification
data_collector = DataCollatorForTokenClassification(tokenizer)

In [27]:
label_names = dataset["train"].features["ner_tags"].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [28]:
model = BertForTokenClassification.from_pretrained('distilbert-base-uncased',num_labels = len(label_names))

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForTokenClassification: ['distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.4.attention.q_lin.bias', 'distilbert.transformer.layer.4.ffn.lin2.weight', 'distilbert.transformer.layer.5.attention.k_lin.bias', 'distilbert.transformer.layer.1.attention.out_lin.bias', 'distilbert.transformer.layer.1.ffn.lin1.bias', 'distilbert.transformer.layer.2.attention.v_lin.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.3.attention.v_lin.weight', 'distilbert.transformer.layer.2.ffn.lin2.bias', 'vocab_transform.bias', 'distilbert.transformer.layer.5.ffn.lin1.weight', 'distilbert.transformer.layer.5.attention.k_lin.weight', 'distilbert.embeddings.word_embeddings.weight', 'distilbe

In [None]:
!pip install seqeval

In [15]:
# matrics
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


  metric = load_metric("seqeval")



### Install accelerate
https://discuss.huggingface.co/t/trainingargument-does-not-work-on-colab/43372/2
1. Run `pip install accelerate -U` in a cell
2. In the top menu click Runtime → Restart Runtime
3. Do not rerun any cells with !pip install in them
4. Rerun all the other code cells

In [29]:
# Fine Tuning Model
logging_steps = len(tokenized_data['train']) // 16

training_arg = TrainingArguments(
    output_dir = '/content/',
    num_train_epochs = 5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    evaluation_strategy = 'epoch',
    disable_tqdm = False,
    logging_steps=logging_steps
)

In [30]:
trainer = Trainer(model = model,
                args = training_arg,
                train_dataset=tokenized_data["train"],
                eval_dataset=tokenized_data["validation"],
                data_collator=data_collector,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
                  )

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.0001,0.746538,0.373526,0.350086,0.361426,0.739795
2,0.5954,0.61901,0.460179,0.50388,0.481039,0.787441
3,0.4166,0.601483,0.519704,0.537851,0.528621,0.810848
4,0.2945,0.602664,0.554131,0.58283,0.568118,0.824803
5,0.2088,0.627321,0.559154,0.602138,0.579851,0.829298


TrainOutput(global_step=6250, training_loss=0.503081162109375, metrics={'train_runtime': 915.3263, 'train_samples_per_second': 109.251, 'train_steps_per_second': 6.828, 'total_flos': 1544456128675680.0, 'train_loss': 0.503081162109375, 'epoch': 5.0})

In [37]:
pred,label,mat = trainer.predict(tokenized_data['test'])

In [39]:
pred = np.argmax(pred, axis=2)
true_pred = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(pred,label)
]
true_label = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(pred,label)
]

In [41]:
true_pred[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'B-LOC',
 'O',
 'O',
 'O']

In [40]:
true_label[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'B-LOC',
 'O',
 'O',
 'O']