<a href="https://colab.research.google.com/github/ujjalkumarmaity/NLP/blob/main/Huggingface-NLP-Course%20/7-Token_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In This Notebook we will implement, **NER, **



In [1]:
%%capture
!pip install datasets seqeval evaluate transformers[torch] accelerate

### Prepare Data

In [None]:
from datasets import load_dataset
data = load_dataset("conll2003")
data

In [3]:
data['train'][2]

{'id': '2',
 'tokens': ['BRUSSELS', '1996-08-22'],
 'pos_tags': [22, 11],
 'chunk_tags': [11, 12],
 'ner_tags': [5, 0]}

In [4]:
data["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [5]:
# load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
inp = tokenizer(data['train'][2]['tokens'],is_split_into_words=True)
print(inp)
print(inp.tokens())

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'input_ids': [101, 26660, 13329, 12649, 15928, 1820, 118, 4775, 118, 1659, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'BR', '##US', '##SE', '##LS', '1996', '-', '08', '-', '22', '[SEP]']


In [6]:
inp.word_ids()

[None, 0, 0, 0, 0, 1, 1, 1, 1, 1, None]

In [7]:
# For tokens inside a word but not at the beginning, we replace the B- with I-
def align_label_with_tokens(labels,word_ids):
    label = []
    prev_wid = None
    for w_id in word_ids:
        if w_id is None:
            label.append(-100)
            prev_wid = w_id
        elif prev_wid!=w_id:
            label.append(labels[w_id])
            prev_wid = w_id
        else:
            l = labels[w_id]+1 if labels[w_id]%2==1 else labels[w_id]
            label.append(l)
    return label

In [8]:
labels = data["train"][2]["ner_tags"]
word_ids = inp.word_ids()
print(labels)
align_label_with_tokens(labels, word_ids)

[5, 0]


[-100, 5, 6, 6, 6, 0, 0, 0, 0, 0, -100]

In [9]:
def tokenize_and_assign_label(x):
    input_ids = tokenizer(x['tokens'],truncation=True, is_split_into_words=True)
    labels = x["ner_tags"]
    all_label = []
    for ind,label in enumerate(labels):
        word_ids = input_ids.word_ids(ind)
        all_label.append(align_label_with_tokens(label, word_ids))
    input_ids['labels'] = all_label
    return input_ids


In [10]:
tokenized_data = data.map(tokenize_and_assign_label,batched=True,remove_columns=data['train'].column_names)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [11]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

### Data Collector

In [12]:
from transformers import DataCollatorForTokenClassification
data_collector = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [13]:
batch = data_collector([tokenized_data['train'][0],tokenized_data['train'][1]])
batch['labels']

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

### Define Model

In [14]:
from transformers import AutoModelForTokenClassification
list_of_ner_tag = data['train'].features['ner_tags'].feature.names
id2label = {ind:val for ind,val in enumerate(list_of_ner_tag)}
label2id = {val:ind for ind,val in enumerate(list_of_ner_tag)}
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased',
            id2label = id2label,
            label2id = label2id
            )
# model

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model.config.id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [16]:
model.config.num_labels

9

### Fine-tuning model with the Trainer API

#### Define Training Argument

In [17]:
from transformers import TrainingArguments
training_argument = TrainingArguments(output_dir = '/content/bert-base-ner',
                                        evaluation_strategy="epoch",
                                        save_strategy="epoch",
                                        learning_rate=2e-5,
                                        per_device_eval_batch_size=16,
                                        per_device_train_batch_size=16,
                                        weight_decay=0.01,
                                      )

In [18]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args = training_argument,
    train_dataset = tokenized_data['train'],
   eval_dataset = tokenized_data["validation"],
    tokenizer = tokenizer,
    data_collator = data_collector,
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2189,0.0697
2,0.0442,0.053032
3,0.027,0.052584


TrainOutput(global_step=2634, training_loss=0.07601643613367798, metrics={'train_runtime': 486.4268, 'train_samples_per_second': 86.597, 'train_steps_per_second': 5.415, 'total_flos': 1054683418795902.0, 'train_loss': 0.07601643613367798, 'epoch': 3.0})

## Fine-tuning model with the Trainer API

In [20]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [21]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_data["train"],
    shuffle=True,
    collate_fn=data_collector,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_data["validation"], collate_fn=data_collector, batch_size=8
)

In [22]:
from transformers import AdamW,get_scheduler
optimizer = AdamW(model.parameters(),lr = 1e-05)
epoch = 3
num_training_steps = len(train_dataloader) * epoch
lr_schedular = get_scheduler('linear',
                             optimizer = optimizer,
                             num_warmup_steps = 0,
                             num_training_steps = num_training_steps)



In [23]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
for ep in range(epoch):
    total_loss = 0
    for batch in train_dataloader:
        batch.to(device)
        out = model(**batch)
        loss = out.loss
        total_loss += loss.item()
        # When you call loss.backward(), all it does is compute gradient of loss w.r.t all the parameters in loss that have requires_grad = True
        # and store them in parameter.grad attribute for every parameter.
        loss.backward()
        # optimizer.step() updates all the parameters based on parameter.grad
        optimizer.step()
        # adjusting the learning rate during the training process
        lr_schedular.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print(f"At {ep} epoch, training loss - {total_loss/len(train_dataloader)}")



  0%|          | 0/5268 [00:00<?, ?it/s]

At 0 epoch, training loss - 0.027129581082451643
At 1 epoch, training loss - 0.01593207352835346
At 2 epoch, training loss - 0.009713803820064343


In [24]:
output_dir = '/content/bert-base-ner'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('/content/bert-base-ner/tokenizer_config.json',
 '/content/bert-base-ner/special_tokens_map.json',
 '/content/bert-base-ner/vocab.txt',
 '/content/bert-base-ner/added_tokens.json',
 '/content/bert-base-ner/tokenizer.json')