In [1]:
import transformers
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Data Extraction

In [2]:
data = load_dataset('conllpp')

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [5]:
pd.DataFrame(data['train'][:])[['tokens','ner_tags']].iloc[0]

tokens      [EU, rejects, German, call, to, boycott, Briti...
ner_tags                          [3, 0, 7, 0, 0, 0, 7, 0, 0]
Name: 0, dtype: object

In [6]:
tags = data['train'].features['ner_tags']
indextotag = {idx:tag for idx,tag in enumerate(tags.feature.names)}
tagtoindex = {tag:idx for idx,tag in enumerate(tags.feature.names)}

In [7]:
def create_tag_names(batch):
    tag_name = {'ner_tags_str': [tags.feature.int2str(idx) for idx in batch['ner_tags']]}
    return tag_name

In [8]:
data = data.map(create_tag_names)

In [9]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3453
    })
})

## Model Building

### Tokenization

In [10]:
from transformers import AutoTokenizer

model_checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
tokenizer.is_fast

True

In [12]:
inputs = data['train'][0]['tokens']
inputs = tokenizer(inputs, is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [13]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [14]:
def align_labels_with_tokens(labels,word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id!=current_word:
            current_word=word_id
            label=-100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]

            if label%2==1:
                label+=1
            new_labels.append(label)
    return new_labels

In [15]:
labels = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels,word_ids)

[3, 0, 7, 0, 0, 0, 7, 0, 0] [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]


In [16]:
align_labels_with_tokens(labels,word_ids)

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [17]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'],truncation=True,is_split_into_words=True)
    
    all_labels = examples['ner_tags']

    new_labels = []
    for i, labels in enumerate(all_labels):
        
        word_ids = tokenized_inputs.word_ids(i)

        new_labels.append(align_labels_with_tokens(labels,word_ids))
    
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs


In [18]:
tokenized_dataset = data.map(tokenize_and_align_labels,batched=True,remove_columns=data['train'].column_names)

In [19]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

### Data Collation

In [20]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
batch = data_collator([tokenized_dataset['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
            119,   102],
         [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
              0,     0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
         [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

### Metrics

In [22]:
import evaluate

metrics = evaluate.load('seqeval')

In [23]:
ner_feature = data['train'].features['ner_tags']
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [24]:
label_names = ner_feature.feature.names

In [25]:
labels = data['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [26]:
predictions = labels.copy()
predictions[2] = 'O'

metrics.compute(predictions=[predictions],references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [27]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits,axis=-1)
    true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

    true_predictions = [[label_names[p] for p,l in zip(prediction,label) if l!=-100] for prediction,label in zip(predictions,labels)]

    all_metrics = metrics.compute(predictions=[true_predictions],references=[true_labels])

    return {"precision": all_metrics['overall_precision'],
            "recall": all_metrics['overall_recall'],
            "f1": all_metrics['overall_f1'],
            "accuracy": all_metrics['overall_accuracy']}

### Model Training

In [28]:
id2label = {i:label for i,label in enumerate(label_names)}
label2id = {label:i for i,label in enumerate(label_names)}

In [29]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label = id2label,label2id=label2id)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [30]:
model.config.num_labels

9

In [31]:
from transformers import TrainingArguments

args = TrainingArguments("distillbert-finetuned-ner",
                         evaluation_strategy= "epoch",
                         save_strategy= "epoch",
                         learning_rate=2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)


In [32]:
from transformers import Trainer

trainer = Trainer(model=model, args=args,
                  train_dataset=tokenized_dataset['train'], 
                  eval_dataset=tokenized_dataset['validation'], 
                  data_collator=data_collator, 
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

In [33]:
trainer.train()

***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268
 10%|▉         | 505/5268 [00:21<03:19, 23.90it/s]

{'loss': 0.2797, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}


 19%|█▉        | 1003/5268 [00:44<03:33, 19.99it/s]

{'loss': 0.1201, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}


 29%|██▊       | 1504/5268 [01:08<03:10, 19.80it/s]

{'loss': 0.1035, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}


 33%|███▎      | 1755/5268 [01:20<02:55, 19.99it/s]***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                   
 33%|███▎      | 1756/5268 [01:26<02:55, 19.99it/s]Saving model checkpoint to distillbert-finetuned-ner/checkpoint-1756
Configuration saved in distillbert-finetuned-ner/checkpoint-1756/config.json


{'eval_loss': 0.08148389309644699, 'eval_precision': 0.8543256997455471, 'eval_recall': 0.8543256997455471, 'eval_f1': 0.8543256997455471, 'eval_accuracy': 0.8581538461538462, 'eval_runtime': 5.2301, 'eval_samples_per_second': 621.403, 'eval_steps_per_second': 77.819, 'epoch': 1.0}


Model weights saved in distillbert-finetuned-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in distillbert-finetuned-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in distillbert-finetuned-ner/checkpoint-1756/special_tokens_map.json
 38%|███▊      | 2003/5268 [01:39<02:35, 20.99it/s]

{'loss': 0.0729, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}


 48%|████▊     | 2503/5268 [02:03<02:11, 21.06it/s]

{'loss': 0.0583, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}


 57%|█████▋    | 3004/5268 [02:28<01:48, 20.85it/s]

{'loss': 0.0555, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}


 67%|██████▋   | 3504/5268 [02:53<01:28, 19.87it/s]

{'loss': 0.0431, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}


 67%|██████▋   | 3511/5268 [02:53<01:29, 19.57it/s]***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                   
 67%|██████▋   | 3512/5268 [02:58<01:29, 19.57it/s]Saving model checkpoint to distillbert-finetuned-ner/checkpoint-3512
Configuration saved in distillbert-finetuned-ner/checkpoint-3512/config.json


{'eval_loss': 0.07168446481227875, 'eval_precision': 0.8832325803372574, 'eval_recall': 0.8829516539440203, 'eval_f1': 0.8830920947987911, 'eval_accuracy': 0.8867692307692308, 'eval_runtime': 5.0496, 'eval_samples_per_second': 643.614, 'eval_steps_per_second': 80.6, 'epoch': 2.0}


Model weights saved in distillbert-finetuned-ner/checkpoint-3512/pytorch_model.bin
tokenizer config file saved in distillbert-finetuned-ner/checkpoint-3512/tokenizer_config.json
Special tokens file saved in distillbert-finetuned-ner/checkpoint-3512/special_tokens_map.json
 76%|███████▌  | 4002/5268 [03:25<01:13, 17.29it/s]

{'loss': 0.0292, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}


 85%|████████▌ | 4504/5268 [03:51<00:36, 20.72it/s]

{'loss': 0.0313, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}


 95%|█████████▍| 5004/5268 [04:15<00:13, 19.50it/s]

{'loss': 0.0253, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


100%|█████████▉| 5267/5268 [04:28<00:00, 20.57it/s]***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                   
100%|██████████| 5268/5268 [04:34<00:00, 20.57it/s]Saving model checkpoint to distillbert-finetuned-ner/checkpoint-5268
Configuration saved in distillbert-finetuned-ner/checkpoint-5268/config.json


{'eval_loss': 0.06921835988759995, 'eval_precision': 0.8867324212535794, 'eval_recall': 0.8864503816793893, 'eval_f1': 0.8865913790361062, 'eval_accuracy': 0.8898461538461538, 'eval_runtime': 5.1498, 'eval_samples_per_second': 631.097, 'eval_steps_per_second': 79.033, 'epoch': 3.0}


Model weights saved in distillbert-finetuned-ner/checkpoint-5268/pytorch_model.bin
tokenizer config file saved in distillbert-finetuned-ner/checkpoint-5268/tokenizer_config.json
Special tokens file saved in distillbert-finetuned-ner/checkpoint-5268/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 5268/5268 [04:35<00:00, 19.14it/s]

{'train_runtime': 275.2164, 'train_samples_per_second': 153.054, 'train_steps_per_second': 19.141, 'train_loss': 0.07925557728146819, 'epoch': 3.0}





TrainOutput(global_step=5268, training_loss=0.07925557728146819, metrics={'train_runtime': 275.2164, 'train_samples_per_second': 153.054, 'train_steps_per_second': 19.141, 'train_loss': 0.07925557728146819, 'epoch': 3.0})

In [1]:
from transformers import pipeline

checkpoint = "distillbert-finetuned-ner/checkpoint-5268"
token_classifier = pipeline(
    'token-classification',
    model=checkpoint,
    aggregation_strategy = 'simple'
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(token_classifier("I am tejas and I work at Augrade"))

: 