### Transformers Applications for Tokens (NER/POS tags datasets) 
1. Created models for prediction on a per token basis (NER/POS tags datasets provided);
2. Tokenize words into subwords through Transformers library;
3. Pre-trained transformers model ("distilbert-base-cased") is faster than BERT;
4. To learn structure of datasets and pre-trained model, and study how to make predictions by using transformers models.

In [2]:
# !pip install transformers datasets
from datasets import load_dataset
data = load_dataset('conll2003')
print(data)
data['train']
data['train'].features

Found cached dataset conll2003 (C:/Users/Sealion/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [3]:
data['train']

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [4]:
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
label_names = data['train'].features['ner_tags'].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [7]:
idx = 0
t = tokenizer(data['train'][idx]["tokens"], is_split_into_words=True)
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
t.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [9]:
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [10]:
begin2inside = {1:2, 3:4, 5:6, 7:8}

In [11]:
def align_targets(labels, word_ids):
    aligned_labels = []
    last_word = None
    for word in word_ids:
        if word is None:
            label = -100
        elif word != last_word:
            label = labels[word]
        else:
            label = labels[word]
            if label in begin2inside:
                label = begin2inside[label]
        aligned_labels.append(label)
        
        last_word = word
    return aligned_labels      
                

In [12]:
## test
labels = data['train'][idx]['ner_tags']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [13]:
aligned_labels = [label_names[t] if t>= 0 else None for t in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(f"{x}\{y}")    

[CLS]\None
EU\B-ORG
rejects\O
German\B-MISC
call\O
to\O
boycott\O
British\B-MISC
la\O
##mb\O
.\O
[SEP]\None


In [14]:
# Tokenize both inputs and targets
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(
    batch['tokens'], truncation=True, is_split_into_words=True)
    
    labels_batch = batch['ner_tags']
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(labels, word_ids))
        
    tokenized_inputs['labels'] = aligned_labels_batch
    return tokenized_inputs 

In [15]:
data['train'].column_names

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

In [16]:
tokenized_datasets = data.map(tokenize_fn, batched=True, remove_columns=data['train'].column_names,)

Loading cached processed dataset at C:\Users\Sealion\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-166759d3ce38d03c.arrow


Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\Sealion\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-7a5d616a76ade34a.arrow


In [17]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [18]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [19]:
tokenized_datasets['train'][0:2]

{'input_ids': [[101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  [101, 1943, 14428, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100], [-100, 1, 2, -100]]}

In [20]:
[tokenized_datasets['train'][i] for i in range(2)]

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]}]

In [21]:
# example
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch['labels']

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [22]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [23]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [24]:
tokenized_datasets['train'][0:2]

{'input_ids': [[101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  [101, 1943, 14428, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100], [-100, 1, 2, -100]]}

In [25]:
[tokenized_datasets['train'][i] for i in range(2)]

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]}]

In [26]:
# example
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [27]:
# !pip install seqeval 
from datasets import load_metric
metric = load_metric('seqeval')

  metric = load_metric('seqeval')


In [28]:
metric.compute(predictions=[['O', 'O', 'I-ORG', 'B-MISC']],
              references=[['O', 'B-ORG', 'I-ORG', 'B-MISC']])

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.75}

In [29]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis=-1)
    
    str_labels = [[label_names[t] for t in label if t != -100] for label in labels]
    
    str_preds=[
        [label_names[p] for p, t in zip(pred, targ) if t != -100]
        for pred, targ in zip(preds, labels)
    ]
    
    the_metrics = metric.compute(predictions=str_preds, references=str_labels)
    return {
        'precision': the_metrics['overall_precision'],
        'recall': the_metrics['overall_recall'],
        'f1': the_metrics['overall_f1'],
        'accuracy': the_metrics['overall_accuracy'],        
    }

In [30]:
id2label={k:v for k, v in enumerate(label_names)}
label2id={v:k for k, v in id2label.items()}

In [31]:
from transformers import AutoModelForTokenClassification
checkpoint = "distilbert-base-cased"
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label, 
    label2id=label2id,
)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [32]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    "distilbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)
training_args

Failure while loading azureml_run_type_providers. Failed to load entrypoint azureml.scriptrun = azureml.core.script_run:ScriptRun._from_run_dto with exception (pywin32 228 (c:\users\sealion\anaconda3\lib\site-packages), Requirement.parse('pywin32==227; sys_platform == "win32"'), {'docker'}).


TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_n

In [34]:
import numpy as np
from transformers import Trainer
trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=tokenized_datasets['train'],
     eval_dataset=tokenized_datasets['validation'],
     data_collator=data_collator,
     compute_metrics=compute_metrics,
     tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268
  Number of trainable parameters = 65197833


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0391,0.090033,0.890897,0.920734,0.90557,0.978645
2,0.0261,0.076998,0.916585,0.939414,0.927859,0.983826
3,0.0127,0.080032,0.921122,0.939414,0.930178,0.983723


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner\checkpoint-1756
Configuration saved in distilbert-finetuned-ner\checkpoint-1756\config.json
Model weights saved in distilbert-finetuned-ner\checkpoint-1756\pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner\checkpoint-1756\tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner\checkpoint-1756\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner\checkpoint-3512
Configuration saved in distilbert-finetuned-ner\checkpoint-3512\config.json
Model weights saved in distilbert-finetuned-ner\checkpoint-3512\pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner\checkpoint-3512\tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner\checkpoint-3512\special_tokens_map.json
***** Running Evaluation *****
 

TrainOutput(global_step=5268, training_loss=0.030547335461042283, metrics={'train_runtime': 387.4245, 'train_samples_per_second': 108.726, 'train_steps_per_second': 13.597, 'total_flos': 462023079274890.0, 'train_loss': 0.030547335461042283, 'epoch': 3.0})

In [36]:
trainer.save_model('my_saved_model')
from transformers import pipeline
ner = pipeline(
   "token-classification",
    model="my_saved_model",
    aggregation_strategy="simple",
    device=0,)
ner('Bill Gates was the CEO of Microsoft in Seattle, Washington.')

Saving model checkpoint to my_saved_model
Configuration saved in my_saved_model\config.json
Model weights saved in my_saved_model\pytorch_model.bin
tokenizer config file saved in my_saved_model\tokenizer_config.json
Special tokens file saved in my_saved_model\special_tokens_map.json
loading configuration file my_saved_model\config.json
Model config DistilBertConfig {
  "_name_or_path": "my_saved_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "

[{'entity_group': 'PER',
  'score': 0.9997233,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9989698,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.9994497,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity_group': 'LOC',
  'score': 0.99881625,
  'word': 'Washington',
  'start': 48,
  'end': 58}]

## Exercise

In [43]:
# !pip install transformers datasets
# !pip install nltk
import nltk
from nltk.corpus import brown

nltk.download('brown')
nltk.download('universal_tagset')

corpus = brown.tagged_sents(tagset='universal')
corpus

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Sealion\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Sealion\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [45]:
inputs = []
targets = []
for sentence_tag_pairs in corpus:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)

In [47]:
import json
with open('data.json', 'w') as f:
    for x,y in zip(inputs, targets):
        j = {'inputs':x, 'targets':y}
        s = json.dumps(j)
        f.write(f'{s}\n')    

In [48]:
from datasets import load_dataset
data = load_dataset('json', data_files='data.json')
data

Downloading and preparing dataset json/default to C:/Users/Sealion/.cache/huggingface/datasets/json/default-9ec1cf0132016143/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/Sealion/.cache/huggingface/datasets/json/default-9ec1cf0132016143/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [49]:
small = data['train'].shuffle(seed=42).select(range(20_000))
small

Dataset({
    features: ['inputs', 'targets'],
    num_rows: 20000
})

In [50]:
data = small.train_test_split(seed=42)
data['train'][0]

{'inputs': ['Ulyate',
  'and',
  'Kearton',
  'climbed',
  'on',
  'toward',
  'the',
  'sound',
  'of',
  'the',
  'barking',
  'of',
  'the',
  'dogs',
  'and',
  'the',
  'sporadic',
  'roaring',
  'of',
  'the',
  'lion',
  ',',
  'till',
  'they',
  'came',
  ',',
  'out',
  'of',
  'breath',
  ',',
  'to',
  'the',
  'crest',
  ',',
  'and',
  'peering',
  'through',
  'the',
  'branches',
  'of',
  'a',
  'bush',
  ',',
  'this',
  'is',
  'what',
  'Ulyate',
  'saw',
  ':',
  'Jones',
  'who',
  'had',
  'apparently',
  '(',
  'and',
  'actually',
  'had',
  ')',
  'ridden',
  'up',
  'the',
  'nearly',
  'impassable',
  'hillside',
  ',',
  'sitting',
  'calmly',
  'on',
  'his',
  'horse',
  'within',
  'forty',
  'feet',
  'of',
  'a',
  'full-grown',
  'young',
  'lioness',
  ',',
  'who',
  'was',
  'crouched',
  'on',
  'a',
  'flat',
  'rock',
  'and',
  'seemed',
  'just',
  'about',
  'to',
  'charge',
  'him',
  ',',
  'while',
  'the',
  'dogs',
  'whirled',
  'aroun

In [52]:
data['train'].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [59]:
# Map tagets to ints
target_set = set()
for target in targets:
    target_set = target_set.union(target)
target_set

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [60]:
target_list = list(target_set)
id2label = {k:v for k,v in enumerate(target_list)}
label2id = {v:k for k,v in id2label.items()}

In [63]:
from transformers import AutoTokenizer   # bert
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

loading configuration file config.json from cache at C:\Users\Sealion/.cache\huggingface\hub\models--distilbert-base-cased\snapshots\4dc145c5bd4fdb672dcded7fdc1efd6c2bc55992\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 28996
}

loading file vocab.txt from cache at C:\Users\Sealion/.cache\huggingface\hub\models--distilbert-base-cased\snapshots\4dc145c5bd4fdb672dcded7fdc1efd6c2bc55992\vocab.txt
loading file tokenizer.json from cache at C:\Users\Sealion/.cache\huggingface\hub\models--distilbert-base-cased\snapshots\4d

In [67]:
data['train'][idx]['inputs'][:10]

['Ulyate',
 'and',
 'Kearton',
 'climbed',
 'on',
 'toward',
 'the',
 'sound',
 'of',
 'the']

In [64]:
idx = 0
t = tokenizer(data['train'][idx]['inputs'], is_split_into_words=True)
t

{'input_ids': [101, 158, 25928, 1566, 1105, 26835, 9349, 1320, 5998, 1113, 1755, 1103, 1839, 1104, 1103, 26635, 1104, 1103, 6363, 1105, 1103, 188, 27695, 23041, 1104, 1103, 11160, 117, 6174, 1152, 1338, 117, 1149, 1104, 2184, 117, 1106, 1103, 13468, 117, 1105, 19205, 1194, 1103, 5020, 1104, 170, 13771, 117, 1142, 1110, 1184, 158, 25928, 1566, 1486, 131, 2690, 1150, 1125, 4547, 113, 1105, 2140, 1125, 114, 17698, 1146, 1103, 2212, 24034, 11192, 1895, 25068, 117, 2807, 13285, 1113, 1117, 3241, 1439, 5808, 1623, 1104, 170, 1554, 118, 4215, 1685, 11160, 5800, 117, 1150, 1108, 15062, 1113, 170, 3596, 2067, 1105, 1882, 1198, 1164, 1106, 2965, 1140, 117, 1229, 1103, 6363, 18370, 1213, 1123, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [69]:
print(type(t))
t.tokens()

<class 'transformers.tokenization_utils_base.BatchEncoding'>


['[CLS]',
 'U',
 '##lya',
 '##te',
 'and',
 'Ke',
 '##art',
 '##on',
 'climbed',
 'on',
 'toward',
 'the',
 'sound',
 'of',
 'the',
 'barking',
 'of',
 'the',
 'dogs',
 'and',
 'the',
 's',
 '##poradic',
 'roaring',
 'of',
 'the',
 'lion',
 ',',
 'till',
 'they',
 'came',
 ',',
 'out',
 'of',
 'breath',
 ',',
 'to',
 'the',
 'crest',
 ',',
 'and',
 'peering',
 'through',
 'the',
 'branches',
 'of',
 'a',
 'bush',
 ',',
 'this',
 'is',
 'what',
 'U',
 '##lya',
 '##te',
 'saw',
 ':',
 'Jones',
 'who',
 'had',
 'apparently',
 '(',
 'and',
 'actually',
 'had',
 ')',
 'ridden',
 'up',
 'the',
 'nearly',
 'imp',
 '##ass',
 '##able',
 'hillside',
 ',',
 'sitting',
 'calmly',
 'on',
 'his',
 'horse',
 'within',
 'forty',
 'feet',
 'of',
 'a',
 'full',
 '-',
 'grown',
 'young',
 'lion',
 '##ess',
 ',',
 'who',
 'was',
 'crouched',
 'on',
 'a',
 'flat',
 'rock',
 'and',
 'seemed',
 'just',
 'about',
 'to',
 'charge',
 'him',
 ',',
 'while',
 'the',
 'dogs',
 'whirled',
 'around',
 'her',
 '.',
 

In [74]:
t.word_ids()[:20]

[None, 0, 0, 0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [80]:
def align_targets(labels, word_ids):    
    aligned_labels = []
    for word in word_ids:
        if word is None: 
            label = -100
        else:
            label = label2id[labels[word]]
        aligned_labels.append(label)
    return aligned_labels

labels = data['train'][idx]['targets']
word_idx = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

[-100, 1, 7, 1, 2, 8, 3, 4, 1, 1, 3, -100]

In [108]:
data['train'][idx]['targets'][:10]

['NOUN', 'CONJ', 'NOUN', 'VERB', 'PRT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET']

In [84]:
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x,y in zip(t.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	None
U	NOUN
##lya	CONJ
##te	NOUN
and	VERB
Ke	PRT
##art	ADP
##on	DET
climbed	NOUN
on	NOUN
toward	ADP
the	None


In [86]:
# tokenize both inputs and targets
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(batch['inputs'], truncation=True, is_split_into_words=True)
    labels_batch = batch['targets']
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(labels,word_ids))
        tokenized_inputs['labels'] = aligned_labels_batch
    return tokenized_inputs

tokenized_datasets = data.map(tokenize_fn, batched=True, remove_columns=data['train'].column_names,)
tokenized_datasets        

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [104]:
tokenized_datasets['train'][0]['labels']

[-100,
 1,
 1,
 1,
 7,
 1,
 1,
 1,
 2,
 8,
 3,
 4,
 1,
 3,
 4,
 1,
 3,
 4,
 1,
 7,
 4,
 0,
 0,
 1,
 3,
 4,
 1,
 9,
 3,
 5,
 2,
 9,
 8,
 3,
 1,
 9,
 3,
 4,
 1,
 9,
 7,
 2,
 3,
 4,
 1,
 3,
 4,
 1,
 9,
 4,
 2,
 4,
 1,
 1,
 1,
 2,
 9,
 1,
 5,
 2,
 10,
 9,
 7,
 10,
 2,
 9,
 2,
 3,
 4,
 10,
 0,
 0,
 0,
 1,
 9,
 2,
 10,
 3,
 4,
 1,
 3,
 11,
 1,
 3,
 4,
 0,
 0,
 0,
 0,
 1,
 1,
 9,
 5,
 2,
 2,
 3,
 4,
 0,
 1,
 7,
 2,
 10,
 10,
 8,
 2,
 5,
 9,
 3,
 4,
 1,
 2,
 3,
 5,
 9,
 -100]

In [111]:
def flatten(list_of_lists):
    flattened = [val for sublist in list_of_lists for val in sublist]
    return flattened

In [112]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = np.argmax(logitx,axis=-1)
    labels_jagged = [[t for t in nlabel if t != -100] for label in labels]
    preds_jaggles = [[p for p, t in zip(ps,ts) if t != -100] for ps, ts in zip(preds, labels)]

    labels_flat = flatten(labels_jagged)
    preds_flat = flatten(preds_jagged)

    acc = accuracy_score(labels_flat, preds_flat)
    f1 = f1_score(labels_flat, preds_flat, average='macro')
    
    return {'f1':f1, 'accuracy':acc,}

In [116]:
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments

model = AutoModelForTokenClassification.from_pretrained("distilbert-base-cased", id2label=id2label, label2id=label2id,)
trainer = Trainer(model=model,
                 args=training_args,
                 train_dataset=tokenized_datasets['train'],
                 eval_dataset=tokenized_datasets['test'],
                 data_collator=data_collator,
                 compute_metrics=compute_metrics,
                 tokenizer=tokenizer,)
trainer.train()

loading configuration file config.json from cache at C:\Users\Sealion/.cache\huggingface\hub\models--distilbert-base-cased\snapshots\4dc145c5bd4fdb672dcded7fdc1efd6c2bc55992\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "ADJ",
    "1": "NOUN",
    "2": "VERB",
    "3": "ADP",
    "4": "DET",
    "5": "PRON",
    "6": "X",
    "7": "CONJ",
    "8": "PRT",
    "9": ".",
    "10": "ADV",
    "11": "NUM"
  },
  "initializer_range": 0.02,
  "label2id": {
    ".": 9,
    "ADJ": 0,
    "ADP": 3,
    "ADV": 10,
    "CONJ": 7,
    "DET": 4,
    "NOUN": 1,
    "NUM": 11,
    "PRON": 5,
    "PRT": 8,
    "VERB": 2,
    "X": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sin

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8


IndexError: list index out of range

In [119]:
trainer.save_model('my_saved_model')
from transformers import pipeline
pipe = pipeline(
    'token-classification',
     model='my_saved_model',
     device=0,)

Saving model checkpoint to my_saved_model
Configuration saved in my_saved_model\config.json
Model weights saved in my_saved_model\pytorch_model.bin
tokenizer config file saved in my_saved_model\tokenizer_config.json
Special tokens file saved in my_saved_model\special_tokens_map.json
loading configuration file my_saved_model\config.json
Model config DistilBertConfig {
  "_name_or_path": "my_saved_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "ADJ",
    "1": "NOUN",
    "2": "VERB",
    "3": "ADP",
    "4": "DET",
    "5": "PRON",
    "6": "X",
    "7": "CONJ",
    "8": "PRT",
    "9": ".",
    "10": "ADV",
    "11": "NUM"
  },
  "initializer_range": 0.02,
  "label2id": {
    ".": 9,
    "ADJ": 0,
    "ADP": 3,
    "ADV": 10,
    "CONJ": 7,
    "DET": 4,
    "NOUN": 1,
    "NUM": 11,
    "PRON": 5,
    "PRT": 8,
    "VERB": 2,
    "

In [120]:
pipe('Bill Gates was the CEO of Microsoft in Seattle, Washington.')

[{'entity': 'NOUN',
  'score': 0.9982545,
  'index': 1,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity': 'NOUN',
  'score': 0.99915516,
  'index': 2,
  'word': 'Gates',
  'start': 5,
  'end': 10},
 {'entity': 'VERB',
  'score': 0.9996705,
  'index': 3,
  'word': 'was',
  'start': 11,
  'end': 14},
 {'entity': 'DET',
  'score': 0.99968576,
  'index': 4,
  'word': 'the',
  'start': 15,
  'end': 18},
 {'entity': 'NOUN',
  'score': 0.9986363,
  'index': 5,
  'word': 'CEO',
  'start': 19,
  'end': 22},
 {'entity': 'ADP',
  'score': 0.999655,
  'index': 6,
  'word': 'of',
  'start': 23,
  'end': 25},
 {'entity': 'NOUN',
  'score': 0.9978656,
  'index': 7,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity': 'ADP',
  'score': 0.9994708,
  'index': 8,
  'word': 'in',
  'start': 36,
  'end': 38},
 {'entity': 'NOUN',
  'score': 0.9992939,
  'index': 9,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity': '.',
  'score': 0.9997249,
  'index': 10,
  'word': ',',
  'star

#### Acknowledges:

1. Dataset: NER/POS tags datasets
2. Courses of Lazy Programmer includes practical exercise.
All above is for practice purposes only.