### Imports and Version

In [1]:
import transformers, datasets
transformers.__version__, datasets.__version__

('4.39.3', '2.18.0')

### Dataset

In [2]:
import nltk
from nltk.corpus import brown

In [3]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [4]:
corpus = brown.tagged_sents(tagset='universal')

[a for a, b in corpus[5]]

['It',
 'recommended',
 'that',
 'Fulton',
 'legislators',
 'act',
 '``',
 'to',
 'have',
 'these',
 'laws',
 'studied',
 'and',
 'revised',
 'to',
 'the',
 'end',
 'of',
 'modernizing',
 'and',
 'improving',
 'them',
 "''",
 '.']

In [5]:
inputs = []
targets = []

for sentence_tag_pairs in corpus:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)

In [6]:
import json

with open('data.json', 'w') as f:
    for x, y in zip(inputs, targets):
        j = {'inputs': x, 'targets': y}
        s = json.dumps(j)
        f.write(f"{s}\n")    

In [7]:
from datasets import load_dataset

data = load_dataset("json", data_files='data.json')
data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [8]:
small = data["train"].shuffle(seed=42).select(range(20_000))
data = small

In [9]:
data = data.train_test_split(test_size=0.3, seed=42, )
data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 14000
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 6000
    })
})

In [10]:
data["train"].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [11]:
unique_targets= set()
for target in data["train"]["targets"]:
    for t in target:
        unique_targets.add(t)
label_names = sorted(unique_targets)
label_names

['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']

In [12]:
id2label = {idx:label for idx, label in enumerate(label_names)}
label2id = {label:idx for idx, label in id2label.items()}

### Tokenizer

In [13]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [14]:
idx = 3
t = tokenizer(data["train"][idx]["inputs"], is_split_into_words=True)  # is_split_into_words = True because the data is already in he tokens 
t

{'input_ids': [101, 21086, 1181, 185, 23629, 1116, 117, 1112, 4029, 1111, 1543, 185, 23629, 13742, 1137, 1111, 5497, 1112, 13937, 117, 1132, 187, 20219, 1174, 8365, 2133, 6478, 17518, 1116, 1138, 1151, 6987, 1228, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:

len(data["train"][idx]["targets"]), data["train"][idx]["targets"]

(25,
 ['VERB',
  'NOUN',
  '.',
  'ADP',
  'VERB',
  'ADP',
  'VERB',
  'NOUN',
  'NOUN',
  'CONJ',
  'ADP',
  'VERB',
  'ADP',
  'NOUN',
  '.',
  'VERB',
  'VERB',
  'NOUN',
  'DET',
  'NOUN',
  'VERB',
  'VERB',
  'VERB',
  'PRT',
  '.'])

In [16]:
len(t.tokens()), t.tokens()

(35,
 ['[CLS]',
  'Blanche',
  '##d',
  'p',
  '##eanut',
  '##s',
  ',',
  'as',
  'prepared',
  'for',
  'making',
  'p',
  '##eanut',
  'butter',
  'or',
  'for',
  'eating',
  'as',
  'nuts',
  ',',
  'are',
  'r',
  '##oast',
  '##ed',
  'seeds',
  'whose',
  'seed',
  '##coat',
  '##s',
  'have',
  'been',
  'rubbed',
  'off',
  '.',
  '[SEP]'])

In [17]:
t.word_ids()

[None,
 0,
 0,
 1,
 1,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 16,
 16,
 17,
 18,
 19,
 19,
 19,
 20,
 21,
 22,
 23,
 24,
 None]

#### Tokens not aligned when the tokenizers breaks a words into multiple tokens 

In [18]:
# code to align targets 
def align_targets(label_names, word_ids, ner_tags):
    new_ner = []
    to_add = None
    for i in word_ids:
        if i is None:
            to_add = -100
        else:
            to_add = ner_tags[i]
            to_add = label_names.index(to_add)

        new_ner.append(to_add)

    return new_ner

In [19]:
idx = 3
labels = data['train'][idx]['targets']
t = tokenizer(data["train"][idx]["inputs"], is_split_into_words=True)
word_ids = t.word_ids()

aligned_targets = align_targets(label_names, word_ids, labels)
aligned_targets

[-100,
 10,
 10,
 6,
 6,
 6,
 0,
 2,
 10,
 2,
 10,
 6,
 6,
 6,
 4,
 2,
 10,
 2,
 6,
 0,
 10,
 10,
 10,
 10,
 6,
 5,
 6,
 6,
 6,
 10,
 10,
 10,
 9,
 0,
 -100]

In [20]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	None
Blanche	VERB
##d	VERB
p	NOUN
##eanut	NOUN
##s	NOUN
,	.
as	ADP
prepared	VERB
for	ADP
making	VERB
p	NOUN
##eanut	NOUN
butter	NOUN
or	CONJ
for	ADP
eating	VERB
as	ADP
nuts	NOUN
,	.
are	VERB
r	VERB
##oast	VERB
##ed	VERB
seeds	NOUN
whose	DET
seed	NOUN
##coat	NOUN
##s	NOUN
have	VERB
been	VERB
rubbed	VERB
off	PRT
.	.
[SEP]	None


### Tokenize and align 

In [21]:
# tokenize both inputs and targets
def tokenize_fn(batch):
  # tokenize the input sequence first
  # this populates input_ids, attention_mask, etc.
    tokenized_inputs = tokenizer(
                batch['inputs'], truncation=True, is_split_into_words=True)

    labels_batch = batch['targets'] # original targets
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(label_names, word_ids, labels))
        
  # recall: the 'target' must be stored in key called 'labels'
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [22]:
# want to remove these from model inputs - they are neither inputs nor targets
data["train"][0].keys()

dict_keys(['inputs', 'targets'])

In [23]:
tokenized_datasets = data.map(
  tokenize_fn,
  batched=True,
  remove_columns=data["train"].column_names,
)

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [24]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6000
    })
})

### Data Collator

In [25]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


2024-06-07 06:50:52.674714: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-07 06:50:52.674818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-07 06:50:52.792506: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [26]:
data_collator

DataCollatorForTokenClassification(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token

In [27]:
tokenized_datasets["train"][0:2]

{'input_ids': [[101,
   1109,
   1444,
   1111,
   9629,
   1105,
   4287,
   2228,
   9301,
   1103,
   1554,
   1329,
   1104,
   1155,
   4069,
   1104,
   18261,
   119,
   102],
  [101,
   1556,
   3406,
   1105,
   1143,
   117,
   1175,
   112,
   188,
   1309,
   1251,
   2463,
   119,
   102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'labels': [[-100, 5, 6, 2, 6, 4, 10, 10, 1, 5, 1, 6, 2, 9, 6, 2, 6, 0, -100],
  [-100, 2, 6, 4, 8, 0, 9, 9, 9, 3, 5, 6, 0, -100]]}

In [28]:
[tokenized_datasets["train"][i] for i in range(2)]

[{'input_ids': [101,
   1109,
   1444,
   1111,
   9629,
   1105,
   4287,
   2228,
   9301,
   1103,
   1554,
   1329,
   1104,
   1155,
   4069,
   1104,
   18261,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 5, 6, 2, 6, 4, 10, 10, 1, 5, 1, 6, 2, 9, 6, 2, 6, 0, -100]},
 {'input_ids': [101,
   1556,
   3406,
   1105,
   1143,
   117,
   1175,
   112,
   188,
   1309,
   1251,
   2463,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 2, 6, 4, 8, 0, 9, 9, 9, 3, 5, 6, 0, -100]}]

In [29]:
# example
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    5,    6,    2,    6,    4,   10,   10,    1,    5,    1,    6,
            2,    9,    6,    2,    6,    0, -100],
        [-100,    2,    6,    4,    8,    0,    9,    9,    9,    3,    5,    6,
            0, -100, -100, -100, -100, -100, -100]])

In [30]:
batch

{'input_ids': tensor([[  101,  1109,  1444,  1111,  9629,  1105,  4287,  2228,  9301,  1103,
          1554,  1329,  1104,  1155,  4069,  1104, 18261,   119,   102],
        [  101,  1556,  3406,  1105,  1143,   117,  1175,   112,   188,  1309,
          1251,  2463,   119,   102,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    5,    6,    2,    6,    4,   10,   10,    1,    5,    1,    6,
            2,    9,    6,    2,    6,    0, -100],
        [-100,    2,    6,    4,    8,    0,    9,    9,    9,    3,    5,    6,
            0, -100, -100, -100, -100, -100, -100]])}

### Metric

In [31]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis=-1)

    # remove -100 from labels and predictions
    labels_jagged = [[t for t in label if t != -100] for label in labels]

    # do the same for predictions whenever true label is -100
    preds_jagged = [[p for p, t in zip(ps, ts) if t != -100] \
      for ps, ts in zip(preds, labels)
    ]

    # flatten labels and preds
    labels_flat = flatten(labels_jagged)
    preds_flat = flatten(preds_jagged)

    acc = accuracy_score(labels_flat, preds_flat)
    f1 = f1_score(labels_flat, preds_flat, average='macro')

    return {
    'f1': f1,
    'accuracy': acc,
    }

In [32]:
def flatten(list_of_lists):
    flattened = [val for sublist in list_of_lists for val in sublist]
    return flattened

In [33]:
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
  [0.8, 0.1, 0.1],
  [0.8, 0.1, 0.1],
  [0.8, 0.1, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
]])
compute_metrics((logits, labels))

{'f1': 0.6, 'accuracy': 0.8}

### Model and Trainer

In [34]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    'pos_tag',
    num_train_epochs = 3,
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 64, 
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps = 0.2, 
    logging_dir="./logs", 
    save_steps = 0.2, 
    load_best_model_at_end = True,
    fp16 = False,  
    save_total_limit = 2, report_to="none")

In [36]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,F1,Accuracy
263,0.4036,0.088317,0.929531,0.974473
526,0.0749,0.067975,0.937519,0.981045
789,0.0546,0.062024,0.94171,0.98202
1052,0.042,0.06122,0.942707,0.982731


TrainOutput(global_step=1314, training_loss=0.12275469793032294, metrics={'train_runtime': 193.4067, 'train_samples_per_second': 217.159, 'train_steps_per_second': 6.794, 'total_flos': 734754380956416.0, 'train_loss': 0.12275469793032294, 'epoch': 3.0})

In [37]:
trainer.save_model('ner_model_1')

### Inference

In [38]:
from transformers import pipeline

ner = pipeline(
  "token-classification",
  model='/kaggle/working/ner_model_1',
  device=0,
)

In [39]:
s = "Bill Gates was the CEO of Microsoft in Seattle, Washington."
ner(s)

[{'entity': 'NOUN',
  'score': 0.9989718,
  'index': 1,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity': 'NOUN',
  'score': 0.99927956,
  'index': 2,
  'word': 'Gates',
  'start': 5,
  'end': 10},
 {'entity': 'VERB',
  'score': 0.99934095,
  'index': 3,
  'word': 'was',
  'start': 11,
  'end': 14},
 {'entity': 'DET',
  'score': 0.99937963,
  'index': 4,
  'word': 'the',
  'start': 15,
  'end': 18},
 {'entity': 'NOUN',
  'score': 0.9986297,
  'index': 5,
  'word': 'CEO',
  'start': 19,
  'end': 22},
 {'entity': 'ADP',
  'score': 0.99949825,
  'index': 6,
  'word': 'of',
  'start': 23,
  'end': 25},
 {'entity': 'NOUN',
  'score': 0.9987631,
  'index': 7,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity': 'ADP',
  'score': 0.9992536,
  'index': 8,
  'word': 'in',
  'start': 36,
  'end': 38},
 {'entity': 'NOUN',
  'score': 0.9993579,
  'index': 9,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity': '.',
  'score': 0.9995254,
  'index': 10,
  'word': ',',
  's