In [None]:
!pip install transformers datasets torchinfo

# Download dataset from nltk

In [2]:
import nltk
from nltk.corpus import brown

In [3]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [4]:
corpus = brown.tagged_sents(tagset='universal')
corpus

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [5]:
# Write inputs and labels to json file for datasets object
import json
targets = []

with open('data.json', 'w') as f:
  for sentence_tag_pairs in corpus:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
      tokens.append(token)
      target.append(tag)
    targets.append(target)
    j = {'input': tokens, 'target': target}
    s = json.dumps(j)
    f.write(f"{s}\n")

In [6]:
from datasets import load_dataset

# Load dataset from json file
data = load_dataset('json', data_files='data.json')
data

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-36b36c79db87ab75/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-36b36c79db87ab75/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 57340
    })
})

In [7]:
# Select 20.000 examples 
small_ds = data['train'].shuffle(seed=42).select(range(0,20000))
small_ds

Dataset({
    features: ['input', 'target'],
    num_rows: 20000
})

In [8]:
# Split train and test datasets
ds = small_ds.train_test_split(seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['input', 'target'],
        num_rows: 5000
    })
})

In [9]:
print(ds['train'][0])

{'input': ['Ulyate', 'and', 'Kearton', 'climbed', 'on', 'toward', 'the', 'sound', 'of', 'the', 'barking', 'of', 'the', 'dogs', 'and', 'the', 'sporadic', 'roaring', 'of', 'the', 'lion', ',', 'till', 'they', 'came', ',', 'out', 'of', 'breath', ',', 'to', 'the', 'crest', ',', 'and', 'peering', 'through', 'the', 'branches', 'of', 'a', 'bush', ',', 'this', 'is', 'what', 'Ulyate', 'saw', ':', 'Jones', 'who', 'had', 'apparently', '(', 'and', 'actually', 'had', ')', 'ridden', 'up', 'the', 'nearly', 'impassable', 'hillside', ',', 'sitting', 'calmly', 'on', 'his', 'horse', 'within', 'forty', 'feet', 'of', 'a', 'full-grown', 'young', 'lioness', ',', 'who', 'was', 'crouched', 'on', 'a', 'flat', 'rock', 'and', 'seemed', 'just', 'about', 'to', 'charge', 'him', ',', 'while', 'the', 'dogs', 'whirled', 'around', 'her', '.'], 'target': ['NOUN', 'CONJ', 'NOUN', 'VERB', 'PRT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'CONJ', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', '.', 'ADP', 

In [10]:
ds['train'].features

{'input': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'target': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [11]:
# Get label names 
target_set = set()
for target in targets:
  target_set = target_set.union(target)
target_set

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [12]:
# Config for model
target_list = list(target_set)
id2label = {k:v for k,v in enumerate(target_list)}
label2id = {v:k for k,v in id2label.items()}
label2id

{'X': 0,
 'ADP': 1,
 'PRON': 2,
 'ADJ': 3,
 'ADV': 4,
 '.': 5,
 'NOUN': 6,
 'PRT': 7,
 'VERB': 8,
 'CONJ': 9,
 'DET': 10,
 'NUM': 11}

In [13]:
from transformers import AutoTokenizer

# Load tokenizer from checkpoint
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [14]:
idx = 0
example = data['train'][idx]['input']
t = tokenizer(example, is_split_into_words=True)
t

{'input_ids': [101, 1109, 18196, 1391, 2224, 15169, 1163, 5286, 1126, 4449, 1104, 5161, 112, 188, 2793, 2425, 1728, 1666, 169, 169, 1185, 2554, 112, 112, 1115, 1251, 12692, 4233, 1261, 1282, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
print(example)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [16]:
print(t.tokens())

['[CLS]', 'The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'", 's', 'recent', 'primary', 'election', 'produced', '`', '`', 'no', 'evidence', "'", "'", 'that', 'any', 'irregular', '##ities', 'took', 'place', '.', '[SEP]']


In [17]:
print(t.word_ids())

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 11, 12, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, 24, None]


In [18]:
len(data['train'][idx]['target']) == len(t.tokens())

False

In [19]:
len(t.tokens()) == len(t.word_ids())

True

In [20]:
# align and encode labels for tokens
def align_target(labels, word_ids):
  aligned_labels = []
  for word in word_ids:
    if word is None:
      label = -100
    else:
      label = label2id[labels[word]]
    aligned_labels.append(label)
  return aligned_labels

In [21]:
labels = data['train'][idx]['target']
word_ids = t.word_ids()
aligned_targets = align_target(labels, word_ids)
print(aligned_targets)

[-100, 10, 6, 6, 3, 6, 8, 6, 10, 6, 1, 6, 6, 6, 3, 6, 6, 8, 5, 5, 10, 6, 5, 5, 1, 10, 6, 6, 8, 6, 5, -100]


In [22]:
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
  print(f"{x}\t{y}")

[CLS]	None
The	DET
Fulton	NOUN
County	NOUN
Grand	ADJ
Jury	NOUN
said	VERB
Friday	NOUN
an	DET
investigation	NOUN
of	ADP
Atlanta	NOUN
'	NOUN
s	NOUN
recent	ADJ
primary	NOUN
election	NOUN
produced	VERB
`	.
`	.
no	DET
evidence	NOUN
'	.
'	.
that	ADP
any	DET
irregular	NOUN
##ities	NOUN
took	VERB
place	NOUN
.	.
[SEP]	None


In [23]:
# Define tokenize function
def tokenize_fn(batch):
  tokenized_inputs = tokenizer(
      batch['input'], truncation=True, is_split_into_words=True
  )
  labels_batch = batch['target']
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_batch.append(align_target(labels, word_ids))
  tokenized_inputs['labels'] = aligned_labels_batch
  return tokenized_inputs

In [24]:
# Apply tokenize_fn for train and test dataset
tokenized_ds = ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=ds['train'].column_names
)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [25]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [26]:
from transformers import DataCollatorForTokenClassification

# Create data_collator for padding to tokens each batch
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [27]:
# flatten the list of labels for compute metrics
def flatten(lists_of_lists):
  flattened = [val for sublist in lists_of_lists for val in sublist]
  return flattened

In [28]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# Define compute_metrics function
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  preds = np.argmax(logits, axis=-1)
  # Remove -100 from labels and predictions
  labels_jagged = [[t for t in label if t != -100] for label in labels]
  preds_jagged = [[p for p,t in zip(ps, ts) if t != -100] for ps, ts in zip(preds, labels)]
  # Flatten labels_jagged and preds_jagged 
  labels_flat = flatten(labels_jagged)
  preds_flat = flatten(preds_jagged)
  # Compute metrics
  acc = accuracy_score(labels_flat, preds_flat)
  f1 = f1_score(labels_flat, preds_flat, average='macro')

  return {
      'accuracy': acc,
      'f1': f1
  }

# Test compute_metrics
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
]])

compute_metrics((logits, labels))

{'accuracy': 0.8, 'f1': 0.6}

In [29]:
from transformers import AutoModelForTokenClassification
import torch
# Load model from checkpoint
model = AutoModelForTokenClassification.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [30]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForTokenClassification                        --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           9,228
Total params: 65,200,140
Trainable params: 65,200,140
Non-trainable params: 0

In [31]:
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [32]:
from transformers import TrainingArguments

# Create args object
checkpoint_path = '/content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/'
training_args = TrainingArguments(
    checkpoint_path,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3
)

In [33]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 15000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2814
  Number of trainable parameters = 65200140
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1793,0.057443,0.983513,0.940488
2,0.0319,0.055071,0.98498,0.949715
3,0.0153,0.055687,0.985802,0.959624


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 32
Saving model checkpoint to /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/checkpoint-938
Configuration saved in /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/checkpoint-938/config.json
Model weights saved in /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/checkpoint-938/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/checkpoint-938/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/checkpoint-938/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 32
Saving model checkpoint to /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/checkpoint-1876
Configuration saved in /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/checkpoint/checkpoint-1876/co

TrainOutput(global_step=2814, training_loss=0.05724446182142527, metrics={'train_runtime': 371.2814, 'train_samples_per_second': 121.202, 'train_steps_per_second': 7.579, 'total_flos': 684880792622208.0, 'train_loss': 0.05724446182142527, 'epoch': 3.0})

In [35]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 5000
  Batch size = 32


{'eval_loss': 0.055687446147203445,
 'eval_accuracy': 0.9858019563632665,
 'eval_f1': 0.9596239447005989,
 'eval_runtime': 12.2478,
 'eval_samples_per_second': 408.237,
 'eval_steps_per_second': 12.819,
 'epoch': 3.0}

In [None]:
save_path = '/content/drive/MyDrive/udemy_course/saved_model/pos_tagger/' 
trainer.save_model(save_path)

In [37]:
from transformers import pipeline

pos = pipeline(
    'token-classification',
    model=save_path,
)

loading configuration file /content/drive/MyDrive/udemy_course/saved_model/pos_tagger/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/udemy_course/saved_model/pos_tagger/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "X",
    "1": "ADP",
    "2": "PRON",
    "3": "ADJ",
    "4": "ADV",
    "5": ".",
    "6": "NOUN",
    "7": "PRT",
    "8": "VERB",
    "9": "CONJ",
    "10": "DET",
    "11": "NUM"
  },
  "initializer_range": 0.02,
  "label2id": {
    ".": 5,
    "ADJ": 3,
    "ADP": 1,
    "ADV": 4,
    "CONJ": 9,
    "DET": 10,
    "NOUN": 6,
    "NUM": 11,
    "PRON": 2,
    "PRT": 7,
    "VERB": 8,
    "X": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropou

In [38]:
pos("Bill Gate was the CEO of Microsoft in Seattle, Washington.")

[{'entity': 'NOUN',
  'score': 0.99970764,
  'index': 1,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity': 'NOUN',
  'score': 0.99967885,
  'index': 2,
  'word': 'Gate',
  'start': 5,
  'end': 9},
 {'entity': 'VERB',
  'score': 0.9998385,
  'index': 3,
  'word': 'was',
  'start': 10,
  'end': 13},
 {'entity': 'DET',
  'score': 0.99988854,
  'index': 4,
  'word': 'the',
  'start': 14,
  'end': 17},
 {'entity': 'NOUN',
  'score': 0.9997856,
  'index': 5,
  'word': 'CEO',
  'start': 18,
  'end': 21},
 {'entity': 'ADP',
  'score': 0.9998041,
  'index': 6,
  'word': 'of',
  'start': 22,
  'end': 24},
 {'entity': 'NOUN',
  'score': 0.9996451,
  'index': 7,
  'word': 'Microsoft',
  'start': 25,
  'end': 34},
 {'entity': 'ADP',
  'score': 0.9998512,
  'index': 8,
  'word': 'in',
  'start': 35,
  'end': 37},
 {'entity': 'NOUN',
  'score': 0.99984956,
  'index': 9,
  'word': 'Seattle',
  'start': 38,
  'end': 45},
 {'entity': '.',
  'score': 0.9998996,
  'index': 10,
  'word': ',',
  'sta