<a href="https://colab.research.google.com/github/ucheokechukwu/zero_to_mastery_courses/blob/main/HuggingFace_NLP_Course/6_The_Tokenizer_Library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

https://huggingface.co/learn/nlp-course/chapter6/1?fw=tf

- how to train a new tokenizer similar to the one used by a given checkpoint on a new corpus of texts

- special features of fast tokenizers

- differences between the 3 main subword tokenizer algorithms

- how to build a tokenizer from scratch with the Tokenizer lirbary and train on some data.

# Training A New Tokenizer from an old one

* not the same as training a model.

* models use gradient descent to reduce loss

* tokenizer training is statistical - identifying the subwords that are best to pick from a corpus

* it is deterministic - the results are always the same

## assembling a corpus
`AutoTokenizer.train_new_from_iterator()`

In [None]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# get the corpus
from datasets import load_dataset
raw_datasets = load_dataset("code_search_net", "python")

# examine the columns in training split
raw_datasets['train']

Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [None]:
print(raw_datasets['train'][123456]['whole_func_string'])

def check_result(running, recurse=False, highstate=None):
    '''
    Check the total return value of the run and determine if the running
    dict has any issues
    '''
    if not isinstance(running, dict):
        return False

    if not running:
        return False

    ret = True
    for state_id, state_result in six.iteritems(running):
        expected_type = dict
        # The __extend__ state is a list
        if "__extend__" == state_id:
            expected_type = list
        if not recurse and not isinstance(state_result, expected_type):
            ret = False
        if ret and isinstance(state_result, dict):
            result = state_result.get('result', _empty)
            if result is False:
                ret = False
            # only override return value if we are not already failed
            elif result is _empty and isinstance(state_result, dict) and ret:
                ret = check_result(
                    state_result, recurse=True, highstate=highstate

### creating a generator

**Note**: We need to transform the dataset into an iterator of lists of texts. It will enable the tokenizer to go faster (training on batches) and avoid having everything in ememory

In [None]:
# create a generator
training_corpus = (
    raw_datasets['train'][i: i + 1000]['whole_func_string']
    for i in range(0, len(raw_datasets['train']), 1000)
)

In [None]:
gen = (i for i in range(10))
print(list(gen))
print(list(gen))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]


In [None]:
# a generator can only be used once, so we have to define a function instead
def get_training_corpus():
    return(
    raw_datasets['train'][i : i+1000]['whole_func_string']
    for i in range(0, len(raw_datasets['train']), 1000)
)

In [None]:
# does the same as a above but uses a list and a yield statement
def get_training_corpus():
    dataset = raw_datasets['train']
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples['whole_func_string']

In [None]:
training_corpus = get_training_corpus()
type(training_corpus)

generator

# Training a new tokenizer

In [None]:
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained('gpt2')

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
example = '''def add_numbers(a,b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'n',
 'umbers',
 '(',
 'a',
 ',',
 'b',
 '):',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`',
 '."',
 '""',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [None]:
# train a new tokenizer to see if it's more efficient
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

In [None]:
tokens = tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'numbers',
 '(',
 'a',
 ',',
 'b',
 '):',
 'ĊĠĠĠ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`."""',
 'ĊĠĠĠ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [None]:
len(tokens), len(old_tokenizer.tokenize(example))

(27, 36)

In [None]:
# another example
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

        def __call__(self, x):
            return x @ self.weights + self.bias
        """
tokenizer.tokenize(example)

['class',
 'ĠLinear',
 'Layer',
 '():',
 'ĊĠĠĠ',
 'Ġdef',
 'Ġ__',
 'init',
 '__(',
 'self',
 ',',
 'Ġinput',
 '_',
 'size',
 ',',
 'Ġoutput',
 '_',
 'size',
 '):',
 'ĊĠĠĠĠĠĠĠ',
 'Ġself',
 '.',
 'weight',
 'Ġ=',
 'Ġtorch',
 '.',
 'randn',
 '(',
 'input',
 '_',
 'size',
 ',',
 'Ġoutput',
 '_',
 'size',
 ')',
 'ĊĠĠĠĠĠĠĠ',
 'Ġself',
 '.',
 'bias',
 'Ġ=',
 'Ġtorch',
 '.',
 'zeros',
 '(',
 'output',
 '_',
 'size',
 ')',
 'ĊĊĠĠĠĠĠĠĠ',
 'Ġdef',
 'Ġ__',
 'call',
 '__(',
 'self',
 ',',
 'Ġx',
 '):',
 'ĊĠĠĠĠĠĠĠĠĠĠĠ',
 'Ġreturn',
 'Ġx',
 'Ġ@',
 'Ġself',
 '.',
 'weights',
 'Ġ+',
 'Ġself',
 '.',
 'bias',
 'ĊĠĠĠĠĠĠĠĠ']

In [None]:
from pathlib import Path
dirpath = "/content/drive/MyDrive/MLOPs_Projects/HuggingFace/6_Tokenizer/"
Path(dirpath).mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(dirpath+'code-search-net-tokenizer')

('code-search-net-tokenizer/tokenizer_config.json',
 'code-search-net-tokenizer/special_tokens_map.json',
 'code-search-net-tokenizer/vocab.json',
 'code-search-net-tokenizer/merges.txt',
 'code-search-net-tokenizer/added_tokens.json',
 'code-search-net-tokenizer/tokenizer.json')

In [None]:
help(Path.mkdir)

Help on function mkdir in module pathlib:

mkdir(self, mode=511, parents=False, exist_ok=False)
    Create a new directory at this given path.



In [None]:
ls /content/drive/MyDrive/MLOPs_Projects/

ls: cannot access '/content/drive/MyDrive/MLOPs_Projects/': No such file or directory


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tokenizer.push_to_hub('code-search-net-tokenizer')

RepositoryNotFoundError: ignored

# fast tokenizer special powers

## batch encoding

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [None]:
# checking if the tokenizer is fast or slow
tokenizer.is_fast, encoding.is_fast

(True, True)

In [None]:
# fast tokenizer allows us to access teh encodings without having to re-convert
encoding.tokens()

['[CLS]',
 'My',
 'name',
 'is',
 'S',
 '##yl',
 '##va',
 '##in',
 'and',
 'I',
 'work',
 'at',
 'Hu',
 '##gging',
 'Face',
 'in',
 'Brooklyn',
 '.',
 '[SEP]']

In [None]:
# get the index of the word of each token
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]

In [None]:
robert_tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
robert_tokenizer("81s").tokens(), tokenizer("81s").tokens()

(['<s>', '81', 's', '</s>'], ['[CLS]', '81', '##s', '[SEP]'])

In [None]:
robert_tokenizer("81s").word_ids(), tokenizer("81s").word_ids()

([None, 0, 1, None], [None, 0, 0, None])

In [72]:
start, end = encoding.word_to_chars(3)
example[start:end]

'Sylvain'

## inside the `token-classification` pipeline

identfy which parts of the text correspond to entities like persons, locations or organizations.

### getting the base results with the `pipeline`

In [73]:
from transformers import pipeline
token_classifier = pipeline('token-classification')
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-PER',
  'score': 0.99938285,
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.99815494,
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.99590707,
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.99923277,
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931,
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.976115,
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887976,
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.9932106,
  'index': 16,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [74]:
# grouping together tokens that correspond ot the same entitiy
from transformers import pipeline
token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

### from inputs to predictions

we want to obtain the above results without using the `pipeline()` function.

step 1. tokenize the inputs and pass it through the model.

step 2. get the prediction by applying softmax and argmax to the output logits, and get the labels using the `id2label` dictionary.

step 3. get the `start` and `end` of each entity with offset mapping

In [80]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors='pt')
outputs = model(**inputs)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [90]:
print(inputs["input_ids"].shape)
print(outputs.logits.shape) # Note that the model is a 9-class multiclassifier

torch.Size([1, 19])
torch.Size([1, 19, 9])


In [91]:
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]


In [92]:
# mapping of indicies to labels
# B means the beginning of an entity(word) and I means inside or end of the entity
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [96]:
# grab the score and label

results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != 'O':
        results.append(
            {"entity": label,
             "score": probabilities[idx][pred], # get the probability of the predicted label
             "word": tokens[idx]}
        )
print(results)

[{'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'S'}, {'entity': 'I-PER', 'score': 0.9981548190116882, 'word': '##yl'}, {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va'}, {'entity': 'I-PER', 'score': 0.9992327690124512, 'word': '##in'}, {'entity': 'I-ORG', 'score': 0.9738931059837341, 'word': 'Hu'}, {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': '##gging'}, {'entity': 'I-ORG', 'score': 0.9887974858283997, 'word': 'Face'}, {'entity': 'I-LOC', 'score': 0.99321049451828, 'word': 'Brooklyn'}]


In [97]:
# to get offset mapping
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [98]:
inputs_with_offsets.offset_mapping

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 12),
 (12, 14),
 (14, 16),
 (16, 18),
 (19, 22),
 (23, 24),
 (25, 29),
 (30, 32),
 (33, 35),
 (35, 40),
 (41, 45),
 (46, 48),
 (49, 57),
 (57, 58),
 (0, 0)]

In [99]:
test = {'a':'b'}

In [112]:
results = []
inputs_with_offsets = tokenizer(example,
                                # return_tensor='pt',
                                return_offsets_mapping=True
                                )
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets['offset_mapping']

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]

    if label !='O':
        start, end = offsets[idx]
        results.append(
            {
                "entity": label,
                "score": probabilities[idx][pred],
                "word": tokens[idx],
                "start": start,
                "end": end
            }
        )
results

[{'entity': 'I-PER',
  'score': 0.9993828535079956,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.9981548190116882,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.995907187461853,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.9992327690124512,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931059837341,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.9761149883270264,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887974858283997,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.99321049451828,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [106]:
inputs_with_offsets.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [109]:
predictions

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]

### grouping entities

offsets are useful for grouping entities

In [113]:
example[33:45]

'Hugging Face'

In [118]:
import numpy as np

results = []
inputs_with_offsets = tokenizer(example,
                                return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets['offset_mapping']



idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]

    if label !='O':
        # remove the B- and I-
        label = label[2:]
        start, _ = offsets[idx]

        # grab al the tokens with the I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] ==f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]

            idx += 1

        # the score is the mean of all the scores of the tokens in that grouped entity
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity-group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end
                }
        )
    idx +=1

results


[{'entity-group': 'PER',
  'score': 0.998169407248497,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity-group': 'ORG',
  'score': 0.9796018600463867,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity-group': 'LOC',
  'score': 0.99321049451828,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

# Fast tokenizers in the QA pipeline

- model is trained to predict the index of the token starting the answer and ending the answer. So it doesn't return one tensor of logits but two.

- questions are not truncated, but are 'chunkified' with overlap.

In [161]:
from transformers import pipeline

question_answerer = pipeline("question-answering")
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.9802603125572205,
 'start': 78,
 'end': 106,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [160]:
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question_answerer(question=question, context=long_context)

{'score': 0.9714871048927307,
 'start': 1892,
 'end': 1919,
 'answer': 'Jax, PyTorch and TensorFlow'}

## Using a model for question answering

In [163]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
model_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

In [164]:
outputs.keys()

odict_keys(['start_logits', 'end_logits'])

In [165]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits
start_logits.shape, end_logits.shape

(torch.Size([1, 67]), torch.Size([1, 67]))

In [166]:
import torch
sequence_ids = inputs.sequence_ids()
# masking everything except the tokens of the context
mask = [i !=1 for i in sequence_ids]
# unmask the [CLS] token
mask[0] = False
mask = torch.tensor(mask)[None]
start_logits[mask] = -10000 # large negative number that will softmax to 0
end_logits[mask] = -10000


# now that we've masked, we can apply softmax...

start_probabilities= torch.nn.functional.softmax(start_logits, dim=-1)[0]
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)[0]

Assuming the events “The answer starts at `start_index`” and “The answer ends at `end_index`” to be independent, the probability that the answer starts at start_index and ends at end_index is:


s
t
a
r
t
_
p
r
o
b
a
b
i
l
i
t
i
e
s
[
s
t
a
r
t
_
i
n
d
e
x
]
×
e
n
d
_
p
r
o
b
a
b
i
l
i
t
i
e
s
[
e
n
d
_
i
n
d
e
x
]
start_probabilities[start_index]×end_probabilities[end_index]

So, to compute all the scores, we just need to compute all the products

s
t
a
r
t
_
p
r
o
b
a
b
i
l
i
t
i
e
s
[
s
t
a
r
t
_
i
n
d
e
x
]
×
e
n
d
_
p
r
o
b
a
b
i
l
i
t
i
e
s
[
e
n
d
_
i
n
d
e
x
]
start_probabilities[start_index]×end_probabilities[end_index] where start_index <= end_index.



In [128]:
scores = start_probabilities[:, None] * end_probabilities[None, :]
scores.shape

torch.Size([67, 67])

In [130]:
# mask the values where start_index > end_index by setting to 0
# using torch.trui()

scores = torch.triu(scores)
scores.shape

torch.Size([67, 67])

In [131]:
# get the index of the maximum....
max_index = scores.argmax().item()
start_index = max_index // scores.shape[1]
end_index = max_index % scores.shape[1]

# alternative way to the argmax of a 2-D vector
(scores == torch.max(scores)).nonzero()


print(scores[start_index, end_index])

tensor(0.9803, grad_fn=<SelectBackward0>)


In [182]:
# convert to the character indices using offset mapping

inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
offsets = inputs_with_offsets['offset_mapping']
print(len(offsets))


start_char, _ = offsets[start_index]
_, end_char = offsets[end_index]
answer = context[start_char:end_char]

67


In [183]:
result = {
    'answer': answer,
    'start': start_char,
    'end': end_char,
    'score': scores[start_index, end_index],
}
print(result)

{'answer': 'Jax, PyTorch, and TensorFlow', 'start': 78, 'end': 106, 'score': tensor(0.9803, grad_fn=<SelectBackward0>)}


## Handling long contexts

`return_overflowing_tokens=True`

if the long_context is split into N chunks, we will haev N sets of start and end logits

In [184]:
inputs = tokenizer(question, long_context)
len(inputs['input_ids'])

461

In [185]:
# we need to chunk out input

sentence = "This sentence is not too long but we will split it anyway"

inputs = tokenizer(
    sentence, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)

for ids in inputs.input_ids:
    print(tokenizer.decode(ids))

[CLS] This sentence is not [SEP]
[CLS] is not too long [SEP]
[CLS] too long but we [SEP]
[CLS] but we will split [SEP]
[CLS] will split it anyway [SEP]


In [186]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping'])

In [187]:
inputs.overflow_to_sample_mapping

[0, 0, 0, 0, 0]

In [188]:
sentences = [
    "This sentence is not too long but we are going to split it anyway.",
    "This sentence is shorter but will still get split.",
]
inputs = tokenizer(
    sentences, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)

print(inputs["overflow_to_sample_mapping"])

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]


In [189]:
for ids in inputs.input_ids:
    print(tokenizer.decode(ids))

[CLS] This sentence is not [SEP]
[CLS] is not too long [SEP]
[CLS] too long but we [SEP]
[CLS] but we are going [SEP]
[CLS] are going to split [SEP]
[CLS] to split it anyway [SEP]
[CLS] it anyway. [SEP]
[CLS] This sentence is shorter [SEP]
[CLS] is shorter but will [SEP]
[CLS] but will still get [SEP]
[CLS] still get split. [SEP]


In [194]:
# tokenizing our long_context

inputs = tokenizer(
    question,
    long_context,
    stride=128,
    max_length=384,
    padding="longest",
    truncation='only_second',
    return_overflowing_tokens=True,
    return_offsets_mapping=True
)


# offsets and overflow_to_sampling are not used by the model so we'll pop them out

_ = inputs.pop('overflow_to_sample_mapping')
offsets = inputs.pop('offset_mapping')

inputs = inputs.convert_to_tensors('pt')
print(inputs.input_ids.shape)


torch.Size([2, 384])


In [195]:
# 2 sets of start ane end logits are expected
outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits
start_logits.shape, end_logits.shape

(torch.Size([2, 384]), torch.Size([2, 384]))

In [196]:
# following the previous steps:

# 1. mask the tokens that are not part of the context, and the padding tokens

sequence_ids = inputs.sequence_ids()
mask = [i !=1 for i in sequence_ids]
# unmask the CLS token
mask[0] = False
# mask the PAD tokens
mask = torch.logical_or(torch.tensor(mask)[None],
                        (inputs['attention_mask'] == 0))

start_logits[mask] = -10000
end_logits[mask] = -10000


# then use softmax to conver logits to probabilities
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)

In [223]:
# now multiply the probabilities,
# select only the start < end
# find the argmax

candidates = []
for start_probs, end_probs in zip(start_probabilities, end_probabilities):
    scores = start_probs[:, None] * end_probs[None, :]
    scores = scores.triu()

    start_idx, end_idx = (scores == scores.max()).nonzero().squeeze()
    score = scores.max().item()
    candidates.append((start_idx, end_idx, score))
candidates

[(tensor(0), tensor(18), 0.3386707305908203),
 (tensor(173), tensor(184), 0.9714868664741516)]

In [218]:
a, b = torch.tensor([[ 0, 18]]).squeeze()
a, b

(tensor(0), tensor(18))

In [198]:
torch.max(scores)

tensor(0.9803, grad_fn=<MaxBackward1>)

In [224]:
# map the start and end index to th echaracters

for candidate, offset in zip(candidates, offsets):
    start_token, end_token, score = candidate
    start_char, _ = offset[start_token]
    _, end_char = offset[end_token]
    answer = long_context[start_char:end_char]
    result = {"answer": answer, "start": start_char, "end": end_char, "score": score}
    print(result)

{'answer': '\n🤗 Transformers: State of the Art NLP', 'start': 0, 'end': 37, 'score': 0.3386707305908203}
{'answer': 'Jax, PyTorch and TensorFlow', 'start': 1892, 'end': 1919, 'score': 0.9714868664741516}
