# Train a new tokenizer from an old one

## Load the code corpus 

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset('code_search_net', 'python')
raw_datasets['train']

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [10]:
for k, v in raw_datasets["train"][4].items():
    print(k, v)

repository_name ageitgey/face_recognition
func_path_in_repository face_recognition/api.py
func_name _trim_css_to_bounds
whole_func_string def _trim_css_to_bounds(css, image_shape):
    """
    Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.

    :param css:  plain tuple representation of the rect in (top, right, bottom, left) order
    :param image_shape: numpy shape of the image array
    :return: a trimmed plain tuple representation of the rect in (top, right, bottom, left) order
    """
    return max(css[0], 0), min(css[1], image_shape[1]), min(css[2], image_shape[0]), max(css[3], 0)
language python
func_code_string def _trim_css_to_bounds(css, image_shape):
    """
    Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.

    :param css:  plain tuple representation of the rect in (top, right, bottom, left) order
    :param image_shape: numpy shape of the image array
    :return: a trimmed plain tuple repr

## Create a Python generator to load only batches to memory

In [15]:
# These two ways of defining generators are equivalent
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]
        
def get_training_corpus():
    return (
        raw_datasets['train'][i : i + 1000]['whole_func_string']
        for i in range(0, len(raw_datasets['train']), 1000)
    )

training_corpus = get_training_corpus()
type(training_corpus)

generator

## Training

In [17]:
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained('gpt2')

example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
# Ġ is spaces, Ċ is newlines
print(tokens)

['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [20]:
%time tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

In [22]:
tokens = tokenizer.tokenize(example)
print(tokens)

['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`."""', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


## Saving

In [25]:
tokenizer.save_pretrained('.model/code-search-net-tokenizer')

('.model/code-search-net-tokenizer\\tokenizer_config.json',
 '.model/code-search-net-tokenizer\\special_tokens_map.json',
 '.model/code-search-net-tokenizer\\vocab.json',
 '.model/code-search-net-tokenizer\\merges.txt',
 '.model/code-search-net-tokenizer\\added_tokens.json',
 '.model/code-search-net-tokenizer\\tokenizer.json')

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()
# tokenizer.push_to_hub("code-search-net-tokenizer")

# Fast tokenizers

## Fast tokenizers: batch encoding, offset mapping

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
example = 'My name is Sylvain and I work at Hugging Face in Brooklyn.'
encoding = tokenizer(example)
print(type(encoding))
print(dir(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
['_MutableMapping__marker', '__abstractmethods__', '__class__', '__class_getitem__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_encodings', '_n_sequences', 'char_to_token', 'char_to_word', 'clear', 'convert_to_tensors', 'copy', 'data', 'encodings', 'fromkeys', 'get', 'is_fast', 'items', 'keys', 'n_sequences', 'pop', 'popitem', 'sequence_ids', 'setdefault', 'to', 'token_to_chars', 'token_to_sequence', 'token_to_word', 'tokens', '

### Mapping between token, ids and words

In [22]:
print(encoding.tokens())
print(encoding.word_ids())
print(encoding.token_type_ids)  # like sentence id here

['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']
[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [92]:
start, end = encoding.word_to_chars(3)  # From word id to the original string
print(example[start:end])

start, end = encoding.token_to_chars(5)  # From tokens to the original string
print(encoding.tokens()[5], example[start:end])

ind = encoding.char_to_token(0)  # From the original string to tokens
print(example[0], encoding.tokens()[ind])

ind = encoding.char_to_word(11)  # From the original string to word id
word_ind = [i for i, j in enumerate(encoding.word_ids()) if j == ind]
print(example[11], [encoding.tokens()[i] for i in word_ind])

Sylvain
##yl yl
M My
S ['S', '##yl', '##va', '##in']


## Token-classification 

### As a pipeline

In [97]:
from transformers import pipeline

sentence = "My name is Sylvain and I work at Hugging Face in Brooklyn."
# by default checkpoint = 'dbmdz/bert-large-cased-finetuned-conll03-english'
token_classifier = pipeline('token-classification')
print(token_classifier(sentence))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER', 'score': 0.99938285, 'index': 4, 'word': 'S', 'start': 11, 'end': 12}, {'entity': 'I-PER', 'score': 0.99815494, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14}, {'entity': 'I-PER', 'score': 0.99590707, 'index': 6, 'word': '##va', 'start': 14, 'end': 16}, {'entity': 'I-PER', 'score': 0.99923277, 'index': 7, 'word': '##in', 'start': 16, 'end': 18}, {'entity': 'I-ORG', 'score': 0.9738931, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35}, {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40}, {'entity': 'I-ORG', 'score': 0.9887976, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45}, {'entity': 'I-LOC', 'score': 0.9932106, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]


In [99]:
token_classifier = pipeline('token-classification', 
                            aggregation_strategy='simple',  
                            # 'first', 'max', 'average'
)
token_classifier(sentence)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

### Step by step - From inputs to predictions

In [119]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
checkpoint = 'dbmdz/bert-large-cased-finetuned-conll03-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

inputs = tokenizer(sentence, return_tensors='pt')
outputs = model(**inputs)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [120]:
print(inputs['input_ids'].shape)
print(outputs['logits'].shape)

torch.Size([1, 19])
torch.Size([1, 19, 9])


In [121]:
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)

# Label of tokens
# O is the label for the tokens that are not in any named entity (it stands for “outside”)
# In the IOB1 format, the labels beginning with B- are only ever used to separate two adjacent entities of the same type. 

print(model.config.id2label)

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]
{0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}


In [122]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != 'O':
        results.append(
            {'entity': label, 'score': probabilities[idx][pred], 'word': tokens[idx]}
    )

results

[{'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'S'},
 {'entity': 'I-PER', 'score': 0.9981549382209778, 'word': '##yl'},
 {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va'},
 {'entity': 'I-PER', 'score': 0.9992327690124512, 'word': '##in'},
 {'entity': 'I-ORG', 'score': 0.9738931059837341, 'word': 'Hu'},
 {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': '##gging'},
 {'entity': 'I-ORG', 'score': 0.9887976050376892, 'word': 'Face'},
 {'entity': 'I-LOC', 'score': 0.9932106137275696, 'word': 'Brooklyn'}]

In [126]:
# Get offests mapping to map to original sentence
inputs_with_offsets = tokenizer(sentence, return_offsets_mapping=True)
offsets = inputs_with_offsets['offset_mapping']
print(offsets)

results = []
tokens = inputs_with_offsets.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != 'O':
        start, end = offsets[idx]
        results.append(
            {'entity': label, 
             'score': probabilities[idx][pred], 
             'word': tokens[idx],
             'start': start,
             'end': end,
            }
    )

print(results)

[(0, 0), (0, 2), (3, 7), (8, 10), (11, 12), (12, 14), (14, 16), (16, 18), (19, 22), (23, 24), (25, 29), (30, 32), (33, 35), (35, 40), (41, 45), (46, 48), (49, 57), (57, 58), (0, 0)]
[{'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'S', 'start': 11, 'end': 12}, {'entity': 'I-PER', 'score': 0.9981549382209778, 'word': '##yl', 'start': 12, 'end': 14}, {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va', 'start': 14, 'end': 16}, {'entity': 'I-PER', 'score': 0.9992327690124512, 'word': '##in', 'start': 16, 'end': 18}, {'entity': 'I-ORG', 'score': 0.9738931059837341, 'word': 'Hu', 'start': 33, 'end': 35}, {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': '##gging', 'start': 35, 'end': 40}, {'entity': 'I-ORG', 'score': 0.9887976050376892, 'word': 'Face', 'start': 41, 'end': 45}, {'entity': 'I-LOC', 'score': 0.9932106137275696, 'word': 'Brooklyn', 'start': 49, 'end': 57}]


### Group the entities with offset mapping

In [145]:
import numpy as np

results = []
inputs_with_offsets = tokenizer(sentence, return_offsets_mapping=True)
offsets = inputs_with_offsets['offset_mapping']
tokens = inputs_with_offsets.tokens()

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != 'O':
        # Remove the B- or I- prefix
        label = label[2:]
        start, _ = offsets[idx]

        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f'I-{label}'
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # The score is the mean of all the scores of the tokens in that grouped entity
        score = np.mean(all_scores).item()
        word = sentence[start:end]
        results.append(
            {'entity': label, 
             'score': score, 
             'word': word,
             'start': start,
             'end': end,
            }
        )
    idx += 1

print(results)


[{'entity': 'PER', 'score': 0.9981694370508194, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity': 'ORG', 'score': 0.9796018997828165, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity': 'LOC', 'score': 0.9932106137275696, 'word': 'Brooklyn', 'start': 49, 'end': 57}]


# Fast tokenizers in the QA pipeline

## Pipeline

In [3]:
from transformers import pipeline

question_answerer = pipeline('question-answering')
context = """
Hugging Face Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back Hugging Face Transformers?"

question_answerer(question=question, context=context)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.9779755473136902,
 'start': 89,
 'end': 117,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [4]:
# It can deal with very long context
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

question_answerer(question=question, context=long_context)

{'score': 0.98023921251297,
 'start': 1892,
 'end': 1919,
 'answer': 'Jax, PyTorch and TensorFlow'}

## Step by step

### Run the model

In [5]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = 'distilbert-base-cased-distilled-squad'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors='pt')
outputs = model(**inputs)

In [6]:
# The tokenizer concatenate Q and C: [CLS] question [SEP] context [SEP]
print(tokenizer.decode(inputs['input_ids'].flatten().tolist()))

# The answer is the logits of the index of start and end for the answer in the original text
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)

[CLS] Which deep learning libraries back Hugging Face Transformers? [SEP] Hugging Face Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other. [SEP]
torch.Size([1, 71]) torch.Size([1, 71])


### Softmax and mask

In [7]:
# The softmax step: we need to mask the question and [SEP] in the tokens, but keep [CLS]
# Use the sequence_ids from inputs to do the mask

import torch

sequence_ids = inputs.sequence_ids()
mask = [i != 1 for i in sequence_ids]
mask[0] = False
print(sequence_ids)
print(mask)
mask = torch.tensor(mask)[None]  # [None] adds a dimension, equivalent to equivalent to n.unsqueeze(dim=1)

# Since using softmax next, just give a large neg number
start_logits[mask] = -10000
end_logits[mask] = -10000

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None]
[False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True]


In [8]:
start_proba = torch.nn.functional.softmax(start_logits, dim=-1)[0]
end_proba = torch.nn.functional.softmax(end_logits, dim=-1)[0]

### Get the start, end indices

In [None]:
# Assuming the events “The answer starts at start_index” and “The answer ends at end_index” to be
# independent, the probability that the answer starts at start_index and ends at end_index is:
scores = start_proba[:, None] * end_proba[None, :] 
# or start_proba.unsqueeze(dim=1) * end_proba.unsqueeze(dim=0)

# Mask where start_indx > end_index, i.e. the lower triangular half of the maxtrix. 
# We use torch.triu to get the upper triangular half only
socres = torch.triu(scores)

# import pandas as pd
# import numpy as np
# import seaborn as sns
# df = pd.DataFrame(scores.detach().numpy()) == 0
# sns.heatmap(df)

In [33]:
max_index = scores.argmax().item()
# or top 3:  torch.topk(scores.flatten(), 3)

# Row and column 
start_index = max_index // scores.shape[0]
end_index = max_index % scores.shape[0]
print(max_index, start_index, end_index, scores[start_index, end_index].item())

1956 27 39 0.9779755473136902


### Map to the original text

In [80]:
# Method 1 - get the mapping to letters
inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
# The offset_mapping provide mapping from tokens to question+context
offsets = inputs_with_offsets['offset_mapping']
# Note there are two (0, 0)s
print(offsets)

# Get index of the original text
start_char, _ = offsets[start_index]
_, end_char = offsets[end_index]
answer = context[start_char:end_char]

[(0, 0), (0, 5), (6, 10), (11, 19), (20, 29), (30, 34), (35, 37), (37, 42), (43, 47), (48, 60), (60, 61), (0, 0), (1, 3), (3, 8), (9, 13), (14, 26), (27, 29), (30, 36), (37, 39), (40, 43), (44, 49), (50, 54), (55, 62), (63, 67), (68, 76), (77, 86), (87, 88), (89, 92), (92, 93), (94, 95), (95, 96), (96, 97), (97, 99), (99, 101), (101, 102), (103, 106), (107, 110), (110, 113), (113, 114), (114, 117), (118, 119), (120, 124), (125, 126), (127, 130), (130, 131), (131, 135), (136, 147), (148, 155), (156, 160), (160, 161), (162, 164), (164, 165), (165, 166), (167, 182), (183, 185), (186, 191), (192, 196), (197, 203), (204, 208), (209, 212), (213, 219), (220, 227), (228, 232), (233, 236), (237, 239), (239, 246), (247, 251), (252, 255), (256, 261), (261, 262), (0, 0)]


In [85]:
# Method 2 - Use input ids directly
answer = tokenizer.decode(inputs['input_ids'].flatten()[start_index:end_index+1])

'Jax, PyTorch, and TensorFlow'

In [87]:
result = {
    "answer": answer,
    "start": start_char,
    "end": end_char,
    "score": scores[start_index, end_index],
}
print(result)

{'answer': 'Jax, PyTorch, and TensorFlow', 'start': 89, 'end': 117, 'score': tensor(0.9780, grad_fn=<SelectBackward0>)}


## Handling long contexts


In [105]:
max_length = 384  # maximum length of each context allowed in the question-answering pipeline
stride = 128  # default stride in the question-answering pipeline. See below
inputs = tokenizer(question, long_context)
print(len(inputs["input_ids"]))

463


In [91]:
# Method 1 - truncate the context
# It's possible that the answer is toward the end of the context and thus was missing
inputs = tokenizer(question, long_context, max_length=384, truncation="only_second")
print(tokenizer.decode(inputs["input_ids"]))

[CLS] Which deep learning libraries back Hugging Face Transformers? [SEP] [UNK] Transformers : State of the Art NLP [UNK] Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation and more in over 100 languages. Its aim is to make cutting - edge NLP easier to use for everyone. [UNK] Transformers provides APIs to quickly download and use those pretrained models on a given text, fine - tune them on your own datasets and then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments. Why should I use transformers? 1. Easy - to - use state - of - the - art models : - High performance on NLU and NLG tasks. - Low barrier to entry for educators and practitioners. - Few user - facing abstractions with just three classes to learn. - A un

In [106]:
# Method 2 - GOOD
# Truncate the context by splitting it into chunks with overlaps in case the answer was split
sentences = [
    "This sentence is not too long but we are going to split it anyway.",
    "This sentence is shorter but will still get split.",
]
inputs = tokenizer(
    sentences, 
    truncation=True, 
    return_overflowing_tokens=True,  # to split
    max_length=6,   # length of each chunk including marks
    stride=2,  # length of overlap
)

In [107]:
for ids in inputs['input_ids']:
    print(tokenizer.decode(ids))

[CLS] This sentence is not [SEP]
[CLS] is not too long [SEP]
[CLS] too long but we [SEP]
[CLS] but we are going [SEP]
[CLS] are going to split [SEP]
[CLS] to split it anyway [SEP]
[CLS] it anyway. [SEP]
[CLS] This sentence is shorter [SEP]
[CLS] is shorter but will [SEP]
[CLS] but will still get [SEP]
[CLS] still get split. [SEP]


In [104]:
# which sentence each of the results corresponds to
print(inputs['overflow_to_sample_mapping'])

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]


### Step by step - with long_context split

In [108]:
inputs = tokenizer(
    question,
    long_context,
    stride=stride,
    max_length=max_length,
    padding="longest",
    truncation="only_second",
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

In [110]:
# Remove fields not useful for the model
_ = inputs.pop("overflow_to_sample_mapping")
offsets = inputs.pop('offset_mapping')

inputs = inputs.convert_to_tensors('pt')
print(inputs['input_ids'].shape)

torch.Size([2, 384])


In [111]:
outputs = model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)

torch.Size([2, 384]) torch.Size([2, 384])


In [115]:
sequence_ids = inputs.sequence_ids()
mask = [i != 1 for i in sequence_ids]
# Unmask [CLS]
mask[0] = False
# Mask all the [PAD] tokens
mask = torch.logical_or(torch.tensor(mask)[None], (inputs['attention_mask'] == 0))

start_logits[mask] = -10000
end_logits[mask] = -10000

start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)

In [123]:
# Get the best score and answer
# Since there are multiple chunks of context, we need to go over each one
candidates = []
for start_probs, end_probs in zip(start_probabilities, end_probabilities):
    scores = start_probs[:, None] * end_probs[None, :]
    idx = torch.triu(scores).argmax().item()

    start_idx = idx // scores.shape[0]
    end_idx = idx % scores.shape[0]
    score = scores[start_idx, end_idx].item()
    candidates.append((start_idx, end_idx, score))
print(candidates)

for candidate, input_ids in zip(candidates, inputs['input_ids']):
    start_token, end_token, score = candidate
    answer = tokenizer.decode(input_ids[start_token:end_token+1])
    result = {'answer': answer, 'score': score}    
    print(result)

[(0, 0, 0.867302656173706), (177, 188, 0.9802390933036804)]
{'answer': '[CLS]', 'score': 0.867302656173706}
{'answer': 'Jax, PyTorch and TensorFlow', 'score': 0.9802390933036804}


In [154]:
# Get the top 2 scores and answers
candidates_topk = []
for start_probs, end_probs in zip(start_probabilities, end_probabilities):
    candidates = []
    scores = torch.triu(start_probs[:, None] * end_probs[None, :])
    idx_topk = scores.flatten().topk(2)
    for score, idx in zip(idx_topk[0], idx_topk[1]):
        idx = idx.item()
        start_idx = idx // scores.shape[0]
        end_idx = idx % scores.shape[0]
        candidates.append((start_idx, end_idx, score.item()))
    candidates_topk.append(candidates)
print(candidates_topk)

for candidates, input_ids in zip(candidates_topk, inputs['input_ids']):
    for candidate in candidates:
        start_token, end_token, score = candidate
        answer = tokenizer.decode(input_ids[start_token:end_token+1])
        result = {'answer': answer, 'score': score}    
        print(result)

[[(0, 0, 0.867302656173706), (0, 187, 0.0064314608462154865)], [(177, 188, 0.9802390933036804), (177, 189, 0.013365774415433407)]]
{'answer': '[CLS]', 'score': 0.867302656173706}
{'answer': '[CLS] Which deep learning libraries back Hugging Face Transformers? [SEP] [UNK] Transformers : State of the Art NLP [UNK] Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation and more in over 100 languages. Its aim is to make cutting - edge NLP easier to use for everyone. [UNK] Transformers provides APIs to quickly download and use those pretrained models on a given text, fine - tune them on your own datasets and then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments. Why should I use transformers? 1. Easy - to - use state - of 

# Next

In [None]:
Next: https://huggingface.co/learn/nlp-course/chapter6/4?fw=pt