## [ Chapter 14 - Question Answering with a Fine-tuned Large Language Model ] 

## Extractive Question Answering

In [10]:
import sys
sys.path.append('../..')
from aips import *
from IPython.display import display,HTML

In [11]:
#Depends on outdoors collection built in Chapter 13
#%run ../ch13/1.setting-up-the-outdoors-dataset.ipynb

In [12]:
def normalize(logits):
    return (logits + abs(logits.min())) / (logits.max() + abs(logits.min()))

#Thanks to this answer: https://stackoverflow.com/questions/28907480/convert-0-1-floating-point-value-to-hex-color#28907772
def blend(color, alpha, base=[255,255,255]):
    out = [int(round((alpha * color[i]) + ((1 - alpha) * base[i]))) for i in range(3)]
    hxa = "#" + "".join(["%02x" % e for e in out])
    return hxa

def clean_token(token):
    return token.replace(u"Ġ", "_").replace("<", "&lt;").replace(">", "&gt;")

def stylize(term, colors, logit, probs=True):
    term = clean_token(term)
    color = blend(colors,logit)
    prob = str(logit)[:4]
    if logit == 1.:
        color = "#00ff00"
    if len(prob) < 4:
        prob = prob + "0"
    token = f"<span style='background-color:{color}'>{term}</span>"
    if probs:
        token += f"<sub>{prob}</sub>"
    return token

## Listing 14.1

In [13]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

## Listing 14.2

In [14]:
question = "What are minimalist shoes"
context = """There was actually a project done on the definition of what a minimalist shoe is and the result was "Footwear providing minimal interference with the natural movement of the foot due to its high flexibility, low heel to toe drop, weight and stack height, and the absence of motion control and stability devices". If you are looking for a simpler definition, this is what Wikipedia says, "Minimalist shoes are shoes intended to closely approximate barefoot running conditions. They have reduced cushioning, thin soles, and are of lighter weight than other running shoes, allowing for more sensory contact for the foot on the ground while simultaneously providing the feet with some protection from ground hazards and conditions (such as pebbles and dirt). One example of minimalistic shoes would be the Vibram FiveFingers shoes which look like this."""

inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]

outputs = model(**inputs)
start_logits_norm = normalize(outputs[0].detach().numpy())
end_logits_norm = normalize(outputs[1].detach().numpy())

print(f"Total number of tokens: {len(input_ids)}")
print(f"Total number of start probabilities: {start_logits_norm.shape[1]}")
print(f"Total number of end probabilities: {end_logits_norm.shape[1]}")

Total number of tokens: 172
Total number of start probabilities: 172
Total number of end probabilities: 172


## Listing 14.3

In [15]:
start_tokens = []
end_tokens = []
terms = tokenizer.convert_ids_to_tokens(input_ids)
start_token_id = 0
end_token_id = len(terms)
for i, term in enumerate(terms):
    start_tokens.append(stylize(term, [0, 127, 255], start_logits_norm[0][i]))
    end_tokens.append(stylize(term, [255, 0, 255], end_logits_norm[0][i]))    
    if start_logits_norm[0][i] == 1.0:
        start_token_id = i
    if end_logits_norm[0][i] == 1.0:
        end_token_id = i + 1
        
answer = terms[start_token_id:end_token_id]
display(HTML(f'<h3>{clean_token(" ".join(answer))}</h3>'))
display(HTML(f'<pre>{" ".join(start_tokens)}</pre>'))
display(HTML(f'<pre>{" ".join(end_tokens)}</pre>'))

In [16]:
window = 9
maximum = 58
for i in range((len(terms) // window) + 1):
    start = window * i
    end = window * (i + 1)
    print(clean_token(" ".join(terms[start:end])))

&lt;s&gt; What _are _minimalist _shoes &lt;/s&gt; &lt;/s&gt; There _was
_actually _a _project _done _on _the _definition _of _what
_a _minimalist _shoe _is _and _the _result _was _"
Foot wear _providing _minimal _interference _with _the _natural _movement
_of _the _foot _due _to _its _high _flexibility ,
_low _heel _to _toe _drop , _weight _and _stack
_height , _and _the _absence _of _motion _control _and
_stability _devices ". _If _you _are _looking _for _a
_simpler _definition , _this _is _what _Wikipedia _says ,
_" Min imal ist _shoes _are _shoes _intended _to
_closely _approximate _bare foot _running _conditions . _They _have
_reduced _cushion ing , _thin _sol es , _and
_are _of _lighter _weight _than _other _running _shoes ,
_allowing _for _more _sensory _contact _for _the _foot _on
_the _ground _while _simultaneously _providing _the _feet _with _some
_protection _from _ground _hazards _and _conditions _( such _as
_pe b bles _and _dirt ). _One _example _of
_minimal istic _shoes _w

In [17]:
174 / 3

58.0

In [18]:
58 / 2

29.0

In [19]:
from transformers import RobertaTokenizerFast,PreTrainedTokenizerFast
tokenizer2 = RobertaTokenizerFast.from_pretrained('roberta-base')
assert isinstance(tokenizer2, PreTrainedTokenizerFast)

def tokenize_dataset(examples, maximum_tokens=384, document_overlap=128):

    #maximum_tokens = 384 # This will be the number of tokens in BOTH the question and context
    #document_overlap = 128 # Sometimes we need to split the context into smaller chunks, so we will overlap with this window
    pad_on_right = tokenizer.padding_side == "right"
    
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer2(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=maximum_tokens,
        stride=document_overlap,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    return tokenized_examples

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [20]:
example = {"question" : question, "context" : context}
tokenized_examples = tokenize_dataset(example, maximum_tokens=16, document_overlap=3)
windowed_inputs = tokenized_examples["input_ids"]
windows = ["<table cellpadding=0 cellspacing=0>"]
for window in windowed_inputs:
    row = tokenizer2.convert_ids_to_tokens(window)
    rhtml = ["<tr>"]
    for cell in row:
        rhtml.append(f'<td style="font-size:0.9em;font-family:courier;margin:0;padding:0;">{clean_token(cell)}</td>')
    rhtml.append("</tr>")
    windows.append("".join(rhtml))
windows.append("</table>")
display(HTML("\n".join(windows)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
<s>,What,_are,_minimalist,_shoes,</s>,</s>,There,_was,_actually,_a,_project,_done,_on,_the,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_done,_on,_the,_definition,_of,_what,_a,_minimalist,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_what,_a,_minimalist,_shoe,_is,_and,_the,_result,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_and,_the,_result,_was,"_""",Foot,wear,_providing,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,Foot,wear,_providing,_minimal,_interference,_with,_the,_natural,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_with,_the,_natural,_movement,_of,_the,_foot,_due,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_the,_foot,_due,_to,_its,_high,_flexibility,",",</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_high,_flexibility,",",_low,_heel,_to,_toe,_drop,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_to,_toe,_drop,",",_weight,_and,_stack,_height,</s>
<s>,What,_are,_minimalist,_shoes,</s>,</s>,_and,_stack,_height,",",_and,_the,_absence,_of,</s>


Up next: [Question Answering Data Preparation](2.question-answering-data-preparation.ipynb)