In [1]:
from datasets import load_dataset
from termcolor import colored
import textwrap
import random
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

dataset = load_dataset("squad_v2")

In [3]:
dataset['train'][0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [4]:
def color_answer_start(example):
    if(len(example['answers'])==0):
        return "Unanswerable"
    answer_start = example['answers']['answer_start'][0]
    context = example['context']
    return colored(context[:answer_start], 'white') + colored(context[answer_start:answer_start+2], 'red') + colored(context[answer_start+5:], 'white')

In [6]:
sample = dataset['train'][random.randint(0, len(dataset['train']))]
print("Question: ", sample['question'])
print("Answer: ")
for wrap in textwrap.wrap(color_answer_start(sample), 100):
    print(wrap)

Question:  What effect did the 1936 Soviet Constitution have on the size of the Russia?
Answer: 
[97mMany regions in Russia were affected by the Soviet famine of 1932–1933: Volga; Central Black
Soil Region; North Caucasus; the Urals; the Crimea; part of Western Siberia; and the Kazak ASSR.
With the adoption of the 1936 Soviet Constitution on December 5, 1936, the size of the RSFSR was
significantly [0m[31mre[0m[97med. The Kazakh ASSR and Kirghiz ASSR were transformed into the
Kazakh and Kirghiz Soviet Socialist Republics. The Karakalpak Autonomous Socialist Soviet Republic
was transferred to the Uzbek SSR.[0m


In [7]:
# preprocess

# have at most 1 answer for each question

def preprocess_function(example):
    if(len(example['answers']['answer_start'])==0):
        example['answers']['answer_start'] = -1
        example['answers']['text'] = 'Unanswerable'
    else:
        example['answers']['answer_start'] = example['answers']['answer_start'][0]
        example['answers']['text'] = example['answers']['text'][0]
    return example

In [8]:
dataset = dataset.map(preprocess_function)

### Tokenisation

In [9]:
MODEL_NAME = 'google/flan-t5-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
sample_encoding = tokenizer(sample['question'], sample['context'], truncation=True, padding='max_length', max_length=512)

In [11]:
print(sample_encoding.keys())
print(sample_encoding['input_ids'])
print(sample_encoding['attention_mask'])

dict_keys(['input_ids', 'attention_mask'])
[363, 1504, 410, 8, 27598, 12873, 11378, 43, 30, 8, 812, 13, 8, 4623, 58, 1, 1404, 6266, 16, 4623, 130, 4161, 57, 8, 12873, 3, 89, 8721, 13, 957, 2668, 104, 2294, 4201, 10, 4969, 122, 9, 117, 2808, 1589, 264, 173, 6163, 117, 1117, 16371, 6769, 302, 117, 8, 4575, 5405, 117, 8, 16923, 9, 117, 294, 13, 3782, 925, 115, 4476, 117, 11, 8, 26094, 1639, 6157, 6857, 5, 438, 8, 9284, 13, 8, 27598, 12873, 11378, 30, 1882, 7836, 27598, 6, 8, 812, 13, 8, 391, 7016, 6857, 47, 4019, 3915, 5, 37, 26094, 18965, 6157, 6857, 11, 10976, 5649, 172, 6157, 6857, 130, 13421, 139, 8, 26094, 18965, 11, 10976, 5649, 172, 12873, 2730, 343, 5750, 7, 5, 37, 17422, 4766, 16864, 2040, 3114, 1162, 2730, 343, 12873, 5750, 47, 10250, 12, 8, 412, 172, 346, 157, 180, 6857, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
preds = [
    tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    for input_id in sample_encoding['input_ids']
]
" ".join(preds)

'What effect did the 1936 Soviet Constitution have on the size of the Russia ?  Many regions in Russia were affected by the Soviet  f amine of 19 32 – 19 33 : Vol g a ; Central Black So il Region ; North Cau cas us ; the Ur als ; the Crime a ; part of Western Si b eria ; and the Kaz ak AS SR . With the adoption of the 1936 Soviet Constitution on December 5, 1936 , the size of the R SF SR was significantly reduced . The Kaz akh AS SR and Kir ghi z AS SR were transformed into the Kaz akh and Kir ghi z Soviet Social ist Republic s . The Kara kal pak Auto nom ous Social ist Soviet Republic was transferred to the U z be k S SR .                                                                                                                                                                                                                                                                                                                                                                               '

In [13]:
encoding = tokenizer(
    sample['question'],
    sample['context'],
    truncation='only_second',
    padding='max_length',
    max_length=512,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors='pt'
)

In [14]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [15]:
tokenizer.special_tokens_map

Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


{'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<extra_id_0>',
  '<extra_id_1>',
  '<extra_id_2>',
  '<extra_id_3>',
  '<extra_id_4>',
  '<extra_id_5>',
  '<extra_id_6>',
  '<extra_id_7>',
  '<extra_id_8>',
  '<extra_id_9>',
  '<extra_id_10>',
  '<extra_id_11>',
  '<extra_id_12>',
  '<extra_id_13>',
  '<extra_id_14>',
  '<extra_id_15>',
  '<extra_id_16>',
  '<extra_id_17>',
  '<extra_id_18>',
  '<extra_id_19>',
  '<extra_id_20>',
  '<extra_id_21>',
  '<extra_id_22>',
  '<extra_id_23>',
  '<extra_id_24>',
  '<extra_id_25>',
  '<extra_id_26>',
  '<extra_id_27>',
  '<extra_id_28>',
  '<extra_id_29>',
  '<extra_id_30>',
  '<extra_id_31>',
  '<extra_id_32>',
  '<extra_id_33>',
  '<extra_id_34>',
  '<extra_id_35>',
  '<extra_id_36>',
  '<extra_id_37>',
  '<extra_id_38>',
  '<extra_id_39>',
  '<extra_id_40>',
  '<extra_id_41>',
  '<extra_id_42>',
  '<extra_id_43>',
  '<extra_id_44>',
  '<extra_id_45>',
  '<extra_id_46>',
  '<extra_id_47>',
 

In [16]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 1)

In [17]:
tokenizer.decode(encoding['input_ids'].squeeze())

'What effect did the 1936 Soviet Constitution have on the size of the Russia?</s> Many regions in Russia were affected by the Soviet famine of 1932–1933: Volga; Central Black Soil Region; North Caucasus; the Urals; the Crimea; part of Western Siberia; and the Kazak ASSR. With the adoption of the 1936 Soviet Constitution on December 5, 1936, the size of the RSFSR was significantly reduced. The Kazakh ASSR and Kirghiz ASSR were transformed into the Kazakh and Kirghiz Soviet Socialist Republics. The Karakalpak Autonomous Socialist Soviet Republic was transferred to the Uzbek SSR.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [18]:
answer_encoding = tokenizer(
    sample['answers']['text'][0],
    truncation=True,
    padding='max_length',
    max_length=32,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors='pt'
)

In [19]:
tokenizer.decode(answer_encoding['input_ids'].squeeze())

'reduced</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [20]:
labels = answer_encoding['input_ids']
labels

tensor([[3915,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])

In [21]:
labels[labels == tokenizer.pad_token_id] = -100
labels

tensor([[3915,    1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]])

In [22]:
# convert dataset to pytorch tensors using the tokenizer
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer(
        example_batch['question'],
        example_batch['context'],
        truncation='only_second',
        padding='max_length',
        max_length=512,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    answer_texts=[]
    for answers in example_batch['answers']:
            answer_texts.append(answers['text'])
            
    print(answer_texts)
    # Tokenize answers
    answers = tokenizer(
        answer_texts,
        truncation=True,
        padding='max_length',
        max_length=32,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    # replace -100 in the labels as we can't decode them.
    labels = answers['input_ids']
    labels[labels == tokenizer.pad_token_id] = -100

    encodings['labels'] = labels

    return encodings

In [23]:
# get encodings
sample_encodings = convert_to_features(dataset['train'][:5])
sample_encodings.keys()

['in the late 1990s', 'singing and dancing', '2003', 'Houston, Texas', 'late 1990s']


dict_keys(['input_ids', 'attention_mask', 'labels'])

In [24]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)



In [25]:
# forward pass
loss = model(input_ids=sample_encodings['input_ids'], attention_mask=sample_encodings['attention_mask'], labels=sample_encodings['labels']).loss
print(loss.item())  # 0.0001

0.40218377113342285


In [26]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# define training args
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    overwrite_output_dir=True,
    save_total_limit=2,
    fp16=True,
    learning_rate=1e-4,
    num_train_epochs=1,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)


In [27]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=None,
    train_dataset=dataset['train'].map(convert_to_features, batched=True),
    eval_dataset=dataset['validation'].map(convert_to_features, batched=True)
)

trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss


In [None]:
question_answerer = pipeline('question-answering', model=model, tokenizer=tokenizer)

# create a question and 2 contexts where 1 of the context contains the answer to the question while the other doesn't

question = "who painted the mona lisa?"
context1 = "The Mona Lisa is a 16th century oil painting created by Leonardo da Vinci. It's held at the Louvre in Paris."
context2 = "The Mona Lisa is a 16th century oil painting. It's held at the Louvre in Paris."

# get the answer
res1 = question_answerer(question=question, context=context1)
res2 = question_answerer(question=question, context=context2)

print(f"Question: {question}")
print(f"Answer 1: {res1['answer']}")
print(f"Answer 2: {res2['answer']}")