# Question Answering Fine-Tuning

### Installation

## 1. Load and process data

In [1]:
!pip install transformers datasets evaluate transformers[torch]




[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: C:\Users\Hussein Menkam\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


### 1.1. Load data

In [1]:
from datasets import load_dataset

squad = load_dataset('squad', split='train[:500]')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset squad (C:/Users/Hussein Menkam/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [8]:
import json

with open('test.json', 'w') as file:
   json.dump(squad[:4], file, indent = 4)

In [3]:
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 500
})

In [4]:
## Create train and test sets

squad = squad.train_test_split(test_size=0.2)

In [5]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 400
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 100
    })
})

In [6]:
squad['train'][0]

{'id': '5733a7bd4776f41900660f6c',
 'title': 'University_of_Notre_Dame',
 'context': 'The university first offered graduate degrees, in the form of a Master of Arts (MA), in the 1854–1855 academic year. The program expanded to include Master of Laws (LL.M.) and Master of Civil Engineering in its early stages of growth, before a formal graduate school education was developed with a thesis not required to receive the degrees. This changed in 1924 with formal requirements developed for graduate degrees, including offering Doctorate (PhD) degrees. Today each of the five colleges offer graduate education. Most of the departments from the College of Arts and Letters offer PhD programs, while a professional Master of Divinity (M.Div.) program also exists. All of the departments in the College of Science offer PhD programs, except for the Department of Pre-Professional Studies. The School of Architecture offers a Master of Architecture, while each of the departments of the College of Engineeri

### 1.2. Process data

In [7]:
checkpoint = 'distilbert-base-uncased'

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,
        truncation='only_second',
        return_offsets_mapping=True,
        padding='max_length'
    )

    offset_mapping = inputs.pop('offset_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        #If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [10]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

                                                              

## Train

In [11]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [12]:
from transformers import  AutoModelForQuestionAnswering, TrainingArguments

model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to

In [13]:
output_dir = 'my_awesome_qa_model'

In [14]:
training_args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy = 'epoch',
    learning_rate = 5e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
)


In [15]:
from transformers import Trainer

In [28]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_squad['train'],
    eval_dataset = tokenized_squad['test'],
    tokenizer = tokenizer,
    data_collator = data_collator
)

trainer.train()

 20%|██        | 50/250 [19:30<1:18:46, 23.63s/it]
 20%|██        | 50/250 [21:30<1:18:46, 23.63s/it]

{'eval_loss': 3.014359951019287, 'eval_runtime': 120.2604, 'eval_samples_per_second': 0.832, 'eval_steps_per_second': 0.108, 'epoch': 1.0}


 40%|████      | 100/250 [41:12<59:24, 23.76s/it] 
 40%|████      | 100/250 [43:14<59:24, 23.76s/it]

{'eval_loss': 3.4219489097595215, 'eval_runtime': 121.118, 'eval_samples_per_second': 0.826, 'eval_steps_per_second': 0.107, 'epoch': 2.0}


 60%|██████    | 150/250 [1:03:20<40:46, 24.46s/it]
 60%|██████    | 150/250 [1:05:20<40:46, 24.46s/it]

{'eval_loss': 3.800827980041504, 'eval_runtime': 119.9294, 'eval_samples_per_second': 0.834, 'eval_steps_per_second': 0.108, 'epoch': 3.0}


 80%|████████  | 200/250 [1:25:17<19:44, 23.69s/it]  
 80%|████████  | 200/250 [1:27:18<19:44, 23.69s/it]

{'eval_loss': 4.324705123901367, 'eval_runtime': 120.7309, 'eval_samples_per_second': 0.828, 'eval_steps_per_second': 0.108, 'epoch': 4.0}


100%|██████████| 250/250 [1:47:20<00:00, 24.18s/it]
100%|██████████| 250/250 [1:49:21<00:00, 26.25s/it]

{'eval_loss': 4.444410800933838, 'eval_runtime': 121.4395, 'eval_samples_per_second': 0.823, 'eval_steps_per_second': 0.107, 'epoch': 5.0}
{'train_runtime': 6561.6276, 'train_samples_per_second': 0.305, 'train_steps_per_second': 0.038, 'train_loss': 0.4207970886230469, 'epoch': 5.0}





TrainOutput(global_step=250, training_loss=0.4207970886230469, metrics={'train_runtime': 6561.6276, 'train_samples_per_second': 0.305, 'train_steps_per_second': 0.038, 'train_loss': 0.4207970886230469, 'epoch': 5.0})

In [17]:
trainer.save_model(output_dir)

In [18]:
print('ok')

ok


In [19]:
tokenizer = AutoTokenizer.from_pretrained(f'./{output_dir}')


In [20]:
model = AutoModelForQuestionAnswering.from_pretrained(f'./{output_dir}')

In [21]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [22]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
question_answerer(question=question, context=context)

{'score': 0.2862240970134735,
 'start': 58,
 'end': 95,
 'answer': '46 languages natural languages and 13'}

#### Tokenize the text and return PyTorch tensors:

In [23]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("./my_awesome_qa_model")
inputs = tokenizer(question, context, return_tensors="pt")

#### Pass your inputs to the model and return the logits:

In [24]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
with torch.no_grad():
    outputs = model(**inputs)

#### Get the highest probability from the model output for the start and end positions:

In [25]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

Decode the predicted tokens to get the answer:

In [29]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'46 languages natural languages and 13'