<a href="https://colab.research.google.com/github/xhavien/CCDEPLRL_EXERCISES_COM222ML/blob/main/Exercise8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 8

Instructions: Finetune a model to answer the questions below.

In [1]:
!pip install --upgrade transformers datasets evaluate



In [2]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## A. Model finetuning

In [3]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = inputs.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [4]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import TrainingArguments


model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)


args = TrainingArguments(
    output_dir="qa-finetuned-model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=500,
    report_to="none"
)
tokenized_squad = squad.map(
    preprocess_function,
    batched=True,
    remove_columns=squad.column_names
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_squad,
    tokenizer=tokenizer
)

trainer.train()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,5.8568
20,5.6005


KeyboardInterrupt: 

## B. Question Set

In [None]:
model.save_pretrained("qa-finetuned-model")
tokenizer.save_pretrained("qa-finetuned-model")

In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="qa-finetuned-model", tokenizer="qa-finetuned-model")

1. Question 1

In [None]:
result = qa_pipeline(
    question="What is the largest ocean on Earth?",
    context= "The Pacific Ocean is the largest and deepest of Earth's oceanic divisions. It extends from the Arctic Ocean in the north to the Southern Ocean in the south"
)
print(result["answer"])  # Expected: Pacific Ocean

2. Question 2

In [None]:
result = qa_pipeline(
    question="What was Isaac Newton's nationality?",
    context= "Isaac Newton was an English mathematician, physicist, astronomer, and author who is widely recognised as one of the greatest mathematicians and physicists of all time."
)
print(result["answer"])  # Expected: English

3. Question 3

In [None]:
result = qa_pipeline(
    question="where is Mount Everest located?",
    context= "Mount Everest is Earth's highest mountain above sea level, located in the Himalayas on the border between Nepal and the Tibet Autonomous Region of China."
)
print(result["answer"])  # Expected Answer: in the Himalayas on the border between Nepal and the Tibet Autonomous Region of China

4. Question 4

In [None]:
result = qa_pipeline(
    question="What do plants use to perform photosynthesis?",
    context= "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water"
)
print(result["answer"])  # Expected Answer: Sunlight

5. Question 5

In [None]:
result = qa_pipeline(
    question="When did Barack Obama serve as president?",
    context= "Barack Obama served as the 44th president of the United States from 2009 to 2017."
)
print(result["answer"]) # Expected Answer: from 2009 to 2017