# Question Answering Fine-Tuning

### Installation

## 1. Load and process data

### 1.1. Load data

In [1]:
#Load context
context = ''
with open('./diabete_1.txt', 'r', encoding='utf8') as context_file:
    context = " ".join([line.lower().strip() for line in context_file.readlines()])

#Load questions answer
questions = []
responses = []
with open('./question_reponse.txt', 'r', encoding='utf8') as qa_file :
    lines = qa_file.readlines() + ['']
    acc = []
    i = 0
    while i < len(lines):
        fline = lines[i].strip().lower().replace('.', '') 
        if fline == '':
            if len(acc) == 2 and acc[-1].startswith('r:'):
                response = acc[1].replace('r:', '').strip()
                if context.find(response) != -1:
                    questions.append(acc[0])
                    responses.append(response)
                else:
                    print(acc[0], acc[1])
            acc.clear()
        else:
            acc.append(fline)
        i += 1

In [2]:
print(len(questions))
print(len(responses))

52
52


In [4]:
titles = ['diabete'] * len (questions)
ids = [str(i) for i in range(len(questions))]
contexts = [context] * len (questions)
answers = [{'text': [answer], "answer_start":[context.find(answer)]} for answer in responses]

In [5]:
import pandas as pd
from datasets import DatasetDict, Dataset

dataset = pd.DataFrame(list(zip(ids, titles, contexts, questions, answers)), columns = ['id', 'title', 'context', 'question', 'answers'])

squad = DatasetDict({'train': Dataset.from_pandas(dataset)})

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 41
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11
    })
})

In [11]:
squad['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 41
})

In [12]:
## Create train and test sets

squad = squad['train'].train_test_split(test_size=0.2)

In [13]:
pd.DataFrame(squad['test'])

Unnamed: 0,id,title,context,question,answers
0,27,diabete,le diabète correspond à une élévation anormale...,le diabète de type 2 est lié a quoi ?,"{'answer_start': [4603], 'text': ['le diabète ..."
1,45,diabete,le diabète correspond à une élévation anormale...,quel sont les symptome du diabete ?,"{'answer_start': [8315], 'text': ['sensation d..."
2,1,diabete,le diabète correspond à une élévation anormale...,que peut provoquercette glycemie ?,"{'answer_start': [182], 'text': ['des lésions ..."
3,42,diabete,le diabète correspond à une élévation anormale...,comment les medicament sont pris ?,"{'answer_start': [7574], 'text': ['il peut s'a..."
4,32,diabete,le diabète correspond à une élévation anormale...,qui contacter pour traiter le diabet ?,"{'answer_start': [5741], 'text': ['le médecin ..."
5,51,diabete,le diabète correspond à une élévation anormale...,quel sont les autre facteur risque de maladie ...,"{'answer_start': [10072], 'text': ['la surchar..."
6,12,diabete,le diabète correspond à une élévation anormale...,quel cellules sont detruit ?,"{'answer_start': [1452], 'text': ['des cellule..."
7,40,diabete,le diabète correspond à une élévation anormale...,en quoi visent le traitement contre diabete ?,"{'answer_start': [7404], 'text': ['les traitem..."
8,4,diabete,le diabète correspond à une élévation anormale...,quand est evoquer le diabete selon l'oms ?,"{'answer_start': [500], 'text': ['lorsque la g..."


In [14]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 32
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 9
    })
})

### 1.2. Process data

In [15]:
checkpoint = "Nadav/camembert-base-squad-fr"

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [17]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,
        truncation='only_second',
        return_offsets_mapping=True,
        padding='max_length'
    )

    offset_mapping = inputs.pop('offset_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1 

        #If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [18]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

                                                           

## Train

In [19]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [16]:
from transformers import  AutoModelForQuestionAnswering, TrainingArguments

model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

In [17]:
output_dir = 'my_customized_model'

In [18]:
training_args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy = 'epoch',
    learning_rate = 5e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    weight_decay = 0.01,
)


In [19]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_squad['train'],
    eval_dataset = tokenized_squad['test'],
    tokenizer = tokenizer,
    data_collator = data_collator
)

trainer.train()

: 

In [16]:
trainer.save_model(output_dir)

In [17]:
tokenizer = AutoTokenizer.from_pretrained(f'./{output_dir}')


In [18]:
model = AutoModelForQuestionAnswering.from_pretrained(f'./{output_dir}')

In [19]:
question = "Quelle est la premiere cause de cecité entre 20 et  60ans ?"
# context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [20]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model=output_dir)
question_answerer(question=question, context=context)

{'score': 0.001121675013564527,
 'start': 5684,
 'end': 5691,
 'answer': 'diabète'}

In [23]:
question = 'quels sont les signes pouvant annoncer un diabète ?'

In [28]:
question_answerer(question=question, context=context)

{'score': 0.0005180313601158559,
 'start': 8738,
 'end': 8744,
 'answer': 'mal...'}

#### Tokenize the text and return PyTorch tensors:

In [25]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(f"./{output_dir}")
inputs = tokenizer(question, context, return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (2218 > 512). Running this sequence through the model will result in indexing errors


#### Pass your inputs to the model and return the logits:

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(output_dir)
with torch.no_grad():
    outputs = model(**inputs)

#### Get the highest probability from the model output for the start and end positions:

In [None]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

Decode the predicted tokens to get the answer:

In [None]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)