In [None]:
import json

Dans ce notebook, on utilisera le modèle préentrainé à base de transformers : BertForQuestionAnswering. Cela nous permet de nous familiariser avec les métriques utilisées classiquement pour cette tâche de NLP


In [None]:
pip install transformers



In [None]:
# On télécharge le fichier de dev pour changer... (et il est plus léger)
! wget -c https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

--2022-04-24 15:45:34--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.108.153, 185.199.109.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 416 Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [None]:
squad = json.loads(open('dev-v2.0.json').read())

In [None]:
# Structure du fichier json : une clef data (442 items)
# dans clef, un "titre" +  des "paragraphs" (plusieurs dizaines). 
# dans paragraphs : un contexte + des questions/réponses (qas).
# dans qas : "question" + une clef 'answers'
# dans answers : "text" de la réponse + "answer start" + is impossible False (peut répondre).
# Variante pour les dernies items : dans 'qas', is impossible : True, et donne une "plausible answer" 

In [None]:
# Exemple pour le premier contexte, et les différentes questions associées

squad['data'][0]['paragraphs'][0]

{'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'qas': [{'answers': [{'answer_start': 159, 'text': 'France'},
    {'answer_start': 159, 'text': 'France'},
    {'answer_start': 159, 'text': 'France'},
    {'answer_start': 159, 'text': 'France'}],
   'id': '56ddde6b9a695914005b9628',
   'is

In [None]:
# extraction des différents items : le contexte, les questions, les réponses
# afin de le passer ensuite dans le modèle BERT
# nb : existe deux formats pour réponse ('answers' ou 'plausible answers')

new_squad = []

for group in squad['data']:
    for paragraph in group['paragraphs']:
        context = paragraph['context']
        for qa_pair in paragraph['qas']:
            question = qa_pair['question']
            if 'answers' in qa_pair.keys() and len(qa_pair['answers']) > 0:
                answer_list = qa_pair['answers']
            elif 'plausible_answers' in qa_pair.keys() and len(qa_pair['plausible_answers']) > 0:
                answer_list = qa_pair['plausible_answers']
            else:
                answer_list = []
            answer_list = [item['text'] for item in answer_list]
            answer_list = list(set(answer_list)) # pour enlever les doublons
            for answer in answer_list :
                new_squad.append({
                    'question' : question,
                    'answer': answer,
                    'context': context})      

In [None]:
# sauvegarde au format json

with open("/content/dev.json", 'w') as f:
    json.dump(new_squad,f)

In [None]:
# charge le fichier dev préparé qui servira en entrée du modèle

with open('/content/dev.json','r') as f :
    squad = json.load(f)

In [None]:
# fichier json simplifié, exemple des deux premières lignes
squad[:2]

[{'answer': 'France',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
  'question': 'In what country is Normandy located?'},
 {'answer': 'in the 10th and 11th centuries',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 1

In [None]:
# import des tokenizer de Bert

from transformers import BertTokenizer, BertForQuestionAnswering

# Choisir un modéle pré-entrainé sur HuggingFace

modelname = 'deepset/bert-base-cased-squad2'

tokenizer = BertTokenizer.from_pretrained(modelname)
model = BertForQuestionAnswering.from_pretrained(modelname)

In [None]:
from transformers import pipeline

In [None]:
qa = pipeline ('question-answering', model = modelname, tokenizer = tokenizer)

In [None]:
# test sur une paire contexte + question (prise dans Squad)

qa({ 'question': 'In what country is Normandy located?',
    'context' : 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'
   })

{'answer': 'France.', 'end': 166, 'score': 0.999527633190155, 'start': 159}

In [None]:
# essaie de prédiction sur plusieurs exemples du dataset

answers = []

for pair in squad[:5] :
    ans = qa({'question': pair['question'], 'context' : pair['context']}) # récupère question et contexte
    answers.append({'predicted':ans['answer'], # la réponse du modèle
                    'true' : pair['answer']}) # la réponse exacte 

In [None]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries',
  'true': 'in the 10th and 11th centuries'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'}]

In [None]:
# Metrics 
# Exact Mach (EM) si virgule près dira que "Rollo," et "Rollo" ce n'est pas pareil... 
# De même "France." sera considérée faux, car l'attendu est "France" sans point.

em = []
for answer in answers :
    if answer['predicted'] == answer['true']:
        em.append(1)
    else :
        em.append(0)

sum (em)/len(em)

# on obtient un EM score de 40% sur les 5 Q&A

0.4

In [None]:
# modifier un peu pour accepter plus (lower case et autre)
# enleve tout ce qui n'est PAS chiffre ou lettre

import re

em = []

for answer in answers:
    pred = re.sub('[^0-9a-z ]', '', answer['predicted'].lower())
    true = re.sub('[^0-9a-z ]', '', answer['true'].lower())
    if pred == true:
        em.append(1)
    else :
        em.append(0)

sum (em)/len(em)
                 
# Le score est monté à 80%. reste en effet une erreur pour la troisième question car l'attendue contient "in the"

0.8

In [None]:
# Regard une autre métrique : ROUGE
# ROUGE metric (Recall Oriented Understudy for Gisting Evaluation)
# a set of metrics : N, L, S en particulier, regarde match entre prédiction et réf texte
# N: number of matching n-grams (unigram, bigram). ROUGE-1, ROUGE-2
! pip install rouge



In [None]:
from rouge import Rouge

In [None]:
rouge = Rouge()

# transforme le dic Answers en une liste
model_out = [ans['predicted'] for ans in answers]
reference = [ans['true'] for ans in answers]

rouge.get_scores(model_out, reference) # va itérer sur les listes 
# donne f:F1-score, p: precision et r: recall
# pour chacune des 5 paires (predicted, true)
# trouve 0 pour le dernier à cause de la virgule

[{'rouge-1': {'f': 0.999999995, 'p': 1.0, 'r': 1.0},
  'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
  'rouge-l': {'f': 0.999999995, 'p': 1.0, 'r': 1.0}},
 {'rouge-1': {'f': 0.7999999952000001, 'p': 1.0, 'r': 0.6666666666666666},
  'rouge-2': {'f': 0.7499999953125, 'p': 1.0, 'r': 0.6},
  'rouge-l': {'f': 0.7999999952000001, 'p': 1.0, 'r': 0.6666666666666666}},
 {'rouge-1': {'f': 0.999999995, 'p': 1.0, 'r': 1.0},
  'rouge-2': {'f': 0.999999995, 'p': 1.0, 'r': 1.0},
  'rouge-l': {'f': 0.999999995, 'p': 1.0, 'r': 1.0}},
 {'rouge-1': {'f': 0.999999995, 'p': 1.0, 'r': 1.0},
  'rouge-2': {'f': 0.999999995, 'p': 1.0, 'r': 1.0},
  'rouge-l': {'f': 0.999999995, 'p': 1.0, 'r': 1.0}},
 {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
  'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
  'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}]

In [None]:
# built-in function pour avoir la moyenne sur la liste

rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'f': 0.7599999960400001, 'p': 0.8, 'r': 0.7333333333333333},
 'rouge-2': {'f': 0.5499999970625, 'p': 0.6, 'r': 0.52},
 'rouge-l': {'f': 0.7599999960400001, 'p': 0.8, 'r': 0.7333333333333333}}

In [None]:
# sur tout le modèle.

from tqdm import tqdm

model_out = []
reference = []

In [None]:
# trop long, teste sur 50 premiers questions

for pair in tqdm(squad[:50],leave = True):
    ans = qa({'question':pair['question'],
              'context':pair['context']})
    model_out.append(ans['answer'])
    reference.append(pair['answer'])

100%|██████████| 50/50 [01:30<00:00,  1.81s/it]


In [None]:
rouge.get_scores(model_out, reference, avg = True)

{'rouge-1': {'f': 0.40930158480384604,
  'p': 0.45285714285714285,
  'r': 0.4152380952380952},
 'rouge-2': {'f': 0.21939626912222163, 'p': 0.24696969696969695, 'r': 0.2245},
 'rouge-l': {'f': 0.40930158480384604,
  'p': 0.45285714285714285,
  'r': 0.4152380952380952}}

In [None]:
# pas terrible, regarde détail
scores = rouge.get_scores(model_out, reference)
# Sait que le 4ème (Rollo,) pose problème. regarde rouge-1 et f score
print(model_out[4],'|',reference[4], '|', scores[4]['rouge-1']['f'])
    

Rollo, | Rollo | 0.0


In [None]:
# pb de poncutation !
# fait la même manip' de nettoyage
import re

clean = re.compile('(?i)[^0-9a-z ]') # modifier ?i, devient case insensitive
model_out = [clean.sub('',text) for text in model_out]
reference = [clean.sub('',text) for text in reference]

In [None]:
# relance rouge sur listes nettoyées
rouge.get_scores(model_out, reference, avg = True)

{'rouge-1': {'f': 0.5337460286760682,
  'p': 0.5608658008658007,
  'r': 0.5919047619047618},
 'rouge-2': {'f': 0.2943962687784716, 'p': 0.3198268398268398, 'r': 0.3145},
 'rouge-l': {'f': 0.5337460286760682,
  'p': 0.5608658008658007,
  'r': 0.5919047619047618}}

In [None]:
scores = rouge.get_scores(model_out, reference)
print(model_out[4],'|',reference[4], '|', scores[4]['rouge-1']['f'])

Rollo | Rollo | 0.999999995


In [None]:
#POUR INFO :
# recall nb de ngram de pred qui match ref / nb de ngram dans la réference. 
# voir si modèle capture toute l'info dans la ref, mais peut etre que donnne tonne de mots
# juste pour augmenter le recall. 
# 100% recall si tous les mots de ref dans préd, mais peut avoir d'autres mots inutiles
# pour éviter prend précision : nb de ngrams qui match ref / nb ngrams dans la pred
# combinaison des 2 avec le F1-score : 2*(precision*recall)/(precision + recall)

In [None]:
# Rouge L metric : longest common subsequence (shared tokens) between pred and ref
# ex 2 mots. puis calcule recall en divisant par nb bi-grams dans la ref
# ou calcule precision p en divisant par ngram dans la pred