In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from gensim import models
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import csv
import nltk
import spacy
import string
import math

spacy.prefer_gpu()
sp = spacy.load('en')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

!git clone https://github.com/talbio/question-answer-AI

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Cloning into 'question-answer-AI'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 0), reused 8 (delta 0), pack-reused 0[K
Unpacking objects: 100% (8/8), done.


##**Présentation des données**##

In [None]:
corpus = pd.read_csv('question-answer-AI/data/corpus.csv')  
X_train = pd.read_csv('question-answer-AI/data/train_ids.csv')
X_val = pd.read_csv('question-answer-AI/data/val_ids.csv')
X_test = pd.read_csv('question-answer-AI/data/test.csv')
docs = corpus['paragraph'] 

print("Le corpus est composé de %d textes" %len(corpus['paragraph']))
print("Les paires de questions réponses pour l'ensemble d'entrainement contiennent " + str(X_train.shape[0]) + " entrées")
print("Les paires de questions réponses pour l'ensemble de validation contiennent " + str(X_val.shape[0]) + " entrées")
print("Les paires de questions pour l'ensemble de test contiennent " + str(X_test.shape[0]) + " entrées")

Le corpus est composé de 83327 textes
Les paires de questions réponses pour l'ensemble d'entrainement contiennent 106176 entrées
Les paires de questions réponses pour l'ensemble de validation contiennent 10000 entrées
Les paires de questions pour l'ensemble de test contiennent 10000 entrées


##**Preprocessing**##

### Tokenisation de phrases: ###

In [None]:
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

sentence_num = []
doc_sentences = []

for doc in docs:
  doc_sentences.append(sent_detector.tokenize(doc.strip()))
  sentence_num.append(len(sent_detector.tokenize(doc.strip())))


print("Il y a en moyenne " + str(sum(sentence_num) / len(sentence_num)) + " phrases par document")

Il y a en moyenne 5.407250951072282 phrases par document


### Pipeline pour tokeniser en mots le corpus, enlever les stopwords, et lemmatizer les tokens ###

In [None]:
class Preprocess(object):
    def __init__(self, lemmatize=True):
        self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.lemmatize = lemmatize
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.stemmer = nltk.PorterStemmer()

    def preprocess_pipeline(self, corpus):
        clean_tokenized_corpus = self._clean_docs(corpus)
        if self.lemmatize:
            clean_tokenized_corpus = self._lemmatize(clean_tokenized_corpus)

        return clean_tokenized_corpus

    def word_tokenize(self, corpus):
        tokenized_docs = []

        for doc in corpus:
            tokenized_docs.append(word_tokenize(doc.lower()))

        return tokenized_docs

    def remove_punk_stop_stem(self, data):
        tokenized = self.word_tokenize(data)

        no_stopwords = [[word for word in row if not word in self.stopwords and not word in string.punctuation] for row in tokenized]
        no_stopwords = [[self.stemmer.stem(word) for word in row] for row in no_stopwords]

        rejoined = [" ".join(row) for row in no_stopwords]

        return rejoined


    def _clean_docs(self, corpus):
        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        return [
            [
                token.lower()
                for token in tokenizer.tokenize(doc)
                if token.lower() not in self.stopwords
                and len(token) > 1
                and token.isalpha()
                and token != "br]"
            ]
            for doc in corpus
        ]

    def _lemmatize(self, corpus):
        return [[self.lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

    def lemmatize_text(self, text):
      return (' ').join([self.lemmatizer.lemmatize(word) for word in word_tokenize(text.lower())])

    def convert_to_doc(self, tokenized_docs):
        docs = []
        for tokens in tokenized_docs:
            docs.append(" ".join(tokens))

        return docs

Autres méthodes utiles

In [None]:
from scipy.spatial import distance


def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def dot_product(u, v):
    return np.dot(u, v)

def euclidean(u, v):
    return distance.euclidean(u,v)

In [None]:
def get_ranking_accuracy(correct_ids, topn_ids, top10_ids):
    total_topn = 0
    total_top10 = 0
    total_lengths = 0
    total_non_nans = 0

    for correct_id, topn, top10 in zip(correct_ids, topn_ids, top10_ids):
        if(str(correct_id) == 'nan'):
            continue

        total_non_nans += 1
        total_lengths += len(set(topn))
        if correct_id in topn:
            total_topn += 1
        if correct_id in top10:
            total_top10 += 1

    print('Excluding questions with no answers...')
    print(f'prediciton is: {total_topn} out of {total_non_nans} predictions have the correct paragraph in their top {total_lengths/total_non_nans} or {total_topn/total_non_nans}')
    print(f'prediciton is: {total_top10} out of {total_non_nans} predictions have the correct paragraph in their top 10 or {total_top10/total_non_nans}')

In [None]:
def create_passage_submission(ids, topns, top10s):
    submission_topn = []
    submission_top10 = []

    for tn, t10 in zip(topns, top10s):
        tn = [str(x) for x in tn]
        t10 = [str(x) for x in t10]
        submission_topn.append(";".join(tn))
        submission_top10.append(";".join(t10))

    submission_df = pd.DataFrame(zip(ids, submission_topn, submission_top10), columns=["id", "top_n", "top_10"])
    submission_df.to_csv("./passage_submission.csv", encoding='utf-8', index=False)

#**Représentation de passages et ordonnacement**#

## TF-IDF des bigrams sur le corpus prétraité avec la classe Preprocess() ##

In [None]:
pre = Preprocess()
matrix = pre.preprocess_pipeline(docs)
matrix = pre.convert_to_doc(matrix)
print('preprocess done')

vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df=5, max_df=.5)
matrix = vectorizer.fit_transform(matrix)

preprocess done


In [None]:
matrix.shape

(83327, 151788)

### Essayons de répondre à une question ###

In [None]:
question = X_train.iloc[2, :]['question']
paragraph_id = X_train.iloc[2, :]['paragraph_id']
answer = X_train.iloc[2, :]['answer']

print('Question: ' + str(question))
print('ID du document réponse: ' + str(paragraph_id))
print('Réponse: ' + str(answer))

query = vectorizer.transform([pre.lemmatize_text(question)])
(query > 0).sum(), vectorizer.inverse_transform(query)
query.shape

Question: How many other cities had populations larger than 40,000 by 1500?
ID du document réponse: 11041.0
Réponse: Twenty-two


(1, 151788)

In [None]:
%%time
scores = (matrix * query.T).toarray()
results = (np.flip(np.argsort(scores, axis=0)))
print((paragraph_id in results[:40000, 0]))

print( [corpus['paragraph'][i] for i in results[:3, 0]] )

True
["<P> Jacob Basil Anderson ( born 18 June 1990 ) is an English actor , singer - songwriter , rapper , and record producer . As an actor , he is known for his role as Grey Worm in the television series Game of Thrones , and his recurring appearances in the first seasons of Episodes and Broadchurch . As a musician , he uses the alias Raleigh Ritchie ; his debut album , You 're a Man Now , Boy , was released in 2016 to very positive reviews . </P>", "<P> For centuries the Forum was the center of day - to - day life in Rome : the site of triumphal processions and elections ; the venue for public speeches , criminal trials , and gladiatorial matches ; and the nucleus of commercial affairs . Here statues and monuments commemorated the city 's great men . The teeming heart of ancient Rome , it has been called the most celebrated meeting place in the world , and in all history . Located in the small valley between the Palatine and Capitoline Hills , the Forum today is a sprawling ruin of 

En conclusion, TD-IDF n'est pas très efficace. Pour une question donnée, il a fallu prendre les 40 000 documents les plus similaires pour contenir dans l'un d'eux le bon id du paragraphe réponse!

Testons notre baseline sur 10000 questions du dataset d'entrainement

In [None]:
train_questions = X_train["question"].tolist()[:10000]
train_par_ids = X_train["paragraph_id"].tolist()[:10000]

In [None]:
topn = []
top10 = []
n = 5

for test_question in tqdm(train_questions):
    query = vectorizer.transform([pre.lemmatize_text(test_question)])

    scores = (matrix * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))

    top = [x[0] for x in results]

    topn.append(top[:n])
    top10.append(top[:10])

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [None]:
get_ranking_accuracy(train_par_ids, topn, top10)

Excluding questions with no answers...
prediciton is: 1141 out of 6489 predictions have the correct paragraph in their top 5.0 or 0.17583603020496225%
prediciton is: 1492 out of 6489 predictions have the correct paragraph in their top 10 or 0.22992756973339498


##**Vers des représentations plus sophistiquées!**##

Code inspired from https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/


En raison de contrainte de temps nous allons éxécuter le reste des méthodes sur un petit subset du training dataset pour montrer que l'exécution se fait sans erreur

##Modèles pré-entrainés##

### GloVe ###




Nous utilisons les plongements pré-entraînes sur Wikipedia 2014 et Gigaword 5. Nous utilisons aussi les plongements de dimension 300

In [None]:
!mkdir GloVe
! curl -Lo GloVe/glove.6B.zip http://nlp.stanford.edu/data/glove.6B.zip
! unzip GloVe/glove.6B.zip -d GloVe/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   308    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   345    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  822M  100  822M    0     0  2155k      0  0:06:30  0:06:30 --:--:-- 2479k
Archive:  GloVe/glove.6B.zip
  inflating: GloVe/glove.6B.50d.txt  
  inflating: GloVe/glove.6B.100d.txt  
  inflating: GloVe/glove.6B.200d.txt  
  inflating: GloVe/glove.6B.300d.txt  


Il faut lire tous les words embeddings dans un dictionnaire

In [None]:
embeddings_dict = {}
with open("GloVe/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
def generate_document_embedding(par):
    par_embedding = np.zeros(np.shape(shape))

    for token in par:
        try:
            par_embedding = np.add(par_embedding, embeddings_dict[token])
        except KeyError:
            par_embedding = par_embedding
    
    return par_embedding/len(par)

In [None]:
first_key = list(embeddings_dict.keys())[0]
shape = np.shape(embeddings_dict[first_key])

Ici, nous créons nos doc embeddings à partir des word embeddings

In [None]:
preprocess = Preprocess()
tokenized_pars = preprocess.word_tokenize(docs)
par_embeddings = []

print('finished tokenizing')

for par in tqdm(tokenized_pars):
    par_embeddings.append(generate_document_embedding(par))



In [None]:
train_questions = X_train["question"].tolist()[:100]
train_par_ids = X_train["paragraph_id"].tolist()[:100]

train_questions = preprocess.word_tokenize(train_questions)

Finalement, on génère notre topn et top10

In [None]:
%%time
topn = []
top10 = []
n = 5

for test_question in tqdm(train_questions):
    try:
        question_embedding = generate_document_embedding(test_question)
        doc_scores = np.array([cosine(question_embedding, x) for x in par_embeddings])

        topn.append(np.argsort(-1*doc_scores)[:n])
        top10.append(np.argsort(-1*doc_scores)[:10])
        
    except ValueError:
        topn.append([-1])
        top10.append([-1])

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


CPU times: user 1min 49s, sys: 824 ms, total: 1min 50s
Wall time: 1min 50s


In [None]:
get_ranking_accuracy(train_par_ids, topn, top10)

Excluding questions with no answers...
prediciton is: 20 out of 63 predictions have the correct paragraph in their top 5.0 or 0.31746031746031744%
prediciton is: 22 out of 63 predictions have the correct paragraph in their top 10 or 0.3492063492063492


###Doc2Vec###

In [None]:
%%time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#preprocess = Preprocess()
#tokenized_pars = preprocess.word_tokenize(train_ids_sample["paragraph"])
#print(tokenized_pars[0])
#tokenized_pars = preprocess.word_tokenize(doc_sentences[:1000])

tagged_data = [TaggedDocument(doc, [idx]) for idx, doc in enumerate(tokenized_docs)]
model = Doc2Vec(tagged_data, vector_size = 200, window = 4, min_count = 10, epochs = 100)

CPU times: user 1h 20min 36s, sys: 6min 4s, total: 1h 26min 41s
Wall time: 49min 19s


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
model.save("/word2vec.model")

In [None]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath 
model = KeyedVectors.load_word2vec_format(datapath("/content/gdrive/MyDrive/w2vec.bin"), binary=True)

In [None]:
test_questions = train_ids_sample["question"].tolist()
test_par_ids = train_ids_sample["paragraph_id"].tolist()
test_answers = train_ids_sample["answer"].tolist()

In [None]:
%%time

result_par_ids = []

for test_question in test_questions:
    test_doc = word_tokenize(test_question.lower())
    test_doc_vector = model.infer_vector(test_doc)

    result = model.docvecs.most_similar(positive = [test_doc_vector])
    # get top 5
    result = result[:5]
    result = [x[0] for x in result]
    result_par_ids.append(result)

  if np.issubdtype(vec.dtype, np.int):


CPU times: user 22 s, sys: 8.76 s, total: 30.7 s
Wall time: 15.8 s


In [None]:
total = 0

for test_par_id, result_par_id in zip(test_par_ids, result_par_ids):
    if test_par_id in result_par_id:
        total += 1

print(f'prediciton is: {total} out of {len(test_par_ids)} predictions have the correct paragraph in their top 5')

prediciton is: 161 out of 873 predictions have the correct paragraph in their top 5


##BM25##

Le code pour la fonction de score BM25 a été tiré du repo Github https://github.com/dorianbrown/rank_bm25

In [None]:
class BM25:
    def __init__(self, corpus):
        self.k1 = 1.5
        self.b = 0.75
        self.epsilon = 0.25
        self.corpus_size = len(corpus)
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []

        nd = self._initialize(corpus)
        self._calc_idf(nd)

    def _initialize(self, corpus):
        nd = {} 
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                try:
                    nd[word]+=1
                except KeyError:
                    nd[word] = 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def _calc_idf(self, nd):
        idf_sum = 0

        negative_idfs = []
        for word, freq in nd.items():
            idf = math.log(self.corpus_size) - math.log(freq)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score

In [None]:
%%time

preprocess = Preprocess()
cleaned_pars = preprocess.remove_punk_stop_stem(docs)
tokenized_paragraphs = preprocess.word_tokenize(cleaned_pars)

bm25 = BM25(tokenized_paragraphs)

CPU times: user 3min 36s, sys: 1.46 s, total: 3min 37s
Wall time: 3min 38s


In [None]:
train_questions = X_train["question"].tolist()[:100]
train_par_ids = X_train["paragraph_id"].tolist()[:100]

train_questions = preprocess.remove_punk_stop_stem(train_questions)

On génère notre topn et top10

In [None]:
top10 = []
topn = []
n = 3

for train_question, id in tqdm(train_questions):
    test_doc = word_tokenize(train_question)
    doc_scores = bm25.get_scores(test_doc)

    topn.append(np.argsort(-1*doc_scores)[:n])
    top10.append(np.argsort(-1*doc_scores)[:10])


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
get_ranking_accuracy(train_par_ids, topn, top10)

Excluding questions with no answers...
prediciton is: 50 out of 63 predictions have the correct paragraph in their top 3.0 or 0.7936507936507936%
prediciton is: 59 out of 63 predictions have the correct paragraph in their top 10 or 0.9365079365079365


Voici la section pour générer le fichier passage_submission.csv

In [None]:
test_questions = X_test["question"].tolist()
test_ids = X_test["id"].tolist()

test_questions = preprocess.remove_punk_stop_stem(test_questions)


In [None]:
top10 = []
topn = []
n = 2

for test_question in tqdm(test_questions):
    test_doc = word_tokenize(test_question)
    doc_scores = bm25.get_scores(test_doc)

    topn.append(np.argsort(-1*doc_scores)[:n])
    top10.append(np.argsort(-1*doc_scores)[:10])

In [None]:
create_passage_submission(test_ids, topn, top10)

#**Extraction de réponse**#

Code taken from https://colab.research.google.com/drive/1uSlWtJdZmLrI3FCNIlUHFxwAJiSu2J0-

In [None]:
%%capture

!pip install transformers

In [None]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




## BERT tokenizer et normalisation ##

In [None]:
from scipy.special import softmax

def argmax(l):
  f = lambda i: l[i]
  return max(range(len(l)), key=f)

def global_normalize(question, answer_texts):

  paragraph_start_scores = []
  paragraph_end_scores = []
  paragraph_input_ids = []

  normalized_s_scores = []
  concat_s_scores = []

  normalized_e_scores = []
  concat_e_scores = []

  len_answer_texts = []

  count = 0

  softmax_s = []
  softmax_e = []

  inputs_ids = []

  for answer_text in answer_texts:

    # Encode the input ids and segment ids with the built-in function from the huggingface library,
    # truncating passages larger than 512 tokens (the limit for our pretrained BERT model)
    encoded = tokenizer.encode_plus(text=question, text_pair=answer_text, add_special_tokens=True, truncation=True, max_length=512)
    input_ids = encoded["input_ids"]
    segment_ids = encoded['token_type_ids']

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    model_outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    start_scores = model_outputs[0]
    end_scores = model_outputs[1]

    len_answer_texts.append(len(start_scores.detach().numpy().flatten()))

    concat_s_scores += list(start_scores.detach().numpy().flatten())
    concat_e_scores += list(end_scores.detach().numpy().flatten())

    # Store necessary information for easier string extraction later in correct paragraph
    paragraph_start_scores.append(start_scores)
    paragraph_end_scores.append(end_scores)
    paragraph_input_ids.append(input_ids)
  
  softmax_s = softmax(concat_s_scores)
  softmax_e = softmax(concat_e_scores)

  #Reconstruct normalized pars:
  for i in range(len(answer_texts)):
    normalized_s_scores.append(list(softmax_s[count:count+len_answer_texts[i]]))
    normalized_e_scores.append(list(softmax_e[count:count+len_answer_texts[i]]))
    count = len_answer_texts[i]

  arg_max = argmax(softmax_s)

  pointer = len_answer_texts[0]

  i = 0
  #find which paragraph has the highest probability
  while pointer < arg_max:
    i += 1
    pointer += len_answer_texts[i]

  # Return the paragraph containing the best answer, along with its best start and end scores and input ids (for reconstruction)
  return i, paragraph_input_ids[i], paragraph_start_scores[i], paragraph_end_scores[i]

## Reconstruction du passage à partir des scores et le texte original ##


In [None]:
import collections
import regex
from difflib import SequenceMatcher

def reconstruct_answer(answer_text, input_ids, start_score, end_score):
    # preliminary predictions based on start and end indexes
    PrelimPrediction = collections.namedtuple( 
        "PrelimPrediction", ["start_index", "end_index", "score"]
    )
    # best predictions containing the original text (if possible) obtained with fuzzy regex string matching
    BestPrediction = collections.namedtuple(
        "BestPrediction", ["text", "score"]
    )

    # Internal method to convert a tensor to a list
    def to_list(tensor):
        return tensor.detach().cpu().tolist()

    # Convert our start and end logit tensors to lists
    start_logits = to_list(start_score)[0]
    end_logits = to_list(end_score)[0]

    # Sort our start and end logits from largest to smallest, keeping track of the index
    start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
    end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)

    start_indexes = [idx for idx, logit in start_idx_and_logit[:5]]
    end_indexes = [idx for idx, logit in end_idx_and_logit[:5]]

    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # question tokens are defined as those between the CLS token (101, at position 0) and first SEP (102) token 
    question_indexes = [i+1 for i, token in enumerate(tokens[1:tokens.index("[SEP]")])]

    # Sanity check so that the answer span we are looking for does not start before the end and does not include any part of the question
    prelim_preds = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # throw out invalid predictions
            if start_index in question_indexes:
                continue
            if end_index in question_indexes:
                continue
            if end_index < start_index:
                continue
            prelim_preds.append(
                PrelimPrediction(
                    start_index = start_index,
                    end_index = end_index,
                    score = start_logits[start_index] + end_logits[end_index]
                )
            )

    # Sort so that the highest scores are first
    prelim_preds = sorted(prelim_preds, key=lambda x: (x.score), reverse=True)

    nbest = []
    seen_predictions = []

    for pred in prelim_preds:
        # for now we only care about the top 5 best predictions
        if len(nbest) >= 5: 
            break
            
        # loop through predictions according to their start index
        if pred.start_index > 0: # non-null answers have start_index > 0
            text = tokenizer.convert_tokens_to_string(tokens[pred.start_index:pred.end_index+1])

            # Determine initial text by naively joining all tokens with a space
            text = text.strip()
            text = " ".join(text.split())

            # Use regex to try to match the naive text to the original source document (for better EM results)
            # Escape the text so that special symbols does not interfere with our regex search
            escaped_text = regex.escape(text)
            ids = tokenizer.encode(escaped_text, add_special_tokens=False, truncation=True, max_length=512)
            # The maximum number of errors we accept for our fuzzy logic string matching is log base 2 the number of answer tokens with string escaping to allow special characters
            max_error = len(ids).bit_length()
            # Set minimum error to 2 if it is below 2
            max_error = 2 if max_error < 2 else max_error
            max_error_string = f"{{e<={max_error}}}"
            fuzzy_constraint = "("+escaped_text+")"+max_error_string
            regex_text = regex.findall(fuzzy_constraint,answer_text,flags=regex.BESTMATCH)
            # If we don't find a regex match, just get the original naive text
            if not len(regex_text) == 0:
                regex_text = regex_text[0]
                # The text matched by the regex in the source text must resemble the original answer
                if SequenceMatcher(None, text, regex_text).ratio() >= 0.75:
                    text = regex_text

            if text in seen_predictions:
                continue

            # flag this text as being seen -- if we see it again, don't add it to the nbest list
            seen_predictions.append(text) 

            # add this text prediction to a pruned list of the top 5 best predictions
            nbest.append(BestPrediction(text=text, score=pred.score))

    # Add null answer to our predictions
    nbest.append(BestPrediction(text="<No Answer>", score = start_logits[0] + end_logits[0]))

    # The best answer span is the one with the highest combined logit start and end score
    absolute_best = sorted(nbest, key=lambda x: (x.score), reverse=True)[0]

    return absolute_best.text

## Answer Extraction ##

In [None]:
import math
import random

local_train_ids = pd.read_csv('question-answer-AI/data/train_ids.csv')
number_tests = 100

test_ids = local_train_ids[:number_tests]
test_questions = test_ids["question"].tolist()
test_par_ids = test_ids["paragraph_id"].tolist()
test_par_ids = [[random.randint(0, len(corpus)) if math.isnan(par_id) else int(par_id)] for par_id in test_par_ids]
test_answers = test_ids["answer"].tolist()
test_ids = test_ids["id"].tolist()
paragraphs = corpus['paragraph']

In [None]:
%%time
result_answers = []
result_paragraph_id = []

for test_question, result_par_id in tqdm(zip(test_questions, test_par_ids)):
    
    answer_texts = []
    for id in result_par_id:
        answer_texts.append(corpus.iloc[id, 1])

    answer_paragraph_index, input_ids, start_logits, end_logits = global_normalize(test_question, answer_texts)
    answer_string = reconstruct_answer(answer_texts[answer_paragraph_index], input_ids, start_logits, end_logits)

    result_answers.append(answer_string)
    result_paragraph_id.append(result_par_id[answer_paragraph_index])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


CPU times: user 3min 34s, sys: 1.89 s, total: 3min 36s
Wall time: 3min 36s


## Métriques ##
Code tiré de: https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html

In [None]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

### Métriques pour notre extraction de réponses ###

In [None]:
number_answers = len(test_answers)
EM_scores = []
F1_scores = []

for given_answer, ground_truth in tqdm(zip(result_answers, test_answers)):
    EM_scores.append(compute_exact_match(given_answer, ground_truth))
    F1_scores.append(compute_f1(given_answer, ground_truth))
print("EM score: " + str(np.sum(EM_scores)/number_answers))
print("F1 score: " + str(np.sum(F1_scores)/number_answers))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


EM score: 0.51
F1 score: 0.5689152274003438


### Métriques pour notre baseline ###

In [None]:
def baseline_answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # Encode the input ids and segment ids with the built-in function from the huggingface library,
    # truncating passages larger than 512 tokens (the limit for our pretrained BERT model)
    encoded = tokenizer.encode_plus(text=question, text_pair=answer_text, add_special_tokens=True, truncation=True, max_length=512)
    input_ids = encoded["input_ids"]
    segment_ids = encoded['token_type_ids']

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    model_outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    start_scores = model_outputs[0]
    end_scores = model_outputs[1]

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    return answer

In [None]:
%%time
baseline_result_answers = []

for test_question, result_par_id in tqdm(zip(test_questions, test_par_ids)):
    answer_string = baseline_answer_question(test_question, corpus.iloc[result_par_id[0], 1])
    baseline_result_answers.append(answer_string)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


CPU times: user 2min 22s, sys: 1.86 s, total: 2min 24s
Wall time: 2min 24s


In [None]:
baseline_number_answers = len(baseline_result_answers)
baseline_EM_scores = []
baseline_F1_scores = []

for prediction, answer in tqdm(zip(baseline_result_answers, test_answers)):
    baseline_EM_scores.append(compute_exact_match(prediction, answer))
    baseline_F1_scores.append(compute_f1(prediction, answer))
print("EM score: " + str(np.sum(baseline_EM_scores)/number_answers))
print("F1 score: " + str(np.sum(baseline_F1_scores)/number_answers))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


EM score: 0.48
F1 score: 0.5374697247197248
