In [1]:
import json
import nltk
import random
import requests
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import numpy as np

from rank_bm25 import BM25Okapi

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/U4VN/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/U4VN/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def load_squad_dataset(file_path):
    with open(file_path, 'r') as f:
        dataset = json.load(f)
    return dataset['data']

squad_data = load_squad_dataset('./train-v2.0.json')
squad_data.extend(load_squad_dataset('./dev-v2.0.json'))
print(len(squad_data))

458


<pre>
[
    {'title': TITLE,
     'paragraphs' : 
     [{
         'qas': [
             {'question':, 'answers': [{'text': }], 'is_impossible':}
          ],
          'context' : 
      }]
     }
]
</pre>

Contexts have several sentences. Questions have answer in some sentence.
<pre>
    sentences = []
    questions = []
    q2sent[question] = sent_with_answer #dict
</pre>

In [36]:
sentences = []
questions = []
q2s = {}
last_sent_idx = 0
cur_q_idx = -1
for data in squad_data:
    for paragraph in data['paragraphs']:
        last_sent_idx = len(sentences)
        context = paragraph['context']
        context_sentences = sent_tokenize(context)
        context_sentences_len = [len(sent) for sent in context_sentences]
        sentences.extend(context_sentences)
        
        for qa in paragraph['qas']:
            question = qa['question']
            if qa['is_impossible']:
                continue #next question with answer
            
            answer_start = qa['answers'][0]['answer_start']
            answer_text = qa['answers'][0]['text']
            cur_q_idx += 1
            questions.append(question)
            q2s[cur_q_idx] = None
            
            # Find the sentence containing the answer span
            for idx, s in enumerate(context_sentences):
                start_index = context.find(s)
                end_index = start_index + len(s)
                if start_index <= answer_start < end_index:
                    q2s[cur_q_idx] = idx + last_sent_idx
                    break



In [37]:
print(len(sentences), len(questions), len(q2s))

97870 89731 89731


In [40]:
qid = 1000
questions[qid]

"What nationality were the two friends who served as a pivotal influence in Frédéric's life while in Paris?"

In [41]:
sentences[q2s[qid]]

"Two Polish friends in Paris were also to play important roles in Chopin's life there."

In [23]:
stop_words = set(stopwords.words('english'))

In [24]:
def get_words(sentence, remove_stop_words=True):
    words = word_tokenize(sentence.lower())
    filtered_sentence = [w for w in words if not w in stop_words]
    return filtered_sentence

In [42]:
tokenized_corpus = [get_words(sent) for sent in sentences]
bm25 = BM25Okapi(tokenized_corpus)

In [26]:
def get_best_sents(query, bm25, k=5):
    tokenized_query = get_words(query)
    scores = bm25.get_scores(tokenized_query)
    results = np.argpartition(scores, -k)[-k:]
    return results

In [43]:
i = 1000
tq = get_words(questions[i])
print(questions[i], sentences[q2s[i]])
results = get_best_sents(questions[i], bm25, 10)
[sentences[i] for i in results]

What nationality were the two friends who served as a pivotal influence in Frédéric's life while in Paris? Two Polish friends in Paris were also to play important roles in Chopin's life there.


['His influence on the brotherhood permeated nearly every aspect of Dominican life.',
 'Famous musicians include Władysław Szpilman and Frédéric Chopin.',
 "The thoughts, ideas and concepts developed at this period of life greatly influence one's future life, playing a major role in character and personality formation.",
 'Just in the last two decades,[when?]',
 "Two Polish friends in Paris were also to play important roles in Chopin's life there.",
 "It first gained influence in England and France; in England, Sir William Hamilton's excavations at Pompeii and other sites, the influence of the Grand Tour and the work of William Chambers and Robert Adam, was pivotal in this regard.",
 'On his way back to Paris, he met old friends from Warsaw, the Wodzińskis.',
 "[note 4] He served in Valence and Auxonne until after the outbreak of the Revolution in 1789, and took nearly two years' leave in Corsica and Paris during this period.",
 "The CPY's influence on the political life of the Kingdom

In [None]:
import json
from tqdm import tqdm
import concurrent.futures

N_NEG_SAMPLES = 10
triples = []

def process_question(q_idx, question, q2s, sentences, bm25):
    if q_idx in q2s:
        correct_sentence_idxs = q2s[q_idx]
        for correct_sentence_idx in correct_sentence_idxs:
            correct_sentence = sentences[correct_sentence_idx]
            neg_sentences_idx = get_best_sents(question, bm25, N_NEG_SAMPLES)
            neg_sentences = []
            for neg_s in neg_sentences_idx:
                if sentences[neg_s] != correct_sentence:
                    neg_sentences.append(sentences[neg_s])
            neg_sentences = list(set(neg_sentences))

            for neg_sent in neg_sentences:
                triples.append({'question': question, 'pos_sentence': correct_sentence, 'neg_sentence': neg_sent})


# Use ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    # tqdm doesn't work well with multithreading, so remove it if needed
    futures = [executor.submit(process_question, q_idx, question, q2s, sentences, bm25) for q_idx, question in tqdm(enumerate(questions))]

    # Wait for all threads to finish
    concurrent.futures.wait(futures)

# Dump the result to a file
with open('pt-br-squad-1.1-hard-neg.json', 'w') as fout:
    json.dump(triples, fout)

 36%|███▌      | 31877/89731 [1:28:14<2:53:12,  5.57it/s]

In [110]:
triples[:5]

[{'question': 'When did Beyonce start becoming popular?',
  'pos_sentence': "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.",
  'neg_sentence': 'About the time of Damascene, the public celebration of the "Conception of St. Ann [i.e., of the Theotokos in her womb]" was becoming popular.'},
 {'question': 'When did Beyonce start becoming popular?',
  'pos_sentence': "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.",
  'neg_sentence': 'His original name is unknown, but it seems that he was absorbed by the more popular Apollo, who stood by the virgin "Mistress of the Animals", becoming her brother.'},
 {'question': 'When did Beyonce start becoming popular?',
  'pos_sentence': "Born and raised in Houston, Texas,