In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os 
import json 
import pickle
import random

import spacy
import numpy as np
import pandas as pd 
from tqdm import tqdm
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from settings import *

spy = spacy.load('en_core_web_sm')

In [3]:
# Extract Questions from SQuAD
with open(SQuAD_train) as fr:
    doc = json.load(fr)

paragraphs = []
questions = []
for topic in doc['data']:
    for pgraph in topic['paragraphs']:
        paragraphs.append(pgraph['context'])
        for qa in pgraph['qas']:
            if not qa['is_impossible']:
                questions.append(qa['question'])

len(paragraphs), len(questions)

(19035, 86821)

In [4]:
# Map words to lemmes

def lemmatize(text):
    return ' '.join([word.lemma_ for word in spy(text)])

In [5]:
%%time

if not os.path.isfile(LEMMA_CACHE):
    lemmas = [lemmatize(par) for par in tqdm(paragraphs)]
    df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas})
    df.to_feather(LEMMA_CACHE)

df = pd.read_feather(LEMMA_CACHE)
paragraphs = df.context
lemmas = df.lemmas

CPU times: user 121 ms, sys: 59.7 ms, total: 181 ms
Wall time: 256 ms


In [6]:
rand_idx = [random.randint(0, len(lemmas)-1) for i in range(10)]

[(paragraphs[i][:80], lemmas[i][:80]) for i in rand_idx]

[('Euro1080, a division of the former and now bankrupt Belgian TV services company ',
  'Euro1080 , a division of the former and now bankrupt belgian tv service company '),
 ('The term child labour can be misleading when it confuses harmful work with emplo',
  'the term child labour can be mislead when -PRON- confuse harmful work with emplo'),
 ('On September 21, 1529, Álvaro de Saavedra Cerón commanded the Spanish ship Flori',
  'on September 21 , 1529 , Álvaro de Saavedra Cerón command the spanish ship Flori'),
 ('From an institutional perspective, the rules of the House assign a number of spe',
  'from an institutional perspective , the rule of the House assign a number of spe'),
 ('The Recommended Exposure Index (REI) technique, new in the 2006 version of the s',
  'the Recommended Exposure Index ( REI ) technique , new in the 2006 version of th'),
 ('Arsenal was the first club from the south of England to join The Football League',
  'Arsenal be the first club from the south of En

In [7]:
%%time

if not os.path.isfile(VECTOR_CACHE):
    vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=.5, ngram_range=(1, 3))
    tfidf = vectorizer.fit_transform(lemmas)
    with open(VECTOR_CACHE, 'wb') as fw:
        pickle.dump(dict(vectorizer=vectorizer, tfidf=tfidf), fw)
else:
    with open(VECTOR_CACHE, 'rb') as fr:
        cache = pickle.load(fr)
        vectorizer = cache['vectorizer']
        tfidf = cache['tfidf']

CPU times: user 673 ms, sys: 156 ms, total: 828 ms
Wall time: 846 ms


In [8]:
# Fetch contexts related to question

question = "Who is a notable exponent of pluralistic idealism?"
query = vectorizer.transform([lemmatize(question)])
(query>0).sum(), vectorizer.inverse_transform(query)

(4, [array(['exponent', 'idealism', 'notable', 'pluralistic'], dtype='<U42')])

In [9]:
%%time
# compare the vectorized query against all paragraphs in the corpus.

scores = (tfidf*query.T).toarray()
results = (np.flip(np.argsort(scores, axis=0)))
[paragraphs[i] for i in results[:3, 0]]

CPU times: user 4.39 ms, sys: 691 µs, total: 5.08 ms
Wall time: 5.05 ms


['Pluralistic idealism such as that of Gottfried Leibniz takes the view that there are many individual minds that together underlie the existence of the observed world and make possible the existence of the physical universe. Unlike absolute idealism, pluralistic idealism does not assume the existence of a single ultimate mental reality or "Absolute". Leibniz\' form of idealism, known as Panpsychism, views "monads" as the true atoms of the universe and as entities having perception. The monads are "substantial forms of being",elemental, individual, subject to their own laws, non-interacting, each reflecting the entire universe. Monads are centers of force, which is substance while space, matter and motion are phenomenal and their form and existence is dependent on the simple and immaterial monads. There is a pre-established harmony established by God, the central monad, between the world in the minds of the monads and the external world of objects. Leibniz\'s cosmology embraced traditi

In [10]:
# Extract answers from contexts

qa_pipe = pipeline('question-answering', model=BERT_MODEL, tokenizer=BERT_MODEL)

In [11]:
%%time 

THRESH = 0.01
candicate_idxs = [(i, scores[i]) for i in results[0:10, 0]]
contexts = [(paragraphs[i], s) for (i, s) in candicate_idxs if s>THRESH]
question_df = pd.DataFrame.from_records([{
    'question': question,
    'context': ctx
} for (ctx, s) in contexts])
question_df.to_feather('cache/question_context.feather')

CPU times: user 3.02 ms, sys: 3.54 ms, total: 6.56 ms
Wall time: 24.8 ms


In [12]:
%%time 

preds = qa_pipe(question_df.to_dict(orient='records'))
answer_df = pd.DataFrame.from_records(preds)
answer_df['context'] = question_df['context']
answer_df = answer_df.sort_values(by='score', ascending=False)
answer_df.head()

CPU times: user 12.1 s, sys: 364 ms, total: 12.5 s
Wall time: 3.27 s


Unnamed: 0,score,start,end,answer,context
0,0.98691,37,54,Gottfried Leibniz,Pluralistic idealism such as that of Gottfried...
4,0.889905,0,4,Kant,Kant argued against all three forms of materia...
5,0.879589,200,216,Woodrow Wilson's,Idealism is a term with several related meanin...
1,0.699285,106,117,Descartes's,The 2nd edition (1787) contained a Refutation ...
7,0.696889,342,359,Platonic idealism,Any philosophy that assigns crucial importance...
