In [40]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os 
import json 
import pickle
import random

import spacy
import numpy as np
import pandas as pd 
from tqdm import tqdm
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from settings import *

spy = spacy.load('en_core_web_sm')

In [3]:
# Extract Questions from SQuAD
with open(SQuAD_train) as fr:
    doc = json.load(fr)

paragraphs = []
questions = []
for topic in doc['data']:
    for pgraph in topic['paragraphs']:
        paragraphs.append(pgraph['context'])
        for qa in pgraph['qas']:
            if not qa['is_impossible']:
                questions.append(qa['question'])

len(paragraphs), len(questions)

(19035, 86821)

In [4]:
# Map words to lemmes

def lemmatize(text):
    return ' '.join([word.lemma_ for word in spy(text)])

In [10]:
%%time

if not os.path.isfile(LEMMA_CACHE):
    lemmas = [lemmatize(par) for par in tqdm(paragraphs)]
    df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas})
    df.to_feather(LEMMA_CACHE)

df = pd.read_feather(LEMMA_CACHE)
paragraphs = df.context
lemmas = df.lemmas

100%|██████████| 19035/19035 [07:02<00:00, 45.06it/s]
CPU times: user 6min 38s, sys: 29.7 s, total: 7min 8s
Wall time: 7min 2s


In [11]:
rand_idx = [random.randint(0, len(lemmas)-1) for i in range(10)]

[(paragraphs[i][:80], lemmas[i][:80]) for i in rand_idx]

[('The Ancient Near East is a term of the 20th century intended to stabilize the ge',
  'the Ancient Near East be a term of the 20th century intend to stabilize the geog'),
 ('Nearly two-thirds of all murders in Michigan in 2011 occurred in Detroit. Althou',
  'nearly two - third of all murder in Michigan in 2011 occur in Detroit . although'),
 ('Bern has a population of 140,634 people and 34% of the population are resident f',
  'Bern have a population of 140,634 people and 34 % of the population be resident '),
 ('The most significant event between the 7th and 11th century was the Tripartite s',
  'the most significant event between the 7th and 11th century be the Tripartite st'),
 ('Nanjing is endowed with rich natural resources, which include more than 40 kinds',
  'Nanjing be endow with rich natural resource , which include more than 40 kind of'),
 ('In 525 BC, the powerful Achaemenid Persians, led by Cambyses II, began their con',
  'in 525 BC , the powerful Achaemenid Persians ,

In [33]:
%%time

if not os.path.isfile(VECTOR_CACHE):
    vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=.5, ngram_range=(1, 3))
    tfidf = vectorizer.fit_transform(lemmas)
    with open(VECTOR_CACHE, 'wb') as fw:
        pickle.dump(dict(vectorizer=vectorizer, tfidf=tfidf), fw)
else:
    with open(VECTOR_CACHE, 'rb') as fr:
        cache = pickle.load(fr)
        vectorizer = cache['vectorizer']
        tfidf = cache['tfidf']

CPU times: user 7.27 s, sys: 400 ms, total: 7.67 s
Wall time: 7.24 s


In [59]:
# Fetch contexts related to question

question = "Who is a notable exponent of pluralistic idealism?"
query = vectorizer.transform([lemmatize(question)])
(query>0).sum(), vectorizer.inverse_transform(query)

(4, [array(['exponent', 'idealism', 'notable', 'pluralistic'], dtype='<U42')])

In [60]:
%%time
# compare the vectorized query against all paragraphs in the corpus.

scores = (tfidf*query.T).toarray()
results = (np.flip(np.argsort(scores, axis=0)))
[paragraphs[i] for i in results[:3, 0]]

CPU times: user 4.19 ms, sys: 815 µs, total: 5 ms
Wall time: 4.37 ms


['Pluralistic idealism such as that of Gottfried Leibniz takes the view that there are many individual minds that together underlie the existence of the observed world and make possible the existence of the physical universe. Unlike absolute idealism, pluralistic idealism does not assume the existence of a single ultimate mental reality or "Absolute". Leibniz\' form of idealism, known as Panpsychism, views "monads" as the true atoms of the universe and as entities having perception. The monads are "substantial forms of being",elemental, individual, subject to their own laws, non-interacting, each reflecting the entire universe. Monads are centers of force, which is substance while space, matter and motion are phenomenal and their form and existence is dependent on the simple and immaterial monads. There is a pre-established harmony established by God, the central monad, between the world in the minds of the monads and the external world of objects. Leibniz\'s cosmology embraced traditi

In [61]:
# Extract answers from contexts

qa_pipe = pipeline('question-answering', model=BERT_MODEL, tokenizer=BERT_MODEL)

Some weights of the model checkpoint at /Users/liuzhi/models/bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint a

In [62]:
%%time 

THRESH = 0.01
candicate_idxs = [(i, scores[i]) for i in results[0:10, 0]]
contexts = [(paragraphs[i], s) for (i, s) in candicate_idxs if s>THRESH]
question_df = pd.DataFrame.from_records([{
    'question': question,
    'context': ctx
} for (ctx, s) in contexts])
question_df.to_feather('cache/question_context.feather')

CPU times: user 3.61 ms, sys: 5.27 ms, total: 8.88 ms
Wall time: 8.94 ms


In [63]:
%%time 

preds = qa_pipe(question_df.to_dict(orient='records'))
answer_df = pd.DataFrame.from_records(preds)
answer_df['context'] = question_df['context']
answer_df = answer_df.sort_values(by='score', ascending=False)
answer_df.head()

CPU times: user 19.5 s, sys: 516 ms, total: 20 s
Wall time: 4.56 s


Unnamed: 0,score,start,end,answer,context
4,0.000222,172,182,change and,Kant argued against all three forms of materia...
9,0.000222,237,272,"""ideal"" character of all phenomena,","Beginning with Immanuel Kant, German idealists..."
2,0.000182,80,96,or reality as we,"In philosophy, idealism is the group of philos..."
3,0.00016,187,242,"idealism"" of Berkeley and the ""transcendental ...",Absolute idealism is G. W. F. Hegel's account ...
5,0.000142,89,144,"(ἰδεῖν), meaning ""to see"". The term entered th...",Idealism is a term with several related meanin...
