In [1]:
# Imports 

import numpy as np 
import pandas as pd 
import os

In [2]:
# Dataset Construction

folder = '/Users/takumakoide/Downloads/archive/' # subject to user's directory

df = pd.DataFrame()
for file in ['S10_question_answer_pairs.txt','S09_question_answer_pairs.txt','S08_question_answer_pairs.txt']:
    filename = os.path.join(folder, file)
    df_tmp = pd.read_csv(filename, encoding='latin1', sep='\t').drop_duplicates(subset="Question")
    print(filename, len(df_tmp))
    df = pd.concat([df,df_tmp])

/Users/takumakoide/Downloads/archive/S10_question_answer_pairs.txt 832
/Users/takumakoide/Downloads/archive/S09_question_answer_pairs.txt 598
/Users/takumakoide/Downloads/archive/S08_question_answer_pairs.txt 1033


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 2463 entries, 0 to 1714
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ArticleTitle              832 non-null    object
 1   Question                  2461 non-null   object
 2   Answer                    2188 non-null   object
 3   DifficultyFromQuestioner  1570 non-null   object
 4   DifficultyFromAnswerer    2185 non-null   object
 5   ArticleFile               2461 non-null   object
 6   ï»¿ArticleTitle           1631 non-null   object
dtypes: object(7)
memory usage: 153.9+ KB


In [4]:
# normalize the name of columns
df.columns = ['articleTitle','question','answer','difficultyFromQuestioner',
              'difficultyFromAnswerer','articleFile',"articleTitle_to_drop"]

# drop the last column
df.drop('articleTitle_to_drop', axis=1, inplace=True)

In [5]:
# nb of na values in each column
df.isna().sum()

articleTitle                1631
question                       2
answer                       275
difficultyFromQuestioner     893
difficultyFromAnswerer       278
articleFile                    2
dtype: int64

In [6]:
# drop na values
print('original df length: ',len(df))
df.dropna(subset=['question'], inplace=True)
df.dropna(subset=['answer'], inplace=True)
df.dropna(subset=['articleTitle'], inplace=True)
df = df[~df.question.str.contains('#')] # remove badly formatted questions
df = df[~df.answer.isin(['no','yes','Yes','No','No,','Yes,','No.','Yes.','yes.','no.'])] # remove yes/no questions
print('new df length: ',len(df))

original df length:  2463
new df length:  511


In [7]:
pd.options.display.max_colwidth=None
df.head()

Unnamed: 0,articleTitle,question,answer,difficultyFromQuestioner,difficultyFromAnswerer,articleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemistry.,easy,easy,S10_set4_a10
2,Alessandro_Volta,Did Alessandro Volta invent the remotely operated pistol?,Alessandro Volta did invent the remotely operated pistol.,easy,easy,S10_set4_a10
4,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,S10_set4_a10
6,Alessandro_Volta,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium,S10_set4_a10
8,Alessandro_Volta,What did Alessandro Volta invent in 1800?,"In 1800, Alessandro Volta invented the voltaic pile.",medium,easy,S10_set4_a10


In [8]:
# load question-answering model

from transformers import pipeline
qa_model = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [9]:
def answer_question_given_article(question, article_name):
    '''
    given the question, read the article and use it as a context and return a dictionary with an 
    answer and a score and the position of the answer in the article
    '''
    folder_name = '/Users/takumakoide/Downloads/archive/text_data'
    article_file = str(df[df.articleTitle==article_name].articleFile.unique()[0]) + '.txt.clean'
    article_path = os.path.join(folder_name, article_file)
    
    with open(article_path, 'r') as file:
        context = file.read().replace('\n', '')
        
    return qa_model(question, context)

In [10]:
%%time

# question example and the time it takes to answer it
question_example = """In 1602, the British East India Company's first voyage, commanded by Sir who, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post?"""
answer_question_given_article(question_example, "Jakarta")

CPU times: user 8.6 s, sys: 1.58 s, total: 10.2 s
Wall time: 5.62 s


{'score': 0.8335656523704529,
 'start': 3262,
 'end': 3281,
 'answer': 'Sir James Lancaster'}

In [11]:
# let's compare with what the answerer provided 

df[df.question==question_example]

Unnamed: 0,articleTitle,question,answer,difficultyFromQuestioner,difficultyFromAnswerer,articleFile
663,Jakarta,"In 1602, the British East India Company's first voyage, commanded by Sir who, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post?",James Lancaster,,medium,S10_set3_a5


In [12]:
# Apply the answering function on a small sample of 20 questions
df_sample = df.sample(30)

# complete answer
df_sample['answer_from_model'] = df_sample.apply(lambda x: answer_question_given_article(x.question, x.articleTitle), axis=1)

# extract the exact answer 
df_sample['models_answer'] = df_sample['answer_from_model'].map(lambda x:x['answer'])

# extract the score (confidence of the model)
df_sample['models_score'] = df_sample['answer_from_model'].map(lambda x:x['score'])


In [13]:
cols_of_interst = ['question','answer','models_answer','models_score']
df_sample.head()[cols_of_interst]

Unnamed: 0,question,answer,models_answer,models_score
464,What is the earliest extant transverse flute?,it dates from 433 BC,a chi,0.455581
1436,Did the golden age of xylophones come before or after the first usage of the European-derived orchestral?,after,since the Middle Ages,0.814323
889,Where is Melbourne situated?,boundary of the very hot inland areas and the cold southern ocean,Derbyshire,0.669771
725,What is the official language of North Korea?,Korean,Korean,0.86495
333,What are the three heaviest cats in the world?,"The three heaviest cats in the world are the tiger, lion and jaguar.",bobcat and Canadian lynx,0.854326


In [14]:
# pip install -Uq sentence-transformers


In [15]:
def how_similar(sent1, sent2, model):
    '''
    compute sentence similarity. 
    '''
    #Compute embedding for both lists
    embedding_1= model.encode(str(sent1), convert_to_tensor=True)
    embedding_2 = model.encode(str(sent2), convert_to_tensor=True)

    return util.pytorch_cos_sim(embedding_1, embedding_2).item()

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df_sample['answers_similarity'] = df_sample.apply(lambda x: how_similar(x.answer, x.models_answer, model=model), axis=1)

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [16]:
# let's inspect 
df_sample[['question','answer',
           'models_answer','answers_similarity','models_score']].sort_values(by='answers_similarity', ascending=False)

Unnamed: 0,question,answer,models_answer,answers_similarity,models_score
1073,Give an example of the most comprehensive records extant of any major artists beginnings.,Museu Picasso,Museu Picasso,1.0,0.471382
725,What is the official language of North Korea?,Korean,Korean,1.0,0.86495
887,How many sister cities does the City of Melbourne have?,six,six,1.0,0.894697
781,What is the city's population?,1.6 million,1.6 million,1.0,0.870069
794,"When did Baldassare Castiglione, author of Il Cortegiano ( "" The Courtier "" ), write?",1528,1528,1.0,0.961219
843,What is the basic word order in Malay?,Subject Verb Object,Subject Verb Object,1.0,0.957562
458,For how long has the classification of dialects spoken outside of Finland been a controversial issue?,Since Finland's independence in 1917,since Finland's independence in 1917,1.0,0.837275
1024,When did Tesla receive his last patent?,1928,1928,1.0,0.97432
982,Which embassy in Nairobi was bombed in August 1998?,The United States Embassy,United States Embassy,0.972272,0.374236
309,Do linguists often view Chinese as a language family?,"Yes, linguists often view Chinese as a language family.",Linguists often view Chinese as a language family,0.943564,0.337964
