In [30]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [14]:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

In [31]:
texts=pd.read_csv('wikiQA/WikiQA.tsv', sep='\t')
train=pd.read_csv('wikiQA/WikiQA-train.tsv', sep='\t')
dev=pd.read_csv('wikiQA/WikiQA-dev.tsv', sep='\t')
test=pd.read_csv('wikiQA/WikiQA-test.tsv', sep='\t')

In [32]:
def clean(df):
  # delete questions without correct answers
  ls=df.groupby(['QuestionID'])['Label'].sum()
  ind=ls[ls==0].index
  df=df.set_index('QuestionID').drop(ind)
  return df.reset_index()

In [33]:
texts=clean(texts)
dev=clean(dev)
test=clean(test)
train=clean(train)

In [18]:
# generate all documents
questions=pd.unique(texts['Question']).tolist()
answers=texts['Sentence'].to_list()
corpus=questions+answers

In [19]:
t=Tokenizer()
t.fit_on_texts(corpus)

In [20]:
tfidf=t.texts_to_matrix(corpus, mode='tfidf')

In [21]:
ct={}
qlen=len(questions)
alen=len(answers)

qids=pd.unique(texts['QuestionID']).tolist()
aids=texts['SentenceID'].to_list()

for i in range(qlen):
  ct[qids[i]]=i
  
for i in range(alen):
  ct[aids[i]]=i+qlen

In [22]:
class ExampleGenerator():
  def __init__(self, nCandidate):
    self.nCandidate=nCandidate
    
  def sample_neg_answers(self, df, QuestionID, nNegs):
    neg_as=[]
    negs=df[(df['QuestionID']==QuestionID) & (df['Label']==0)]['SentenceID'].to_list()
    diff=nNegs-len(negs)
    if diff<=0:
      ind=np.random.choice(len(negs), nNegs, replace=False)
      for i in ind:
        neg_as.append(negs[i])
    else:
      neg_as=neg_as+negs
      answer_pool=df[df['QuestionID']!=QuestionID]['SentenceID'].to_list()
      ind=np.random.choice(len(answer_pool),diff,replace=False)
      for i in ind:
        neg_as.append(answer_pool[i]) 
    return neg_as
        
  def create_test_group(self, df):
    '''
    treat each question as a group 
    '''
    qids=pd.unique(df['QuestionID']).tolist()
    test_gs=[]
    for qid in qids:
      examples=[]
      records=df[(df['QuestionID']==qid) & (df['Label']==1)]
      pos_as=records['SentenceID'].tolist()
      pos_examples=list(map(lambda p: [qid, p, int(1)], pos_as))
      examples.extend(pos_examples)
      neg_as=self.sample_neg_answers(df, qid, self.nCandidate-len(pos_as))
      neg_examples=list(map(lambda n: [qid, n, int(0)], neg_as))
      examples.extend(neg_examples)
      test_gs.append(examples)
    return test_gs

In [24]:
G=ExampleGenerator(50)
# gps=G.create_test_group(texts)

dev_gs_for_eval=G.create_test_group(dev)
test_gs=G.create_test_group(test)

In [25]:
def cosine_sim(a,b):
  return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [27]:
def evaluate(gps):
  ranks=[]
  aps=[]
  for g in gps:
    sims=[]
    recall_pos=[]

    qs, ans, labels = list(zip(*g))

    # get similarities for each group
    for i in range(len(g)):
      ind1, ind2=ct[qs[i]], ct[ans[i]]
      sim=cosine_sim(tfidf[ind1],tfidf[ind2])
      sims.append(sim)

    # sort the similarities desceding
    sorted_index=np.argsort(sims)[::-1]

    for i in range(len(sorted_index)):
      if labels[sorted_index[i]]==1:
        # record the postion of correct answers
        recall_pos.append(i+1)
    recall_pos=np.array(recall_pos)

    # calculate the average precision
    ap=np.mean((np.arange(len(recall_pos))+1)/recall_pos)
    aps.append(ap)

    # record the postion of first correct answer
    ranks.append(recall_pos[0])

  # calculate the mean average precision
  MAP=sum(aps)/len(aps)

  # calculate the MRR
  ranks=np.array(ranks)
  MRR=np.mean(1/ranks)

  # calculate accuracy (precision@1)
  ACC=sum(ranks==1)/len(ranks)
  return MAP,MRR,ACC

In [28]:
evaluate(dev_gs_for_eval)

(0.5048147410808926, 0.5113042641615886, 0.3253968253968254)

In [29]:
evaluate(test_gs)

(0.5220296515084796, 0.5398578609386567, 0.3817427385892116)

In [20]:
print("MAP=%.5f"%(MAP))
print("MRR=%.5f"%(MRR))
print("ACC=%.5f"%(ACC))

MAP=0.51899
MRR=0.53657
ACC=0.36672


In [43]:
from gensim.utils import tokenize

def count_word(sentence):
  tokens=list(tokenize(sentence))
  return len(tokens)

In [44]:
train_count=train['Sentence'].apply(count_word).tolist()
dev_count=dev['Sentence'].apply(count_word).tolist()
test_count=test['Sentence'].apply(count_word).tolist()

In [45]:
avg_train=sum(train_count)/len(train_count)
avg_dev=sum(dev_count)/len(dev_count)
avg_test=sum(test_count)/len(test_count)

In [46]:
print(avg_train)
print(avg_dev)
print(avg_test)

21.952804061850912
21.15575221238938
21.561691113028473


In [49]:
len(test_count)

2318