In [33]:
import pandas as pd
from gensim.utils import *
from gensim.parsing.preprocessing import remove_stopwords
import gensim.downloader as api
from gensim.models import KeyedVectors

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [34]:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

In [35]:
train=pd.read_csv('wikiQA/WikiQA-train.tsv', sep='\t')
dev=pd.read_csv('wikiQA/WikiQA-dev.tsv', sep='\t')
test=pd.read_csv('wikiQA/WikiQA-test.tsv', sep='\t')

In [36]:
folder="/vol/home/s2465922/gensim-data/"
file="glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz"

model_embed = KeyedVectors.load_word2vec_format(folder+file, binary=False)

In [37]:
def clean(df):
  # delete questions without correct answers
  ls=df.groupby(['QuestionID'])['Label'].sum()
  ind=ls[ls==0].index
  df=df.set_index('QuestionID').drop(ind)
  return df.reset_index()

In [38]:
class Word2Seq():
  def __init__(self, model, docs, max_sentence_length):
    self.model=model
    self.docs=docs
    self.t=self.tokenizer(docs)
    self.maxLen=max_sentence_length

  def tokenizer(self,docs):
    t=Tokenizer()
    t.fit_on_texts(docs)
    return t
  
  def create_padded_sentence(self,sentence):
    encoded_sentence=self.t.texts_to_sequences([sentence])
    padded_sentence=pad_sequences(encoded_sentence, maxlen=self.maxLen, padding='post')
    return padded_sentence[0]
  
  def create_embedding_matrix(self):
    vocab_size=len(self.t.word_index)+1
    embedding_matrix=np.zeros((vocab_size, self.model.vector_size))
    for word,i in self.t.word_index.items():
      if word in self.model.vocab:
        embedding_matrix[i]=self.model[word]
    return embedding_matrix

In [39]:
class ExampleGenerator():
  def __init__(self, nCandidate):
    self.nCandidate=nCandidate
    
  def sample_neg_answers(self, df, QuestionID, nNegs):
    neg_as=[]
    negs=df[(df['QuestionID']==QuestionID) & (df['Label']==0)]['a_vec'].to_list()
    diff=nNegs-len(negs)
    if diff<=0:
      ind=np.random.choice(len(negs), nNegs, replace=False)
      for i in ind:
        neg_as.append(negs[i])
    else:
      neg_as=neg_as+negs
      answer_pool=df[df['QuestionID']!=QuestionID]['a_vec'].to_list()
      ind=np.random.choice(len(answer_pool),diff,replace=False)
      for i in ind:
        neg_as.append(answer_pool[i]) 
    return neg_as
      
  def create_train_group(self,df):
    '''
    treat each question with multiples groud truth as multiple groups
    '''
    
    pos_ind=df.loc[df['Label']==1].index
    train_gs=[]
    for i in pos_ind:
      examples=[]
      qid, q, p=df.loc[i,['QuestionID','q_vec','a_vec']].to_list()
      pos_example=[q, p, int(1)]
      examples.append(pos_example)
      neg_as=self.sample_neg_answers(df,qid, self.nCandidate)
      neg_examples=list(map(lambda n: [q, n, int(0)], neg_as))
      examples.extend(neg_examples)
      train_gs.append(examples)
    return train_gs
      
  def create_examples(self,gs):
    examples=[]
    for g in gs:
      triplets=[]
      q=g[0][0]
      p=g[0][1]
      for i in range(1,len(g)) :
        triplets.append([q,p,g[i][1]])
      examples.extend(triplets)
    return examples
  
  def create_test_group(self, df):
    '''
    treat each question as a group 
    '''
    qids=pd.unique(df['QuestionID']).tolist()
    test_gs=[]
    for qid in qids:
      examples=[]
      records=df[(df['QuestionID']==qid) & (df['Label']==1)]
      q=records['q_vec'].tolist()[0]
      pos_as=records['a_vec'].tolist()
      pos_examples=list(map(lambda p: [q, p, int(1)], pos_as))
      examples.extend(pos_examples)
      neg_as=self.sample_neg_answers(df, qid, self.nCandidate-len(pos_as))
      neg_examples=list(map(lambda n: [q, n, int(0)], neg_as))
      examples.extend(neg_examples)
      test_gs.append(examples)
    return test_gs
      
    

In [40]:
train=clean(train)
dev=clean(dev)
test=clean(test)

In [41]:
# generate all documents
docs=pd.unique(train['Question']).tolist()
docs.extend(train['Sentence'].to_list())
docs.extend(pd.unique(dev['Question']).tolist())
docs.extend(dev['Sentence'].to_list())
docs.extend(pd.unique(test['Question']).tolist())
docs.extend(test['Sentence'].to_list())

In [42]:
# convert word to sequence
ws=Word2Seq(model_embed, docs, 40)

train['q_vec']=train['Question'].apply(lambda x: ws.create_padded_sentence(x))
train['a_vec']=train['Sentence'].apply(lambda x: ws.create_padded_sentence(x))

dev['q_vec']=dev['Question'].apply(lambda x: ws.create_padded_sentence(x))
dev['a_vec']=dev['Sentence'].apply(lambda x: ws.create_padded_sentence(x))

test['q_vec']=test['Question'].apply(lambda x: ws.create_padded_sentence(x))
test['a_vec']=test['Sentence'].apply(lambda x: ws.create_padded_sentence(x))

In [43]:
G=ExampleGenerator(50)
train_gs=G.create_train_group(train)     
trainExamples=G.create_examples(train_gs)

dev_gs=G.create_train_group(dev)    
devExamples=G.create_examples(dev_gs)
dev_gs_for_eval=G.create_test_group(dev)  # for evaluation

test_gs=G.create_test_group(test)

embedding_matrix=ws.create_embedding_matrix()

In [44]:
max_sentence_length=40

In [49]:
class nnet:
  def __init__(self):
    self.embedding_matrix=embedding_matrix
    self.vocab_size, self.vec_size=embedding_matrix.shape
    self.model=self.build_model()
    
  
  def create_base_network(self):
    in_sentence=Input(shape=(max_sentence_length,))
    embedding=Embedding(self.vocab_size, self.vec_size, 
                        weights=[self.embedding_matrix],
                        input_length=max_sentence_length,
                        trainable=False)(in_sentence)
    gru=GRU(100,dropout=0.5,
            return_sequences=True,
            kernel_initializer='glorot_normal',
            kernel_regularizer=keras.regularizers.l2(0.005))(embedding)

#     rnn=SimpleRNN(100, dropout=0.5,
#             return_sequences=True,
#             kernel_initializer='glorot_normal',
#             kernel_regularizer=keras.regularizers.l2(0.005))(embedding)
    out=GlobalMaxPooling1D()(gru)
    return Model(in_sentence, out)
  
  def build_model(self):
    in_q = Input(shape=(max_sentence_length,), name='in_q')
    in_pos=Input(shape=(max_sentence_length,), name='in_pos')
    in_neg=Input(shape=(max_sentence_length,),name='in_neg')
    
    self.base_network=self.create_base_network()
    q=self.base_network(in_q)
    pos=self.base_network(in_pos)
    neg=self.base_network(in_neg)

    q_pos = Dot(axes=1, normalize=True)([q,pos])
    q_neg = Dot(axes=1, normalize=True)([q,neg])
    sims=concatenate([q_pos,q_neg])

    model = Model(inputs=[in_q, in_pos, in_neg], outputs=sims)
    model.compile(optimizer=Adam(learning_rate=0.00025),
                  loss=self.triplet_loss)

    return model
    
  def triplet_loss(self,y_true,y_pred):
    margin=0.2
    q_pos_sim, q_neg_sim=y_pred[:,0], y_pred[:,1]
    loss = tf.maximum(tf.constant(0.0),tf.constant(margin) - q_pos_sim + q_neg_sim)
    return tf.reduce_mean(loss)
    
  def train(self, trainExamples, devExamples):
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    
    qs_train, pos_as_train, neg_as_train = list(zip(*trainExamples))
    qs_train=np.array(qs_train)
    pos_as_train=np.array(pos_as_train)
    neg_as_train=np.array(neg_as_train)
    
    qs_dev, pos_as_dev, neg_as_dev = list(zip(*devExamples))
    qs_dev=np.array(qs_dev)
    pos_as_dev=np.array(pos_as_dev)
    neg_as_dev=np.array(neg_as_dev)
    
    out_train=np.zeros((len(qs_train),2))
    out_dev=np.zeros((len(qs_dev),2))
    history=self.model.fit(x=[qs_train,pos_as_train,neg_as_train], y=[out_train],
                           validation_data=([qs_dev,pos_as_dev,neg_as_dev], [out_dev]),
                           batch_size=64, 
                           epochs=15, 
                           shuffle=True,
                           callbacks=[callback])

    
  def predict(self, examples):
    qs, pos_as, neg_as = list(zip(*examples))
    qs=np.array(qs)
    pos_as=np.array(pos_as)
    neg_as=np.array(neg_as)
    sims=self.model.predict([qs,pos_as,neg_as])
    return sims
  
  def evaluate(self,qa_groups):
    ranks=[]
    aps=[]
    nExamples=0
    for g in qa_groups:
      recall_pos=[]
      qs, ans, labels = list(zip(*g))
      qs=np.array(qs)
      ans=np.array(ans)
      tmp=np.zeros((qs.shape[0],qs.shape[1]))
      sims=self.model.predict([qs,ans,tmp])[:,0]

      # sort the simlarities descending
      sorted_index=np.argsort(sims)[::-1]

      for i in range(len(sorted_index)):
        if labels[sorted_index[i]]==1:
          # record the postion of correct answers
          recall_pos.append(i+1)
      recall_pos=np.array(recall_pos)

      # calculate the average precision
      ap=np.mean((np.arange(len(recall_pos))+1)/recall_pos)
      aps.append(ap)

      # record the postion of first correct answer
      ranks.append(recall_pos[0])

    # calculate the mean average precision
    MAP=sum(aps)/len(aps)

    # calculate the MRR
    ranks=np.array(ranks)
    MRR=np.mean(1/ranks)
    
    # calculate accuracy (precision@1)
    ACC=sum(ranks==1)/len(ranks)
    return MAP, MRR, ACC

In [50]:
rnn=nnet()
rnn.train(trainExamples, devExamples)

Epoch 1/15


KeyboardInterrupt: 

In [47]:
rnn.evaluate(dev_gs_for_eval)

(0.5650921325623169, 0.5706768124740137, 0.42063492063492064)

In [48]:
rnn.evaluate(test_gs)

(0.5478913951680726, 0.5629212286038376, 0.4024896265560166)

In [51]:
gru=nnet()

In [52]:
gru.train(trainExamples, devExamples)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15


In [53]:
gru.evaluate(dev_gs_for_eval)

(0.6217317770172716, 0.63310834434784, 0.4603174603174603)

In [54]:
gru.evaluate(test_gs)

(0.6169315958735563, 0.6380337694330435, 0.4896265560165975)