In [32]:
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import nltk
import gensim
import itertools
import tflearn as tl

In [33]:
lmap=lambda func,it:list(map(func,it))

In [34]:
with open('documents.json','r+') as f:
    documents=json.loads(f.read())

In [35]:
with open('training.json','r+') as f:
    query=json.loads(f.read())

In [36]:
with open('testing.json','r+') as f:
    test=json.loads(f.read())

In [38]:
tokenizer=nltk.tokenize.SpaceTokenizer()
sentence_tokenizer=nltk.tokenize.PunktSentenceTokenizer()

In [39]:
paragraphs={}
for d in documents:
    doc_id=d['docid']
    for pi,p in enumerate(d['text']):
        text=p.lower().replace('"','').replace('`','').replace('``','').replace("''",'').replace('``','')
        text=list(filter(lambda x:len(x.strip())>0,tokenizer.tokenize(text)))
        paragraphs[(doc_id,pi)]=text

In [46]:
processed_query=[]
for q in query:
    docid=q['docid']
    paraid=q['answer_paragraph']
    question_text=q['question'].lower().replace('"','').replace('`','').replace('``','').replace("''",'').replace('``','')
    question_token=list(filter(lambda x:len(x.strip())>0,tokenizer.tokenize(question_text)))
    answer_text=q['text'].lower().replace('"','').replace('`','').replace('``','').replace("''",'').replace('``','')
    answer_token=list(filter(lambda x:len(x.strip())>0,tokenizer.tokenize(answer_text)))
    processed_query.append({'dp_id':(docid,paraid),'question':question_token,'answer':answer_token})

In [47]:
# documents=lmap(lambda x:' '.join(x['text']).lower().replace('"','').replace('`','').replace('``','').replace("''",'').replace('``',''),documents)
# query=lmap(lambda x:{'dpid':(x['docid'],x['answer_paragraph']),
#                      'question':x['question'].lower().replace('``','').replace("''",'').replace('``',''),
#                      'text':x['text'].lower().replace('``','').replace("''",'').replace('``','')},query)

In [48]:
# document_sentences=lmap(lambda x:sentence_tokenizer.tokenize(x),documents)

In [49]:
# document_sentences=list(itertools.chain.from_iterable(document_sentences))

In [50]:
# doc_tokens=lmap(lambda x:list(filter(lambda x:len(x)>0,tokenizer.tokenize(x))),documents)
# query_tokens=lmap(lambda x:list(filter(lambda x:len(x)>0,tokenizer.tokenize(x['question']))),query)
# answer_tokens=lmap(lambda x:list(filter(lambda x:len(x)>0,tokenizer.tokenize(x['text']))),query)

In [51]:
doc_tokens=list(paragraphs.values())

In [52]:
query_tokens=lmap(lambda x: x['question'], processed_query)

In [53]:
answer_tokens=lmap(lambda x: x['answer'], processed_query)

In [54]:
all_tokens=doc_tokens+query_tokens+answer_tokens

In [55]:
w2c=gensim.models.Word2Vec(all_tokens,min_count=0,size=100)

In [56]:
%%time
w2c.train(all_tokens,epochs=5,total_examples=w2c.corpus_count)

CPU times: user 36.2 s, sys: 156 ms, total: 36.3 s
Wall time: 13.1 s


(10703726, 13582995)

In [57]:
# doc_embeddings={}
# for i,d in enumerate(documents):
#     tokens=list(filter(lambda x:len(x)>0,tokenizer.tokenize(d.lower())))
# #     sentences=sentence_tokenizer.tokenize(d)
# #     tokens=lmap(lambda x:tokenizer.tokenize(x),sentences)
# #     tokens=list(itertools.chain.from_iterable(tokens))
#     doc_embeddings[i]=(tokens,np.array(lmap(lambda x: w2c.wv[x.lower()] ,tokens)))

In [58]:
for k,v in paragraphs.items():
    paragraphs[k]=(v,np.array(lmap(lambda x: w2c.wv[x.lower()] ,v)))

In [59]:
def lookup_answer_index(answer_tokens,doc_tokens):
    doc_length=len(doc_tokens)
    answer_length=len(answer_tokens)
    for i in range(doc_length):
        if doc_length-i<answer_length:
            return (0,0)
        found=True
        for j in range(answer_length):
            found=answer_tokens[j] in doc_tokens[i+j]
            if not found: break
        if found:
            return (i,i+answer_length-1)

In [60]:
def crop_pad_question(max_leng,embedding):
    if embedding.shape[0]>max_leng:
        return embedding[:max_leng]
    dim=embedding.shape[1]
    pad_leng=max_leng - embedding.shape[0]
    padded_embedding=np.concatenate((embedding,np.zeros((pad_leng,dim))))
    assert padded_embedding.shape[0]==max_leng
    return padded_embedding

In [61]:
max_question_length=max(lmap(lambda x:len(x['question']),processed_query))
for q in processed_query:
#     {'dp_id':(doc_id,paraid),'question':question_token,'answer':answer_token}
    doc_t=paragraphs[q['dp_id']][0]
    answer_t=q['answer']
    query_t=q['question']
    start,end=lookup_answer_index(answer_t,doc_t)
    start_vec=np.zeros((len(doc_t),1),dtype=np.float32)
    start_vec[start]=1.0
    end_vec=np.zeros((len(doc_t),1),dtype=np.float32)
    end_vec[end]=1.0
    q['answer_span']=(start_vec,end_vec)
    query_embedding=np.array(lmap(lambda x:w2c.wv[x],query_t))
    q['query_embedding']=crop_pad_question(max_question_length,query_embedding)

In [62]:
# query_embeddings={}
# for i,q in enumerate(query):
#     query_tokens=list(filter(lambda x:len(x)>0,tokenizer.tokenize(q['question'].lower()))) 
#     answer_tokens=list(filter(lambda x:len(x)>0,tokenizer.tokenize(q['text'].lower()))) 
#     doc_tokens=doc_embeddings[q['docid']][0]
#     start,end=lookup_answer_index(answer_tokens,doc_tokens)
#     query_matrix=np.array(lmap(lambda x:w2c.wv[x],query_tokens))
# #     answer_matrix=np.array(lmap(lambda x:w2c.wv[x],answer_tokens))
#     start_vec=np.zeros(len(doc_tokens),dtype=np.float32)
#     start_vec[start]=1.0
#     end_vec=np.zeros(len(doc_tokens),dtype=np.float32)
#     end_vec[end]=1.0
#     query_embeddings[i]=(q['docid'],i,query_matrix,start_vec,end_vec,answer_tokens)

In [93]:
class RNNQANet():
    def __init__(self,rnn_units_number=[100,50],rnn_attention_length=5,learning_rate=0.001):
        tf.reset_default_graph()
        self.question=tf.placeholder(shape=[None,40,100],dtype=tf.float32,name='question')
        self.context=tf.placeholder(shape=[1,None,100],dtype=tf.float32,name='context')
        self.y_start=tf.placeholder(shape=[None,None,1],dtype=tf.float32,name='y_start')
        self.y_end=tf.placeholder(shape=[None,None,1],dtype=tf.float32,name='y_end')
        self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='dropout_keep_prob')
        self.question_input=self.question
        self.context_input=self.context
        with tf.variable_scope('rnn_context_encoder',initializer=tf.contrib.layers.xavier_initializer()):
            fcell=self._add_layerd_RNN_attention_cell(units_number=rnn_units_number,attention_length=rnn_attention_length,activation=tf.nn.relu)
            bcell=self._add_layerd_RNN_attention_cell(units_number=rnn_units_number,attention_length=rnn_attention_length,activation=tf.nn.relu)
            self.context_output,self.c_state=tf.nn.bidirectional_dynamic_rnn(inputs=self.context_input,cell_fw=fcell,cell_bw=bcell,dtype=tf.float32)
#             self.context_output,self.c_state=tf.nn.dynamic_rnn(inputs=self.context_input,cell=cell,dtype=tf.float32)
            self.context_output=tf.concat(self.context_output,2)
            self.context_output=tf.unstack(self.context_output,axis=0)[0]
            self.context_output=tl.layers.batch_normalization(self.context_output)
        with tf.variable_scope('rnn_question_encoder',initializer=tf.contrib.layers.xavier_initializer()):
#             cell=self._add_layerd_RNN_attention_cell(units_number=rnn_units_number,attention_length=rnn_attention_length,activation=tf.nn.relu)
#             bcell=self._add_layerd_RNN_attention_cell(units_number=rnn_units_number,attention_length=rnn_attention_length,activation=tf.nn.relu)
#             self.qoutput,self.q_state=tf.nn.bidirectional_dynamic_rnn(inputs=self.question_input,cell_fw=fcell,cell_bw=bcell,dtype=tf.float32)
#             print(self.q_state[0])
            _,self.q_state=self._encoder_block(inputs=self.question_input,units_number= rnn_units_number[-1]*2,attention_length=rnn_attention_length)
            self.question_output=self.q_state[0]
            
            self.question_output=tl.layers.batch_normalization(self.question_output)
        with tf.variable_scope('context_query_similarity',initializer=tf.contrib.layers.xavier_initializer()):
            self.context_query_output=tf.map_fn(lambda x:tf.multiply(x,self.context_output),self.question_output)
            self.context_query_output=tf.nn.softmax(self.context_query_output,axis=1)
            self.cq_encoder1,_=self._encoder_block(inputs=self.context_query_output,units_number=10,attention_length=5)
            self.cq_encoder2,_=self._encoder_block(inputs=self.cq_encoder1,units_number=10,attention_length=5)
            self.cq_encoder3,_=self._encoder_block(inputs=self.cq_encoder2,units_number=10,attention_length=5)
#             self.context_query_output=tf.nn.softmax(self.context_query_output,axis=0)
        with tf.variable_scope('start_decoder',initializer=tf.contrib.layers.xavier_initializer()):
            self.y_predict_start=tf.concat([self.cq_encoder1,self.cq_encoder2],axis=2)
            self.y_predict_start=self._add_dense_layer(inputs=self.y_predict_start,output_shape=1,drop_keep_prob=self.dropout_keep_prob,act=None)
            self.y_predict_start_softmax=tf.nn.softmax(self.y_predict_start,axis=1)
            self.y_start_loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.y_predict_start,labels=self.y_start,dim=1))
        with tf.variable_scope('end_decoder',initializer=tf.contrib.layers.xavier_initializer()):
            self.y_predict_end=tf.concat([self.cq_encoder1,self.cq_encoder3],axis=2)
            self.y_predict_end=self._add_dense_layer(inputs=self.y_predict_end,output_shape=1,drop_keep_prob=self.dropout_keep_prob,act=None)
            self.y_predict_end_softmax=tf.nn.softmax(self.y_predict_end,axis=1)
            self.y_end_loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.y_predict_end,labels=self.y_end,dim=1))
        with tf.variable_scope('train'):
            self.optimizier=tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.total_loss=self.y_start_loss+self.y_end_loss
            self.train_op=self.optimizier.minimize(self.total_loss)
        self.init_op=tf.global_variables_initializer()
        self.session=tf.Session()
        self.session.run(self.init_op)

    def _add_dense_layer(self, inputs, output_shape, drop_keep_prob, act=tf.nn.tanh):
        output = tf.contrib.layers.fully_connected(activation_fn=act, num_outputs=output_shape, inputs=inputs)
        output = tf.nn.dropout(output, drop_keep_prob)
        return output
                
    def _add_RNN_attention_cell(self,units_number,activation=tf.nn.tanh,attention_length=2,use_attention=False):
        rnn=tf.contrib.rnn.GRUCell(activation=activation,num_units=units_number)
        if use_attention:
            rnn=tf.contrib.rnn.AttentionCellWrapper(rnn,attn_length=attention_length)
        rnn = tf.contrib.rnn.DropoutWrapper(rnn,
                                             input_keep_prob=self.dropout_keep_prob,
                                             output_keep_prob=self.dropout_keep_prob,
                                             state_keep_prob=self.dropout_keep_prob,
                                             )
        return rnn
    
    def _add_layerd_RNN_attention_cell(self,units_number,activation=tf.nn.tanh,attention_length=2,use_attention=False):
        rnn=[self._add_GRU(units_number=n,attention_length=attention_length) for n in units_number ]
        rnn=tf.contrib.rnn.MultiRNNCell(cells=rnn,state_is_tuple=True)
#         if use_attention:
#             rnn=tf.contrib.rnn.AttentionCellWrapper(rnn,attn_length=attention_length)
        rnn= tf.contrib.rnn.DropoutWrapper(rnn,
                                             input_keep_prob=self.dropout_keep_prob,
                                             output_keep_prob=self.dropout_keep_prob,
                                             state_keep_prob=self.dropout_keep_prob,
                                             )
        return rnn
    
    def _add_GRU(self,units_number,attention_length,activation=tf.nn.tanh,use_attention=False):
        gru=tf.contrib.rnn.GRUCell(activation=activation,num_units=units_number)
        if use_attention:
            gru=tf.contrib.rnn.AttentionCellWrapper(gru,attn_length=attention_length)
        return gru
    
    def _encoder_block(self,inputs,units_number,attention_length,activation=tf.nn.relu,use_attention=True,normalize=True):
        gru=self._add_GRU(units_number=units_number,activation=activation,use_attention=use_attention,attention_length= attention_length)
        output,state=tf.nn.dynamic_rnn(inputs=inputs,cell=gru,dtype=tf.float32)
        if normalize:
            output=tl.layers.batch_normalization(output)
        return output,state
    
    def build_feed_dict(self,context,question,y_start,y_end,drop_keep_prob=0.8):
        feed_dict={
            self.question:question,
            self.context:context,
            self.y_start:y_start,
            self.y_end:y_end,
            self.dropout_keep_prob:drop_keep_prob
        }
        return feed_dict
    
    def train(self,context,question,y_start,y_end,drop_keep_prob=0.8):
        feed_dict={
            self.question:question,
            self.context:context,
            self.y_start:y_start,
            self.y_end:y_end,
            self.dropout_keep_prob:drop_keep_prob
        }
        _,loss,y_startloss,y_endloss=self.session.run([self.train_op,self.total_loss,self.y_start_loss,self.y_end_loss],feed_dict=feed_dict)
        return loss,y_startloss,y_endloss
#     def predict(context,question):
#          feed_dict={
#             self.question:question,
#             self.context:context,
#             self.y_start:y_start,
#             self.y_end:y_end,
#             self.dropout_keep_prob:dropout_keep_prob
#         }

In [94]:
qanet=RNNQANet()

ValueError: Variable context_query_similarity/rnn/attention_cell_wrapper/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py", line 99, in __init__
    initializer=kernel_initializer)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/rnn/python/ops/rnn_cell.py", line 1203, in call
    self._linear1 = _Linear([inputs, attns], input_size, True)
  File "<ipython-input-93-164b05951de7>", line 89, in _encoder_block
    output,state=tf.nn.dynamic_rnn(inputs=inputs,cell=gru,dtype=tf.float32)


In [None]:
epoch=1
batch_index=list(paragraphs.keys())
for e in range(epoch):
    np.random.shuffle(batch_index)
    batch_loss=[]
    count=0
    for bk in batch_index:
        context=np.expand_dims(paragraphs[bk][1],axis=0)
        para_questions=list(filter(lambda x:x['dp_id']==bk,processed_query))
        if len(para_questions)==0:
            continue
        question=np.array(lmap(lambda x:x['query_embedding'],para_questions))
        y_start=np.array(lmap(lambda x:x['answer_span'][0],para_questions))
        y_end=np.array(lmap(lambda x:x['answer_span'][1],para_questions))
        loss=qanet.train(context,question,y_start,y_end)
        batch_loss.append(loss)
        if count%1000==0:
            print('batch loss',np.mean(batch_loss))
            batch_loss=[]
        count+=1
#     for d_i,question,start,end,_ in known_questions:
#         context=doc_embeddings[d_i][1]
#         loss=qanet.train(context,question,start.reshape((start.shape[0],1)),end.reshape((end.shape[0],1)))

batch loss 7.218927
batch loss 6.508499
batch loss 9.934282
batch loss 33230.87


KeyboardInterrupt: 

In [579]:
query_embeddings[0][2][0]

0.0

In [580]:
query_embeddings[0][3][0]

0.0

In [583]:
len(list(filter(lambda x:x[2][0]!=1 and x[3][0]!=1,query_embeddings.values())))/len(query_embeddings)

0.9353373752276447

In [588]:
unknow_questions=list(filter(lambda x:x[3][0]==1 and x[3][0]==1,query_embeddings.values()))

In [589]:
unknow_questions[0]

(0, 10, array([[ 1.55454040e+00,  3.20265794e+00,  7.38620102e-01,
          5.14726043e-01,  3.70297395e-02, -1.31857932e+00,
         -2.70212245e+00, -8.23388994e-01, -5.47107719e-02,
         -6.97586536e-01,  3.15777600e-01,  1.21209550e+00,
         -7.64047086e-01, -2.48067474e+00,  1.15331352e+00,
         -1.68379140e+00, -8.56794894e-01,  6.74513802e-02,
         -1.52620971e-01,  2.14040041e+00,  1.36375082e+00,
          1.50497705e-01, -8.61967921e-01, -3.85789752e+00,
          4.05764580e-01, -3.59003216e-01, -1.42243373e+00,
          1.70370197e+00,  1.64122891e+00, -9.52514470e-01,
         -3.78340268e+00,  1.85256565e+00,  1.93335228e-02,
         -1.61799312e+00,  1.15964508e+00, -1.02517354e+00,
          2.72995770e-01,  1.23844337e+00, -5.92374206e-01,
         -7.27590859e-01,  9.59439799e-02,  1.94971573e+00,
          1.20297706e+00, -3.28392911e+00, -3.05100501e-01,
          1.26752877e+00, -9.06824052e-01, -8.26224267e-01,
          1.26848447e+00, -5.5687

In [590]:
query[10]

{'docid': 0,
 'question': 'what is frequency also known as in science?',
 'text': 'f'}

In [592]:
documents[0]

"first recognized in 1900 by max planck, it was originally the proportionality constant between the minimal increment of energy, e, of a hypothetical electrically charged oscillator in a cavity that contained black body radiation, and the frequency, f, of its associated electromagnetic wave. in 1905 the value e, the minimal energy increment of a hypothetical oscillator, was theoretically associated by einstein with a quantum or minimal element of the energy of the electromagnetic wave itself. the light quantum behaved in some respects as an electrically neutral particle, as opposed to an electromagnetic wave. it was eventually called the photon. classical statistical mechanics requires the existence of h (but does not define its value). eventually, following upon planck's discovery, it was recognized that physical action cannot take on an arbitrary value. instead, it must be some multiple of a very small quantity, the quantum of action, now called the planck constant. classical physics