In [1]:
import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.python.layers.core import Dense
import input_utils
import os
from tensorflow.python.tools import inspect_checkpoint as chkp
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_d=os.getcwd()+'/model_data/'

# Word Embedding

In [3]:
FILE_TO_EMBEDDINGS = '../embeddings.5k.txt'
emb_dict, _ = input_utils.load_embeddings(FILE_TO_EMBEDDINGS)

Transform the embeddings as numpy array, only keep regular words

In [4]:
emb_dict.update((x, np.array(list(y)).astype(np.float32)) for x, y in emb_dict.items())
large_tag_list=['<loc>','<org>','<per>','<num>','<unk>','<s>','</s>']
for tag_key in large_tag_list:
    emb_dict.pop(tag_key)

Look up table between words an indices

In [5]:
word2ind={word: idx for idx, word in enumerate(emb_dict.keys())}
ind2word={idx: word for idx, word in enumerate(emb_dict.keys())}

Construct embedding matrix for reguar words and special tags

In [6]:
emb_mat=np.zeros((len(emb_dict.keys()),57))
for ind in ind2word.keys():
    word=ind2word[ind]
    emb_mat[ind,:]=emb_dict[word]
emb_mat=emb_mat[:,:50]
#for special tags
emb_mat_tag=np.reshape(np.random.normal(scale=1.0,size=7*50),(7,50))
i=0
for tag_key in large_tag_list:
    emb_dict[tag_key]=emb_mat_tag[i,:]
    i+=1
#put special tags back in the dictionary    
word2ind={word: idx for idx, word in enumerate(emb_dict.keys())}
ind2word={idx: word for idx, word in enumerate(emb_dict.keys())}

Embedding dimension is 50, dictionary size is 5000

In [7]:
emb_mat.shape

(4993, 50)

In [8]:
emb_mat_tag.shape

(7, 50)

Construct indices for the text file, which is the input of the seq2seq model

In [9]:
'''
with open('../data/train_input.txt', 'r') as f:
    train_in_data = f.read()
with open('../data/train_output.txt', 'r') as f:
    train_out_data = f.read()
with open('../data/train200000_input.txt', 'r') as f:
    train_in_data = f.read()
with open('../data/train200000_output.txt', 'r') as f:
    train_out_data = f.read()
'''
with open('../data/test_input.txt', 'r') as f:
    test_in_data = f.read()
with open('../data/test_output.txt', 'r') as f:
    test_out_data = f.read()


## Ignore the following before the loading step, except for the first run

Get the indexed text file

In [10]:
def paragraph2ind(source_data,word2ind,ind2word):
    #input: text file
    #output: indexed text with padding, max lenghth; length of each sentence
    #length counts END token, don't count START token
    sentence_len=[]
    text_ind=[]
    #handle ' \n'
    articles=source_data.split('</s>')
    
    del articles[-1]
    for i in range(1,len(articles)):
        articles[i]=articles[i][2:]
    #get index
    for a in articles:
        article_idx=[]
        sentence_len.append(len(a))
        for words in a.split():
            article_idx.append(word2ind[words])
        article_idx.append(word2ind['</s>'])
        text_ind.append(article_idx)
        
        
    
    max_len=max(sentence_len)
    min_len=min(sentence_len)
        
    #padding
    padded=np.zeros((len(text_ind),max_len))
    i=0
    for article_id in text_ind:
        padded[i,:]=np.array(list(article_id + [word2ind['</s>']] *(max_len-len(article_id))))
        i+=1
    
    return padded,max_len,min_len, np.array(sentence_len)

load training data

In [None]:
for i in range(2):
    file_ind=str(int((i+20)*10000))
    with open('../data/train'+file_ind+'_input.txt', 'r') as f:
        train_in_data = f.read()
    with open('../data/train'+file_ind+'_output.txt', 'r') as f:
        train_out_data = f.read()
    #input in training set
    [train_in_ind_i,max_in_len,min_in_len,train_in_len_i]=paragraph2ind(train_in_data,word2ind,ind2word)
    #cut off
    train_in_ind_i=train_in_ind_i[:,1:201] #no start token for input
    train_in_ind_i[:,-1]=word2ind['</s>']
    max_in_len=200
    train_in_len_i[train_in_len_i>200]=200
    
    [train_out_ind_i,max_out_len,min_out_len,train_out_len_i]=paragraph2ind(train_out_data,word2ind,ind2word)
    #cut off
    train_out_ind_i=train_out_ind_i[:,0:21]
    train_out_ind_i[:,-1]=word2ind['</s>']
    max_out_len=20
    train_out_len_i[train_out_len_i>20]=20
    
    
    if i==0:
        train_in_ind=train_in_ind_i
        train_in_len=train_in_len_i
        train_out_ind=train_out_ind_i
        train_out_len=train_out_len_i
        
    else:
        train_in_ind=np.concatenate((train_in_ind,train_in_ind_i),axis=0)
        train_in_len=np.concatenate((train_in_len,train_in_len_i),axis=0)
        train_out_ind=np.concatenate((train_out_ind,train_out_ind_i),axis=0)
        train_out_len=np.concatenate((train_out_len,train_out_len_i),axis=0)

In [None]:
#[train_in_ind,max_in_len,min_in_len,train_in_len]=paragraph2ind(train_in_data,word2ind,ind2word)

In [None]:
#[max_in_len,min_in_len]

Cutoff input len to 200

In [None]:
#train_in_ind=train_in_ind[:,1:201] #no start token for input
#train_in_ind[:,-1]=word2ind['</s>']
#max_in_len=200
#train_in_len[train_in_len>200]=200

In [None]:
#[train_out_ind,max_out_len,min_out_len,train_out_len]=paragraph2ind(train_out_data,word2ind,ind2word)

In [None]:
#[max_out_len,min_out_len]

Cutoff output len to 20

In [None]:
#train_out_ind=train_out_ind[:,0:21]
#train_out_ind[:,-1]=word2ind['</s>']
#max_out_len=20
#train_out_len[train_out_len>20]=20

Test file

In [None]:
[test_in_ind,max_test_in_len,min_test_in_len,test_in_len]=paragraph2ind(test_in_data,word2ind,ind2word)

In [None]:
test_in_ind=test_in_ind[:,1:201] #no start token for input
test_in_ind[:,-1]=word2ind['</s>']
max_test_in_len=200
test_in_len[test_in_len>200]=200

In [None]:
[test_out_ind,_,_,test_out_len]=paragraph2ind(test_out_data,word2ind,ind2word)

In [None]:
test_out_ind=test_out_ind[:,0:21]
test_out_ind[:,-1]=word2ind['</s>']

In [None]:
np.save(data_d+'train_in_ind.npy',train_in_ind)
np.save(data_d+'train_in_len.npy',train_in_len)
np.save(data_d+'train_out_ind.npy',train_out_ind)
np.save(data_d+'train_out_len.npy',train_out_len)
np.save(data_d+'test_in_ind.npy',test_in_ind)
np.save(data_d+'test_in_len.npy',test_in_len)
np.save(data_d+'test_out_ind.npy',test_out_ind)

## Load data

In [9]:
train_in_ind=np.load(data_d+'train_in_ind.npy')
train_in_len=np.load(data_d+'train_in_len.npy')
train_out_ind=np.load(data_d+'train_out_ind.npy')
train_out_len=np.load(data_d+'train_out_len.npy')
test_in_ind=np.load(data_d+'test_in_ind.npy')
test_in_len=np.load(data_d+'test_in_len.npy')
test_out_ind=np.load(data_d+'test_out_ind.npy')

In [10]:
train_in_len.shape

(14000,)

In [11]:
max_in_len=200
max_out_len=20
max_test_in_len=200

# Training

In [12]:
start_token=word2ind['<s>']
end_token=word2ind['</s>']

In [13]:
vocab_size=emb_mat.shape[0]+emb_mat_tag.shape[0]
embedding_dim=emb_mat.shape[1]
train_size=train_in_ind.shape[0]
batch_size=400

In [14]:
embedding_tf_reg=tf.Variable(emb_mat.astype(np.float32),name='embedding_tf_reg',trainable=True)
embedding_tf_tag=tf.Variable(emb_mat_tag.astype(np.float32),name='embedding_tf_tag')
embedding_all=tf.concat([embedding_tf_reg,embedding_tf_tag],0)

In [15]:
#embedding_all=tf.Variable(emb_mat.astype(np.float32), name='embedding_all')

Ids of input sequence. The first dimension is batch size.

In [16]:
X=tf.placeholder(tf.int32,[None,max_in_len],name='X')
X_emb=tf.nn.embedding_lookup(embedding_all, X)

Record the length of each sentence

In [17]:
seq_len=tf.placeholder(tf.int32,[None],name='seq_len')

Define encoder

In [18]:
forward_cell=tf.contrib.rnn.GRUCell(num_units=80)
backward_cell=tf.contrib.rnn.GRUCell(num_units=80)
[bi_outputs_tr,encoder_states_tr]=tf.nn.bidirectional_dynamic_rnn(
    forward_cell,backward_cell,X_emb,dtype=tf.float32,sequence_length=seq_len,time_major=False)
encoder_outputs_tr=tf.concat(bi_outputs_tr,-1)
#encoder_outputs_tr=tf.transpose(encoder_outputs_tr0, [1, 0, 2])

Decoder helper

In [19]:
decoder_out=tf.placeholder(tf.int32,[None,max_out_len],name='decoder_out')
decoder_inp=tf.placeholder(tf.int32,[None,max_out_len],name='decoder_inp')
decoder_emb_inp=tf.nn.embedding_lookup(embedding_all, decoder_inp,name='decoder_emb_inp')
decoder_lengths=tf.placeholder(tf.int32,[None],name='decoder_lengths')
train_helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, decoder_lengths)

Define target weights for loss function

In [20]:
target_weight=np.zeros((train_size,max_out_len))
for i in range(train_size):
    target_weight[i,:train_out_len[i]]=1
target_weight.astype(np.float32)
target_weight_tf=tf.placeholder(tf.float32,[None,max_out_len],name='target_weight_batch')

Decoding network

In [21]:

def decoder(helper, scope, ouputs_enc,states_enc,batch_decoder,seq_len_decoder,reuse=None,max_iter=None):
    with tf.variable_scope(scope, reuse=reuse):
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units=80, memory=ouputs_enc,
            memory_sequence_length=seq_len_decoder)
        decoder_cell = tf.contrib.rnn.GRUCell(num_units=80)
        attn_cell = tf.contrib.seq2seq.AttentionWrapper(
            decoder_cell, attention_mechanism, attention_layer_size=40,alignment_history=True)
        init_st=attn_cell.zero_state(dtype=tf.float32,batch_size=batch_decoder).clone(cell_state=states_enc)
        #init_st=states_enc
        projection_layer = Dense(units=vocab_size,use_bias=False)
        decoder = tf.contrib.seq2seq.BasicDecoder(
            attn_cell, helper, init_st, output_layer=projection_layer)
        outputs, states ,_= tf.contrib.seq2seq.dynamic_decode(
            decoder,impute_finished=True,maximum_iterations=max_iter)
        return outputs,states


Decoder output

In [22]:
#train_outputs, _ ,_= tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True)
train_outputs,train_states=decoder(train_helper,'decode',encoder_outputs_tr,encoder_states_tr[1],batch_size,seq_len)
logits = train_outputs.rnn_output
Y_train=train_outputs.sample_id

In [23]:
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=decoder_out, logits=logits)

Define Training op

In [24]:
saver = tf.train.Saver()
learning_rate=0.005
max_gradient_norm=3
loss = tf.reduce_sum(crossent*target_weight_tf)#target_weight
params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(
    gradients, max_gradient_norm)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
update_step = optimizer.apply_gradients(
    zip(clipped_gradients, params))
#training_op = optimizer.minimize(loss)

# Inference

Define inference helper

In [25]:
infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_all, tf.fill([100], start_token), end_token)

Testing example, which is the first sentence in the training set.

In [26]:
X_test=tf.placeholder(tf.int32,[100,max_in_len],name='X_test')
x_test=test_in_ind[:100,:]
seq_len_test=tf.placeholder(tf.int32,[100],name='seq_len_test')
seq_len_test_in=test_in_len[:100]

x_train0=train_in_ind[:100,:]
seq_len_train0_in=train_in_len[:100]

In [27]:
X_emb_test=tf.nn.embedding_lookup(embedding_all, X_test)

Inference network

In [28]:
#[encoder_outputs_inf,encoder_states_inf]=tf.nn.dynamic_rnn(
#    gru_cell,X_emb_test,dtype=tf.float32,sequence_length=seq_len_test)
[bi_outputs_inf,encoder_states_inf]=tf.nn.bidirectional_dynamic_rnn(
    forward_cell,backward_cell,X_emb_test,dtype=tf.float32,sequence_length=seq_len_test)
encoder_outputs_inf=tf.concat(bi_outputs_inf,-1)
batch_test=X_test.get_shape()[0].value
outputs_inf,states_inf=decoder(
    infer_helper,'decode',encoder_outputs_inf, encoder_states_inf[1],batch_test,seq_len_test,reuse=True, max_iter=15)

Output id

In [29]:
Y_test= outputs_inf.sample_id

In [30]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, data_d+"model.ckpt")
    
    #random shuffle
    sample_size=train_in_ind.shape[0]
    rand_index=np.random.shuffle(np.arange(sample_size))
    train_in_ind_rand=train_in_ind[rand_index,:]
    train_out_ind_rand=train_out_ind[rand_index,:]
    train_in_len_rand=train_in_len[rand_index]
    train_out_len_rand=train_out_len[rand_index]
    target_weight_rand=target_weight[rand_index,:]
    
    '''
    for i in range(31):
        loss_value=0
        for j in range(int(train_size/batch_size)):
            x_in_batch=train_in_ind[j*batch_size:j*batch_size+batch_size]
            x_out_batch=train_out_ind[j*batch_size:j*batch_size+batch_size]
            x_in_length_batch=train_in_len[j*batch_size:j*batch_size+batch_size]
            x_out_length_batch=train_out_len[j*batch_size:j*batch_size+batch_size]
            target_weight_batch=target_weight[j*batch_size:j*batch_size+batch_size,:]
            sess.run(update_step,feed_dict={X:x_in_batch, seq_len:x_in_length_batch,\
                                            decoder_inp:x_out_batch[:,:-1], target_weight_tf:target_weight_batch, \
                                            decoder_lengths:x_out_length_batch,decoder_out:x_out_batch[:,1:]})
            loss_value+=loss.eval(feed_dict={X:x_in_batch, seq_len:x_in_length_batch,\
                                             decoder_inp:x_out_batch[:,:-1], target_weight_tf:target_weight_batch, \
                                             decoder_lengths:x_out_length_batch,decoder_out:x_out_batch[:,1:]})
        if np.mod(i,5)==0:
            print(i,loss_value)
    '''
    # get attention matrix
    attention_images_test = sess.run(states_inf.alignment_history.stack(), feed_dict={X_test:x_test,seq_len_test:seq_len_test_in})
    attention_images_test = np.transpose(attention_images_test, [1, 2, 0])
    attention_images_train0 = sess.run(states_inf.alignment_history.stack(), feed_dict={X_test:x_train0,seq_len_test:seq_len_train0_in})
    attention_images_train0 =np.transpose(attention_images_train0, [1, 2, 0])
    #id of test samples
    test_output_ind=(Y_test.eval(feed_dict={X_test:x_test,seq_len_test:seq_len_test_in}))
    train0_output_ind=(Y_test.eval(feed_dict={X_test:x_train0,seq_len_test:seq_len_train0_in}))
    save_path = saver.save(sess, data_d+"model.ckpt")

INFO:tensorflow:Restoring parameters from /Users/zxo/Dropbox/Machine_Learning/course/10701/Project_10701/10701-text-summarization-project/src/model_data/model.ckpt


Read text from indices

In [31]:
def read_output(output_ind):
    out_text=[]
    for i in range(output_ind.shape[0]):
        out_article=[]
        for j in range(output_ind.shape[1]):
            out_article.append(ind2word[output_ind[i,j]])
        out_text.append(out_article)
    return out_text

In [32]:
train_out_t=read_output(train0_output_ind)

## Compute the f-score

In [57]:
f=0.0
for i in range(100):
    model_out=[]
    correct_out=[]
    for j in range(len(train_out_t[i])):
        if train_out_t[i][j]=='</s>':
            break
        model_out.append(train_out_t[i][j])
    #print(model_out)
    for j in range(len(train_out_correct[i])):
        if test_out_correct[i][j]=='</s>':
            break
        correct_out.append(train_out_correct[i][j])
    f_i=input_utils.fscore(model_out,correct_out)
    #print(f_i)
    f+=f_i
f/=100

In [58]:
f

0.39465906605339