In [None]:
from distutils.version import LooseVersion
import tensorflow as tf
from tensorflow.python.layers.core import Dense
import os 
import json
import pandas as pd
import nltk
import numpy as np
from sklearn.cross_validation import train_test_split

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

In [None]:
file_path = os.getcwd()+"/idebate.json"
Glove_path=os.getcwd()+"/glove.6B.50d.txt"
def load_data(file_path):
    with open(file_path) as file:
        raw_data=file.read()
        json_data=json.loads(raw_data)
    dataframe=pd.DataFrame(columns=["text","summary"])
    for i in json_data:
        text_summary_pair={}
        text=" "
        for x in i["_argument_sentences"]:
            text=text+i["_argument_sentences"][x].lower()
            #probably we can get rid of stopping word and punctuaton for input
        text_summary_pair["text"]=text
        text_summary_pair["summary"]=i["_claim"]
        dataframe=dataframe.append(text_summary_pair,ignore_index=True)#append doesn't happen in place
    return dataframe
##########################################
dataframe=load_data(file_path)

In [None]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model
##################################################
word_vec=loadGloveModel(Glove_path)

In [None]:
words_list=[]
vectors_list=[]
for i in word_vec:
    words_list.append(i)
    vectors_list.append(word_vec[i])
    
GO_vector=[1]*50
EOS_vector=[2]*50
PAD_vector=[0]*50
words_list.append("<GO>")
vectors_list.append(GO_vector)
words_list.append("<EOS>")
vectors_list.append(EOS_vector)

words_list.insert(0,"<PAD>")
vectors_list.insert(0,PAD_vector)
vectors_array=np.array([np.array(xi) for xi in vectors_list])

In [None]:
def build_projection(word_list):
    int_to_vocab={}
    vocab_to_int={}
    for i in range(len(word_list)):
        int_to_vocab[i]=word_list[i]
        vocab_to_int[word_list[i]]=i
    return int_to_vocab, vocab_to_int

In [None]:
source_int_to_letter, source_letter_to_int=build_projection(words_list)
target_int_to_letter, target_letter_to_int = build_projection(words_list)

In [None]:
source_int = [[source_letter_to_int.get(letter, source_letter_to_int["unk"]) 
               for letter in line.split()] for line in list(dataframe["text"])]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['unk']) 
               for letter in line.split()] + [target_letter_to_int['<EOS>']] for line in list(dataframe["summary"])]

In [None]:
def get_encoder_layer(input_data, 
                      rnn_size, 
                      num_layers,
                      source_sequence_length, 
                      source_vocab_size,
                      encoding_embedding_size):

   
    # Encoder embedding
    #encoder_embed_input = tf.contrib.layers.embed_sequence(input_data,source_vocab_size, encoding_embedding_size)
    batch_size=input_data.shape[0].value
    length=input_data.shape[1].value
    
    #encoder_embed_input = tf.Variable(tf.zeros([batch_size, length, encoding_embedding_size]),trainable=False,dtype=tf.float32)
    encoder_out = tf.nn.embedding_lookup(Vectors_Array,input_data)  #final input for encoder
    encoder_out= tf.cast(encoder_embed_input,tf.float32)
    

    # RNN cell
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        
        return lstm_cell
    
    for n in range(num_layers):
        (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
            cell_fw = get_lstm_cell(rnn_size // 2),
            cell_bw = get_lstm_cell(rnn_size // 2),
            inputs = encoder_out,
            sequence_length = self.X_seq_len,
            dtype = tf.float32)
        encoder_out = tf.concat((out_fw, out_bw), 2)
        
    bi_state_c = tf.concat((state_fw.c, state_bw.c), -1)
    bi_state_h = tf.concat((state_fw.h, state_bw.h), -1)
    bi_lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=bi_state_c, h=bi_state_h)
    encoder_state = tuple([bi_lstm_state] * num_layers)

    
    return encoder_state

In [None]:
def process_decoder_input(data, vocab_to_int, batch_size):
   
    # cut掉最后一个字符
    ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1, 1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return decoder_input

In [None]:
def decoding_layer(target_letter_to_int, 
                   decoding_embedding_size, 
                   num_layers, 
                   Rnn_size,
                   target_sequence_length, 
                   max_target_sequence_length, 
                   encoder_state, 
                   decoder_input):
   
    # 1. Embedding
    target_vocab_size = len(target_letter_to_int)  #total 
    #decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(Vectors_Array, decoder_input)

    # 2. 构造Decoder中的RNN单元
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(Rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return decoder_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(Rnn_size) for _ in range(num_layers)])
     
    # 3. Output全连接层
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))


    # 4. Training decoder
    with tf.variable_scope("decode"):
        # 得到help对象
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        # 构造decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                           training_helper,
                                                           encoder_state,
                                                           output_layer) 
        training_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)
    # 5. Predicting decoder
    # 与training共享参数
    with tf.variable_scope("decode", reuse=True):
        # 创建一个常量tensor并复制为batch_size的大小
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], 
                               name='start_tokens')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(Vectors_Array,
                                                                start_tokens,
                                                                target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                        predicting_helper,
                                                        encoder_state,
                                                        output_layer)
        predicting_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)
    
    return training_decoder_output, predicting_decoder_output

In [None]:
def seq2seq_model(input_data, targets, target_sequence_length, 
                  max_target_sequence_length, source_sequence_length,
                  source_vocab_size, target_vocab_size,
                  encoder_embedding_size, decoder_embedding_size, 
                  rnn_size,Rnn_size, num_layers):
    
    # 获取encoder的状态输出
    encoder_state = get_encoder_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  source_sequence_length,
                                  source_vocab_size, 
                                  encoding_embedding_size)
    
    
    # 预处理后的decoder输入
    decoder_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    # 将状态向量与输入传递给decoder
    training_decoder_output, predicting_decoder_output = decoding_layer(target_letter_to_int, 
                                                                       decoding_embedding_size, 
                                                                       num_layers, 
                                                                       Rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       encoder_state, 
                                                                       decoder_input) 
    
    return training_decoder_output, predicting_decoder_output
    

In [None]:
# 超参数
# Number of Epochs
epochs = 2
# Batch Size
batch_size = 30
# RNN Size
rnn_size = 500
Rnn_size=500
# Number of Layers
num_layers = 3
# Embedding Size
encoding_embedding_size = 50
decoding_embedding_size = 50
# Learning Rate
learning_rate = 0.001

In [None]:
train_graph = tf.Graph()

with train_graph.as_default():
    
    Vectors_Array = tf.Variable(tf.constant(0.0, shape=[400003, 50]), trainable=False, name="vectors_array")
    embedding_placeholder = tf.placeholder(tf.float32, [400003, 50])
    embedding_init = Vectors_Array.assign(embedding_placeholder)
    
    # 获得模型输入  
    input_data = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    
    
    training_decoder_output, predicting_decoder_output = seq2seq_model(input_data,
                                                                       targets,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       source_sequence_length,
                                                                       len(source_letter_to_int),
                                                                       len(target_letter_to_int),
                                                                       encoding_embedding_size,
                                                                       decoding_embedding_size,
                                                                       rnn_size,
                                                                       Rnn_size,
                                                                       num_layers)    
    
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')   
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [None]:
def pad_sentence_batch(sentence_batch, pad_int):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [None]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    '''
    定义生成器，用来获取batch
    '''
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]
        # 补全序列
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        # 记录每条记录的长度
        targets_lengths = []
        for target in targets_batch:
            targets_lengths.append(len(target))
        
        source_lengths = []
        for source in sources_batch:
            source_lengths.append(len(source))
        
        yield pad_targets_batch, pad_sources_batch, targets_lengths, source_lengths

In [None]:
# 将数据集分割为train和validation
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]
# 留出一个batch进行验证
valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))

display_step = 1 # 每隔50轮输出loss

checkpoint = "trained_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    print("a")
    sess.run(embedding_init, feed_dict={embedding_placeholder: vectors_array})
    print("b")
        
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches(train_target, train_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>'])):
            
            _, loss = sess.run(
                [train_op, cost],
                {input_data: sources_batch,
                 targets: targets_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths})

            if batch_i % display_step == 0:
                
                # 计算validation loss
                validation_loss = sess.run(
                [cost],
                {input_data: valid_sources_batch,
                 targets: valid_targets_batch,
                 lr: learning_rate,
                 target_sequence_length: valid_targets_lengths,
                 source_sequence_length: valid_sources_lengths})
                
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              loss, 
                              validation_loss[0]))

    
    
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')