# lstm

In [None]:
import tensorflow as tf
import numpy as np
import collections

In [None]:
train_path = './train-sentences.txt'

word2vec_path = './GoogleNews-vectors-negative300.bin'



num_layers = 2 # 2层RNN
num_units = 300 # 隐藏单元数量
use_peepholes = True # 是否使用peepholes

input_keep_prob = 0.8 # 输入dropout
output_keep_prob = 1.0 # 输出dropout

batch_size = 32 # 每批数据的规模，每批有32个


max_epoch = 500 # epoch

isTraining = True # is training?

# 读取unique word

def _read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", " ").split()

# build word_to_id
def _build_vocab(filename):
    data = _read_words(filename)

    counter = collections.Counter(data)
    
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))

    return word_to_id

# load Embedding matrix (only contain our unique word)
def loadEmbedding(word_to_id,word2vec_name):
    """ Initialize embeddings with pre-trained word2vec vectors
        Will modify the embedding weights of the current loaded model
        Uses the GoogleNews pre-trained values
    """
    
    #initW = np.zeros((len(word_to_id), 5000)) 
    #return initW
    
    # Load the pre-trained word2vec data
    with open(word2vec_name, "rb", 0) as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * vector_size
        
        # 如果word在GoogleNews pre-trained没有出现，就是设置为0
            
        initW = np.zeros((len(word_to_id), vector_size)) 
        for line in range(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == b' ':
                    word = b''.join(word).decode('utf-8')
                    break
                if ch != b'\n':
                    word.append(ch)
            if word in word_to_id:
                initW[word_to_id[word]] = np.fromstring(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)
    return initW


# Load Google's pre-trained Word2Vec model.
def get_word_to_id_word2vec(train_path):
    
    word_to_id = _build_vocab(train_path)
    word2vec = loadEmbedding(word_to_id,word2vec_path)
    
    return word_to_id,word2vec,word2vec.shape[0],word2vec.shape[1]

# 数据类型
def data_type():
    return tf.float32




In [None]:
word_to_id,word2vec,vocab_size,embedding_dim = get_word_to_id_word2vec(train_path)

In [None]:

def add_brnn(inputs,num_units,seq_lens,input_keep_prob,output_keep_prob,use_peepholes):
        
    ''' 
    inputs:
        [
        <tf.Tensor 'L_context:0' shape=(batch_size, time_step, num_units) dtype=float32>, 
        <tf.Tensor 'R_context:0' shape=(batch_size, time_step, num_units) dtype=float32>
        ]
        
    seq_lens:
        [
        <tf.Tensor 'L_context_len:0' shape=(batch_size,) dtype=int32>, 
        <tf.Tensor 'R_context_len:0' shape=(batch_size,) dtype=int32>
        ]
    
    return:
    
    [tf.concat(o[0],2,name='outputs') for o in batch_outputs]: # 左文和右文已经拼接
        [
        # output_fw
        <tf.Tensor 'LSTM_1/outputs:0' shape=(batch_size, time_step, 2*num_units) dtype=float32>, 
        # output_bw
        <tf.Tensor 'LSTM_1/outputs_1:0' shape=(batch_size, time_step, 2*num_units) dtype=float32>
        ]
    
    [o[1] for o in batch_outputs]:
    
    # 左文
        [
        (LSTMStateTuple
        # output_state_fw
            (
            c=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn/fw/fw/while/Exit_2:0' shape=(batch_size, num_units) dtype=float32>, 
            h=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn/fw/fw/while/Exit_3:0' shape=(batch_size, num_units) dtype=float32>
            ), 
        # output_state_bw
        LSTMStateTuple
            (
            c=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn/bw/bw/while/Exit_2:0' shape=(batch_size, num_units) dtype=float32>, 
            h=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn/bw/bw/while/Exit_3:0' shape=(batch_size, num_units) dtype=float32>
            )
        ), 
        
    # 右文
        (LSTMStateTuple
            (
        # output_state_fw
            c=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn_1/fw/fw/while/Exit_2:0' shape=(batch_size, num_units) dtype=float32>, 
            h=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn_1/fw/fw/while/Exit_3:0' shape=(batch_size, num_units) dtype=float32>
            ), 
        # output_state_bw
        LSTMStateTuple
            (
            c=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn_1/bw/bw/while/Exit_2:0' shape=(batch_size, num_units) dtype=float32>, 
            h=<tf.Tensor 'LSTM_2/bi_dy_rnn/bidirectional_rnn_1/bw/bw/while/Exit_3:0' shape=(batch_size, num_units) dtype=float32>
            )
        )]
        
    '''
    
    # 左向，右向
    cell_fw = tf.nn.rnn_cell.LSTMCell(num_units = num_units,use_peepholes = use_peepholes)
    cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw,input_keep_prob=input_keep_prob,
                                      output_keep_prob=output_keep_prob) # RNN只对多层之间的cell进行dropout 
    
    cell_bw = tf.nn.rnn_cell.LSTMCell(num_units = num_units,use_peepholes = use_peepholes)
    cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw,input_keep_prob=input_keep_prob,
                                      output_keep_prob=output_keep_prob) # RNN只对多层之间的cell进行dropout 
    
    # 记录输出
    batch_outputs = []
    
    with tf.variable_scope(name_or_scope='bi_dy_rnn') as scope:
        for input_sentences,seq_len in zip(inputs,seq_lens):
           
            batch_outputs.append(tf.nn.bidirectional_dynamic_rnn
                           (cell_fw, cell_bw, input_sentences, sequence_length=seq_len, dtype=tf.float32))

            scope.reuse_variables() # 第二次循环是右上下文，设置重用变量
    
    return [tf.concat(o[0],2,name='outputs') for o in batch_outputs], [o[1] for o in batch_outputs]




In [None]:
# first reset graph or will raise error in second time run
tf.reset_default_graph()

# set word2vec into tensor
embedding = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]),trainable=False, name="embedding")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
embedding_init = embedding.assign(embedding_placeholder)

# input data
L_context_id = tf.placeholder(tf.int32, shape=[None,None]) # left context word's id
L_context_length = tf.placeholder(tf.int32, shape=[None])# left context word's len
R_context_id = tf.placeholder(tf.int32, shape=[None,None])# left context word's id
R_context_length = tf.placeholder(tf.int32, shape=[None])# left context word's len

# get word2vec
L_context_vec_before_drop = tf.nn.embedding_lookup(embedding, L_context_id,name='L_context_before_drop')
R_context_vec_before_drop = tf.nn.embedding_lookup(embedding, R_context_id,name='R_context_before_drop')

# input data dropout
if isTraining and input_keep_prob < 1:
    L_context_vec = tf.nn.dropout(L_context_vec_before_drop, input_keep_prob,name='L_context_vec')
    R_context_vec = tf.nn.dropout(R_context_vec_before_drop, input_keep_prob,name='R_context_vec')

with tf.variable_scope('LSTM_1'):
    [L_outputs,R_outputs], _ = add_brnn(
        [L_context_vec,R_context_vec], 
        num_units, 
        [L_context_length,R_context_length],
        input_keep_prob,
        output_keep_prob,
        use_peepholes)

with tf.variable_scope('LSTM_2'):
    [L_outputs_2,R_outputs_2],[L_final_state,R_final_state] = add_brnn(
        [L_outputs,R_outputs], 
        num_units, 
        [L_context_length,R_context_length],
        input_keep_prob,
        output_keep_prob,
        use_peepholes)

outputs = tf.concat([L_outputs,R_outputs], 1)#同一batch拼接在一起就是得到[文章，上下文特征向量]

final_state = (
    tf.concat([L_final_state[0][1], L_final_state[1][1]], 1) 
    + tf.concat([R_final_state[0][1], R_final_state[1][1]], 1))/2 #双向拼接、上下文取平均，得到encode向量


#对填充位置进行mask，注意这里是softmax之前的mask，所以mask不是乘以0，而是减去1e12

L_context_mask = (1-tf.cast(tf.sequence_mask(L_context_length), tf.float32))*(-1e12) 

R_context_mask = (1-tf.cast(tf.sequence_mask(R_context_length), tf.float32))*(-1e12)

context_mask = tf.concat([L_context_mask,R_context_mask], 1)

outputs = tf.concat([L_outputs,R_outputs], 1)#同一batch拼接在一起就是得到[文章，上下文特征向量]

In [None]:
with tf.Session() as sess:
    
    init = tf.global_variables_initializer()
    sess.run(init)
    
    # 初始化embedding
    sess.run(embedding_init,feed_dict={embedding_placeholder:word2vec})
    
    L_id = np.array([[71,72,73,70],[71,72,73,74]])
    R_id = np.array([[75,76,77,78,70],[75,76,77,78,79]])
    L_len = np.array([1,4])
    R_len = np.array([1,3])
    
    ts,matrix = sess.run([L_context_mask,embedding], 
                         feed_dict={
                             L_context_id:L_id,
                             R_context_id:R_id,
                             L_context_length:L_len, 
                             R_context_length:R_len})
ts