-
Notifications
You must be signed in to change notification settings - Fork 0
/
base_model.py
420 lines (362 loc) · 21.2 KB
/
base_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
"""Definition of the base model class, which is inherited by concrete model classes."""
import os
import time
import codecs
import numpy as np
import tensorflow as tf
def load_embed_txt(embed_file, vocab):
emb_dict = dict()
emb_size = None
with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f:
for line in f:
tokens = line.strip().split(" ")
word = tokens[0]
if word not in vocab._word_to_id: continue
vec = list(map(float, tokens[1:]))
emb_dict[word] = vec
if emb_size:
assert emb_size == len(vec), "All embedding size should be same."
else:
emb_size = len(vec)
return emb_dict, emb_size
def _create_pretrained_emb_from_vocab(vocab, embed_file, dtype=tf.float32, name=None):
trainable_tokens = vocab._word_to_id
emb_dict, emb_size = load_embed_txt(embed_file, vocab)
for token in trainable_tokens:
if token not in emb_dict:
emb_dict[token] = np.random.normal(size=(200))
emb_mat = np.array(
[emb_dict[token] for token in vocab._word_to_id], dtype=dtype.as_numpy_dtype())
num_trainable_tokens = emb_mat.shape[0]
emb_size = emb_mat.shape[1]
emb_mat = tf.constant(emb_mat)
emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
emb_mat_var = tf.get_variable(name, [num_trainable_tokens, emb_size])
return tf.concat([emb_mat_var, emb_mat_const], 0)
class baseModel(object):
"""Base class for seq2seq models."""
def __init__(self, hps, src_vocab, tgt_vocab):
#hps传入的是parser args的数值,用于加载输入参数的数值
#在此处声明类中需要用的变量
self.hps = hps
self._src_vocab = src_vocab
self._tgt_vocab = tgt_vocab
self.arg_dec_outputs = None
self._dec_out_state = None
self.attn_dists = None
def __call__(self):
print("[*] Building graph...")
t0 = time.time()
self._add_placeholder()
self._add_model()
print(" Time to add model: %i seconds" % (time.time() - t0))
self.global_step = tf.Variable(0, name="global_step", trainable=False)
if self.hps.mode == "train":
self._add_train_op()
self.summaries = tf.summary.merge_all()
print(" Time to build graph: %i seconds" % (time.time() - t0))
return
def _add_placeholder(self):
# encoder part
self._enc_batch = tf.placeholder(tf.int32, [self.hps.batch_size, None], name='enc_batch')
self._enc_lens = tf.placeholder(tf.int32, [self.hps.batch_size], name='enc_lens')
self._enc_padding_mask = tf.placeholder(tf.float32, [self.hps.batch_size, None],
name='enc_padding_mask')
# decoder part
self._arg_dec_batch = tf.placeholder(tf.int32,
[self.hps.batch_size, self.hps.arg_max_dec_steps],
name='arg_dec_batch')
self._arg_target_batch = tf.placeholder(tf.int32,
[self.hps.batch_size, self.hps.arg_max_dec_steps],
name='arg_target_batch')
self._arg_dec_padding_mask = \
tf.placeholder(tf.float32, [self.hps.batch_size, self.hps.arg_max_dec_steps],
name='arg_dec_padding_mask')
# add placeholder for kp decoder if necessary
if self.hps.model in ["sep_dec", "shd_dec"]:
self._kp_dec_batch = tf.placeholder(tf.int32,
[self.hps.batch_size, self.hps.kp_max_dec_steps],
name='kp_dec_batch')
self._kp_target_batch = tf.placeholder(tf.int32,
[self.hps.batch_size, self.hps.kp_max_dec_steps],
name='kp_target_batch')
self._kp_dec_padding_mask = \
tf.placeholder(tf.float32, [self.hps.batch_size, self.hps.kp_max_dec_steps],
name='kp_dec_padding_mask')
self._initial_attention = tf.placeholder_with_default(tf.constant(False), shape=[])
def _add_model(self):
with tf.variable_scope("seq2seq_model"):
# Some initializers
self.rand_unif_init = \
tf.random_uniform_initializer(-self.hps.rand_unif_init_mag,
self.hps.rand_unif_init_mag,
seed=123)
self.trunc_norm_init = \
tf.truncated_normal_initializer(stddev=self.hps.trunc_norm_init_std)
# Add embedding matrix (shared by the encoder and decoder inputs)
#加载单词的ebedding格式,此处用的是glove/glove.6B.200d.txt
#如果该数据及未加载会利用随机矩阵自动生成一个vector
with tf.variable_scope('embedding'):
self._add_embedding()
#此处对加载入的单词vector做encoder训练
#具体对应的TensorFlow官方教程在
with tf.variable_scope("encoder"):
self._add_encoder()
self._reduce_states()
with tf.variable_scope("decoder"):
self._add_decoder()
with tf.variable_scope("output_projection"):
self._add_output_projection()
if self.hps.mode in ["train", "eval"]:
with tf.variable_scope("loss"):
self._loss_arg = tf.contrib.seq2seq.sequence_loss(
tf.stack(self.arg_dec_vocab_scores, axis=1),
self._arg_target_batch, self._arg_dec_padding_mask)
self._loss = self._loss_arg
if self.hps.model in ["sep_dec", "shd_dec"]:
self._loss_kp = tf.contrib.seq2seq.sequence_loss(
tf.stack(self.kp_dec_vocab_scores, axis=1),
self._kp_target_batch, self._kp_dec_padding_mask)
self._loss += self._loss_kp
tf.summary.scalar('loss_kp', self._loss_kp)
tf.summary.scalar('loss', self._loss)
tf.summary.scalar('loss_arg', self._loss_arg)
else:
assert len(self.arg_dec_vocab_dists) == 1
topk_probs_arg, self._topk_ids_arg = tf.nn.top_k(self.arg_dec_vocab_dists[0],
self.hps.batch_size * 2 + 1)
self._topk_log_probs_arg = tf.log(topk_probs_arg)
if self.hps.model in ["sep_dec", "shd_dec"]:
assert len(self.kp_dec_vocab_dists) == 1
topk_probs_kp, self._topk_ids_kp = tf.nn.top_k(self.kp_dec_vocab_dists[0],
self.hps.batch_size * 2 + 1)
self._topk_log_probs_kp = tf.log(topk_probs_kp)
def _add_embedding(self):
#此处是加载glove/glove.6B.200d.txt数据集
if os.path.exists(self.hps.embed_path):
embedding_encoder = _create_pretrained_emb_from_vocab(
self._src_vocab, self.hps.embed_path, name="embedding_src")
embedding_decoder = _create_pretrained_emb_from_vocab(
self._tgt_vocab, self.hps.embed_path, name="embedding_tgt")
else:
#如果未加载自动生成一个同样大小的矩阵,随机生成
embedding_encoder = tf.get_variable('embedding_src',
[self._src_vocab.size(), self.hps.emb_dim],
dtype=tf.float32,
initializer=self.trunc_norm_init)
embedding_decoder = tf.get_variable('embedding_tgt',
[self._tgt_vocab.size(), self.hps.emb_dim],
dtype=tf.float32,
initializer=self.trunc_norm_init)
#利用look_up查找对应的单词在表中的位置做单词和ebededing的映射
self.emb_enc_inputs = tf.nn.embedding_lookup(embedding_encoder, self._enc_batch)
self.emb_arg_dec_inputs = [tf.nn.embedding_lookup(embedding_decoder, x)
for x in tf.unstack(self._arg_dec_batch, axis=1)]
if self.hps.model in ["sep_dec", "shd_dec"]:
self.emb_kp_dec_inputs = [tf.nn.embedding_lookup(embedding_decoder, x)
for x in tf.unstack(self._kp_dec_batch, axis=1)]
def _add_train_op(self):
loss_to_minimize = self._loss
tvars = tf.trainable_variables()
gradients = tf.gradients(
loss_to_minimize, tvars,
aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
# Clip the gradients
grads, global_norm = tf.clip_by_global_norm(gradients, self.hps.max_grad_norm)
# Add a summary
tf.summary.scalar('global_norm', global_norm)
# Apply optimizer
if self.hps.optimizer == "adam":
optimizer = tf.train.AdamOptimizer(self.hps.learning_rate)
else:
optimizer =\
tf.train.AdagradOptimizer(0.15,
initial_accumulator_value=self.hps.adagrad_init_acc)
self._train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=self.global_step, name='train_step')
#在此处去做关于单词与vector的映射
def _add_output_projection(self):
with tf.variable_scope('arg_dec_output_projection'):
w = tf.get_variable('w1', [self.hps.hidden_dim, self._tgt_vocab.size()],
dtype=tf.float32, initializer=self.trunc_norm_init)
v = tf.get_variable('v1', [self._tgt_vocab.size()],
dtype=tf.float32, initializer=self.trunc_norm_init)
arg_dec_flattened = tf.reshape(
tf.stack(self.arg_dec_outputs), [-1, self.hps.hidden_dim])
arg_dec_vocab_scores = tf.nn.xw_plus_b(arg_dec_flattened, w, v)
arg_dec_vocab_scores = tf.reshape(arg_dec_vocab_scores,
[-1, self.hps.batch_size, self._tgt_vocab.size()])
arg_dec_vocab_dists = tf.nn.softmax(arg_dec_vocab_scores)
self.arg_dec_vocab_scores = tf.unstack(arg_dec_vocab_scores)
self.arg_dec_vocab_dists = tf.unstack(arg_dec_vocab_dists)
if self.hps.model in ["sep_dec", "shd_dec"]:
with tf.variable_scope('kp_dec_output_projection'):
kp_dec_flattened = tf.reshape(
tf.stack(self.kp_dec_outputs), [-1, self.hps.hidden_dim])
kp_dec_vocab_scores = tf.nn.xw_plus_b(kp_dec_flattened, w, v)
kp_dec_vocab_scores = tf.reshape(kp_dec_vocab_scores,
[-1, self.hps.batch_size, self._tgt_vocab.size()])
kp_dec_vocab_dists = tf.nn.softmax(kp_dec_vocab_scores)
self.kp_dec_vocab_scores = tf.unstack(kp_dec_vocab_scores)
self.kp_dec_vocab_dists = tf.unstack(kp_dec_vocab_dists)
def _add_encoder(self):
#经过两个LSTM训练,但是不明白为什么要用两个LSTM训练
#对应的为https://www.tensorflow.org/beta/tutorials/text/nmt_with_attention
#中的class Encoder(tf.keras.Model):函数
cell_fw1 = tf.contrib.rnn.LSTMCell(self.hps.hidden_dim, initializer=self.rand_unif_init,
state_is_tuple=True)
cell_bw1 = tf.contrib.rnn.LSTMCell(self.hps.hidden_dim, initializer=self.rand_unif_init,
state_is_tuple=True)
cell_fw2 = tf.contrib.rnn.LSTMCell(self.hps.hidden_dim, initializer=self.rand_unif_init,
state_is_tuple=True)
cell_bw2 = tf.contrib.rnn.LSTMCell(self.hps.hidden_dim, initializer=self.rand_unif_init,
state_is_tuple=True)
if self.hps.dropout > 0.0:
cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell=cell_fw1,
input_keep_prob=(1 - self.hps.dropout))
cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell=cell_bw1,
input_keep_prob=(1 - self.hps.dropout))
cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell=cell_fw2,
input_keep_prob=(1 - self.hps.dropout))
cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell=cell_bw2,
input_keep_prob=(1 - self.hps.dropout))
cell_fw = tf.contrib.rnn.MultiRNNCell([cell_fw1, cell_fw2])
cell_bw = tf.contrib.rnn.MultiRNNCell([cell_bw1, cell_bw2])
(bi_outputs, (bi_fw_st, bi_bw_st)) = \
tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,
self.emb_enc_inputs, dtype=tf.float32,
sequence_length=self._enc_lens, swap_memory=True)
# concatenate state of two layers
bi_fw_st_conc = tf.concat(axis=2, values=[bi_fw_st[0], bi_fw_st[1]])
bi_bw_st_conc = tf.concat(axis=2, values=[bi_bw_st[0], bi_bw_st[1]])
print("test value shape")
self.bi_fw_st_conc = tf.contrib.rnn.LSTMStateTuple(c=bi_fw_st_conc[0], h=bi_fw_st_conc[1])
self.bi_bw_st_conc = tf.contrib.rnn.LSTMStateTuple(c=bi_bw_st_conc[0], h=bi_bw_st_conc[1])
self.encoder_outputs = tf.concat(axis=2, values=bi_outputs)
#此处先把处理的数值进行了relu,应该是神经元每层的操作,但是这里不理解同时使用两个lstm训练的目的,也就导致这里的意义不是很清楚
def _reduce_states(self):
w_reduce_c = tf.get_variable('w_reduce_c',
[self.hps.hidden_dim * 4, self.hps.hidden_dim * 2],
dtype=tf.float32, initializer=self.trunc_norm_init)
w_reduce_h = tf.get_variable('w_reduce_h',
[self.hps.hidden_dim * 4, self.hps.hidden_dim * 2],
dtype=tf.float32, initializer=self.trunc_norm_init)
bias_reduce_c = tf.get_variable('bias_reduce_c',
[self.hps.hidden_dim * 2], dtype=tf.float32,
initializer=self.trunc_norm_init)
bias_reduce_h = tf.get_variable('bias_reduce_h',
[self.hps.hidden_dim * 2], dtype=tf.float32,
initializer=self.trunc_norm_init)
#此处不理解。原因是因为自己没用个两个LSTM叠加。他为什么这么做,需要查询多一点demo
# Apply linear layer
#此处需要了解TensorFlow的变量共享机制,因为TensorFlow有命名空间,跨方法的调用函数值依旧可以共享
old_c = tf.concat(axis=1, values=[self.bi_fw_st_conc.c, self.bi_bw_st_conc.c])
old_h = tf.concat(axis=1, values=[self.bi_fw_st_conc.h, self.bi_bw_st_conc.h])
new_c = tf.nn.relu(tf.matmul(old_c, w_reduce_c) + bias_reduce_c)
new_h = tf.nn.relu(tf.matmul(old_h, w_reduce_h) + bias_reduce_h)
new_c_1, new_c_2 = tf.split(new_c, [self.hps.hidden_dim, self.hps.hidden_dim], 1)
new_h_1, new_h_2 = tf.split(new_h, [self.hps.hidden_dim, self.hps.hidden_dim], 1)
self._dec_in_state = tuple([tf.contrib.rnn.LSTMStateTuple(new_c_1, new_h_1),
tf.contrib.rnn.LSTMStateTuple(new_c_2, new_h_2)])
def _add_decoder(self):
raise NotImplementedError("Subclasses should implement this!")
#喂入数据
def _make_feed_dict(self, batch, just_enc=False):
feed_dict = {}
#self._enc_batch=32
#batch.enc_batch=[32,250]对应着
feed_dict[self._enc_batch] = batch.enc_batch
#batch.enc_lens=[32]其中数值全是250,应该对应的是每个句子的长度
feed_dict[self._enc_lens] = batch.enc_lens
#此处是[32,250]全是1的矩阵
feed_dict[self._enc_padding_mask] = batch.enc_padding_mask
if not just_enc:
feed_dict[self._arg_dec_batch] = batch.arg_dec_batch
feed_dict[self._arg_target_batch] = batch.arg_target_batch
feed_dict[self._arg_dec_padding_mask] = batch.arg_dec_padding_mask
if self.hps.model in ["sep_dec", "shd_dec"]:
feed_dict[self._kp_dec_batch] = batch.kp_dec_batch
feed_dict[self._kp_target_batch] = batch.kp_target_batch
feed_dict[self._kp_dec_padding_mask] = batch.kp_dec_padding_mask
return feed_dict
#设置代码运行
def run_step(self, sess, batch):
feed_dict = self._make_feed_dict(batch)
to_return = {
'summaries': self.summaries,
'loss': self._loss,
'global_step': self.global_step,
}
if self.hps.mode == "train":
to_return["train_op"] = self._train_op
return sess.run(to_return, feed_dict)
#喂入
def run_encoder(self, sess, batch):
feed_dict = self._make_feed_dict(batch, just_enc=True)
(enc_states, dec_in_state, global_step) = \
sess.run([self.encoder_outputs, self._dec_in_state, self.global_step], feed_dict)
dec_in_state = (tf.contrib.rnn.LSTMStateTuple(dec_in_state[0].c[0], dec_in_state[0].h[0]),
tf.contrib.rnn.LSTMStateTuple(dec_in_state[1].c[0], dec_in_state[1].h[0]))
return enc_states, dec_in_state
def decode_onestep(self, sess, batch, latest_tokens, enc_states,
kp_dec_states, dec_init_states, arm, first_step=False):
beam_size = len(dec_init_states)
cells_0 = [np.expand_dims(state[0].c, axis=0) for state in dec_init_states]
hiddens_0 = [np.expand_dims(state[0].h, axis=0) for state in dec_init_states]
new_c_0 = np.concatenate(cells_0, axis=0) # shape [batch_size,hidden_dim]
new_h_0 = np.concatenate(hiddens_0, axis=0) # shape [batch_size,hidden_dim]
cells_1 = [np.expand_dims(state[1].c, axis=0) for state in dec_init_states]
hiddens_1 = [np.expand_dims(state[1].h, axis=0) for state in dec_init_states]
new_c_1 = np.concatenate(cells_1, axis=0) # shape [batch_size,hidden_dim]
new_h_1 = np.concatenate(hiddens_1, axis=0) # shape [batch_size,hidden_dim]
new_dec_in_state = (tf.contrib.rnn.LSTMStateTuple(new_c_0, new_h_0),
tf.contrib.rnn.LSTMStateTuple(new_c_1, new_h_1))
feed = {
self.encoder_outputs: enc_states,
self._enc_padding_mask: batch.enc_padding_mask,
self._dec_in_state: new_dec_in_state,
self._initial_attention: not first_step,
}
if arm == 2 and self.hps.model in ["sep_dec", "shd_dec"]: # aux task
feed[self._kp_dec_batch] = np.transpose(np.array([latest_tokens]))
to_return = {
"ids": self._topk_ids_kp,
"probs": self._topk_log_probs_kp,
"last_states": self._dec_out_state[1],
"dec_states": self.kp_dec_states,
"attn_dists": self.attn_dists[1]
}
else:
feed[self._arg_dec_batch] = np.transpose(np.array([latest_tokens]))
if self.hps.model == "sep_dec":
feed[self._kp_dec_batch] = np.transpose(np.array([latest_tokens]))
elif self.hps.model == "shd_dec":
feed[self._dec_out_state[1]] = new_dec_in_state
to_return = {
"ids": self._topk_ids_arg,
"probs": self._topk_log_probs_arg,
"last_states": self._dec_out_state[0],
"attn_dists": self.attn_dists[0]
}
if self.hps.attention == "dual" and self.hps.model in ["sep_dec", "shd_dec"]:
feed[self.kp_states] = kp_dec_states[0]
feed[self.kp_dec_padding_mask] = kp_dec_states[1]
to_return['dual_attn_dists'] = self.attn_dists[2]
results = sess.run(to_return, feed_dict=feed)
new_states = [
(tf.contrib.rnn.LSTMStateTuple(results['last_states'][0].c[i, :],
results['last_states'][0].h[i, :]),
tf.contrib.rnn.LSTMStateTuple(results['last_states'][1].c[i, :],
results['last_states'][1].h[i, :]))
for i in range(beam_size)]
if "dec_states" in results:
dec_states = results['dec_states'].tolist()
else:
dec_states = [None] * beam_size
assert len(results['attn_dists']) == 1
attn_dists = results['attn_dists'][0].tolist()
if "dual_attn_dists" in results:
assert len(results['dual_attn_dists']) == 1
dual_attn_dists = results['dual_attn_dists'][0].tolist()
attn_dists = zip(attn_dists, dual_attn_dists)
return results['ids'], results['probs'], new_states, attn_dists, dec_states