-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
tutorial_word2vec_basic.py
373 lines (316 loc) · 15.4 KB
/
tutorial_word2vec_basic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
# Copyright 2019 TensorLayer. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Vector Representations of Words.
This is the minimalistic reimplementation of
tensorflow/examples/tutorials/word2vec/word2vec_basic.py
This basic example contains the code needed to download some data,
train on it a bit and visualize the result by using t-SNE.
Once you get comfortable with reading and running the basic version,
you can graduate to
tensorflow/models/embedding/word2vec.py
which is a more serious implementation that showcases some more advanced
TensorFlow principles about how to efficiently use threads to move data
into a text model, how to checkpoint during training, etc.
If your model is no longer I/O bound but you want still more performance, you
can take things further by writing your own TensorFlow Ops, as described in
Adding a New Op. Again we've provided an example of this for the Skip-Gram case
tensorflow/models/embedding/word2vec_optimized.py.
Link
------
https://www.tensorflow.org/versions/r0.9/tutorials/word2vec/index.html#vector-representations-of-words
"""
import argparse
import os
import time
import numpy as np
import tensorflow as tf
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorlayer as tl
import wget
parser = argparse.ArgumentParser()
parser.add_argument(
"--model", default='one', type=str, required=False, help="The model name. It can be 'one', 'two', 'three', 'four'."
)
FLAGS = parser.parse_args()
def main_word2vec_basic():
# Step 1: Download the data, read the context into a list of strings.
# Set hyperparameters.
words = tl.files.load_matt_mahoney_text8_dataset()
data_size = len(words)
print(data_size) # 17005207
print(words[0:10]) # ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
# exit()
resume = False # load existing model, data and dictionaries
_UNK = "_UNK"
if FLAGS.model == "one":
# toy setting (tensorflow/examples/tutorials/word2vec/word2vec_basic.py)
vocabulary_size = 50000 # maximum number of word in vocabulary
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector (hidden layer).
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# (should be double of 'skip_window' so as to
# use both left and right words)
num_sampled = 64 # Number of negative examples to sample.
# more negative samples, higher loss
learning_rate = 1.0
n_epoch = 20
model_file_name = "model_word2vec_50k_128"
# Eval 2084/15851 accuracy = 15.7%
elif FLAGS.model == "two":
# (tensorflow/models/embedding/word2vec.py)
vocabulary_size = 80000
batch_size = 20 # Note: small batch_size need more steps for a Epoch
embedding_size = 200
skip_window = 5
num_skips = 10
num_sampled = 100
learning_rate = 0.2
n_epoch = 15
model_file_name = "model_word2vec_80k_200"
# 7.9%
elif FLAGS.model == "three":
# (tensorflow/models/embedding/word2vec_optimized.py)
vocabulary_size = 80000
batch_size = 500
embedding_size = 200
skip_window = 5
num_skips = 10
num_sampled = 25
learning_rate = 0.025
n_epoch = 20
model_file_name = "model_word2vec_80k_200_opt"
# bad 0%
elif FLAGS.model == "four":
# see: Learning word embeddings efficiently with noise-contrastive estimation
vocabulary_size = 80000
batch_size = 100
embedding_size = 600
skip_window = 5
num_skips = 10
num_sampled = 25
learning_rate = 0.03
n_epoch = 200 * 10
model_file_name = "model_word2vec_80k_600"
# bad
else:
raise Exception("Invalid model: %s" % FLAGS.model)
num_steps = int((data_size / batch_size) * n_epoch) # total number of iteration
print('%d Steps in a Epoch, total Epochs %d' % (int(data_size / batch_size), n_epoch))
print(' learning_rate: %f' % learning_rate)
print(' batch_size: %d' % batch_size)
# Step 2: Build the dictionary and replace rare words with 'UNK' token.
print()
if resume:
print("Load existing data and dictionaries" + "!" * 10)
all_var = tl.files.load_npy_to_any(name=model_file_name + '.npy')
data = all_var['data']
count = all_var['count']
dictionary = all_var['dictionary']
reverse_dictionary = all_var['reverse_dictionary']
else:
data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size, True, _UNK)
print(
'Most 5 common words (+UNK)', count[:5]
) # [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
# [5243, 3081, 12, 6, 195, 2, 3135, 46, 59, 156] [b'anarchism', b'originated', b'as', b'a', b'term', b'of', b'abuse', b'first', b'used', b'against']
del words # Hint to reduce memory.
# Step 3: Function to generate a training batch for the Skip-Gram model.
print()
batch, labels, data_index = tl.nlp.generate_skip_gram_batch(
data=data, batch_size=8, num_skips=4, skip_window=2, data_index=0
)
for i in range(8):
print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
batch, labels, data_index = tl.nlp.generate_skip_gram_batch(
data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0
)
for i in range(8):
print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
# Step 4: Build a Skip-Gram model.
print()
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# a list of 'valid_size' integers smaller than 'valid_window'
# print(valid_examples) # [90 85 20 33 35 62 37 63 88 38 82 58 83 59 48 64]
# n_epoch = int(num_steps / batch_size)
# train_inputs is a row vector, a input is an integer id of single word.
# train_labels is a column vector, a label is an integer id of single word.
# valid_dataset is a column vector, a valid set is an integer id of single word.
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Look up embeddings for inputs.
inputs = tl.layers.Input([batch_size], dtype=tf.int32)
labels = tl.layers.Input([batch_size, 1], dtype=tf.int32)
emb_net = tl.layers.Word2vecEmbedding(
vocabulary_size=vocabulary_size,
embedding_size=embedding_size,
num_sampled=num_sampled,
activate_nce_loss=True, # nce loss is activated
nce_loss_args={},
E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
nce_W_init=tl.initializers.truncated_normal(stddev=float(1.0 / np.sqrt(embedding_size))),
nce_b_init=tl.initializers.constant(value=0.0),
name='word2vec_layer',
)
emb, nce = emb_net([inputs, labels])
model = tl.models.Model(inputs=[inputs, labels], outputs=[emb, nce], name="word2vec_model")
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels
# each time we evaluate the loss.
# Construct the optimizer. Note: AdamOptimizer is very slow in this case
optimizer = tf.optimizers.Adagrad(learning_rate, initial_accumulator_value=0.1)
# normalized embedding
normalized_embeddings = emb_net.normalized_embeddings
# Step 5: Start training.
model.train()
if resume:
print("Load existing model" + "!" * 10)
model.load_weights(filepath=model_file_name + '.hdf5')
# save vocabulary to txt
tl.nlp.save_vocab(count, name='vocab_text8.txt')
average_loss = 0
step = 0
print_freq = 2000
while step < num_steps:
start_time = time.time()
batch_inputs, batch_labels, data_index = tl.nlp.generate_skip_gram_batch(
data=data, batch_size=batch_size, num_skips=num_skips, skip_window=skip_window, data_index=data_index
)
# We perform one update step by evaluating the train_op (including it
# in the list of returned values for sess.run()
with tf.GradientTape() as tape:
outputs, nce_cost = model([batch_inputs, batch_labels])
grad = tape.gradient(nce_cost, model.trainable_weights)
optimizer.apply_gradients(zip(grad, model.trainable_weights))
average_loss += nce_cost
if step % print_freq == 0:
if step > 0:
average_loss /= print_freq
print("Average loss at step %d/%d. loss: %f took: %fs/per step" % \
(step, num_steps, average_loss, time.time() - start_time))
average_loss = 0
# Prints out nearby words given a list of words.
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % (print_freq * 5) == 0:
# Compute the cosine similarity between minibatch examples and all embeddings.
# For simple visualization of validation set.
valid_embed = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
sim = tf.matmul(valid_embed, normalized_embeddings, transpose_b=True)
sim = sim.numpy()
# multiply all valid word vector with all word vector.
# transpose_b=True, normalized_embeddings is transposed before multiplication.
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors to print
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
if (step % (print_freq * 20) == 0) and (step != 0):
print("Save model, data and dictionaries" + "!" * 10)
model.save_weights(filepath=model_file_name + ".hdf5")
tl.files.save_any_to_npy(
save_dict={
'data': data,
'count': count,
'dictionary': dictionary,
'reverse_dictionary': reverse_dictionary
}, name=model_file_name + '.npy'
)
# if step == num_steps-1:
# keeptrain = input("Training %d finished enter 1 to keep training: " % num_steps)
# if keeptrain == '1':
# step = 0
# learning_rate = float(input("Input new learning rate: "))
# train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
step += 1
# Step 6: Visualize the normalized embedding matrix by t-SNE.
print()
final_embeddings = normalized_embeddings #.eval()
tl.visualize.tsne_embedding(final_embeddings, reverse_dictionary, plot_only=500, \
second=5, saveable=False, name='word2vec_basic')
# Step 7: Evaluate by analogy questions. see tensorflow/models/embedding/word2vec_optimized.py
print()
model.eval()
# from tensorflow/models/embedding/word2vec.py
if not os.path.exists("questions-words.txt"):
print("Downloading file 'questions-words.txt'")
wget.download('http://download.tensorflow.org/data/questions-words.txt')
analogy_questions = tl.nlp.read_analogies_file(eval_file='questions-words.txt', word2id=dictionary)
# For each question (row in dist), find the top 'n_answer' words.
n_answer = 4
def predict(analogy):
# The eval feeds three vectors of word ids for a, b, c, each of
# which is of size N, where N is the number of analogies we want to
# evaluate in one batch.
analogy_a = analogy[:, 0] # [N]
analogy_b = analogy[:, 1] # [N]
analogy_c = analogy[:, 2] # [N]
# Each row of a_emb, b_emb, c_emb is a word's embedding vector.
# They all have the shape [N, emb_dim]
a_emb = tf.gather(normalized_embeddings, analogy_a) # a's embs
b_emb = tf.gather(normalized_embeddings, analogy_b) # b's embs
c_emb = tf.gather(normalized_embeddings, analogy_c) # c's embs
# We expect that d's embedding vectors on the unit hyper-sphere is
# near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim].
# Bangkok Thailand Tokyo Japan -> Thailand - Bangkok = Japan - Tokyo
# Japan = Tokyo + (Thailand - Bangkok)
# d = c + (b - a)
target = c_emb + (b_emb - a_emb)
# Compute cosine distance between each pair of target and vocab.
# dist has shape [N, vocab_size].
dist = tf.matmul(target, normalized_embeddings, transpose_b=True)
"""Predict the top 4 answers for analogy questions."""
_, pred_idx = tf.nn.top_k(dist, n_answer)
return pred_idx
# Evaluate analogy questions and reports accuracy.
# i.e. How many questions we get right at precision@1.
correct = 0
total = analogy_questions.shape[0]
start = 0
while start < total:
limit = start + 2500
sub = analogy_questions[start:limit, :] # question
idx = predict(sub) # 4 answers for each question
# print('question:', tl.nlp.word_ids_to_words(sub[0], reverse_dictionary))
# print('answers:', tl.nlp.word_ids_to_words(idx[0], reverse_dictionary))
start = limit
for question in xrange(sub.shape[0]):
for j in xrange(n_answer):
# if one of the top 4 answers in correct, win !
if idx[question, j] == sub[question, 3]:
# Bingo! We predicted correctly. E.g., [italy, rome, france, paris].
print(
j + 1, tl.nlp.word_ids_to_words([idx[question, j]], reverse_dictionary), ':',
tl.nlp.word_ids_to_words(sub[question, :], reverse_dictionary)
)
correct += 1
break
elif idx[question, j] in sub[question, :3]:
# We need to skip words already in the question.
continue
else:
# The correct label is not the precision@1
break
print("Eval %4d/%d accuracy = %4.1f%%" % (correct, total, correct * 100.0 / total))
if __name__ == '__main__':
main_word2vec_basic()