In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

import numpy as np
import pandas as pd
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from tqdm import tqdm

import pickle

from tensorflow.contrib.tensorboard.plugins import projector

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
zh_wiki_id = open("data/zh_wiki_id_w_d").readline()
word_to_id = pickle.load(open("data/word_to_id_w_d.pkl", "rb"))
id_to_word = pickle.load(open("data/id_to_word_w_d.pkl", "rb"))

In [4]:
len(id_to_word), len(word_to_id), len(zh_wiki_id)

(507260, 507260, 862838467)

In [5]:
def getWord(data, num, data_index):
    sub_data_string = data[data_index:data_index+num*(6+1)]
    result = []
    for index, item in enumerate(sub_data_string.split()):
        if index == num: break
        data_index += len(item) + 1
        result.append(int(item))
    if len(result) < num:
        return getWord(data, num, 0)
    assert len(result) == num
    return result, data_index

In [6]:
def generate_batch(batch_size, skip_window, num_skips):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size, num_skips), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    assert batch_size >= span
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builti
    
    result, data_index = getWord(zh_wiki_id, span, data_index)
    buffer.extend(result)
    
    for i in range(batch_size):
        context_words = [w for w in range(span) if w != skip_window]
        batch[i, :] = [buffer[token] for idx, token in enumerate(context_words)]
        labels[i, 0] = buffer[skip_window]
        result, data_index = getWord(zh_wiki_id, 1, data_index)
        buffer.append(result[0])
        if data_index > len(zh_wiki_id):
            result, data_index = getWord(zh_wiki_id, span-1, 0)
            buffer.extend(result)
        if i == batch_size - span:
            last_index = data_index
            
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = last_index
    return batch, labels

In [9]:
# data_index = 0

batch, labels = generate_batch(batch_size=8, skip_window=2, num_skips=2*2)
for i in range(8):
    print(batch[i, 0], id_to_word[batch[i, 0]],
          batch[i, 1], id_to_word[batch[i, 1]],
          '->', labels[i, 0], id_to_word[labels[i, 0]])
data_index

331 变化 73 以及 -> 651 空间
73 以及 651 空间 -> 38 等
651 空间 38 等 -> 904 概念
38 等 904 概念 -> 2 的
904 概念 2 的 -> 5871 一门
2 的 5871 一门 -> 2348 学科
5871 一门 2348 学科 -> 1 ，
2348 学科 1 ， -> 68 从


66

In [28]:
word_max_len = 16
vocabulary_size = len(id_to_word)
def get_char_batch(word_batch):
    char_batch = []
    char_num_batch = []
    for win in word_batch:
        char_win = []
        char_num_win = []
        for word in win:
            char_win_w = [vocabulary_size] * word_max_len
            w = id_to_word[word]
            for i, c in enumerate(w):
                if i >= word_max_len: break
                if c in word_to_id:
                    char_win_w[i] = word_to_id[c]
            char_win.append(char_win_w)
            char_num_win.append([len(w)])
        char_batch.append(char_win)
        char_num_batch.append(char_num_win)
    return np.array(char_batch), np.array(char_num_batch)

In [29]:
char_batch, char_num_batch = get_char_batch(batch)

for win in char_batch:
    for w in win:
        for c in w:
            if c == vocabulary_size:
                print("0", end=" ")
                continue
            print(id_to_word[c], end=" ")
        print(" - ", end="")
    print()

数 学 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 是 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 符 号 语 言 0 0 0 0 0 0 0 0 0 0 0 0  - 研 究 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 
是 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 利 用 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 研 究 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 数 量 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 
利 用 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 符 号 语 言 0 0 0 0 0 0 0 0 0 0 0 0  - 数 量 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 、 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 
符 号 语 言 0 0 0 0 0 0 0 0 0 0 0 0  - 研 究 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 、 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 结 构 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 
研 究 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 数 量 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 结 构 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 、 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 
数 量 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 、 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 、 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 变 化 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 
、 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 结 构 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 变 化 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 以 及 0 0 0 0 0 0 0 0 0 0 0 0 0 0  - 
结 构 0 0 0 0 0

In [12]:
len(id_to_word)

507260

In [13]:
batch_size = 200
batch_size = 100 # 0509 change
# batch_size = 256
embedding_size = 100    # Dimension of the embedding vector.
skip_window = 5    # How many words to consider left and right.
num_skips = 2*skip_window    # How many times to reuse an input to generate a label.
num_sampled = 100    # Number of negative examples to sample.
# num_sampled = 128    # Number of negative examples to sample.
vocabulary_size = len(id_to_word)

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
# valid_size = 16    # Random set of words to evaluate similarity on.
# valid_window = 100    # Only pick dev samples in the head of the distribution.
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# valid_examples = list(range(1, 10))
valid_examples = list(range(280, 291))
valid_size = len(valid_examples)

graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size, num_skips])
        train_inputs_char = tf.placeholder(tf.int32, shape=[batch_size, num_skips, word_max_len])
        train_inputs_char_num = tf.placeholder(tf.float32, shape=[batch_size, num_skips, 1])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/gpu:0'):
#     with tf.device('/gpu:0'):
    # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
            print("embed:", embed)
            
            embeddings_concat = tf.concat([embeddings, [[0.0]*embedding_size]], 0)
            print("embeddings_concat：", embeddings_concat)
            embed_char = tf.nn.embedding_lookup(embeddings_concat, train_inputs_char)
            print("embed_char:", embed_char)
            embed_mean_char = tf.div(tf.reduce_sum(embed_char, 2), train_inputs_char_num)
            
            # take mean of embeddings of context words for context embedding
            embed_mean = (embed + embed_mean_char) / 2
            embed_context = tf.reduce_mean(embed_mean, 1)

    with tf.device('/gpu:0'):
        # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                    tf.truncated_normal(
                            [vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #     http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    
#     with tf.device('/gpu:2'):
    with tf.name_scope('loss'):
#         loss = tf.reduce_mean(
#             tf.nn.nce_loss(nce_weights, nce_biases, embed_context, train_labels,
#                            num_sampled, vocabulary_size))
#         print(train_labels, embed_context)
        loss = tf.reduce_mean(
                tf.nn.nce_loss(
                        weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
                        inputs=embed_context,
#                         labels=embed_context,
#                         inputs=train_labels,
                        num_sampled=num_sampled,
                        num_classes=vocabulary_size))

    # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
#         optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
        optimizer = tf.train.GradientDescentOptimizer(1).minimize(loss)
#         optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    print(valid_embeddings, normalized_embeddings, similarity)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()

embed: Tensor("embeddings/embedding_lookup:0", shape=(100, 10, 100), dtype=float32, device=/device:GPU:0)
embeddings_concat： Tensor("embeddings/concat:0", shape=(507261, 100), dtype=float32, device=/device:GPU:0)
embed_char: Tensor("embeddings/embedding_lookup_1:0", shape=(100, 10, 16, 100), dtype=float32, device=/device:GPU:0)
Tensor("embedding_lookup:0", shape=(11, 100), dtype=float32) Tensor("truediv:0", shape=(507260, 100), dtype=float32) Tensor("MatMul:0", shape=(11, 507260), dtype=float32)


In [None]:
num_steps = 20000010
# log_dir = "./log_002_baseline_cbow/"
log_dir = "./log_005_cwe/"

tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True

with tf.Session(graph=graph, config=tfconfig) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(log_dir, session.graph)

    # We must initialize all variables before we use them.
#     init.run()
#     saver = tf.train.import_meta_graph('./checkpoint_dir/MyModel-1000.meta')
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
    print('Initialized')

    average_loss = 0
    start_index = 880000
    for step in xrange(start_index, num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, skip_window=skip_window, num_skips=num_skips)
        char_batch, char_num_batch = get_char_batch(batch_inputs)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels, \
                     train_inputs_char: char_batch, train_inputs_char_num: char_num_batch}

        # Define metadata variable.
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
        # Feed metadata variable to session for visualizing the graph in TensorBoard.
        _, summary, loss_val = session.run(
                [optimizer, merged, loss],
                feed_dict=feed_dict,
                run_metadata=run_metadata)
        average_loss += loss_val
        
        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0 and step != start_index:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = id_to_word[valid_examples[i]]
                top_k = 8    # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = id_to_word[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
                
            # Save the model for checkpoints.
            saver.save(session, os.path.join(log_dir, 'model.ckpt'), global_step=step)
        
        if step % 1000000 == 0:
            word2vec = embeddings.eval()
            print(word2vec.shape, type(word2vec))
            np.save("result/003#cwe_win5_"+str(step), word2vec)
            
    final_embeddings = normalized_embeddings.eval()

    # Write corresponding labels for the embeddings.
    with open(log_dir + '/metadata.tsv', 'w') as f:
        for i in xrange(vocabulary_size):
            f.write(id_to_word[i] + '\n')

    # Save the model for checkpoints.
    saver.save(session, os.path.join(log_dir, 'model.ckpt'), global_step=step)

    # Create a configuration for visualizing embeddings with the labels in TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
    projector.visualize_embeddings(writer, config)

writer.close()