In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

import numpy as np
import pandas as pd
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from tqdm import tqdm

import pickle

from tensorflow.contrib.tensorboard.plugins import projector

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
zh_wiki_id = open("data/zh_wiki_id_w_d").readline()
word_to_id = pickle.load(open("data/word_to_id_w_d.pkl", "rb"))
id_to_word = pickle.load(open("data/id_to_word_w_d.pkl", "rb"))
# word_count = pickle.load(open("data/count.pkl", "rb"))

In [4]:
len(id_to_word), len(word_to_id), len(zh_wiki_id)

(507260, 507260, 862838467)

In [5]:
def getWord(data, num, data_index):
    sub_data_string = data[data_index:data_index+num*(6+1)]
    result = []
    for index, item in enumerate(sub_data_string.split()):
        if index == num: break
        data_index += len(item) + 1
        result.append(int(item))
    if len(result) < num:
        return getWord(data, num, 0)
    assert len(result) == num
    return result, data_index

In [6]:
def generate_batch(batch_size, skip_window, num_skips):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size, num_skips), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    assert batch_size >= span
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builti
    
    result, data_index = getWord(zh_wiki_id, span, data_index)
    buffer.extend(result)
    
    for i in range(batch_size):
        context_words = [w for w in range(span) if w != skip_window]
        batch[i, :] = [buffer[token] for idx, token in enumerate(context_words)]
        labels[i, 0] = buffer[skip_window]
        result, data_index = getWord(zh_wiki_id, 1, data_index)
        buffer.append(result[0])
        if data_index > len(zh_wiki_id):
            result, data_index = getWord(zh_wiki_id, span-1, 0)
            buffer.extend(result)
        if i == batch_size - span:
            last_index = data_index
            
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = last_index
    return batch, labels

In [7]:
data_index = 0

batch, labels = generate_batch(batch_size=8, skip_window=1, num_skips=2*1)
for i in range(8):
    print(batch[i, 0], id_to_word[batch[i, 0]],
          batch[i, 1], id_to_word[batch[i, 1]],
          '->', labels[i, 0], id_to_word[labels[i, 0]])
data_index

1348 数学 501 利用 -> 9 是
9 是 237319 符号语言 -> 501 利用
501 利用 141 研究 -> 237319 符号语言
237319 符号语言 894 数量 -> 141 研究
141 研究 5 、 -> 894 数量
894 数量 499 结构 -> 5 、
5 、 5 、 -> 499 结构
499 结构 331 变化 -> 5 、


34

In [8]:
def positional_encoding(inputs,
                        num_units,
                        zero_pad=True,
                        scale=True,
                        scope="positional_encoding",
                        reuse=None):
    '''Sinusoidal Positional_Encoding.
    Args:
      inputs: A 2d Tensor with shape of (N, T).
      num_units: Output dimensionality
      zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
      scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    Returns:
        A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
    '''

    B, T, N = inputs.get_shape().as_list()
    with tf.variable_scope(scope, reuse=reuse):
        position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])

        # First part of the PE function: sin and cos argument
        position_enc = np.array([
            [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
            for pos in range(T)])

        # Second part, apply the cosine to even columns and sin to odds.
        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1

        # Convert to a tensor
        lookup_table = tf.convert_to_tensor(position_enc)

        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
        print("lookup_table:", lookup_table)
        print("position_ind:", position_ind)
        print("outputs:", outputs)

        if scale:
            outputs = outputs * num_units**0.5

#         return outputs
        return tf.cast(outputs, dtype=tf.float32)

In [9]:
batch_size = 100
# batch_size = 256
embedding_size = 100    # Dimension of the embedding vector.
skip_window = 5    # How many words to consider left and right.
num_skips = 2*skip_window    # How many times to reuse an input to generate a label.
num_sampled = 100    # Number of negative examples to sample.
# num_sampled = 128    # Number of negative examples to sample.
vocabulary_size = len(id_to_word)

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
# valid_size = 16    # Random set of words to evaluate similarity on.
# valid_window = 100    # Only pick dev samples in the head of the distribution.
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples = list(range(280, 291))
valid_size = len(valid_examples)

graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size, num_skips])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
#     with tf.device('/gpu:0'):
    # Look up embeddings for inputs.
    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        print("embeddings:", embeddings)
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        print("embed:", embed)
        # take mean of embeddings of context words for context embedding
#             embed_context = tf.reduce_mean(embed, 1)

    with tf.name_scope("position"):
#         print(positional_encoding(embed, embedding_size, zero_pad=False))
        embed_pos = embed + positional_encoding(embed, embedding_size, zero_pad=False)
        print("position embde:", embed_pos)

    with tf.name_scope('global-attention'):
        attention_size = embedding_size - 0
        attention_w = tf.Variable(tf.ones(shape=[2*embedding_size, attention_size], dtype=tf.float32))
        attention_b = tf.Variable(tf.zeros(shape=[attention_size], dtype=tf.float32))

        target_embed = tf.nn.embedding_lookup(embeddings, train_labels)
        # B,D -> B,1,D
        target_embed = tf.tile(target_embed, multiples=[1, num_skips, 1])
        # B,1,D -> B,W,D
        embed_concat = tf.concat([embed_pos, target_embed], 2)
        # B,W,D + B,W,D -> B,W,2D 
        print("embed_concat:", embed_concat)

        attention_matmul = tf.tanh(tf.tensordot(embed_concat, attention_w, axes=[[2], [0]]) + attention_b)
        # B,W,2D * 2D*A -> B,W,A
        print("attention_matmul:", attention_matmul)
        attention_w_a = tf.Variable(tf.ones(shape=[attention_size], dtype=tf.float32))
        attention = tf.nn.softmax(tf.tensordot(attention_matmul, attention_w_a, axes=[[2],[0]]))
        # B,W,A * A -> B,W
        print("attention:", attention)

        attention_context = tf.reduce_sum(tf.multiply(embed, tf.expand_dims(attention, -1)), axis=1)
        # B,W,D * B,W,1 -> B,W,D -> B,D
        print("attention_context:", attention_context)

        # statistics attention info
        attention_mean, attention_var = tf.nn.moments(attention, axes=[-1])
        attention_mean = tf.reduce_mean(attention_mean)
        attention_var = tf.reduce_mean(attention_var)
        print("attention_mean:", attention_mean)
        print("attention_var:", attention_var)

    # Construct the variables for the NCE loss
    with tf.name_scope('weights'):
        nce_weights = tf.Variable(
                tf.truncated_normal(
                        [vocabulary_size, embedding_size],
                        stddev=1.0 / math.sqrt(embedding_size)))
    with tf.name_scope('biases'):
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #     http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    with tf.name_scope('loss'):
#         loss = tf.reduce_mean(
#             tf.nn.nce_loss(nce_weights, nce_biases, embed_context, train_labels,
#                            num_sampled, vocabulary_size))
#         print(train_labels, embed_context)
        loss = tf.reduce_mean(
                tf.nn.nce_loss(
                        weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
#                         inputs=embed_context,
                        inputs=attention_context,
#                         labels=embed_context,
#                         inputs=train_labels,
                        num_sampled=num_sampled,
                        num_classes=vocabulary_size))

    # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
#         optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    print("valid_embeddings:", valid_embeddings)
    print("normalized_embeddings:", normalized_embeddings)
    print("similarity:", similarity)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()
    saver_embed = tf.train.Saver([embeddings])

embeddings: <tf.Variable 'embeddings/Variable:0' shape=(507260, 100) dtype=float32_ref>
embed: Tensor("embeddings/embedding_lookup:0", shape=(100, 10, 100), dtype=float32)
lookup_table: Tensor("position/positional_encoding/Const:0", shape=(10, 100), dtype=float64)
position_ind: Tensor("position/positional_encoding/Tile:0", shape=(100, 10), dtype=int32)
outputs: Tensor("position/positional_encoding/embedding_lookup:0", shape=(100, 10, 100), dtype=float64)
position embde: Tensor("position/add:0", shape=(100, 10, 100), dtype=float32)
embed_concat: Tensor("global-attention/concat:0", shape=(100, 10, 200), dtype=float32)
attention_matmul: Tensor("global-attention/Tanh:0", shape=(100, 10, 100), dtype=float32)
attention: Tensor("global-attention/Softmax:0", shape=(100, 10), dtype=float32)
attention_context: Tensor("global-attention/Sum:0", shape=(100, 100), dtype=float32)
attention_mean: Tensor("global-attention/Mean:0", shape=(), dtype=float32)
attention_var: Tensor("global-attention/Mean_1:

In [None]:
data_index = 782363369
num_steps = 20000001
# num_steps = 1
log_dir = "./log_010_cbow_global-attention-position/"
log_embed_dir = "./log_embeddings/"

base_dir = "/home/renxinzhang/renxingzhang/chineseword2vec/logs/"
log_dir = base_dir+"010_P_AWE_Global_02/"
log_embed_dir = base_dir + "word_embeddings_1.2kw/"
log_result_dir = "/home/renxinzhang/renxingzhang/chineseword2vec/result/"

tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True

with tf.Session(graph=graph, config=tfconfig) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(log_dir, session.graph)

    # We must initialize all variables before we use them.
#     init.run()
#     saver = tf.train.import_meta_graph('./checkpoint_dir/MyModel-1000.meta')
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
#     saver_embed.restore(session, tf.train.latest_checkpoint(log_embed_dir))
    print('Initialized')

    average_loss = 0
    avetage_attention = 0
    std_attention = 0
    start_index = 2000001
    for step in xrange(start_index, num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, skip_window=skip_window, num_skips=num_skips)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
        # Feed metadata variable to session for visualizing the graph in TensorBoard.
        _, summary, loss_val, attention_val, std_val = session.run(
                [optimizer, merged, loss, attention_mean, attention_var],
                feed_dict=feed_dict,
                run_metadata=run_metadata)
        average_loss += loss_val
        avetage_attention += attention_val
        std_attention += std_val
        
#         print("embed:", embed.eval(feed_dict=feed_dict))
#         print("attention_w:", attention_w.eval())
#         print("attention_matmul:", attention_matmul.eval(feed_dict=feed_dict))
#         print("attention:", attention.eval(feed_dict=feed_dict))
#         print("attention_context:", attention_context.eval(feed_dict=feed_dict))

        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0 and step != start_index:
                average_loss /= 2000
                avetage_attention /= 2000
                std_attention /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss, ': ', data_index)
            average_loss = 0
            print('Average attention at step ', step, ': ', avetage_attention)
            print('Variance attention at step ', step, ': ', std_attention)
            avetage_attention = 0
            std_attention = 0
#             print("attention:", attention.eval(feed_dict=feed_dict)[:2])
#             print("attention:", attention.eval(feed_dict=feed_dict))

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = id_to_word[valid_examples[i]]
                top_k = 8    # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = id_to_word[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
                
            # Save the model for checkpoints.
            saver.save(session, os.path.join(log_dir, 'model.ckpt'), global_step=step)
            
        
        if step % 1000000 == 0 and step != start_index:
            word2vec = embeddings.eval()
            print(word2vec.shape, type(word2vec))
#             np.save("result/0010#cbow_global-attention-position_"+str(step), word2vec)
            np.save(log_result_dir+"010_P_AWE_Global_02_"+str(step), word2vec)
            
    final_embeddings = normalized_embeddings.eval()

    # Write corresponding labels for the embeddings.
    with open(log_dir + '/metadata.tsv', 'w') as f:
        for i in xrange(vocabulary_size):
            f.write(id_to_word[i] + '\n')

    # Save the model for checkpoints.
    saver.save(session, os.path.join(log_dir, 'model.ckpt'), global_step=step)

    # Create a configuration for visualizing embeddings with the labels in TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    projector.visualize_embeddings(writer, config)

writer.close()

In [11]:
Average loss at step  2002000 :  4.30323612314 :  783150179
Average attention at step  2002000 :  0.100000007104
Variance attention at step  2002000 :  0.0
Average loss at step  2004000 :  4.12583158082 :  783935879
Average attention at step  2004000 :  0.100000007093
Variance attention at step  2004000 :  0.0
Average loss at step  2006000 :  3.88449565303 :  784718625
Average attention at step  2006000 :  0.100000007071
Variance attention at step  2006000 :  0.0
Average loss at step  2008000 :  3.92502414954 :  785495410
Average attention at step  2008000 :  0.100000007071
Variance attention at step  2008000 :  0.0
Average loss at step  2010000 :  4.12631542498 :  786279787
Average attention at step  2010000 :  0.100000007063
Variance attention at step  2010000 :  0.0
Nearest to 音乐: 艺术, 文化, 歌曲, 戏剧, 流行音乐, 乐队, 表演, 个人,
Nearest to 地方: 区域, 地区, 市, 政府, 处, 当地, 中央, 机构,
Nearest to 社会: 文化, 经济, 教育, 政府, 性, 宗教, 人民, 者,
Nearest to 服务: 计划, 工作, 业务, 活动, 免费, 系统, 公共, 商业,
Nearest to 均: 皆, 都, 仍, 亦, 则, 虽然, 便, 由于,
Nearest to 型: 式, 系, 系统, 部分, 级, -, 结构, 采用,
Nearest to 学生: 教师, 人士, 学校, 者, 民众, 人员, 市民, 师生,
Nearest to 今: 现, 今属, 今日, 现今, 北, 治今, 原, 城,
Nearest to 受到: 受, 遭到, 遭, 得到, 造成, 遭受, 获, 引起,
Nearest to 事件: 案, 行为, 行动, 问题, 组织, 计划, 结果, 运动,
Nearest to 经济: 社会, 商业, 农业, 金融, 政府, 工业, 文化, 军事,
Average loss at step  2012000 :  4.23256733602 :  787063596
Average attention at step  2012000 :  0.100000007056
Variance attention at step  2012000 :  0.0
Average loss at step  2014000 :  4.22357171029 :  787849084
Average attention at step  2014000 :  0.100000007134
Variance attention at step  2014000 :  0.0
Average loss at step  2016000 :  4.03940198737 :  788628274
Average attention at step  2016000 :  0.100000007067
Variance attention at step  2016000 :  0.0
Average loss at step  2018000 :  3.84731678396 :  789402733
Average attention at step  2018000 :  0.100000007022
Variance attention at step  2018000 :  0.0
Average loss at step  2020000 :  4.0805553498 :  790185197
Average attention at step  2020000 :  0.100000007097
Variance attention at step  2020000 :  0.0
Nearest to 音乐: 艺术, 文化, 歌曲, 戏剧, 流行音乐, 乐队, 表演, 个人,
Nearest to 地方: 区域, 地区, 市, 政府, 处, 当地, 中央, 机构,
Nearest to 社会: 文化, 经济, 教育, 政府, 性, 宗教, 人民, 者,
Nearest to 服务: 计划, 工作, 业务, 活动, 免费, 系统, 公共, 商业,
Nearest to 均: 皆, 都, 仍, 亦, 则, 虽然, 便, 由于,
Nearest to 型: 式, 系, 系统, 部分, -, 级, 结构, 采用,
Nearest to 学生: 教师, 人士, 学校, 者, 民众, 人员, 市民, 师生,
Nearest to 今: 现, 今属, 今日, 现今, 北, 治今, 原, 城,
Nearest to 受到: 受, 遭到, 遭, 得到, 造成, 遭受, 获, 引起,
Nearest to 事件: 案, 行为, 行动, 问题, 组织, 计划, 结果, 运动,
Nearest to 经济: 社会, 商业, 农业, 金融, 工业, 政府, 文化, 军事,
Average loss at step  2022000 :  4.11212456042 :  790967666
Average attention at step  2022000 :  0.100000007123
Variance attention at step  2022000 :  0.0
Average loss at step  2024000 :  4.13663935685 :  791750837
Average attention at step  2024000 :  0.100000007048
Variance attention at step  2024000 :  0.0
Average loss at step  2026000 :  3.77246125358 :  792519885
Average attention at step  2026000 :  0.100000007164
Variance attention at step  2026000 :  0.0
Average loss at step  2028000 :  3.9095029453 :  793293320
Average attention at step  2028000 :  0.100000007186
Variance attention at step  2028000 :  0.0
Average loss at step  2030000 :  4.07582554579 :  794071244
Average attention at step  2030000 :  0.100000007074
Variance attention at step  2030000 :  0.0
Nearest to 音乐: 艺术, 文化, 歌曲, 戏剧, 流行音乐, 乐队, 表演, 个人,
Nearest to 地方: 区域, 地区, 市, 政府, 处, 当地, 中央, 机构,
Nearest to 社会: 文化, 经济, 教育, 政府, 性, 宗教, 人民, 者,
Nearest to 服务: 计划, 工作, 业务, 活动, 免费, 系统, 公共, 行动,
Nearest to 均: 皆, 都, 仍, 亦, 则, 虽然, 便, 由于,
Nearest to 型: 式, 系, 系统, 部分, 级, -, 结构, 采用,
Nearest to 学生: 教师, 人士, 学校, 者, 民众, 人员, 市民, 师生,
Nearest to 今: 现, 今属, 今日, 现今, 治今, 北, 原, 城,
Nearest to 受到: 受, 遭到, 遭, 得到, 造成, 遭受, 获, 引起,
Nearest to 事件: 案, 行为, 行动, 问题, 组织, 计划, 结果, 运动,
Nearest to 经济: 社会, 商业, 农业, 金融, 工业, 政府, 文化, 军事,
Average loss at step  2032000 :  4.07024439839 :  794849633
Average attention at step  2032000 :  0.100000007082
Variance attention at step  2032000 :  0.0
Average loss at step  2034000 :  4.18060521185 :  795630228
Average attention at step  2034000 :  0.100000007037
Variance attention at step  2034000 :  0.0
Average loss at step  2036000 :  3.8465884088 :  796408533
Average attention at step  2036000 :  0.100000007108
Variance attention at step  2036000 :  0.0
Average loss at step  2038000 :  4.0682031998 :  797190824
Average attention at step  2038000 :  0.100000006989
Variance attention at step  2038000 :  0.0
Average loss at step  2040000 :  4.26512009424 :  797983670
Average attention at step  2040000 :  0.100000007033
Variance attention at step  2040000 :  0.0
Nearest to 音乐: 艺术, 文化, 歌曲, 戏剧, 流行音乐, 乐队, 表演, 个人,
Nearest to 地方: 区域, 地区, 市, 政府, 处, 当地, 中央, 机构,
Nearest to 社会: 文化, 经济, 教育, 政府, 性, 宗教, 人民, 者,
Nearest to 服务: 计划, 工作, 业务, 活动, 免费, 系统, 公共, 行动,
Nearest to 均: 皆, 都, 仍, 亦, 则, 虽然, 便, 由于,
Nearest to 型: 式, 系, 系统, 部分, -, 级, 结构, 采用,
Nearest to 学生: 教师, 人士, 学校, 者, 民众, 人员, 师生, 市民,
Nearest to 今: 现, 今属, 今日, 现今, 治今, 北, 城, 原,
Nearest to 受到: 受, 遭到, 遭, 得到, 造成, 遭受, 获, 引起,
Nearest to 事件: 案, 行为, 行动, 问题, 组织, 计划, 结果, 运动,
Nearest to 经济: 社会, 商业, 农业, 金融, 工业, 政府, 文化, 军事,
Average loss at step  2042000 :  4.11729718107 :  798763124
Average attention at step  2042000 :  0.100000007033
Variance attention at step  2042000 :  0.0
Average loss at step  2044000 :  4.277883555 :  799548276
Average attention at step  2044000 :  0.100000006981
Variance attention at step  2044000 :  0.0
Average loss at step  2046000 :  4.05011437172 :  800329424
Average attention at step  2046000 :  0.100000007045
Variance attention at step  2046000 :  0.0
Average loss at step  2048000 :  4.10984226292 :  801113430
Average attention at step  2048000 :  0.100000007011
Variance attention at step  2048000 :  0.0
Average loss at step  2050000 :  3.8505967482 :  801880791
Average attention at step  2050000 :  0.100000006963
Variance attention at step  2050000 :  0.0
Nearest to 音乐: 艺术, 文化, 歌曲, 戏剧, 流行音乐, 乐队, 表演, 个人,
Nearest to 地方: 区域, 地区, 市, 政府, 处, 当地, 中央, 机构,
Nearest to 社会: 文化, 经济, 教育, 政府, 性, 宗教, 人民, 者,
Nearest to 服务: 计划, 工作, 业务, 活动, 免费, 系统, 公共, 行动,
Nearest to 均: 皆, 都, 仍, 亦, 则, 虽然, 便, 由于,
Nearest to 型: 式, 系, 系统, 部分, -, 级, 结构, 采用,
Nearest to 学生: 教师, 人士, 学校, 者, 民众, 人员, 师生, 市民,
Nearest to 今: 现, 今属, 今日, 现今, 治今, 北, 城, 原,
Nearest to 受到: 受, 遭到, 遭, 得到, 造成, 遭受, 获, 引起,
Nearest to 事件: 案, 行为, 行动, 问题, 组织, 计划, 结果, 运动,
Nearest to 经济: 社会, 商业, 农业, 金融, 工业, 政府, 文化, 军事,
Average loss at step  2052000 :  4.21242675209 :  802657144
Average attention at step  2052000 :  0.100000007063
Variance attention at step  2052000 :  0.0
Average loss at step  2054000 :  4.26584884423 :  803440003
Average attention at step  2054000 :  0.10000000719
Variance attention at step  2054000 :  0.0
Average loss at step  2056000 :  3.84894037619 :  804209453
Average attention at step  2056000 :  0.100000006899
Variance attention at step  2056000 :  0.0
Average loss at step  2058000 :  4.27817510617 :  804995637
Average attention at step  2058000 :  0.100000007112
Variance attention at step  2058000 :  0.0
Average loss at step  2060000 :  3.90618189186 :  805759554
Average attention at step  2060000 :  0.100000007052
Variance attention at step  2060000 :  0.0
Nearest to 音乐: 艺术, 文化, 歌曲, 戏剧, 流行音乐, 乐队, 表演, 个人,
Nearest to 地方: 区域, 地区, 市, 政府, 处, 当地, 中央, 一带,
Nearest to 社会: 文化, 经济, 教育, 政府, 性, 宗教, 人民, 者,
Nearest to 服务: 计划, 工作, 业务, 活动, 免费, 系统, 公共, 商业,
Nearest to 均: 皆, 都, 仍, 亦, 则, 虽然, 便, 由于,
Nearest to 型: 式, 系, 系统, 部分, -, 级, 结构, 采用,
Nearest to 学生: 教师, 人士, 学校, 者, 民众, 人员, 师生, 市民,
Nearest to 今: 现, 今属, 今日, 现今, 治今, 北, 城, 原,
Nearest to 受到: 受, 遭到, 遭, 得到, 造成, 遭受, 获, 引起,
Nearest to 事件: 案, 行为, 行动, 问题, 组织, 计划, 结果, 案件,
Nearest to 经济: 社会, 商业, 农业, 金融, 工业, 政府, 文化, 军事,