In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

import numpy as np
import pandas as pd
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from tqdm import tqdm

import pickle

from tensorflow.contrib.tensorboard.plugins import projector

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

## Step 1 - Get Data

In [3]:
zh_wiki_id = open("data/zh_wiki_id_w_d").readline()
word_to_id = pickle.load(open("data/word_to_id_w_d.pkl", "rb"))
id_to_word = pickle.load(open("data/id_to_word_w_d.pkl", "rb"))

# zh_wiki_id = open("data/zh_wiki_id").readline()
# word_to_id = pickle.load(open("data/word_to_id.pkl", "rb"))
# id_to_word = pickle.load(open("data/id_to_word.pkl", "rb"))
# word_count = pickle.load(open("data/count.pkl", "rb"))

In [4]:
# (507260, 507260, 862838467)
len(id_to_word), len(word_to_id), len(zh_wiki_id)

(507260, 507260, 862838467)

In [5]:
def getWord(data, num, data_index):
    sub_data_string = data[data_index:data_index+num*(6+1)]
    result = []
    for index, item in enumerate(sub_data_string.split()):
        if index == num: break
        data_index += len(item) + 1
        result.append(int(item))
    if len(result) < num:
        return getWord(data, num, 0)
    assert len(result) == num
    return result, data_index

In [6]:
def generate_batch(batch_size, skip_window, num_skips):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size, num_skips), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    assert batch_size >= span
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builti
    
    result, data_index = getWord(zh_wiki_id, span, data_index)
    buffer.extend(result)
    
    for i in range(batch_size):
        context_words = [w for w in range(span) if w != skip_window]
        batch[i, :] = [buffer[token] for idx, token in enumerate(context_words)]
        labels[i, 0] = buffer[skip_window]
        result, data_index = getWord(zh_wiki_id, 1, data_index)
        buffer.append(result[0])
        if data_index > len(zh_wiki_id):
            result, data_index = getWord(zh_wiki_id, span-1, 0)
            buffer.extend(result)
        if i == batch_size - span:
            last_index = data_index
            
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = last_index
    return batch, labels

In [7]:
data_index = 0

batch, labels = generate_batch(batch_size=8, skip_window=1, num_skips=2*1)
for i in range(8):
    print(batch[i, 0], id_to_word[batch[i, 0]],
          batch[i, 1], id_to_word[batch[i, 1]],
          '->', labels[i, 0], id_to_word[labels[i, 0]])

1348 数学 501 利用 -> 9 是
9 是 237319 符号语言 -> 501 利用
501 利用 141 研究 -> 237319 符号语言
237319 符号语言 894 数量 -> 141 研究
141 研究 5 、 -> 894 数量
894 数量 499 结构 -> 5 、
5 、 5 、 -> 499 结构
499 结构 331 变化 -> 5 、


## Step 2 - Build & Train Network

In [8]:
batch_size = 200
batch_size = 100 # 0509 change
# batch_size = 256
embedding_size = 200    # Dimension of the embedding vector.
skip_window = 10    # How many words to consider left and right.
num_skips = 2*skip_window    # How many times to reuse an input to generate a label.
num_sampled = 100    # Number of negative examples to sample.
# num_sampled = 128    # Number of negative examples to sample.
vocabulary_size = len(id_to_word)

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
# valid_size = 16    # Random set of words to evaluate similarity on.
# valid_window = 100    # Only pick dev samples in the head of the distribution.
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# valid_examples = list(range(1, 10))
valid_examples = list(range(280, 291))
valid_size = len(valid_examples)

graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size, num_skips])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        ### rxz
        test_dataset = tf.placeholder(tf.int32, shape=None)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/gpu:0'):
    # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
            # take mean of embeddings of context words for context embedding
            embed_context = tf.reduce_mean(embed, 1)

    with tf.device('/gpu:0'):
        # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                    tf.truncated_normal(
                            [vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #     http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    
#     with tf.device('/gpu:2'):
    with tf.name_scope('loss'):
#         loss = tf.reduce_mean(
#             tf.nn.nce_loss(nce_weights, nce_biases, embed_context, train_labels,
#                            num_sampled, vocabulary_size))
#         print(train_labels, embed_context)
        loss = tf.reduce_mean(
                tf.nn.nce_loss(
                        weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
                        inputs=embed_context,
#                         labels=embed_context,
#                         inputs=train_labels,
                        num_sampled=num_sampled,
                        num_classes=vocabulary_size))

    # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
#         optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
        optimizer = tf.train.GradientDescentOptimizer(1).minimize(loss)
#         optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    print(valid_embeddings, normalized_embeddings, similarity)
    ### top k same word
    print("---- top k same word")
    test_embeddings = tf.nn.embedding_lookup(normalized_embeddings, test_dataset)
    print("test_embeddings", test_embeddings)
    print("normalized_embeddings", normalized_embeddings)
    similarity_test = tf.matmul(test_embeddings, normalized_embeddings, transpose_b=True)
    print("similarity_test:", similarity_test)
    similarity_test_top_k_value, similarity_test_top_k_index  = tf.nn.top_k(similarity_test, k=5)
    print("similarity_test_top_k_value:", similarity_test_top_k_value)
    print("similarity_test_top_k_index:", similarity_test_top_k_index)
    ### word similarity
    test_embeddings = tf.nn.embedding_lookup(normalized_embeddings, test_dataset)
    test_emb1, test_emb2 = tf.split(test_embeddings, [1, 1], 0)
    similarity_smi_test = tf.matmul(test_emb1, test_emb2, transpose_b=True)
    ### analogical reasoning
    print("---- analogical reasoning")
    test_embeddings = tf.nn.embedding_lookup(embeddings, test_dataset)
    print("test_embeddings:", test_embeddings)
    test_emb1, test_emb2, test_emb3, test_emb4 = tf.split(test_embeddings, [1, 1, 1, 1], 0)
    print("test_embs:", test_emb1, test_emb2)
    print("test_embs:", test_emb3, test_emb4)
#     test_result_tmp = tf.subtract(test_emb1, test_emb2)
    test_result = tf.add(tf.subtract(test_emb1, test_emb2), test_emb3)
#     assert_sum_1_shape = tf.shape(test_result)
#     assert_sum_1 = tf.reduce_sum(test_result, axis=-1)
    analogical_product = tf.matmul(test_result, test_emb4, transpose_b=True)
    analogical_norm_1 = tf.sqrt(tf.reduce_sum(tf.square(test_result), axis=0))
    analogical_norm_2 = tf.sqrt(tf.reduce_sum(tf.square(test_emb4), axis=0))
    analogical_smi_test = analogical_product / (analogical_norm_1 * analogical_norm_2)
    print("test_result", test_result)
    print("normalized_embeddings", normalized_embeddings)
#     analogical_similarity = tf.squeeze(tf.matmul(tf.reshape(test_result, [1,-1]), normalized_embeddings, transpose_b=True))
    test_result_norm = test_result / analogical_norm_1
    analogical_similarity = tf.squeeze(tf.matmul(test_result_norm, normalized_embeddings, transpose_b=True))
    print("analogical_similarity", analogical_similarity)
    analogical_top_k_value, analogical_top_k_index  = tf.nn.top_k(tf.squeeze(analogical_similarity), k=5)
    print("analogical_top_k_value:", analogical_top_k_value)
    print("analogical_top_k_index:", analogical_top_k_index)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()

Tensor("embedding_lookup:0", shape=(11, 200), dtype=float32) Tensor("truediv:0", shape=(507260, 200), dtype=float32) Tensor("MatMul:0", shape=(11, 507260), dtype=float32)
---- top k same word
test_embeddings Tensor("embedding_lookup_1:0", dtype=float32)
normalized_embeddings Tensor("truediv:0", shape=(507260, 200), dtype=float32)
similarity_test: Tensor("MatMul_1:0", shape=(?, 507260), dtype=float32)
similarity_test_top_k_value: Tensor("TopKV2:0", shape=(?, 5), dtype=float32)
similarity_test_top_k_index: Tensor("TopKV2:1", shape=(?, 5), dtype=int32)
---- analogical reasoning
test_embeddings: Tensor("embedding_lookup_3:0", dtype=float32, device=/device:GPU:0)
test_embs: Tensor("split_1:0", dtype=float32) Tensor("split_1:1", dtype=float32)
test_embs: Tensor("split_1:2", dtype=float32) Tensor("split_1:3", dtype=float32)
test_result Tensor("Add:0", dtype=float32)
normalized_embeddings Tensor("truediv:0", shape=(507260, 200), dtype=float32)
analogical_similarity Tensor("Squeeze:0", dtype=fl

In [None]:
num_steps = 20000001
# log_dir = "./log_002_baseline_cbow/"
log_dir = "./log_002_best_baseline/"

tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True

with tf.Session(graph=graph, config=tfconfig) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(log_dir, session.graph)

    # We must initialize all variables before we use them.
#     init.run()
#     saver = tf.train.import_meta_graph('./checkpoint_dir/MyModel-1000.meta')
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
    print('Initialized')

    average_loss = 0
    start_index = 14640000
    for step in xrange(start_index, num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, skip_window=skip_window, num_skips=num_skips)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
        # Feed metadata variable to session for visualizing the graph in TensorBoard.
        _, summary, loss_val = session.run(
                [optimizer, merged, loss],
                feed_dict=feed_dict,
                run_metadata=run_metadata)
        average_loss += loss_val
        
        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0 and step != start_index:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss, ":", data_index)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = id_to_word[valid_examples[i]]
                top_k = 8    # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = id_to_word[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
                
            # Save the model for checkpoints.
            saver.save(session, os.path.join(log_dir, 'model.ckpt'), global_step=step)
        
        if step % 1000000 == 0 and step != start_index:
            word2vec = embeddings.eval()
            print(word2vec.shape, type(word2vec))
            np.save("result/002#word_embedding_best_"+str(step), word2vec)
            
    final_embeddings = normalized_embeddings.eval()

    # Write corresponding labels for the embeddings.
    with open(log_dir + '/metadata.tsv', 'w') as f:
        for i in xrange(vocabulary_size):
            f.write(id_to_word[i] + '\n')

    # Save the model for checkpoints.
    saver.save(session, os.path.join(log_dir, 'model.ckpt'), global_step=step)

    # Create a configuration for visualizing embeddings with the labels in TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
    projector.visualize_embeddings(writer, config)

writer.close()

In [None]:
log_dir = "./log_002_baseline_cbow/"
log_dir = "./log_002_tmp/"

tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True
# tfconfig.device_count = {'GPU': 0}

with tf.Session(graph=graph, config=tfconfig) as session:
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
    print('Initialized')
    word2vec = embeddings.eval()
    print(word2vec.shape, type(word2vec))
    np.save("result/002#word_embedding_win5", word2vec)

## Step 3 - Evaluation - wordsim

In [None]:
wordsim_240 = None
with open("./data/240.txt") as f:
    wordsim_240 = f.readlines()
    wordsim_240 = [line.strip().split("\t") for line in wordsim_240]
wordsim_240[:5]

In [None]:
wordsim_297 = None
with open("./data/297.txt") as f:
    wordsim_297 = f.readlines()
    wordsim_297 = [line.strip().split("\t") for line in wordsim_297]
wordsim_297[:5]

In [None]:
log_dir = "./log_002_baseline_cbow/"

# tfconfig = tf.ConfigProto(device_count={'gpu':0})
tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True
# tfconfig.device_count = {'GPU': 0}

with tf.Session(graph=graph, config=tfconfig) as session:
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
    print('Initialized')
    context_lt = []
    for item1, item2, score in wordsim_240:
        if item1 not in word_to_id:
            print(item1, item2)
            continue
        if item2 not in word_to_id:
            print(item2)
            continue
        testitem1 = word_to_id[item1]
        testitem2 = word_to_id[item2]
        sim = similarity_smi_test.eval({test_dataset:[testitem1, testitem2]})
        context_lt.append([item1, item2, sim[0][0]*10, float(score)])
        print()
        print(item1, item2, sim[0][0]*10, score)
        index, value = session.run([similarity_test_top_k_index, similarity_test_top_k_value], feed_dict={test_dataset:[testitem1]})
        for i, v in zip(index[0], value[0]):
            print(id_to_word[i], v, end=",")
        print()
        index, value = session.run([similarity_test_top_k_index, similarity_test_top_k_value], feed_dict={test_dataset:[testitem2]})
        for i, v in zip(index[0], value[0]):
            print(id_to_word[i], v, end=",")
        print()

In [None]:
log_dir = "./log_002_baseline_cbow/"

# tfconfig = tf.ConfigProto(device_count={'gpu':0})
tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True
# tfconfig.device_count = {'GPU': 0}

with tf.Session(graph=graph, config=tfconfig) as session:
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
    print('Initialized')
    context_lt = []
    for item1, item2, score in wordsim_297:
        if item1 not in word_to_id:
            print(item1, item2)
            continue
        if item2 not in word_to_id:
            print(item2)
            continue
        testitem1 = word_to_id[item1]
        testitem2 = word_to_id[item2]
        sim = similarity_smi_test.eval({test_dataset:[testitem1, testitem2]})
        context_lt.append([item1, item2, sim[0][0], float(score)])
#         print(item1, item2, sim[0][0], score)

In [None]:
# word_count['黄瓜']

In [None]:
import pandas as pd
context_pd = pd.DataFrame(context_lt, columns=['a', 'b', 'pscore', 'score'])
# context_pd = context_pd[['pscore', 'score']]
context_pd['pscore'] = [float(line) for line in context_pd['pscore'].values]
context_pd['score'] = [float(line) for line in context_pd['score'].values]
context_pd.corr("spearman")

In [None]:
import pandas as pd
context_pd = pd.DataFrame(context_lt, columns=['a', 'b', 'pscore', 'score'])
# context_pd = context_pd[['pscore', 'score']]
context_pd['pscore'] = [float(line*5) for line in context_pd['pscore'].values]
context_pd['score'] = [float(line) for line in context_pd['score'].values]
context_pd.corr("spearman")

In [None]:
context_pd.to_csv("result/002#wordsim-296-0.547964.csv", index=False)

In [None]:
context_pd.to_csv("result/002#wordsim-240-0.462907.csv", index=False)

## Step 3 - Evaluation - analogy


In [None]:
# batch_size = 100
# # batch_size = 256
# skip_window = 5    # How many words to consider left and right.
# num_skips = 2*skip_window    # How many times to reuse an input to generate a label.
# num_sampled = 100    # Number of negative examples to sample.
# # num_sampled = 128    # Number of negative examples to sample.

embedding_size = 100    # Dimension of the embedding vector.
vocabulary_size = len(id_to_word)

graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('inputs'):
        test_dataset = tf.placeholder(tf.int32, shape=None)
        
    # Look up embeddings for inputs.
    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    ### analogical reasoning
    print("---- analogical reasoning")
    test_embeddings = tf.nn.embedding_lookup(embeddings, test_dataset)
    print("test_embeddings:", test_embeddings)
    test_emb1, test_emb2, test_emb3, test_emb4 = tf.split(test_embeddings, [1, 1, 1, 1], 0)
    print("test_embs:", test_emb1, test_emb2)
    print("test_embs:", test_emb3, test_emb4)
#     test_result_tmp = tf.subtract(test_emb1, test_emb2)
    test_result = tf.add(tf.subtract(test_emb2, test_emb1), test_emb3)
#     assert_sum_1_shape = tf.shape(test_result)
#     assert_sum_1 = tf.reduce_sum(test_result, axis=-1)
    analogical_product = tf.matmul(test_result, test_emb4, transpose_b=True)
    analogical_norm_1 = tf.sqrt(tf.reduce_sum(tf.square(test_result), axis=-1))
    analogical_norm_2 = tf.sqrt(tf.reduce_sum(tf.square(test_emb4), axis=-1))
    analogical_smi_test = analogical_product / (analogical_norm_1 * analogical_norm_2)
    print("test_result", test_result)
    print("normalized_embeddings", normalized_embeddings)
#     analogical_similarity = tf.squeeze(tf.matmul(tf.reshape(test_result, [1,-1]), normalized_embeddings, transpose_b=True))
    test_result_norm = test_result / analogical_norm_1
    analogical_similarity = tf.squeeze(tf.matmul(test_result_norm, normalized_embeddings, transpose_b=True))
    print("analogical_similarity", analogical_similarity)
    analogical_top_k_value, analogical_top_k_index  = tf.nn.top_k(tf.squeeze(analogical_similarity), k=5)
    print("analogical_top_k_value:", analogical_top_k_value)
    print("analogical_top_k_index:", analogical_top_k_index)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()

In [None]:
log_dir = "./log_002_baseline_cbow/"
log_dir = "./log_002_tmp/"

tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True
# tfconfig.device_count = {'GPU': 0}

with tf.Session(graph=graph, config=tfconfig) as session:
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
    print('Initialized')
    word2vec = embeddings.eval()
    print(word2vec.shape, type(word2vec))
    np.save("result/002#word_embedding_win20", word2vec)

In [None]:
analogy_1127 = None
with open("./data/analogy-1127.txt") as f:
    analogy_1127 = f.readlines()
    analogy_1127 = [line.strip().split() for line in analogy_1127]
# analogy_1127[:5]

analogy_1127_capital = []
analogy_1127_state = []
analogy_1127_family = []
flag = 0
for item in analogy_1127:
    if item[0] == ":":
        flag += 1
        analogy_1127.remove(item)
        continue
    if flag == 1:
        analogy_1127_capital.append(item)
    elif flag == 2:
        analogy_1127_state.append(item)
    elif flag == 3:
        analogy_1127_family.append(item)
analogy_1127_capital[:5], analogy_1127_state[:5], analogy_1127_family[:5]

In [None]:
for line in analogy_1127:
    print(line)
    break

In [None]:
log_dir = "./log_002_baseline_cbow/"

# tfconfig = tf.ConfigProto(device_count={'gpu':0})
tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True
# tfconfig.device_count = {'GPU': 0}

with tf.Session(graph=graph, config=tfconfig) as session:
    saver.restore(session, tf.train.latest_checkpoint(log_dir))
    print('Initialized')
    context_lt = []
    count = 0
    for item1, item2, item3, item4 in tqdm(analogy_1127):
        if item1 not in word_to_id or\
            item2 not in word_to_id or\
            item3 not in word_to_id or\
            item4 not in word_to_id:
            print(item1, item2, item3, item4)
            continue
        testitem1 = word_to_id[item1]
        testitem2 = word_to_id[item2]
        testitem3 = word_to_id[item3]
        testitem4 = word_to_id[item4]
        sim = analogical_smi_test.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         assert_sum = assert_sum_1.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         assert_sum_shape = assert_sum_1_shape.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         assert_val = test_result.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         assert_tmp_val = test_result_tmp.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         test_emb1_val = test_emb1.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         test_emb2_val = test_emb1.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         test_emb3_val = test_emb1.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         test_emb4_val = test_emb1.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         test_embs_val = test_embeddings.eval({test_dataset:[testitem1, testitem2, testitem3, testitem4]})
#         voca_embs = normalized_embeddings.eval()
#         print(assert_sum, assert_sum_shape)
#         print(assert_val)
#         print(assert_tmp_val)
        
#         print(item1, item2, item3, item4, sim[0][0])
        value, index  = session.run([analogical_top_k_value, analogical_top_k_index], feed_dict={test_dataset:[testitem1, testitem2, testitem3, testitem4]})
        
#         print(index, value)
        sim_item = []
        for i, v in zip(index, value):
#             print(id_to_word[i], v, end=",")
            sim_item.append(id_to_word[i] + str(v))
#         print()
        if sim[0][0] >= value[1]: count += 1
        context_lt.append([item1, item2, item3, item4, sim[0][0]])
        context_lt[-1].extend(sim_item)
    print("accuracy:", count/len(analogy_1127)*100.0, "%")
#         print(item1, item2, sim[0][0], score)

In [None]:
pd.DataFrame(context_lt).to_csv("result/002#analogy_1127_0.0907.csv", index=False)