In [1]:
%pylab inline
from IPython.display import Image, display

import tensorflow as tf
sess = tf.InteractiveSession() 

Populating the interactive namespace from numpy and matplotlib


## Word2Vec

# Task


* Represent a word as a vector
  - Two related words should have two vectors close by.
* Data. 
  - 全唐诗 + 全宋词

## Data

In [2]:
words = open('./data/poem.txt').read().replace('。', '').replace('，', '').replace('\n', '')
print("Total chars = ", len(words))
words_as_set = set(words)
print("Unique chars = ", len(words_as_set))
id_to_word = sorted(words_as_set)
word_to_id = {w: i for i, w in enumerate(id_to_word)}
data = [word_to_id[w] for w in words]
print(words[100:120])
print(data[100:120])

Total chars =  3987351
Unique chars =  7955
惊雁落虚弦啼猿悲急箭阅赏诚多美于兹乃忘倦
[2066, 7208, 5632, 5795, 1858, 954, 3918, 2056, 1981, 4756, 7108, 6383, 6217, 1257, 5122, 171, 492, 141, 1939, 381]


In [3]:
def skipgram_generator(window=8):
    curr = 0
    while True:
        curr %= len(data)
        x = data[curr]
        left = data[max(0, curr - window):curr]
        right = data[curr+1:x+window]
        for y in left + right:
            yield (x, y)
        curr += 1
        
skipgram = skipgram_generator()

for _ in range(10):
    x, y = next(skipgram)
    print(x, y)


4562 1707
4562 7209
4562 1746
4562 1472
4562 552
4562 6301
4562 1242
4562 4228
4562 1565
4562 5013


In [4]:
def get_batch(batchsize=128):
    xs, ys = [], []
    for _ in range(batchsize):
        (x, y) = next(skipgram)
        xs += [x]
        ys += [y]
    return xs, ys

for _ in range(10):
    print(get_batch(4))

([4562, 4562, 4562, 4562], [3175, 689, 1532, 6413])
([4562, 4562, 4562, 4562], [4543, 1497, 4224, 7213])
([4562, 4562, 4562, 4562], [7458, 6659, 5750, 6730])
([4562, 4562, 4562, 4562], [2386, 3229, 7408, 6101])
([4562, 4562, 4562, 4562], [6663, 533, 5795, 173])
([4562, 4562, 4562, 4562], [2601, 7187, 1563, 7128])
([4562, 4562, 4562, 4562], [7396, 3717, 550, 5013])
([4562, 4562, 4562, 4562], [4139, 1608, 1820, 5099])
([4562, 4562, 4562, 4562], [2748, 630, 1636, 2553])
([4562, 4562, 4562, 4562], [5201, 7524, 6608, 3938])


# Model

In [5]:
NDIMS=128
NWORDS=len(id_to_word)
embedding = tf.Variable(tf.random_uniform([NWORDS, NDIMS], -0.02, 0.02))

In [6]:
inputs = tf.placeholder(tf.int64)
targets = tf.placeholder(tf.int64)

def model():
    x_emb = tf.nn.embedding_lookup(embedding, inputs)
    y_emb = tf.nn.embedding_lookup(embedding, targets)
    scores = tf.reduce_sum(x_emb * y_emb, [1])
    probs = tf.sigmoid(scores)
    logp = tf.log(probs)
    mean_logp = tf.reduce_mean(logp)
    return -mean_logp


In [7]:
loss = model()

tf.initialize_all_variables().run()
for _ in range(3):
    xs, ys = get_batch()
    print("loss = ", sess.run([loss], {inputs:xs, targets:ys}))

loss =  [0.69313043]
loss =  [0.69330263]
loss =  [0.69310367]


In [8]:
global_step = tf.Variable(0, trainable=False)

def train(learning_rate=1.0):
    loss = model()
    vars = tf.trainable_variables()
    grads = tf.gradients(loss, vars)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.apply_gradients(zip(grads, vars), global_step=global_step)
    return loss, train_op

loss, train_op = train()


In [9]:
tf.initialize_all_variables().run()
for steps in range(2000):
    xs, ys = get_batch(batchsize=1024)
    l, _ = sess.run([loss, train_op], feed_dict={inputs:xs, targets:ys})
    if steps % 1000 == 0:
        print('step %d: %.4f' % (steps, l))

step 0: 0.6931
step 1000: 0.6924


In [10]:
saver = tf.train.Saver(tf.all_variables())
saver.save(sess, './data/w2v_params', global_step=global_step.eval())

'./data/w2v_params-2000'

## Nearest neighbors

In [11]:
word_ids = tf.placeholder(tf.int64)
norm_embs = tf.nn.l2_normalize(embedding, 1)
word_embs = tf.nn.embedding_lookup(norm_embs, word_ids)
dist = tf.matmul(word_embs, norm_embs, transpose_b=True)
topk = tf.nn.top_k(dist, k=10)

def word_to_ids(words):
    ids = []
    for w in words:
        if w in word_to_id:
            ids += [word_to_id[w]]
        else:
            ids += [word_to_id['。']]
    return ids

def nearby(words):
    dist, ids = sess.run(topk, feed_dict={word_ids:word_to_ids(words)})
    for (p, r) in zip(dist, ids):
        print([id_to_word[w] for w in r])

In [12]:
tf.initialize_all_variables().run()
nearby('千里冰封万里雪飘')

['千', '霰', '瘵', '龃', '琫', '箫', '饪', '髫', '蒜', '鹉']
['里', '沌', '具', '镆', '宦', '韵', '畤', '玛', '辩', '鞙']
['冰', '畛', '榞', '酱', '怿', '蒻', '饘', '睫', '厦', '鉟']
['封', '鲤', '袓', '侬', '罕', '墝', '喙', '陨', '呱', '渑']
['万', '譬', '假', '燑', '贽', '荦', '晕', '峣', '鍧', '仕']
['里', '沌', '具', '镆', '宦', '韵', '畤', '玛', '辩', '鞙']
['雪', '唼', '悺', '筼', '冽', '亚', '臀', '岚', '递', '编']
['飘', '櫁', '矮', '潞', '呼', '稙', '绎', '自', '枮', '争']


In [13]:
saver.restore(sess, './data/w2v_params-100000')
nearby('千里冰封万里雪飘')

['千', '玉', '不', '香', '一', '里', '天', '高', '野', '云']
['里', '风', '日', '不', '长', '有', '见', '高', '云', '为']
['冰', '雁', '天', '不', '无', '长', '珠', '秋', '还', '为']
['封', '秋', '王', '有', '空', '风', '鸟', '水', '一', '马']
['万', '人', '风', '一', '不', '云', '香', '飞', '上', '何']
['里', '风', '日', '不', '长', '有', '见', '高', '云', '为']
['雪', '风', '霜', '上', '长', '不', '中', '飞', '一', '露']
['飘', '飞', '高', '阙', '道', '人', '不', '出', '衣', '风']


## Analogy

In [14]:
word_ids = tf.placeholder(tf.int64, shape=[3])
norm_embs = tf.nn.l2_normalize(embedding, 1)
word_embs = tf.nn.embedding_lookup(norm_embs, word_ids)
target = tf.expand_dims(word_embs[1, :] - word_embs[0, :] + word_embs[2, :], 0)
dist = tf.matmul(target, norm_embs, transpose_b=True)
topk = tf.nn.top_k(dist, k=10)

def analogy(a, b, x):
    dist, ids = sess.run(topk, feed_dict={word_ids:word_to_ids([a, b, x])})
    for (p, r) in zip(dist, ids):
        print([id_to_word[w] for w in r])

In [15]:
analogy('上', '下', '左')

['左', '下', '何', '枝', '疏', '无', '见', '玉', '烟', '临']
