In [1]:
%pylab inline
from IPython.display import Image, display

import tensorflow as tf
sess = tf.InteractiveSession() 

Populating the interactive namespace from numpy and matplotlib


## Word2Vec

# Task


* Represent a word as a vector
  - Two related words should have two vectors close by.
* Data. 
  - 100MB wikipedia. Available from http://http://mattmahoney.net/dc/text8.zip
  - 全唐诗。

## Data

In [2]:
words = open('./data/sc.utf8.cleaned.txt').read().replace('\n', '').replace(' ', '')
print("Total chars = ", len(words))
words_as_set = set(words)
print("Unique chars = ", len(words_as_set))
id_to_word = sorted(words_as_set)
word_to_id = {w: i for i, w in enumerate(id_to_word)}
data = [word_to_id[w] for w in words]
print(words[2005:2017])
print(data[2005:2017])

Total chars =  1635832
Unique chars =  5853
直恐好风光，尽随伊归去。
[3245, 1581, 1019, 5487, 363, 5846, 1237, 5337, 189, 1473, 606, 38]


In [3]:
def skipgram_generator(window=8):
    curr = 0
    while True:
        curr %= len(data)
        x = data[curr]
        left = data[max(0, curr - window):curr]
        right = data[curr+1:x+window]
        for y in left + right:
            yield (x, y)
        curr += 1
        
skipgram = skipgram_generator()

for _ in range(10):
    x, y = next(skipgram)
    print(x, y)


5241 1527
5241 1144
5241 1259
5241 5846
5241 1259
5241 865
5241 2732
5241 1524
5241 1021
5241 5778


In [4]:
def get_batch(batchsize=128):
    xs, ys = [], []
    for _ in range(batchsize):
        (x, y) = next(skipgram)
        xs += [x]
        ys += [y]
    return xs, ys

for _ in range(10):
    print(get_batch(4))

([5241, 5241, 5241, 5241], [3600, 38, 343, 1725])
([5241, 5241, 5241, 5241], [842, 5407, 656, 2732])
([5241, 5241, 5241, 5241], [1445, 38, 4879, 2336])
([5241, 5241, 5241, 5241], [606, 4934, 2192, 38])
([5241, 5241, 5241, 5241], [4079, 4159, 5545, 790])
([5241, 5241, 5241, 5241], [4939, 124, 5268, 38])
([5241, 5241, 5241, 5241], [5268, 58, 2705, 965])
([5241, 5241, 5241, 5241], [2417, 59, 5176, 38])
([5241, 5241, 5241, 5241], [465, 2192, 1224, 863])
([5241, 5241, 5241, 5241], [2523, 150, 4447, 38])


# Model

In [5]:
NDIMS=128
NWORDS=len(id_to_word)
embedding = tf.Variable(tf.random_uniform([NWORDS, NDIMS], -0.02, 0.02))

In [6]:
inputs = tf.placeholder(tf.int64)
targets = tf.placeholder(tf.int64)

def model():
    x_emb = tf.nn.embedding_lookup(embedding, inputs)
    y_emb = tf.nn.embedding_lookup(embedding, targets)
    scores = tf.reduce_sum(x_emb * y_emb, [1])
    probs = tf.sigmoid(scores)
    logp = tf.log(probs)
    mean_logp = tf.reduce_mean(logp)
    return -mean_logp


In [7]:
loss = model()

tf.initialize_all_variables().run()
for _ in range(3):
    xs, ys = get_batch()
    print("loss = ", sess.run([loss], {inputs:xs, targets:ys}))

loss =  [0.69295943]
loss =  [0.69311172]
loss =  [0.69327652]


In [8]:
global_step = tf.Variable(0, trainable=False)

def train(learning_rate=1.0):
    loss = model()
    vars = tf.trainable_variables()
    grads = tf.gradients(loss, vars)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.apply_gradients(zip(grads, vars), global_step=global_step)
    return loss, train_op

loss, train_op = train()


In [9]:
tf.initialize_all_variables().run()
for steps in range(10000):
    xs, ys = get_batch(batchsize=1024)
    l, _ = sess.run([loss, train_op], feed_dict={inputs:xs, targets:ys})
    if steps % 1000 == 0:
        print('step %d: %.4f' % (steps, l))

step 0: 0.6930


In [10]:
saver = tf.train.Saver(tf.all_variables())
saver.save(sess, './w2v_params', global_step=global_step.eval())

'./w2v_params-1000'

## Nearest neighbors

In [11]:
word_ids = tf.placeholder(tf.int64)
norm_embs = tf.nn.l2_normalize(embedding, 1)
word_embs = tf.nn.embedding_lookup(norm_embs, word_ids)
dist = tf.matmul(word_embs, norm_embs, transpose_b=True)
topk = tf.nn.top_k(dist, k=10)

def word_to_ids(words):
    ids = []
    for w in words:
        if w in word_to_id:
            ids += [word_to_id[w]]
        else:
            ids += [word_to_id['。']]
    return ids

def nearby(words):
    dist, ids = sess.run(topk, feed_dict={word_ids:word_to_ids(words)})
    for (p, r) in zip(dist, ids):
        print([id_to_word[w] for w in r])

In [12]:
tf.initialize_all_variables().run()
nearby('千里冰封万里雪飘')

['千', '揎', '沾', '觉', '剜', '疣', '痣', '辋', '缑', '学']
['里', '括', '狮', '帻', '匿', '攸', '醱', '呖', '犭', '戚']
['冰', '倏', '阻', '胪', '摴', '阙', '籀', '峙', '錝', '淆']
['封', '惰', '圳', '犒', '浚', '蚓', '产', '纪', '粪', '胜']
['万', '鵷', '共', '枥', '皑', '峥', '汲', '姿', '佛', '补']
['里', '括', '狮', '帻', '匿', '攸', '醱', '呖', '犭', '戚']
['雪', '堞', '鹠', '铒', '崇', '缵', '乌', '绥', '艗', '邺']
['飘', '咨', '粱', '佚', '昏', '俞', '蕤', '酒', '岨', '鼯']


In [13]:
saver.restore(sess, '../w2v/w2v_params-100000')
nearby('千里冰封万里雪飘')

['千', '，', '、', '。', '长', '风', '年', '不', '是', '花']
['里', '，', '长', '。', '、', '花', '雨', '风', '云', '金']
['冰', '。', '，', '、', '江', '春', '无', '天', '长', '梦']
['封', '，', '酒', '、', '一', '。', '阁', '醉', '风', '阳']
['万', '，', '。', '、', '长', '无', '酒', '春', '明', '风']
['里', '，', '长', '。', '、', '花', '雨', '风', '云', '金']
['雪', '，', '、', '。', '一', '去', '轻', '清', '长', '自']
['飘', '，', '。', '年', '、', '相', '云', '何', '天', '无']


## Analogy

In [14]:
word_ids = tf.placeholder(tf.int64, shape=[3])
norm_embs = tf.nn.l2_normalize(embedding, 1)
word_embs = tf.nn.embedding_lookup(norm_embs, word_ids)
target = tf.expand_dims(word_embs[1, :] - word_embs[0, :] + word_embs[2, :], 0)
dist = tf.matmul(target, norm_embs, transpose_b=True)
topk = tf.nn.top_k(dist, k=10)

def analogy(a, b, x):
    dist, ids = sess.run(topk, feed_dict={word_ids:word_to_ids([a, b, x])})
    for (p, r) in zip(dist, ids):
        print([id_to_word[w] for w in r])

In [15]:
analogy('上', '下', '左')

['左', '下', '连', '春', '百', '雨', '一', '，', '杨', '。']
