In [8]:
%pylab inline
from IPython.display import Image, display

import tensorflow as tf
import numpy as np

Populating the interactive namespace from numpy and matplotlib


## Word2Vec

# Task


* Represent a word as a vector
  - Two related words should have two vectors close by.
* Data. 
  - 全唐诗 + 全宋词

## Data

In [21]:
class TrainData(object):

    def __init__(self, corpus, batch, windows):
        self.batch = batch
        self.windows = windows
        words = open(corpus, mode='r').read()
        words = words.replace('。', '').replace('，', '').replace('\n', '')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        print('Number of unique chars: ', len(self.id_to_word))
        print('Number of training chars: ', len(self.data))
        self.seqgen = self.skipgram_generator()

    @property
    def vocab(self):
        return len(self.id_to_word)
    
    def skipgram_generator(self):
        curr = 0
        while True:
            curr %= len(self.data)
            x = self.data[curr]
            left = self.data[max(0, curr - self.windows):curr]
            right = self.data[curr + 1:curr + 1 + self.windows]
            for y in left + right:
                yield (x, y)
            curr += 1
            
    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            x, y = next(self.seqgen)
            input.append(x)
            target.append(y)
        return np.array(input), np.array(target)
    
    def to_ids(self, words):
        ids = []
        for w in words:
            if w in self.word_to_id:
                ids.append(self.word_to_id[w])
        return ids


In [None]:
class TrainData(object):

    def __init__(self, corpus, batch, windows):
        self.batch = batch
        self.windows = windows
        words = open(corpus, mode='r').read()
        words = words.replace('。', '').replace('，', '').replace('\n', '')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        print('Number of unique chars: ', len(self.id_to_word))
        print('Number of training chars: ', len(self.data))
        self.seqgen = self.skipgram_generator()

    @property
    def vocab(self):
        return len(self.id_to_word)
    
    def to_ids(self, words):
        ids = []
        for w in words:
            if w in self.word_to_id:
                ids.append(self.word_to_id[w])
        return ids

In [None]:
    def skipgram_generator(self):
        curr = 0
        while True:
            curr %= len(self.data)
            x = self.data[curr]
            left = self.data[max(0, curr - self.windows):curr]
            right = self.data[curr + 1:curr + 1 + self.windows]
            for y in left + right:
                yield (x, y)
            curr += 1
            
    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            x, y = next(self.seqgen)
            input.append(x)
            target.append(y)
        return np.array(input), np.array(target)
    

### Example

In [25]:
data = TrainData('./data/poem.txt', 2, 2)

Number of unique chars:  7955
Number of training chars:  3987351


In [24]:
for _ in range(10):
    print(data.get_batch())

(array([4562, 4562, 1707, 1707, 1707]), array([1707, 7209, 4562, 7209, 1746]))
(array([7209, 7209, 7209, 7209, 1746]), array([4562, 1707, 1746, 1472, 1707]))
(array([1746, 1746, 1746, 1472, 1472]), array([7209, 1472,  552, 7209, 1746]))
(array([1472, 1472,  552,  552,  552]), array([ 552, 6301, 1746, 1472, 6301]))
(array([ 552, 6301, 6301, 6301, 6301]), array([1242, 1472,  552, 1242, 4228]))
(array([1242, 1242, 1242, 1242, 4228]), array([ 552, 6301, 4228, 1565, 6301]))
(array([4228, 4228, 4228, 1565, 1565]), array([1242, 1565, 5013, 1242, 4228]))
(array([1565, 1565, 5013, 5013, 5013]), array([5013, 3175, 4228, 1565, 3175]))
(array([5013, 3175, 3175, 3175, 3175]), array([ 689, 1565, 5013,  689, 1532]))
(array([ 689,  689,  689,  689, 1532]), array([5013, 3175, 1532, 6413, 3175]))


# Model

In [None]:
class Model(object):
    
    def __init__(self, dims, vocab, lr):
        # Configs.
        self.dims = dims
        self.vocab = vocab
        self.lr = lr

        self.graph = tf.Graph()
        with self.graph.as_default():
            # Var
            self.embedding = tf.Variable(
                tf.random_uniform([vocab, dims], -0.02, 0.02))

            # Feeds.
            self.inputs = tf.placeholder(tf.int64)
            self.targets = tf.placeholder(tf.int64)

            # Define forward.
            x_emb = tf.nn.embedding_lookup(self.embedding, self.inputs)
            y_emb = tf.nn.embedding_lookup(self.embedding, self.targets)

            # Compute the loss.
            scores = tf.reduce_sum(x_emb * y_emb, [1])
            probs = tf.sigmoid(scores)      logp = tf.log(probs)
            self.loss = - tf.reduce_mean(logp)

            # Define training.
            self.global_step = tf.Variable(0, trainable=False, name='global_step')
            vars = tf.trainable_variables()
            grads = tf.gradients(self.loss, vars)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            self.train_op = optimizer.apply_gradients(
                zip(grads, vars), global_step=self.global_step)

            # Summary
            tf.scalar_summary('loss', self.loss)
            self.summary = tf.merge_summary(tf.get_collection(tf.GraphKeys.SUMMARIES))

            # Inference

            # Nearest neighbors
            norm_embs = tf.nn.l2_normalize(self.embedding, 1)
            word_embs = tf.nn.embedding_lookup(norm_embs, self.inputs)
            distance = tf.matmul(word_embs, norm_embs, transpose_b=True)
            self.neighbors_topk = tf.nn.top_k(distance, k=10)

            # Analogy
            a, b, c = word_embs[1, :], word_embs[0, :], word_embs[2, :]
            d = b - a + c
            target = tf.reshape(d, [1, -1])
            dist = tf.matmul(target, norm_embs, transpose_b=True)
            self.analogy_topk = tf.nn.top_k(dist, k=10)

            # Init
            self.init = tf.initialize_all_variables()

            # Saver
            self.saver = tf.train.Saver(tf.all_variables())
            
        self.sess = tf.Session(graph=self.graph)
        self.sess.run(self.init)
        
    def train(self, data, logdir, total_steps):
        swriter = tf.train.SummaryWriter(logdir)
        
        # Recover.
        self.load(tf.train.latest_checkpoint(logdir))
        
        steps = self.sess.run(self.global_step)
        while steps < total_steps:
            if steps % 1000 == 0:
                self.saver.save(
                    self.sess, logdir + '/wv_params', global_step=steps)
            x, y = data.get_batch()
            if steps % 100 == 0:
                loss, summary = self.sess.run(
                    [self.loss, self.summary],
                    feed_dict={self.inputs: x, self.targets: y})
                swriter.add_summary(summary, steps)
                swriter.flush()
                tf.logging.info('step %d: %.4f', steps, loss)
            else:
                self.sess.run(
                    self.train_op,
                    feed_dict={self.inputs: x, self.targets: y})
            steps += 1

    def load(self, checkpoint):
        if checkpoint is not None:
            tf.logging.info('restore %s', checkpoint)
            self.saver.restore(self.sess, checkpoint)
            
    def nearby(self, data, words):
        ids = data.to_ids(words)
        print('ids = %s' % ids)
        _, neighbors = self.sess.run(
            self.neighbors_topk, feed_dict={self.inputs : ids})
        for (w, n) in zip(words, neighbors):
            print('nearby  %s --> %s' % (w, ''.join(
                        [data.id_to_word[x] for x in n])))
            
    def analogy(self, data, words):
        _, neighbors = self.sess.run(
            self.analogy_topk,
            feed_dict={self.inputs : data.to_ids(words)})
        neighbors = [data.id_to_word[x] for x in neighbors[0, :]]
        print('analogy %s %s' % (words, ''.join(neighbors)))

## Nearest neighbors

## Analogy