In [1]:
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)
words[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [4]:
class WordNumericEncoder:
    def __init__(self, words, common=0, rare_word_token="UNK"):
        self._words = words
        self._rare_word_token = rare_word_token
        self._counter = collections.Counter(words)
        
        self._set_items(common)
        self._build_dictionary()
        self._encode_words()
        
    def _set_items(self, common=0):
        self._items = [[self._rare_word_token, -1]]
        if common <= 0:
            common = len(self._words)
            self._items = []
        self._items.extend(self._counter.most_common(common))
    
    def _build_dictionary(self):
        self._dictionary = dict()
        for word, _ in self._items:
            self._dictionary[word] = len(self._dictionary)
    
    def _encode_words(self):
        data = list()
        unk_count = 0
        for word in self._words:
            if word in self._dictionary:
                index = self._dictionary[word]
            else:
                index = 0  # items['UNK']
                unk_count = unk_count + 1
            data.append(index)
        self._items[0][1] = unk_count
        self._data = data
        
    def get_data(self):
        return self._data
    
    def get_reverse_dictionary(self):
        return dict(zip(self._dictionary.values(), self._dictionary.keys())) 

In [5]:
class ContextBatchGenerator:
    
    def __init__(self, text, window):
        self._text = text # -> ["1", "2", "3", "4", "5", "6", "7"]
        self._len = len(text)
        self._window = window # -> 2
        self._cursor = 0
        self._span = window*2 + 1 # -> 5 [window... , target, window...]
        self._non_window_idx = [i for i in range(self._span) if i != window] # -> [0,1,3,4]
        self._buffer = collections.deque(maxlen=self._span)
        for i in range(self._span):
            self.shift_buffer()
        # -> buffer = ["1", "2", "3", "4", "5"]
    
    def _batch(self, size):
        l = list()
        for i in range(size):
            target = self._buffer[self._window] # -> buffer[2]
            context = [self._buffer[i] for i in self._non_window_idx] # buffer ['1','2','4','5']
            l.append((context, target)) # ->(['1', '2', '4', '5'], '3')
            self.shift_buffer()
        return l
            
    def shift_buffer(self):
        self._buffer.append(self._text[self._cursor])
        self._cursor = (self._cursor + 1) % self._len

class SkipGramGenerator(ContextBatchGenerator):
    
    def next(self, size, dtype=np.int32):
        if (size % (self._window*2) !=0):
            raise ValueError("batch size should be devidable by window*2")
        batches = size // (self._window*2)
        
        batch = np.ndarray(shape=(size), dtype=dtype)
        labels = np.ndarray(shape=(size, 1), dtype=dtype)
        i = 0
        for b in self._batch(batches):
            for t in b[0]:
                batch[i] = t
                labels[i] = b[1]
                i+=1
        return batch, labels # next(2) -> [10, 30], [[20], [20]]
    
class CBOWGenerator(ContextBatchGenerator):
    
    def next(self, size, dtype=np.int32):
        if (size % (self._window*2) !=0):
            raise ValueError("batch size should be devidable by window*2")
        batches = size // (self._window*2)
        
        batch = np.ndarray(shape=(size), dtype=dtype)
        labels = np.ndarray(shape=(size, 1), dtype=dtype)
        i = 0
        for b in self._batch(batches):
            for t in b[0]:
                batch[i] = b[1] # <--
                labels[i] = t # <--
                i+=1
        return batch, labels

In [10]:
vocabulary_size = 50000
window = 2
batch_size = 128
embedding_size = 128
num_sampled = 64



In [11]:
graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels  = tf.placeholder(tf.int32, shape=[batch_size, 1])
    
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    nce_w = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
    nce_b = tf.Variable(tf.zeros([vocabulary_size]))
    
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(nce_w, nce_b, embed,
                               train_labels, num_sampled, vocabulary_size))
    
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    
    

In [12]:
word_encoded = WordNumericEncoder(words, vocabulary_size)
batch = CBOWGenerator(word_encoded.get_data(), window)

In [17]:
reverse = word_encoded.get_reverse_dictionary()

In [31]:
num_steps = 10001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    
    for step in range(num_steps):
        batch_data, batch_labels = batch.next(batch_size)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        if step%100 == 0:
            print(step)
    
    valid = [13,100]
    print("validate", [reverse[i] for i in valid])
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid)
    
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    sim = similarity.eval()
    for i in range(len(valid)):
        closest = sim[i].argsort()[-10:]
        print("closest", closest, [reverse[i] for i in closest])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
validate ['eight', 'where']
closest [10 17  4  8 21 16  9 22 23 13] ['two', 'three', 'one', 'zero', 'four', 'five', 'nine', 'six', 'seven', 'eight']
closest [31352   643 12330  8410 22340 16401   212 40955  1609   100] ['obituaries', 'jews', 'herb', 'embraced', 'consolidating', 'playwrights', 'include', 'amorites', 'description', 'where']
