# Word2vec basic

In [1]:
import collections
import math
import os
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline  
print ("Packages loaded")

Packages loaded


# 1. Download the text and make corpus (set of words)

## Download (or reuse) the text file that we will use 

In [2]:
folder_dir  = "data"
file_name   = "text8.zip"
file_path   = os.path.join(folder_dir, file_name)
url         = 'http://mattmahoney.net/dc/'
if not os.path.exists(file_path):
    print ("No file found. Start downloading")
    downfilename, _ = urllib.request.urlretrieve(
        url + file_name, file_path)
    print ("'%s' downloaded" % (downfilename))
else:
    print ("File already exists")

File already exists


## Check we have correct data 

In [3]:
statinfo = os.stat(file_path)
expected_bytes = 31344016
if statinfo.st_size == expected_bytes:
    print ("I guess we have correct file at '%s'" % (file_path))
else:
    print ("Something's wrong with the file at '%s'" % (file_path))

I guess we have correct file at 'data\text8.zip'


## Unzip the file

In [4]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = f.read(f.namelist()[0]).split()##return [data.filename for data in self.filelist]
    return data

In [5]:
words = read_data(file_path) 
print ("Type of 'words' is %s / Length is %d " 
       % (type(words), len(words)))
print ("'words' look like \n %s" %(words[0:100]))
print(type(words))
with zipfile.ZipFile(file_path) as f:
    data = f.read(f.namelist()[0]).split()
    data2 = f.read(f.namelist()[0])

Type of 'words' is <class 'list'> / Length is 17005207 
'words' look like 
 [b'anarchism', b'originated', b'as', b'a', b'term', b'of', b'abuse', b'first', b'used', b'against', b'early', b'working', b'class', b'radicals', b'including', b'the', b'diggers', b'of', b'the', b'english', b'revolution', b'and', b'the', b'sans', b'culottes', b'of', b'the', b'french', b'revolution', b'whilst', b'the', b'term', b'is', b'still', b'used', b'in', b'a', b'pejorative', b'way', b'to', b'describe', b'any', b'act', b'that', b'used', b'violent', b'means', b'to', b'destroy', b'the', b'organization', b'of', b'society', b'it', b'has', b'also', b'been', b'taken', b'up', b'as', b'a', b'positive', b'label', b'by', b'self', b'defined', b'anarchists', b'the', b'word', b'anarchism', b'is', b'derived', b'from', b'the', b'greek', b'without', b'archons', b'ruler', b'chief', b'king', b'anarchism', b'as', b'a', b'political', b'philosophy', b'is', b'the', b'belief', b'that', b'rulers', b'are', b'unnecessary', b'and', b'

In [6]:
#print(data.shape)
type(data2)

bytes

# 2. Make a dictionary with fixed length (using UNK token)

## Count the words 

In [7]:
vocabulary_size = 50000 
count = [['UNK', -1]] 
count.extend(collections.Counter(words)
             .most_common(vocabulary_size - 1)) # -1 is for UNK 
print ("Type of 'count' is %s / Length is %d " % (type(count), len(count)))
print ("'count' looks like \n %s" % (count[0:10]))

Type of 'count' is <class 'list'> / Length is 50000 
'count' looks like 
 [['UNK', -1], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764), (b'in', 372201), (b'a', 325873), (b'to', 316376), (b'zero', 264975), (b'nine', 250430)]


## Make a dictionary

In [11]:
dictionary = dict() 
for word, _ in count:
    dictionary[word] = len(dictionary)
print ("Type of 'dictionary' is %s / Length is %d " 
       % (type(dictionary), len(dictionary)))
dictionary

Type of 'dictionary' is <class 'dict'> / Length is 50000 


{'UNK': 0,
 b'the': 1,
 b'of': 2,
 b'and': 3,
 b'one': 4,
 b'in': 5,
 b'a': 6,
 b'to': 7,
 b'zero': 8,
 b'nine': 9,
 b'two': 10,
 b'is': 11,
 b'as': 12,
 b'eight': 13,
 b'for': 14,
 b's': 15,
 b'five': 16,
 b'three': 17,
 b'was': 18,
 b'by': 19,
 b'that': 20,
 b'four': 21,
 b'six': 22,
 b'seven': 23,
 b'with': 24,
 b'on': 25,
 b'are': 26,
 b'it': 27,
 b'from': 28,
 b'or': 29,
 b'his': 30,
 b'an': 31,
 b'be': 32,
 b'this': 33,
 b'which': 34,
 b'at': 35,
 b'he': 36,
 b'also': 37,
 b'not': 38,
 b'have': 39,
 b'were': 40,
 b'has': 41,
 b'but': 42,
 b'other': 43,
 b'their': 44,
 b'its': 45,
 b'first': 46,
 b'they': 47,
 b'some': 48,
 b'had': 49,
 b'all': 50,
 b'more': 51,
 b'most': 52,
 b'can': 53,
 b'been': 54,
 b'such': 55,
 b'many': 56,
 b'who': 57,
 b'new': 58,
 b'used': 59,
 b'there': 60,
 b'after': 61,
 b'when': 62,
 b'into': 63,
 b'american': 64,
 b'time': 65,
 b'these': 66,
 b'only': 67,
 b'see': 68,
 b'may': 69,
 b'than': 70,
 b'world': 71,
 b'i': 72,
 b'b': 73,
 b'would': 74,
 b'd

# Make a reverse dictionary

In [12]:
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
print ("Type of 'reverse_dictionary' is %s / Length is %d " 
       % (type(reverse_dictionary), len(reverse_dictionary)))
reverse_dictionary

Type of 'reverse_dictionary' is <class 'dict'> / Length is 50000 


{0: 'UNK',
 1: b'the',
 2: b'of',
 3: b'and',
 4: b'one',
 5: b'in',
 6: b'a',
 7: b'to',
 8: b'zero',
 9: b'nine',
 10: b'two',
 11: b'is',
 12: b'as',
 13: b'eight',
 14: b'for',
 15: b's',
 16: b'five',
 17: b'three',
 18: b'was',
 19: b'by',
 20: b'that',
 21: b'four',
 22: b'six',
 23: b'seven',
 24: b'with',
 25: b'on',
 26: b'are',
 27: b'it',
 28: b'from',
 29: b'or',
 30: b'his',
 31: b'an',
 32: b'be',
 33: b'this',
 34: b'which',
 35: b'at',
 36: b'he',
 37: b'also',
 38: b'not',
 39: b'have',
 40: b'were',
 41: b'has',
 42: b'but',
 43: b'other',
 44: b'their',
 45: b'its',
 46: b'first',
 47: b'they',
 48: b'some',
 49: b'had',
 50: b'all',
 51: b'more',
 52: b'most',
 53: b'can',
 54: b'been',
 55: b'such',
 56: b'many',
 57: b'who',
 58: b'new',
 59: b'used',
 60: b'there',
 61: b'after',
 62: b'when',
 63: b'into',
 64: b'american',
 65: b'time',
 66: b'these',
 67: b'only',
 68: b'see',
 69: b'may',
 70: b'than',
 71: b'world',
 72: b'i',
 73: b'b',
 74: b'would',
 75:

## Make data

In [16]:
data = list()
unk_count = 0
for word in words:
    if word in dictionary:
        index = dictionary[word]
    else:
        index = 0  # dictionary['UNK']
        unk_count += 1
    data.append(index)
count[0][1] = unk_count
# del words  # Hint to reduce memory.

### 'dictionary' converts word to index 
### 'reverse_dictionary' converts index to word 

In [17]:
print ("Most common words (+UNK) are: %s" % (count[:5]))

Most common words (+UNK) are: [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]


### Data (in indices)

In [18]:
print ("Sample data: %s" % (data[:10]))

Sample data: [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


### Convert to char (which we can read)

In [19]:
print ("Sample data corresponds to\n__________________")
for i in range(10):
    print ("%d->%s" % (data[i], reverse_dictionary[data[i]]))

Sample data corresponds to
__________________
5234->b'anarchism'
3081->b'originated'
12->b'as'
6->b'a'
195->b'term'
2->b'of'
3134->b'abuse'
46->b'first'
59->b'used'
156->b'against'


# Batch-generating function for skip-gram model
## - Skip-gram (one word to one word) => Can generate more training data

<img src="images/etc/word2vec_desc.png">

In [17]:
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch  = np.ndarray(shape=(batch_size),    dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips): # '//' makes the result an integer, e.g., 7//3 = 2
        target = skip_window
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

### Examples for generating batch and labels 

In [18]:
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
print ("Type of 'batch' is %s / Length is %d " 
       % (type(batch), len(batch))) 
print ("Type of 'labels' is %s / Length is %d " 
       % (type(labels), len(labels)))

Type of 'batch' is <class 'numpy.ndarray'> / Length is 8 
Type of 'labels' is <class 'numpy.ndarray'> / Length is 8 


In [19]:
print ("'batch' looks like \n %s" % (batch))

'batch' looks like 
 [3081 3081   12   12    6    6  195  195]


In [20]:
print ("'labels' looks like \n %s" % (labels))

'labels' looks like 
 [[5234]
 [  12]
 [   6]
 [3081]
 [  12]
 [ 195]
 [   2]
 [   6]]


In [21]:
for i in range(8):
    print ("%d -> %d" 
           % (batch[i], labels[i, 0])),
    print ("\t%s -> %s" 
           % (reverse_dictionary[batch[i]]
              , reverse_dictionary[labels[i, 0]]))

3081 -> 5234
	b'originated' -> b'anarchism'
3081 -> 12
	b'originated' -> b'as'
12 -> 6
	b'as' -> b'a'
12 -> 3081
	b'as' -> b'originated'
6 -> 12
	b'a' -> b'as'
6 -> 195
	b'a' -> b'term'
195 -> 2
	b'term' -> b'of'
195 -> 6
	b'term' -> b'a'


# 3. Build a Skip-Gram Model

In [22]:
batch_size     = 128
embedding_size = 128       # Dimension of the embedding vector.
skip_window    = 1         # How many words to consider left and right.
num_skips      = 2         # How many times to reuse an input 
print ("Parameters ready")

Parameters ready


In [23]:
# Random validation set to sample nearest neighbors.
valid_size     = 32        # Random set of words to evaluate similarity 
valid_window   = 200       # Only pick validation samples in the top 200
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

print (valid_examples)

[  6 181  77 104  40   5 166  56 155 138  39 127 195 192 143  86 123  80
  97  38  33 102   8   1 142  61 199 121 133  99 145 129]


# Define network

In [25]:
# Construct the word2vec model 
train_inputs   = tf.placeholder(tf.int32, shape=[batch_size])   
train_labels   = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset  = tf.constant(valid_examples, dtype=tf.int32)

# Look up embeddings for inputs. (vocabulary_size = 50,000)
with tf.variable_scope("EMBEDDING"):
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size]
                              , -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
# Construct the variables for the NCE loss
with tf.variable_scope("NCE_WEIGHT"):
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
print ("Network ready")

Network ready


# Define functions

In [27]:
with tf.device('/cpu:0'):
    # Loss function 
    num_sampled = 64        # Number of negative examples to sample. 
    loss = tf.reduce_mean(
        tf.nn.nce_loss(nce_weights, nce_biases
                       , train_labels, embed, num_sampled, vocabulary_size))
    # Optimizer
    optm = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    # Similarity measure (important)
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings
                    , valid_dataset)
    siml = tf.matmul(valid_embeddings, normalized_embeddings
                    , transpose_b=True)
    
print ("Functions Ready")

Instructions for updating:
keep_dims is deprecated, use keepdims instead


Functions Ready


# 4. Train a Skip-Gram Model

In [29]:
# Train! 
sess = tf.Session()
sess.run(tf.initialize_all_variables())
summary_writer = tf.summary.FileWriter('/tmp/tf_logs/word2vec', graph=sess.graph)
average_loss = 0

num_steps = 100001
for iter in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
    feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
    _, loss_val = sess.run([optm, loss], feed_dict=feed_dict)
    average_loss += loss_val
    
    if iter % 2000 == 0:
        average_loss /= 2000
        print ("Average loss at step %d is %.3f" % (iter, average_loss)) 
    
    if iter % 10000 == 0:
        siml_val = sess.run(siml)
        for i in xrange(valid_size): # Among valid set 
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 6 # number of nearest neighbors
            nearest = (-siml_val[i, :]).argsort()[1:top_k+1]
            log_str = "Nearest to '%s':" % valid_word
            for k in xrange(top_k):
                close_word = reverse_dictionary[nearest[k]] 
                log_str = "%s '%s'," % (log_str, close_word)
            print(log_str) 
            
# Final embeding 
final_embeddings = sess.run(normalized_embeddings)

Average loss at step 0 is 0.148
Nearest to 'b'a'': 'b'believer'', 'b'smoke'', 'b'widely'', 'b'amour'', 'b'quinn'', 'b'tens'',
Nearest to 'b'series'': 'b'firebird'', 'b'tremendously'', 'b'honeybees'', 'b'footsteps'', 'b'elicit'', 'b'renderer'',
Nearest to 'b'however'': 'b'longwave'', 'b'tansley'', 'b'pestilence'', 'b'provably'', 'b'risk'', 'b'jag'',
Nearest to 'b'then'': 'b'brutal'', 'b'fertilize'', 'b'steam'', 'b'quenched'', 'b'unrestricted'', 'b'peh'',
Nearest to 'b'were'': 'b'edinburgh'', 'b'replicators'', 'b'xlii'', 'b'template'', 'b'dividends'', 'b'truthfulness'',
Nearest to 'b'in'': 'b'kabbalistic'', 'b'cpl'', 'b'eprint'', 'b'monastic'', 'b'persuasion'', 'b'skits'',
Nearest to 'b'music'': 'b'tabular'', 'b'backward'', 'b'striping'', 'b'cambodians'', 'b'caeiro'', 'b'distrusted'',
Nearest to 'b'many'': 'b'administrators'', 'b'materialism'', 'b'logone'', 'b'rodgers'', 'b'helena'', 'b'hoare'',
Nearest to 'b't'': 'b'frantz'', 'b'attributing'', 'b'authorities'', 'b'resemblances'', 'b'war


Nearest to 'b'zero'': 'b'payable'', 'b'spit'', 'b'rubies'', 'b'justifiably'', 'b'mnp'', 'b'riots'',
Nearest to 'b'the'': 'b'oscar'', 'b'novi'', 'b'shades'', 'b'hershey'', 'b'ecu'', 'b'stationary'',
Nearest to 'b'although'': 'b'whiteness'', 'b'determinants'', 'b'ellipse'', 'b'unwind'', 'b'posthuman'', 'b'skiing'',
Nearest to 'b'after'': 'b'slick'', 'b'mayotte'', 'b'replace'', 'b'platoon'', 'b'festival'', 'b'shroud'',
Nearest to 'b'found'': 'b'biochemist'', 'b'sentenced'', 'b'roosevelt'', 'b'alde'', 'b'punjab'', 'b'geometrically'',
Nearest to 'b'name'': 'b'hoppers'', 'b'zapata'', 'b'ericales'', 'b'beti'', 'b'sublimation'', 'b'morally'',
Nearest to 'b'same'': 'b'loos'', 'b'liam'', 'b'opus'', 'b'instrument'', 'b'steamship'', 'b'commit'',
Nearest to 'b'while'': 'b'inaction'', 'b'majdanek'', 'b'lured'', 'b'alto'', 'b'started'', 'b'contend'',
Nearest to 'b'john'': 'b'bomb'', 'b'paxton'', 'b'stays'', 'b'ias'', 'b'arcseconds'', 'b'andros'',
Nearest to 'b'life'': 'b'shaughnessy'', 'b'lydia'', '

Average loss at step 2000 is 113.454


Average loss at step 4000 is 53.073


Average loss at step 6000 is 33.435


Average loss at step 8000 is 23.608


Average loss at step 10000 is 18.132
Nearest to 'b'a'': 'b'the'', 'b'austin'', 'b'reginae'', 'b'zulu'', 'b'phi'', 'b'it'',
Nearest to 'b'series'': 'b'matter'', 'b'footsteps'', 'b'home'', 'b'movies'', 'b'gland'', 'b'filippo'',
Nearest to 'b'however'': 'b'risk'', 'b'pestilence'', 'b'pka'', 'b'imaginary'', 'b'reginae'', 'b'analogy'',
Nearest to 'b'then'': 'b'steam'', 'b'brutal'', 'b'reginae'', 'b'guide'', 'b'androids'', 'b'thai'',
Nearest to 'b'were'': 'b'vs'', 'b'and'', 'b'canaris'', 'b'pneumonia'', 'b'happening'', 'b'work'',
Nearest to 'b'in'': 'b'of'', 'b'and'', 'b'basins'', 'b'by'', 'b'vs'', 'b'with'',
Nearest to 'b'music'': 'b'backward'', 'b'wilmot'', 'b'title'', 'b'christians'', 'b'nih'', 'b'computer'',
Nearest to 'b'many'': 'b'hoare'', 'b'administrators'', 'b'alchemists'', 'b'despite'', 'b'nfl'', 'b'materialism'',
Nearest to 'b't'': 'b'widely'', 'b'for'', 'b'authorities'', 'b'altenberg'', 'b'suffix'', 'b'mond'',
Nearest to 'b'each'': 'b'analogue'', 'b'farm'', 'b'vs'', 'b'box'', 'b'

Average loss at step 12000 is 13.910


Average loss at step 14000 is 11.736


Average loss at step 16000 is 9.781


Average loss at step 18000 is 8.591


Average loss at step 20000 is 7.749
Nearest to 'b'a'': 'b'the'', 'b'and'', 'b'this'', 'b'albuquerque'', 'b'msg'', 'b'it'',
Nearest to 'b'series'': 'b'matter'', 'b'footsteps'', 'b'movies'', 'b'home'', 'b'fictitious'', 'b'cinque'',
Nearest to 'b'however'': 'b'pestilence'', 'b'imaginary'', 'b'risk'', 'b'numa'', 'b'agouti'', 'b'reginae'',
Nearest to 'b'then'': 'b'agouti'', 'b'steam'', 'b'androids'', 'b'reginae'', 'b'guide'', 'b'provide'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'is'', 'b'vs'', 'b'canaris'', 'b'and'',
Nearest to 'b'in'': 'b'and'', 'b'at'', 'b'from'', 'b'on'', 'b'of'', 'b'with'',
Nearest to 'b'music'': 'b'backward'', 'b'solon'', 'b'wilmot'', 'b'alt'', 'b'christians'', 'b'distrusted'',
Nearest to 'b'many'': 'b'some'', 'b'administrators'', 'b'hoare'', 'b'despite'', 'b'alchemists'', 'b'dasyprocta'',
Nearest to 'b't'': 'b'widely'', 'b'suffix'', 'b'reasonably'', 'b'authorities'', 'b'altenberg'', 'b'proclaimed'',
Nearest to 'b'each'': 'b'their'', 'b'analogue'', 'b'farm'', 'b'b

Average loss at step 22000 is 7.153


Average loss at step 24000 is 7.035


Average loss at step 26000 is 6.719


Average loss at step 28000 is 6.243


Average loss at step 30000 is 6.144
Nearest to 'b'a'': 'b'the'', 'b'akita'', 'b'albuquerque'', 'b'trinomial'', 'b'this'', 'b'austin'',
Nearest to 'b'series'': 'b'cpc'', 'b'footsteps'', 'b'matter'', 'b'fictitious'', 'b'movies'', 'b'umayyad'',
Nearest to 'b'however'': 'b'jag'', 'b'pestilence'', 'b'imaginary'', 'b'and'', 'b'agouti'', 'b'analogy'',
Nearest to 'b'then'': 'b'agouti'', 'b'androids'', 'b'reginae'', 'b'unofficially'', 'b'guide'', 'b'that'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'is'', 'b'have'', 'b'canaris'', 'b'vs'',
Nearest to 'b'in'': 'b'at'', 'b'on'', 'b'and'', 'b'of'', 'b'from'', 'b'nine'',
Nearest to 'b'music'': 'b'backward'', 'b'trinomial'', 'b'solon'', 'b'compressibility'', 'b'wilmot'', 'b'thermometer'',
Nearest to 'b'many'': 'b'some'', 'b'the'', 'b'administrators'', 'b'despite'', 'b'hoare'', 'b'alchemists'',
Nearest to 'b't'': 'b'dewey'', 'b'amalthea'', 'b'widely'', 'b'authorities'', 'b'suffix'', 'b'altenberg'',
Nearest to 'b'each'': 'b'their'', 'b'agra'', 'b'pri

Average loss at step 32000 is 5.883


Average loss at step 34000 is 5.886


Average loss at step 36000 is 5.681


Average loss at step 38000 is 5.240


Average loss at step 40000 is 5.460
Nearest to 'b'a'': 'b'the'', 'b'akita'', 'b'johansen'', 'b'austin'', 'b'zulu'', 'b'trinomial'',
Nearest to 'b'series'': 'b'cpc'', 'b'footsteps'', 'b'fictitious'', 'b'umayyad'', 'b'matter'', 'b'rick'',
Nearest to 'b'however'': 'b'and'', 'b'imaginary'', 'b'pestilence'', 'b'jag'', 'b'adamantium'', 'b'agouti'',
Nearest to 'b'then'': 'b'agouti'', 'b'that'', 'b'reginae'', 'b'unofficially'', 'b'androids'', 'b'akita'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'is'', 'b'have'', 'b'be'', 'b'had'',
Nearest to 'b'in'': 'b'at'', 'b'and'', 'b'on'', 'b'from'', 'b'with'', 'b'crandall'',
Nearest to 'b'music'': 'b'backward'', 'b'trinomial'', 'b'compressibility'', 'b'solon'', 'b'thermometer'', 'b'recitative'',
Nearest to 'b'many'': 'b'some'', 'b'recitative'', 'b'these'', 'b'administrators'', 'b'despite'', 'b'hoare'',
Nearest to 'b't'': 'b'dewey'', 'b'warring'', 'b'reasonably'', 'b'widely'', 'b'amalthea'', 'b'suffix'',
Nearest to 'b'each'': 'b'their'', 'b'the'', 'b'a

Average loss at step 42000 is 5.324


Average loss at step 44000 is 5.424


Average loss at step 46000 is 5.286


Average loss at step 48000 is 5.035


Average loss at step 50000 is 5.165
Nearest to 'b'a'': 'b'the'', 'b'akita'', 'b'sotho'', 'b'albuquerque'', 'b'alpina'', 'b'anacharsis'',
Nearest to 'b'series'': 'b'cpc'', 'b'footsteps'', 'b'fictitious'', 'b'rick'', 'b'umayyad'', 'b'matter'',
Nearest to 'b'however'': 'b'and'', 'b'but'', 'b'that'', 'b'agouti'', 'b'imaginary'', 'b'reginae'',
Nearest to 'b'then'': 'b'agouti'', 'b'that'', 'b'hyperbolic'', 'b'unofficially'', 'b'reginae'', 'b'akita'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'is'', 'b'have'', 'b'had'', 'b'be'',
Nearest to 'b'in'': 'b'on'', 'b'at'', 'b'and'', 'b'from'', 'b'through'', 'b'agouti'',
Nearest to 'b'music'': 'b'trinomial'', 'b'backward'', 'b'compressibility'', 'b'recitative'', 'b'thermometer'', 'b'solon'',
Nearest to 'b'many'': 'b'some'', 'b'these'', 'b'recitative'', 'b'several'', 'b'administrators'', 'b'other'',
Nearest to 'b't'': 'b'dewey'', 'b'reasonably'', 'b'amalthea'', 'b'warring'', 'b'we'', 'b'also'',
Nearest to 'b'each'': 'b'their'', 'b'the'', 'b'armagedd

Average loss at step 52000 is 5.179


Average loss at step 54000 is 5.120


Average loss at step 56000 is 5.065


Average loss at step 58000 is 5.104


Average loss at step 60000 is 4.942
Nearest to 'b'a'': 'b'the'', 'b'akita'', 'b'sotho'', 'b'johansen'', 'b'anacharsis'', 'b'callithrix'',
Nearest to 'b'series'': 'b'callithrix'', 'b'michelob'', 'b'irt'', 'b'cpc'', 'b'footsteps'', 'b'rick'',
Nearest to 'b'however'': 'b'but'', 'b'and'', 'b'that'', 'b'although'', 'b'agouti'', 'b'which'',
Nearest to 'b'then'': 'b'that'', 'b'agouti'', 'b'hyperbolic'', 'b'akita'', 'b'ssbn'', 'b'androids'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'have'', 'b'had'', 'b'be'', 'b'is'',
Nearest to 'b'in'': 'b'at'', 'b'on'', 'b'from'', 'b'and'', 'b'through'', 'b'of'',
Nearest to 'b'music'': 'b'trinomial'', 'b'backward'', 'b'compressibility'', 'b'cebus'', 'b'recitative'', 'b'solon'',
Nearest to 'b'many'': 'b'some'', 'b'these'', 'b'several'', 'b'other'', 'b'recitative'', 'b'their'',
Nearest to 'b't'': 'b'dewey'', 'b'we'', 'b'reasonably'', 'b'warring'', 'b'also'', 'b'sparse'',
Nearest to 'b'each'': 'b'their'', 'b'the'', 'b'armageddon'', 'b'callithrix'', 'b'some''

Average loss at step 62000 is 4.791


Average loss at step 64000 is 4.803


Average loss at step 66000 is 4.974


Average loss at step 68000 is 4.929


Average loss at step 70000 is 4.779
Nearest to 'b'a'': 'b'akita'', 'b'the'', 'b'sotho'', 'b'mitral'', 'b'johansen'', 'b'callithrix'',
Nearest to 'b'series'': 'b'michelob'', 'b'callithrix'', 'b'irt'', 'b'fictitious'', 'b'rick'', 'b'footsteps'',
Nearest to 'b'however'': 'b'but'', 'b'although'', 'b'that'', 'b'which'', 'b'and'', 'b'though'',
Nearest to 'b'then'': 'b'agouti'', 'b'that'', 'b'hyperbolic'', 'b'akita'', 'b'it'', 'b'unofficially'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'have'', 'b'had'', 'b'be'', 'b'been'',
Nearest to 'b'in'': 'b'at'', 'b'from'', 'b'on'', 'b'during'', 'b'through'', 'b'since'',
Nearest to 'b'music'': 'b'backward'', 'b'trinomial'', 'b'compressibility'', 'b'cebus'', 'b'recitative'', 'b'thermometer'',
Nearest to 'b'many'': 'b'some'', 'b'these'', 'b'several'', 'b'other'', 'b'recitative'', 'b'thaler'',
Nearest to 'b't'': 'b'dewey'', 'b'we'', 'b'reasonably'', 'b'warring'', 'b'sparse'', 'b'suffix'',
Nearest to 'b'each'': 'b'the'', 'b'their'', 'b'armageddon'', 'b't

Average loss at step 72000 is 4.809


Average loss at step 74000 is 4.783


Average loss at step 76000 is 4.872


Average loss at step 78000 is 4.789


Average loss at step 80000 is 4.821
Nearest to 'b'a'': 'b'the'', 'b'mitral'', 'b'johansen'', 'b'akita'', 'b'albuquerque'', 'b'sotho'',
Nearest to 'b'series'': 'b'irt'', 'b'fictitious'', 'b'michelob'', 'b'footsteps'', 'b'rick'', 'b'matter'',
Nearest to 'b'however'': 'b'but'', 'b'although'', 'b'that'', 'b'though'', 'b'and'', 'b'agouti'',
Nearest to 'b'then'': 'b'agouti'', 'b'pontificia'', 'b'hyperbolic'', 'b'akita'', 'b'that'', 'b'cegep'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'have'', 'b'had'', 'b'be'', 'b'been'',
Nearest to 'b'in'': 'b'at'', 'b'during'', 'b'on'', 'b'from'', 'b'vec'', 'b'through'',
Nearest to 'b'music'': 'b'trinomial'', 'b'backward'', 'b'recitative'', 'b'cebus'', 'b'compressibility'', 'b'thermometer'',
Nearest to 'b'many'': 'b'some'', 'b'these'', 'b'several'', 'b'other'', 'b'all'', 'b'thaler'',
Nearest to 'b't'': 'b'dewey'', 'b'we'', 'b'warring'', 'b'spiritualist'', 'b'amalthea'', 'b'nobody'',
Nearest to 'b'each'': 'b'their'', 'b'the'', 'b'armageddon'', 'b'thaler'

Average loss at step 82000 is 4.819


Average loss at step 84000 is 4.786


Average loss at step 86000 is 4.807


Average loss at step 88000 is 4.688


Average loss at step 90000 is 4.764
Nearest to 'b'a'': 'b'akita'', 'b'the'', 'b'mitral'', 'b'johansen'', 'b'anacharsis'', 'b'sotho'',
Nearest to 'b'series'': 'b'irt'', 'b'michelob'', 'b'callithrix'', 'b'fictitious'', 'b'matter'', 'b'footsteps'',
Nearest to 'b'however'': 'b'but'', 'b'that'', 'b'although'', 'b'though'', 'b'and'', 'b'agouti'',
Nearest to 'b'then'': 'b'agouti'', 'b'hyperbolic'', 'UNK', 'b'pontificia'', 'b'akita'', 'b'that'',
Nearest to 'b'were'': 'b'are'', 'b'was'', 'b'have'', 'b'had'', 'b'be'', 'b'been'',
Nearest to 'b'in'': 'b'during'', 'b'at'', 'b'nine'', 'b'and'', 'b'cegep'', 'b'of'',
Nearest to 'b'music'': 'b'trinomial'', 'b'backward'', 'b'recitative'', 'b'cebus'', 'b'compressibility'', 'b'thermometer'',
Nearest to 'b'many'': 'b'some'', 'b'these'', 'b'several'', 'b'other'', 'b'all'', 'b'both'',
Nearest to 'b't'': 'b'dewey'', 'b'we'', 'b'frantz'', 'b'spiritualist'', 'b'reasonably'', 'b'nobody'',
Nearest to 'b'each'': 'b'the'', 'b'their'', 'b'armageddon'', 'b'thaler'', 

Average loss at step 92000 is 4.692


Average loss at step 94000 is 4.618


Average loss at step 96000 is 4.719


Average loss at step 98000 is 4.633


Average loss at step 100000 is 4.668
Nearest to 'b'a'': 'b'the'', 'b'akita'', 'b'sotho'', 'b'mitral'', 'b'albuquerque'', 'b'johansen'',
Nearest to 'b'series'': 'b'michelob'', 'b'irt'', 'b'callithrix'', 'b'matter'', 'b'fictitious'', 'b'footsteps'',
Nearest to 'b'however'': 'b'but'', 'b'that'', 'b'although'', 'b'and'', 'b'though'', 'b'thaler'',
Nearest to 'b'then'': 'UNK', 'b'agouti'', 'b'cleve'', 'b'pontificia'', 'b'hyperbolic'', 'b'akita'',
Nearest to 'b'were'': 'b'are'', 'b'have'', 'b'was'', 'b'had'', 'b'be'', 'b'been'',
Nearest to 'b'in'': 'b'at'', 'b'during'', 'b'on'', 'b'within'', 'b'from'', 'b'crandall'',
Nearest to 'b'music'': 'b'trinomial'', 'b'recitative'', 'b'compressibility'', 'b'backward'', 'b'cebus'', 'b'thermometer'',
Nearest to 'b'many'': 'b'some'', 'b'these'', 'b'several'', 'b'other'', 'b'all'', 'b'both'',
Nearest to 'b't'': 'b'dewey'', 'b'we'', 'b'frantz'', 'b'spiritualist'', 'b'nobody'', 'b'sparse'',
Nearest to 'b'each'': 'b'the'', 'b'their'', 'b'armageddon'', 'b'thale

# 5. Visualize the embeding

In [34]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    #plt.show()
    plt.savefig("examples.jpg")##保存到本地
# Plot
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels)