# Preparations

In [0]:
from google.colab import drive

drive.mount('/content/gdrive') # in case you want to save reaults to your drive

In [0]:
import nltk
nltk.download('punkt')

In [0]:

!wget http://mattmahoney.net/dc/text8.zip
!unzip text8.zip

!wget https://raw.githubusercontent.com/uhh-lt/dl-seminar/master/code/wikipedia-corpus-2mb.txt

# Gensim

In [0]:
import nltk

corpus_path = './text8'

with open(corpus_path) as f:
  corpus_raw = [w.lower() for w in nltk.word_tokenize(f.read())]

n = 200
corpus_token = []
for i in range(0, len(corpus_raw), n):
    corpus_token.append(corpus_raw[i:i + n])
    

# It is also possible to use the following code:

# from gensim.models.word2vec import Text8Corpus
# corpus_token = Text8Corpus(corpus_path)

In [0]:
from gensim.models import Word2Vec


model = Word2Vec(corpus_token, size=100, window=2, min_count=3, workers=4, iter=3)
model.save('gensim_word2vec.model')

In [0]:
model = Word2Vec.load('gensim_word2vec.model')

# TODO: find similar words to word 'three'. You can use gensim 'most_similar' function
# ...

# Utility code

In [0]:
import os
from collections import defaultdict
import nltk


def ensure_dir(f):
    if not os.path.exists(f): 
      os.makedirs(f)
      

def load_corpus(filename, lower_case=True, min_frequency=3):
    """ Load a text file, tokenize it, count occurences and build a word encoder 
    that translate a word into a unique id (sorted by word frequency) """
    
    corpus = []
    
    i = 0
    with open(filename, 'r') as in_file:
        for line in in_file:
            if i % 1000 == 0:
                print('Loading {} processing line {}'.format(filename, i))
            
            if line[-1] == '\n':
                line = line[:-1]
            line = line.strip()
            if lower_case:
                line = line.lower()
            
            corpus += nltk.word_tokenize(line)
            i += 1
    
    print('Compute word encoder...')
    word_counter = defaultdict(int)
    
    for word in corpus:
        word_counter[word] += 1
    
    word_counter = list(word_counter.items())
    word_counter = [elem for elem in word_counter if elem[1] >= min_frequency]
    word_counter.sort(key=lambda x: x[1], reverse=True)
    
    word2index = defaultdict(int)
    
    for i, elem in enumerate(word_counter):
        word2index[elem[0]] = i
        
    print('done')
    
    return corpus, word2index


def save_vocabulary(output_dir, word2index):
    vocab_fpath = os.path.join(output_dir, 'vocabulary.tsv')  
    vocab_items = list(word2index.items())
    vocab_items.sort(key=lambda x:x[1])
    print(vocab_items[:100])
    vocab_list = [elem[0] for elem in vocab_items if elem[1] > 0]
    
    with open(vocab_fpath, 'w') as vocab_file_out:
        vocab_file_out.write('<UNK>'+'\n')
        for word in vocab_list:
            vocab_file_out.write(word+'\n')

    print("Saved vocabulary to:", vocab_fpath)
    
    return vocab_fpath

# Model and training code

In [0]:
import tensorflow as tf
import math
import time
import numpy as np

from tensorflow.contrib.tensorboard.plugins import projector
from tensorflow.nn import sigmoid_cross_entropy_with_logits


def build_graph2(vocabulary_size, num_sampled, embedding_size, 
                learning_rate, optimizer_type):
    print('Using custom nce_loss function.')
  
    contexts = tf.placeholder(tf.int32, shape=[None])
    targets = tf.placeholder(tf.int32, shape=[None, 1])
    
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                          stddev=1.0 / math.sqrt(embedding_size)))
    
    # TODO: implement noise contrastive estimation loss. Use tf.stop_gradients
    # on sampled negative indices. We suggest using tf.nn.log_uniform_candidate_sampler to sample
    # negative indices according to distribution of tokens in the corpus.
    # Hint: you can always look into tf.nn.nce_loss code at github
    
    loss = # ...
        
    if optimizer_type == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    else:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
    
    return embeddings, contexts, targets, optimizer, loss


def build_graph(vocabulary_size, num_sampled, embedding_size, 
                learning_rate, optimizer_type):
    print('Using built-in TF nce_loss function.')
    
    # Placeholders for inputs
    contexts = tf.placeholder(tf.int32, shape=[None])
    targets = tf.placeholder(tf.int32, shape=[None, 1])
    
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                          stddev=1.0 / math.sqrt(embedding_size)))
    
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    # TODO: generate embeddings of contexts 
    embed = # ...
    
    # TODO: compute the NCE loss, using a sample of the negative labels each time
    # with tf.nn.nce_loss function (see TF documentation to find out what parameters you should use)
    loss = # ...
    
    if optimizer_type == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    else:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
    
    return embeddings, contexts, targets, optimizer, loss


def generate_batch(corpus_num, batch_size, skip_gram=True):
    """ Generate a batch in the form of two numpy vectors of (i) target 
    and (ii) context word ids. """

    contexts = np.ndarray(shape=(batch_size*2), dtype=np.int32)
    targets = np.ndarray(shape=(batch_size*2, 1), dtype=np.int32)
    
    for i in range(batch_size):
        random_token_num = int(math.floor(np.random.random_sample() * (len(corpus_num) -2))) + 1
        
        # E.g. for "the quick brown fox jumped over the lazy dog"
        # (context, target) pairs: ([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox)
        # We can simplify to: (the, quick), (brown, quick), (quick, brown), (fox, brown), ... CBOW
        # => contexts is ids of [the, brown, quick, fox, ...], labels/targets: [quick, quick, brown, brown, ...]
	# (quick, the), (quick, brown), (brown, quick), (brown, fox), ... Skip-gram
        # => contexts and targets reversed
        
        # TODO: implement generation of left and right context pairs for CBOW 
        # according suggestions above
        
        # left context pair
        left = # ...
        
        # right context pair
        right = # ...
        
        if skip_gram:
            # TODO: how we can transform left and right pairs to create SkipGram algorithm? 
            # ...
        
        contexts[i*2] = left[0]
        contexts[i*2 + 1] = right[0]
        
        targets[i*2] = left[1]
        targets[i*2 + 1] = right[1]
    
    return contexts, targets
   
  
def train(corpus_num, word2index, vocabulary_size, num_samples, steps, 
          optimizer_type, learning_rate, embedding_size, skip_gram, 
          batch_size, save_path, use_custom_loss):   
    with tf.device('/cpu'):
        with tf.Session() as sess:
            f_build_graph = build_graph2 if use_custom_loss else build_graph
            
            embeddings, contexts, targets, optimizer, loss = f_build_graph(vocabulary_size, 
                           num_samples, embedding_size, learning_rate, optimizer_type)
            
            # Save summary of the training process - can be analyzed with TensorBoard later 
            timestamp = str(int(time.time()))
            logs_dir = os.path.join('w2v_logs_' + timestamp)
            ensure_dir(logs_dir)
            vocab_fpath = save_vocabulary(save_path, word2index)
            
            print('Writing summaries and checkpoints to logdir:' + logs_dir)
            model_ckpt_fpath = os.path.join(logs_dir, 'model.ckpt')    
            loss_summary = tf.summary.scalar('loss', loss) 
            config = projector.ProjectorConfig()
            embedding = config.embeddings.add()
            embedding.tensor_name = embeddings.name
            embedding.metadata_path = vocab_fpath  
            train_summary_op = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(logs_dir, sess.graph)
            projector.visualize_embeddings(summary_writer, config)

            # Initialization
            saver = tf.train.Saver(tf.global_variables())
            sess.run(tf.global_variables_initializer())
            losses = []
            
            # Batched SGD training
            for current_step in range(steps):
                inputs, labels = generate_batch(corpus_num, batch_size=batch_size, skip_gram=skip_gram)
                feed_dict = {contexts: inputs, targets: labels}
                _, cur_loss = sess.run([optimizer, loss], feed_dict=feed_dict)
                
                losses.append(cur_loss)
                             
                if current_step % 100==0 and current_step != 0:
                    summary_str = sess.run(train_summary_op, feed_dict=feed_dict)
                    summary_writer.add_summary(summary_str, current_step)
                    
                if current_step % 1000 == 0:
                    print('step',current_step,'mean loss:', np.mean(np.asarray(losses)))
                    saver.save(sess, model_ckpt_fpath, current_step)
                    losses = []
                    
            embeddings_np = sess.run(embeddings)
            np.save(os.path.join(save_path, 'embeddings.npy'), embeddings_np)

# Launch training

In [0]:
import pandas as pd


OPTIONS = pd.Series()
OPTIONS.corpus = "wikipedia-corpus-2mb.txt" # "Path to the input text corpus. Change to 'text8' to train good embeddings."
OPTIONS.num_neg_samples = 2    # "Number of negative samples"
OPTIONS.steps = 100000         # "Number of training steps"
OPTIONS.learning_rate = 1.     # "The learning rate"
OPTIONS.embedding_size = 100   # "Size of the embedding"
OPTIONS.lower_case = True      # "Whether the corpus should be lowercased"
OPTIONS.skip_gram = False      # "Whether skip gram should be used or CBOW"
OPTIONS.min_frequency = 3      # "Words that occur lower than this frequency are discarded as OOV"
OPTIONS.optimizer_type = "sgd" # "Optimizer type: 'adam' or 'sgd'"
OPTIONS.batch_size = 128       # "Batch size"
OPTIONS.save_path = './'       # Path to directory to save results (dictionary, embedding matrice)
OPTIONS.use_custom_loss = False # Switch to True if you want to do an advanced exercise
                                # and implement nce loss by yourself in build_graph2 function

In [0]:
corpus, word2index = load_corpus(filename=OPTIONS.corpus, 
                                 lower_case=OPTIONS.lower_case, 
                                 min_frequency=OPTIONS.min_frequency)
corpus_num = [word2index[word] for word in corpus]
print(len(corpus_num))

print('First few tokens of corpus:', corpus[:100])
print('First few tokens of corpus_num:', list(corpus_num[:100]))

corpus_num = np.asarray(corpus_num)

In [0]:
tf.reset_default_graph()

train(corpus_num, 
      word2index, 
      vocabulary_size=max(corpus_num) + 1,
      num_samples=OPTIONS.num_neg_samples, 
      steps=OPTIONS.steps,
      optimizer_type=OPTIONS.optimizer_type, 
      learning_rate=OPTIONS.learning_rate, 
      embedding_size=OPTIONS.embedding_size,
      skip_gram=OPTIONS.skip_gram, 
      batch_size=OPTIONS.batch_size,
      save_path=OPTIONS.save_path,
      use_custom_loss=OPTIONS.use_custom_loss)

# Inspect embeddings

In [0]:
# Loading saved vocabulary and embedding matrix

embeddings = np.load(os.path.join(OPTIONS.save_path, 'embeddings.npy'))
with open(os.path.join(OPTIONS.save_path, 'vocabulary.tsv')) as f:
  vocab = [l.strip() for l in f.readlines()]
 
assert len(vocab) == embeddings.shape[0]

embeddings_dict = {w : e for w, e in zip(vocab, embeddings)}

In [0]:
from scipy.spatial.distance import cosine


def k_neighbors(vocab, embeddings, wv, word, k):
  # TODO: implement function to find k similar words
  # ...


k_neighbors(vocab, embeddings, embeddings_dict, 'three', 5)