# `Word2Vec`

In [1]:
# Vanilla PyLibraries
import os
import sys

# Third party Librarires
import nltk
import numpy as np
import tensorflow as tf

## Loading text

In [11]:
file_name = 'datasets/wiki.valid.raw'
corpus_text = open(file_name, mode='r', encoding='utf-8').read()[:100000]
corpus_text = corpus_text.lower()
print('{:,}'.format(len(corpus_text)))

100,000


## `word2id` & `id2word`

In [3]:
unique_words = set(nltk.word_tokenize(corpus_text))
vocab_size = len(unique_words)

word2id = {w: i for i, w in enumerate(unique_words)}
id2word = {i: w for i, w in enumerate(unique_words)}

## Sentences

In [4]:
raw_sentences = nltk.sent_tokenize(corpus_text)
sentences = [nltk.word_tokenize(sent) for sent in raw_sentences]

## Constructing the `word` by `context`

In [5]:
data = []
WINDOW_SIZE = 2

for s, sent in enumerate(sentences):
    for i, word in enumerate(sent):
        start = max(i - WINDOW_SIZE, 0)
        end = min(WINDOW_SIZE+i, len(sent)) + 1
        word_window = sent[start:end]
        for context in word_window:
            if context is not word:
                data.append([word, context])
    sys.stdout.write('\r{:,} of {:,} sentences.'.format(s+1, len(sentences)))

8,224 of 8,224 sentences.

## `one_hot` vectors

In [6]:
def one_hot(idx, vocab_size):
    temp = np.zeros(shape=[vocab_size])
    temp[idx] = 1.
    return temp

## Creating training vectors

In [7]:
print(data[:10])

[['=', 'homarus'], ['=', 'gammarus'], ['homarus', '='], ['homarus', 'gammarus'], ['homarus', '='], ['gammarus', '='], ['gammarus', 'homarus'], ['gammarus', '='], ['gammarus', 'homarus'], ['=', 'homarus']]


In [8]:
X_train = []
y_train = []

start = dt.datetime.now()
for i, word_data in enumerate(data):
    X_train.append(one_hot(word2id[ word_data[0] ], vocab_size))
    y_train.append(one_hot(word2id[ word_data[1] ], vocab_size))
    sys.stdout.write('\r{:,} of {:,}\tSo far = {}'.format(i+1, len(data)))

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

236,592 of 815,298

KeyboardInterrupt: 

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
np.savez('data.npz', X_train, y_train)

## Training with a `tensorflow` model

In [None]:
# Model's placeholders
X = tf.placeholder(tf.float32, shape=[None, vocab_size], name='X_palceholder')
y = tf.placeholder(tf.float32, shape=[None, vocab_size], name='y_placeholder')
y_true = tf.argmax(y, axis=1)

In [None]:
embedding_dim = 50
learning_rate = 1e-3

### Building the Network

In [None]:
# Input -> Hidden
W1 = tf.Variable(tf.truncated_normal(shape=[vocab_size, embedding_dim]))
b1 = tf.Variable(tf.zeros(shape=[embedding_dim]))
hidden = tf.matmul(X, W1) + b

In [None]:
# Hidden -> Output
W2 = tf.Variable(tf.truncated_normal(shape=[embedding_dim, vocab_size]))
b2 = tf.Variable(tf.zeros(shape=[vocab_size]))
y_hat = tf.matmul(hidden, W2) + b2
y_norm = tf.nn.softmax(y_hat)
y_pred = tf.argmax(y_norm, axis=1)

### Loss, training and accuracy

In [None]:
xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=y_hat, labels=y, name='xentropy')
loss = tf.reduce_mean(xentropy, name='loss')
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_step = optimizer.minimize(train_step)

### Accuracy

In [None]:
correct = tf.equal(y_pred, y_true)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

## Initializing global variables and  `tf.Session()`

In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

## Tensorboard

In [None]:
model_path = 'model/'  # Trained model
tensorboard_dir = 'tensorboard/'  # summary protobuf
logdir = os.path.join(tensorboard_dir, 'log')   # summary's file writer
save_path = os.path.join(model_path, 'saved_chkpt')  # tf.train.Saver

# Summaries
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()



# saver & writer
saver = tf.train.Saver()
writer = tf.summary.FileWriter(logdir=logdir, graph=sess.graph)

## Maybe restore `sess`

In [None]:
if len(os.listdir(save_path)) > 1:
    saver.restore(sess=sess, save_path=save_path)

## Training

In [None]:
epochs = 10000
save_interval = 50
batch_size = 25

for epoch in range(epochs):
    