# `Word2Vec`

In [1]:
# Vanilla PyLibraries
import os
import sys
import datetime as dt

# Third party Librarires
import numpy as np
import tensorflow as tf

# Custom libraries
from dataset import TextDataset

## initializing `Word2Vec`

In [2]:
data_dir = 'datasets/wiki.valid.raw'
save_file = 'datasets/saved/data.pkl'

w2v = TextDataset(data_dir=data_dir, logging=True)
w2v.create()
# w2v.save(save_file, force=True)
# load saved object
# w2v = w2v.load(save_file)

Processing 8,224 of 8,224 sentences. Time taken: 0:00:32.243954

In [None]:
import pickle

In [10]:
f = open(save_file, 'wb')
pickle.dump(w2v, f)
# pickle.Pickler(f).dump(w2v)

OSError: [Errno 22] Invalid argument

### Hyperparameters

In [None]:
# Model/Network
embedding_dim = 50
learning_rate = 1e-3
vocab_size = w2v.vocab_size
# Training
epochs = 10000
save_interval = 50
batch_size = 25

## Training with a `tensorflow` model

In [None]:
# Model's placeholders
X = tf.placeholder(tf.float32, shape=[None, vocab_size], name='X_palceholder')
y = tf.placeholder(tf.float32, shape=[None, vocab_size], name='y_placeholder')
y_true = tf.argmax(y, axis=1)

### Building the Network

In [None]:
# Input -> Hidden
W1 = tf.Variable(tf.truncated_normal(shape=[vocab_size, embedding_dim]))
b1 = tf.Variable(tf.zeros(shape=[embedding_dim]))
hidden = tf.matmul(X, W1) + b1

In [None]:
# Hidden -> Output
W2 = tf.Variable(tf.truncated_normal(shape=[embedding_dim, vocab_size]))
b2 = tf.Variable(tf.zeros(shape=[vocab_size]))
y_hat = tf.matmul(hidden, W2) + b2
y_norm = tf.nn.softmax(y_hat)
y_pred = tf.argmax(y_norm, axis=1)

### Loss, training and accuracy

In [None]:
xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=y_hat, labels=y, name='xentropy')
loss = tf.reduce_mean(xentropy, name='loss')
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_step = optimizer.minimize(loss)

### Accuracy

In [None]:
correct = tf.equal(y_pred, y_true)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

## Initializing global variables and  `tf.Session()`

In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

## Tensorboard

In [None]:
# Files & directories
save_path = 'models/'  # Trained model
tensorboard_dir = 'tensorboard/'  # summary protobuf
logdir = os.path.join(tensorboard_dir, 'log')   # summary's file writer

# Summaries
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()

# saver & writer
saver = tf.train.Saver()
writer = tf.summary.FileWriter(logdir=logdir, graph=sess.graph)

# Restore or create
if tf.gfile.Exists(save_path):
    if len(os.listdir(save_path)) > 1:
        saver.restore(sess=sess, save_path=save_path)
else:
    tf.gfile.MakeDirs(save_path)

## Training

In [None]:
train_start = dt.datetime.now()
for i in range(epochs):
    # Train
    X_batch, y_batch = w2v.next_batch(batch_size=batch_size, shuffle=True)
    sess.run(train_step, feed_dict={X: X_batch, y: y_batch})
    # Save at interval
    if i % save_interval == 0:
        # Tensorboard summary
        summary = sess.run(merged, feed_dict={X: X_batch, y: y_batch})
        writer.add_summary(summary=summary, global_step=i)
        # Save model
        saver.save(sess=sess, save_path=save_path)
    sys.stdout.write('\r{:,} of {:,} epochs\tTime taken: {}'.format(i+1, 
                                                                    epochs, 
                                                                    dt.datetime.now() - train_start))

## Word vectors

In [None]:
word_vectors = sess.run(W1+b1)
print(word_vectors.shape)

In [None]:
word_vectors[w2v.word2id['you']]