# `Word2Vec`

In [None]:
# Vanilla PyLibraries
import os
import sys

# Third party Librarires
import nltk
import numpy as np
import tensorflow as tf

## Loading text

In [None]:
file_name = 'wikitext/wiki.val.raw'
corpus_text = open(file_name, mode='r', encoding='utf-8').read()
corpus_text = corpus_text.lower()

## `word2id` & `id2word`

In [None]:
unique_words = set(nltk.word_tokenize(corpus_text))
vocab_size = len(unique_words)

word2id = {w: i for i, w in enumerate(unique_words)}
id2word = {i: w for i, w in enumerate(unique_words)}

## Sentences

In [None]:
raw_sentences = nltk.sent_tokenize(corpus_text)
sentences = [nltk.word_tokenize(sent) for sent in raw_sentences]

## Constructing the `word` by `context`

In [None]:
data = []
WINDOW_SIZE = 2

for s, sent in enumerate(sentences):
    for i, word in enumerate(sent):
        start = max(i - WINDOW_SIZE, 0)
        end = min(WINDOW_SIZE+i, len(sent)) + 1
        word_window = sent[start:end]
        for context in word_window:
            if context is not word:
                data.append([word, context])
    sys.stdout.write('\r{:,} of {:,} sentences.'.format(s+1, len(sentences)))

## `one_hot` vectors

In [None]:
def one_hot(idx, vocab_size):
    temp = np.zeros(shape=[vocab_size])
    temp[idx] = 1.
    return temp

## Creating training vectors

In [None]:
print(data[:10])

In [None]:
X_train = []
y_train = []

for i, word_data in enumerate(data):
    X_train.append(one_hot(word2id[ word_data[0] ], vocab_size))
    y_train.append(one_hot(word2id[ word_data[1] ], vocab_size))
    sys.stdout.write('\r{:,} of {:,}'.format(i+1, len(data)))

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
np.savez('data.npz', X_train, y_train)

## Training with a `tensorflow` model

In [None]:
# Model's placeholders
X = tf.placeholder(tf.float32, shape=[None, vocab_size], name='X_palceholder')
y = tf.placeholder(tf.float32, shape=[None, vocab_size], name='y_placeholder')

### Hyperparameters

In [None]:
embedding_dim = 50

In [None]:
W1 = tf.Variable(tf.truncated_normal(shape=[vocab_size, embedding_dim]))
b1 = tf.Variable(tf.zeros(shape=[embedding_dim]))

hidden = tf.matmul(X, W1) + b

In [None]:
W2 = tf.Variable(tf.truncated_normal(shape=[embedding_dim, vocab_size]))
b2 = tf.Variable(tf.zeros(shape=[vocab_size]))
pred = tf.nn.softmax(tf.matmul(hidden, W2) + b2)