In [38]:
import math
import numpy as np
import tensorflow as tf

In [2]:
with open("./pride_and_prejudice.txt", "r") as f:
    corpus = f.readlines()

In [3]:
sample = corpus[:100]

In [4]:
import re
def preprocess_text(corpus):
    # lowercase
    corpus = sum([a.lower().split() for a in corpus if a != "\n"],[])
    corpus = [re.sub('[^A-Za-z0-9]+', '', token) for token in corpus]
    corpus = [token for token in corpus if len(token) > 0] 
    corpus = [re.sub('^\d+$', '_NUM_', token) for token in corpus]
    
    return corpus

corpus = preprocess_text(corpus)

In [5]:
len(corpus)

124543

In [6]:
vocab = sorted(list(set(corpus)))
print( "Vocabulary size: ", len(vocab))
print(vocab[:100])

Vocabulary size:  7044
['13420txt', '13420zip', '15th', '18th', '1a', '1b', '1c', '1d', '1e', '1e1', '1e2', '1e3', '1e4', '1e5', '1e6', '1e7', '1e8', '1e9', '1f', '1f1', '1f2', '1f3', '1f4', '1f5', '1f6', '26th', '501c3', '_NUM_', 'a', 'abatement', 'abhorrence', 'abhorrent', 'abide', 'abiding', 'abilities', 'able', 'ablution', 'abode', 'abominable', 'abominably', 'abominate', 'abound', 'about', 'above', 'abroad', 'abrupt', 'abruptly', 'abruptness', 'absence', 'absent', 'absolute', 'absolutely', 'absurd', 'absurdities', 'absurdity', 'abundant', 'abundantly', 'abuse', 'abused', 'abusing', 'abusive', 'accede', 'acceded', 'acceding', 'accent', 'accents', 'accept', 'acceptable', 'acceptance', 'accepted', 'accepting', 'access', 'accessed', 'accessible', 'accident', 'accidental', 'accidentally', 'accompanied', 'accompany', 'accompanying', 'accomplished', 'accomplishedshe', 'accomplishment', 'accomplishments', 'accordance', 'according', 'accordingly', 'accosted', 'account', 'accounted', 'accou

- Total words: 125000
- Vocabulary size: 7100


## word to id representation

In [7]:
word_to_ids = {vocab[i]:(i+1) for i in range(len(vocab))}
id_to_words = {(i+1):vocab[i] for i in range(len(vocab))}

In [8]:
word_ids = [word_to_ids[token] for token in corpus]
print(word_ids[:100])

[6267, 4911, 2887, 2025, 4332, 4852, 319, 4797, 859, 3545, 548, 6302, 2025, 3532, 2574, 6267, 6655, 4332, 365, 367, 504, 4222, 1409, 319, 6937, 269, 4222, 5367, 6867, 7026, 3931, 1390, 3534, 2754, 3534, 572, 4388, 5390, 3534, 6516, 6267, 6251, 4332, 6267, 4911, 2887, 3735, 3274, 6937, 6302, 2025, 4388, 4369, 504, 7002, 6347, 4852, 319, 4797, 550, 3545, 548, 4756, 1521, 545, 28, 28, 2025, 28, 5213, 1521, 3587, 28, 3662, 6645, 4326, 28, 28, 3655, 2146, 974, 5658, 2114, 6665, 5953, 4332, 6302, 4911, 2887, 2025, 4852, 319, 4797, 4889, 859, 344, 6757, 4852, 319, 4797]


In [10]:
def create_word_pair(word_ids, C):
    # cut corpus in batch_size
    N = len(word_ids)    
    M = (N-2*C) * 2*C
    centers = [0] * M
    targets = [0] * M
    
    for i in range(C, N-C):        
        k = (i-C)*2*C + C
        
        for j in range(1, C + 1):            
            centers[k - j]  = word_ids[i]
            targets[k - j]  = word_ids[i - j]            
            
            centers[k + j - 1]  = word_ids[i]
            targets[k + j - 1]  = word_ids[i + j]
            
    return list(zip(centers, targets))

In [24]:
example_pairs = create_word_pair([1,2,3,4,5,6,7,8], 2)
print(example_pairs)

[(3, 1), (3, 2), (3, 4), (3, 5), (4, 2), (4, 3), (4, 5), (4, 6), (5, 3), (5, 4), (5, 6), (5, 7), (6, 4), (6, 5), (6, 7), (6, 8)]


In [22]:
import random 
def create_batches(word_pairs, batch_size):
    random.shuffle(word_pairs)
    M = len(word_pairs) // batch_size
    if len(word_pairs) > batch_size * M:
        M += 1
    
    return [word_pairs[i*batch_size:(i+1)*batch_size] for i in range(M)]

In [25]:
create_batches(example_pairs, 4)

[[(5, 4), (4, 2), (6, 7), (5, 3)],
 [(6, 5), (4, 6), (3, 4), (6, 4)],
 [(6, 8), (4, 5), (4, 3), (3, 5)],
 [(3, 2), (5, 6), (3, 1), (5, 7)]]

In [29]:
list(zip(*[(5, 4), (4, 2), (6, 7), (5, 3)]))

[(5, 4, 6, 5), (4, 2, 7, 3)]

## Model

- Input : batch of (id_word, id_context_word)
- 2 embedded matrix each of size (VxD): P and Q
-


In [50]:
vocabulary_size = len(vocab) + 1 # since our word id start from 1, normally there will be unknown word
context_size = 3 # context size 
embedding_size = 20
batch_size = 128

embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0) )

nce_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))

nce_biases = tf.Variable(tf.zeros([vocabulary_size]))


# placeholder for input and output
train_inputs = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[None, 1])

train_embeddings = tf.nn.embedding_lookup(params=embeddings, ids=train_inputs)

train_input_vectors = tf.nn.embedding_lookup(embeddings, train_inputs) # [batch_size, embedding_size]
loss = tf.nn.nce_loss(
    weights = nce_weights,  #A Tensor of shape [num_classes, dim]. The (possibly-partitioned) class embeddings.
    biases = nce_biases, #biases: A Tensor of shape [num_classes]. The class biases.
    labels = train_labels, # A Tensor of type int64 and shape [batch_size, num_true]. The target classes.
    inputs = train_embeddings, # A Tensor of shape [batch_size, dim]. The forward activations of the input network.
    num_sampled = 5,
    num_classes = vocabulary_size, # An int. The number of possible classes.
    num_true=1,  # An int. The number of target classes per training example.
    sampled_values=None,
    remove_accidental_hits=False,
    partition_strategy='mod',
    name='nce_loss'
)
 
loss = tf.reduce_mean(loss)
    
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)

In [51]:
sample_word_ids = word_ids[:1000]
sample_pairs = create_word_pair(sample_word_ids, context_size)

In [52]:
word_pairs = create_word_pair(word_ids, context_size)

In [54]:
n_epoch = 10

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epoch):
        batches = create_batches(batch_size=batch_size, word_pairs=word_pairs)
        for (i, batch) in enumerate(batches):
            batch_inputs, batch_labels = list(zip(*batch))
            
            #resaphe to [batch_size, 1] dimension
            batch_labels = np.expand_dims(batch_labels, axis=1) 
            
            sess.run(optimizer, feed_dict={train_inputs:batch_inputs, train_labels:batch_labels})
            if (i+1) % 1000 == 0:
                batch_loss = sess.run(loss,  feed_dict={train_inputs:batch_inputs, train_labels:batch_labels})
                print("epoch: ", epoch, ", batch_number: ", i+1, ", batch loss: ", batch_loss)
        
            


epoch:  0 , batch_number:  1000 , batch loss:  29.7847
epoch:  0 , batch_number:  2000 , batch loss:  23.9917
epoch:  0 , batch_number:  3000 , batch loss:  24.8392
epoch:  0 , batch_number:  4000 , batch loss:  16.3176
epoch:  0 , batch_number:  5000 , batch loss:  21.3982
epoch:  1 , batch_number:  1000 , batch loss:  2.71581
epoch:  1 , batch_number:  2000 , batch loss:  8.54126
epoch:  1 , batch_number:  3000 , batch loss:  27.7271
epoch:  1 , batch_number:  4000 , batch loss:  16.753
epoch:  1 , batch_number:  5000 , batch loss:  0.664705
epoch:  2 , batch_number:  1000 , batch loss:  19.8797
epoch:  2 , batch_number:  2000 , batch loss:  8.65233
epoch:  2 , batch_number:  3000 , batch loss:  1.45254
epoch:  2 , batch_number:  4000 , batch loss:  0.7806
epoch:  2 , batch_number:  5000 , batch loss:  15.8002
epoch:  3 , batch_number:  1000 , batch loss:  0.230135
epoch:  3 , batch_number:  2000 , batch loss:  1.75615
epoch:  3 , batch_number:  3000 , batch loss:  1.1344
epoch:  3 ,

In [33]:
list(enumerate([1,2,3]))

[(0, 1), (1, 2), (2, 3)]

In [48]:
len(word_pairs)

747222