<a href="https://colab.research.google.com/github/thedatadj/natural-language-processing/blob/main/word_embeddings/continuous_bag_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create word embeddings using the continuous bags of words model.

# Preprocess the data

Load the data

In [None]:
with open("/content/shakespeare.txt") as file:
    data = file.read()
data[:50]

'O for a Muse of fire, that would ascend\nThe bright'

The data is a continuos string.

Replace punctuations by a `.`

In [None]:
# Tool
import re

# Replace
punctuations = r"[,!?:-]"
data = re.sub(punctuations, '.', data)
data[:50]

'O for a Muse of fire. that would ascend\nThe bright'

Tokenize the data

In [None]:
# Tool
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.data.path.append('.')

# Tokenize
data = word_tokenize(data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
type(data)

list

`data` is now a list of tokens.

In [None]:
data[:10]

['O', 'for', 'a', 'Muse', 'of', 'fire', '.', 'that', 'would', 'ascend']

Lower case all words.

In [None]:
data = [token.lower() for token in data]
data[:10]

['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend']

Drop non-alphabetical tokens.

In [None]:
data = [token for token in data if token.isalpha() or token == '.']
data[:10]

['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend']

In [None]:
len(data)

60432

There is a total of 60,432 tokens in this dataset.

Get a frequency dictionary, that also acts as a vocabulary of unique words in the dataset.

In [None]:
freq = {}
for token in data:
    if token not in freq:
        freq[token] = 0
    freq[token] += 1
print("Frequency of 'the' is", freq['the'])

Frequency of 'the' is 1521


Get two dictionary
* `word2Ind`: maps tokens to indices.
* `Ind2word`: maps indices to tokens.

In [None]:
word2Ind = {}
Ind2word = {}
tokens = sorted(list(set(data)))
for idx, token in enumerate(tokens):
    word2Ind[token] = idx
    Ind2word[idx] = token

In [None]:
word2Ind['king']

2744

In [None]:
Ind2word[2743]

'kinds'

Store the size of the vocabulary of tokens in a variable.

In [None]:
V = len(freq)
V

5775

# Training the model

## Initialization


In [None]:
# Tool
import numpy as np

def initialize_model(N, V, random_seed=1):
    '''
        Returns: W1, W2, b1, b2
    '''
    np.random.seed(1)
    W1 = np.random.rand(N, V)
    W2 = np.random.rand(V, N)
    b1 = np.random.rand(N, 1)
    b2 = np.random.rand(V, 1)
    return W1, W2, b1, b2

## Softmax activation function

In [None]:
def softmax(z):
    numerator = np.exp(z)
    denominator = np.sum(numerator, axis=0)
    return numerator / denominator

## Forward propagation

In [None]:
def forward_prop(x, W1, W2, b1, b2):
    '''
        Returns: z, h
    '''
    h = W1.dot(x) + b1
    h = np.maximum(0, h)
    z = W2.dot(h) + b2
    return z, h

## Cost function

In [None]:
def compute_cost(y, yhat, batch_size):
    logyhat = np.log(yhat)
    loss = np.multiply(logyhat, y)
    cost = -1 / batch_size * np.sum(loss)
    cost = np.squeeze(cost)
    return cost

## Backpropagation

In [None]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    z1 = W1.dot(x) + b1
    l1 = W2.T.dot(yhat - y)
    l1[z1 < 0] = 0
    grad_W1 = 1 / batch_size * l1.dot(x.T)
    grad_W2 = 1 / batch_size * (yhat - y).dot(h.T)
    grad_b1 = 1 / batch_size * np.sum(l1, axis=1, keepdims=True)
    grad_b2 = 1 / batch_size * np.sum(yhat - y, axis=1, keepdims=True)
    return grad_W1, grad_W2, grad_b1, grad_b2

## Gradient descent

Create a function that yields batches of the training data.

In [None]:
from collections import defaultdict

In [None]:
def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx = idx + [word2Ind[word]]
    return idx

In [None]:
def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed

In [None]:
def get_vectors(data, word2Ind, V, C):
    i = C
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = data[i]
        y[word2Ind[center_word]] = 1
        context_words = data[(i - C) : i] + data[(i + 1) : (i + C + 1)]
        num_ctx_words = len(context_words)
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
            x[idx] = freq / num_ctx_words
        yield x, y
        i += 1
        if i >= len(data) - C:
            print("i is being set to", C)
            i = C

In [None]:
def get_batches(data, word2Ind, V, C, batch_size):
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        if len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch_x = []
            batch_y = []

In [None]:
def gradient_descent(data, word2Ind, N, V, num_iters, alpha=0.03,
                     random_seed=282, initialize_model=initialize_model,
                     get_batches=get_batches, forward_prop=forward_prop,
                     softmax=softmax, compute_cost=compute_cost,
                     back_prop=back_prop):
    W1, W2, b1, b2 = initialize_model(N, V, random_seed=random_seed)
    batch_size = 128
    iters = 0
    C = 2
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
        z, h = forward_prop(x, W1, W2, b1, b2)
        yhat = softmax(z)
        cost = compute_cost(y, yhat, batch_size)
        if ( (iters+1) % 10 == 0):
            print(f"iters: {iters + 1} cost: {cost:.6f}")
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y,
                                                       h, W1, W2, b1,
                                                       b2, batch_size)
        W1 = W1 - alpha * grad_W1
        W2 = W2 - alpha * grad_W2
        b1 = b1 - alpha * grad_b1
        b2 = b2 - alpha * grad_b2

<table>
    <tr>
        <td>
            Based on
        </td>
        <td>
            Assignment from the Natural Language Processing Specialization in Coursera.
        </td>
    </tr>
</table>