In [43]:
import os
import numpy as np
from typing import List, Tuple, Union

import tensorflow as tf

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Layer, Flatten

# CBOW

Continuous Bag of Words implementation. I hope it makes sense.

## Preprocessing

Read in alice text and then tokenize it.

In [26]:
with open("alice.txt", "r") as f:
    raw_corpus = f.readlines()
    
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + '\'')
tokenizer.fit_on_texts(raw_corpus)
corpus = sum([sequence for sequence in tokenizer.texts_to_sequences(raw_corpus) if len(sequence) > 0], [])

In [27]:
corpus_size = len(corpus)
voc_size = max(corpus) + 1

In [28]:
corpus_size, voc_size

(27330, 2568)

## Dataset generation

Our training task for CBOW is to guess a word based on the `window_size` words surrounding it either side.
Let's generate a dataset of `(window_words, word)` with fixed length window_words.

In [29]:
def generate_dataset(
    corpus: List[int],
    window_size: int
) -> Tuple[np.ndarray, np.ndarray]:
    raw_x = []
    raw_y = []
    for word_idx, word in enumerate(corpus):
        min_idx = max(0, word_idx - window_size)
        max_idx = min(len(corpus), word_idx + window_size)
        raw_x.append(
            corpus[min_idx : word_idx] + corpus[word_idx + 1 : max_idx + 1]
        )
        raw_y.append(word)
    x = pad_sequences(raw_x) # Pads all sequences to a fixed length
    y = np.array(raw_y)
    return x, y

In [30]:
window_size = 4
x, y = generate_dataset(corpus, window_size)

## Model definition and training

We then define the CBOW model and train it

We need a custom `AverageLayer` to average out the embeddings of the window words output from the embedding layer.

In [31]:
class AverageLayer(Layer):
    def __init__(self):
        super(AverageLayer, self).__init__()
    
    def call(self, inputs):
        return tf.math.reduce_mean(inputs, axis=1)

Then we can construct the model itself. We can vary the size of the embedding, optimizer and loss function

In [32]:
embedding_size = 100
cbow = Sequential([
    Embedding(voc_size, embedding_size, mask_zero=True, input_length=2 * window_size),
    AverageLayer(),
    Dense(voc_size, activation="softmax", use_bias=False)
])
cbow.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
cbow.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 8, 100)            256800    
_________________________________________________________________
average_layer_2 (AverageLaye (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2568)              256800    
Total params: 513,600
Trainable params: 513,600
Non-trainable params: 0
_________________________________________________________________


In [33]:
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
_ = cbow.fit(x, y, batch_size=1, epochs=3) #, validation_split=0.1, callbacks=[early_stopping])

Train on 27330 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


## Evaluation of embeddings

We first get the embedding layer from the model

In [40]:
embedding_layer = cbow.layers[0]

We then define a function to retrieve the embedding of a word. Can pass custom tokenizer or embedding_layer, but will otherwise use what we defined
above.

In [41]:
def embed(
    word: str,
    tokenizer: Tokenizer = tokenizer,
    embedding_layer: Embedding = embedding_layer
) -> np.ndarray:
    word_index = tokenizer.texts_to_sequences([word])
    output_tensor = embedding_layer(np.array(word_index))
    return tf.reshape(output_tensor, (-1, 1))

We also define a function that retrieves the closes word to a word vector

In [48]:
def cosine_similarity(
    v: Union[np.ndarray, tf.Tensor],
    w: Union[np.ndarray, tf.Tensor]) -> float:
    """v and w should be 1D-vectors, for which to calculate the cosine similarity"""
    
    assert isinstance(v, tf.Tensor) or isinstance(v, np.ndarray), "v must be tf.Tensor or np.ndarray"
    assert isinstance(w, tf.Tensor) or isinstance(w, np.ndarray), "w must be tf.Tensor or np.ndarray"
    
    v = v.numpy() if isinstance(v, tf.Tensor) else v
    w = w.numpy() if isinstance(w, tf.Tensor) else w
    
    if v.ndim > 1:
        v = v.flatten()
    if w.ndim > 1:
        w = w.flatten()

    return v @ w.T / (np.linalg.norm(v) * np.linalg.norm(w))
    
def closest_n_words(
    embedding: tf.Tensor,
    n: int = 1,
    tokenizer: Tokenizer = tokenizer,
    embedding_layer: Embedding = embedding_layer
) -> List[str]:    
    all_words = np.array(list(tokenizer.index_word.values()))
    all_embeddings = np.array([embed(word).numpy() for word in all_words]).squeeze()
    similarities = np.apply_along_axis(
        lambda row: cosine_similarity(row, embedding),
        1, all_embeddings
    )
    sorted_idx = np.argsort(similarities)
    
    return np.flip(all_words[sorted_idx[-n:]]).tolist()

Ideally, we might think that $e_\text{king} - e_\text{man} \approx e_\text{queen} - e_\text{woman}$. So if we try and find the closest word to 
$e_\text{king} - e_\text{queen} + e_\text{woman}$ it should be close to $e_\text{man}$

In [45]:
analogy = embed("king") - embed("queen") + embed("woman")

In [46]:
closest_n_words(analogy, 10)

['woman',
 'prize',
 'curious',
 'useful',
 'whiting',
 'askance',
 'grin',
 'measure',
 'pity',
 'plainly']

In [47]:
cosine_similarity(analogy, embed("man"))

0.441663