# Word Embeddings for Restricted Access Corpora

## Copyright notice

This version (c) 2018 Fabian Offert for the WE1S project, license: [CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)

## Resources

Original word2vec paper: https://arxiv.org/pdf/1301.3781v3.pdf

C implementation commentary: https://github.com/chrisjmccormick/word2vec_commented

Python only implementation (without negative sampling): https://github.com/cbellei/word2veclite

Keras skipgram function: https://keras.io/preprocessing/sequence/#skipgrams

Implementation of Keras skipgram function: https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/sequence.py

Full Keras example: https://github.com/Hironsan/awesome-embedding-models/blob/master/examples/skip-gram_with_ns.py

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import string
import pickle
import gensim
import os
from collections import Counter, OrderedDict
from tqdm import tqdm_notebook
from scipy.sparse import coo_matrix

import tensorflow as tf
print('Tensorflow version: ' + tf.__version__)
from tensorflow.contrib.tensorboard.plugins import projector

import keras
print('Keras version: ' + keras.__version__)
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import Sequence
from keras.preprocessing.sequence import skipgrams, make_sampling_table
from keras.callbacks import ModelCheckpoint, TensorBoard

Tensorflow version: 1.8.0
Keras version: 2.2.0


Using TensorFlow backend.


In [3]:
# Hyperparameters
corpus = 'corpora/txtlab.txt'
dim = 300 # word2vec default: 300
window_size = 5 # word2vec default: 5
# min_count = 5 # word2vec default: 5 - set to 0 to disable
max_vocab = 10000
sampling_factor = 1e-03 # word2vec default: 1e-03 - set to 0 to disable
positive_samples = 10000 # gensim.word2vec.Text8Corpus default: 10000 words/sequence → Keras skipgram function
negative_samples = 5 # word2vec default: 5
epochs = 5
dynamic_windowing = True # word2vec default: True

In [4]:
# Read corpus
with open(corpus, 'r') as f: text = f.read()

# Make it all lower case    
text = text.lower()

# Remove all punctuation
exclude = set(string.punctuation)
text = ''.join(char for char in text if char not in exclude)

print('Text size: '+ str(len(text)) + ' characters')

# Truncate to a million characters for quick testing
# text = text[:1000000]

# Make list
print('Converting text to word list...')
text = text.split()

Text size: 99554089 characters
Converting text to word list...


In [5]:
# Get frequencies
frequencies = Counter(text).most_common()

# Eliminate words that appear less than min_count times
# frequencies = OrderedDict({word: i for word, i in frequencies if i >= min_count})

# Keep the vocabulary to max_vocab
frequencies = OrderedDict(frequencies[:max_vocab])
min_frequency = min(frequencies, key=frequencies.get)
print('Minimum frequency: ' + str(frequencies[min_frequency]) + ', word: ' + str(min_frequency))

# Map words to integers
token2id = {word: i for i, word in enumerate(frequencies.keys())}
id2token = {i: word for i, word in enumerate(frequencies.keys())}

# Length of dictionary is size of vocabulary
vocabulary_size = len(token2id)

# Encode
text = [token2id[w] for w in text if w in token2id]

# Make sampling table for subsampling (starts at 1!)
sampling_table = make_sampling_table(size=vocabulary_size+1, sampling_factor=sampling_factor)

print('Vocabulary size: ' + str(vocabulary_size) + ' words')

Minimum frequency: 97, word: corrected
Vocabulary size: 10000 words


In [6]:
# Create square word-context-frequency matrix
print('Filling wcf matrix...')
mat = np.zeros((vocabulary_size, vocabulary_size))
for i, wi in (enumerate(tqdm_notebook(text))):
    window_start = max(0, i - window_size)
    window_end = min(len(text), i + window_size + 1)
    for j in range(window_start, window_end):
        if j != i:
            wj = text[j]
            if dynamic_windowing: mat[wi][wj]+=(1/abs(i-j))
            else: mat[wi][wj]+=1            
print('Total samples in matrix: ' + str(mat.sum()) + ' samples')
        
# Create sparse LIL representation of square word-context matrix
print('Creating sparse integer matrix...')
coo_mat = coo_matrix(mat.astype(np.int32), copy=True)
del mat # Reclaim memory
lil_mat = coo_mat.tolil()
print(lil_mat.sum())

Filling wcf matrix...


HBox(children=(IntProgress(value=0, max=17336804), HTML(value='')))


Total samples in matrix: 79171394.93333432 samples
Creating sparse integer matrix...
74261431


In [7]:
# SOURCE: https://github.com/nzw0301/keras-examples/blob/master/Skip-gram-with-NS.ipynb

print('Building model...')

t_inputs = Input(shape=(1, ), dtype=np.int32)
t = Embedding(vocabulary_size, dim)(t_inputs)
c_inputs = Input(shape=(1, ), dtype=np.int32)
c  = Embedding(vocabulary_size, dim)(c_inputs)
o = Dot(axes=2)([t, c])
o = Reshape((1,), input_shape=(1, 1))(o)
o = Activation('sigmoid')(o)
sgns = Model(inputs=[t_inputs, c_inputs], outputs=o)
sgns.summary()
sgns.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

Building model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 300)       3000000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 300)       3000000     input_2[0][0]                    
___________________________________________________________________________________________

In [8]:
class ModelTest(keras.callbacks.Callback):
    
    def __init__(self, query):
        self.query = query
        
    def on_epoch_end(self, epoch, logs={}):
        self.gensim_model = self.make_gensim_model(self.model.get_weights()[0])
        print(self.gensim_model.most_similar(self.query, topn=3))
        
    def make_gensim_model(self, weights):
        # TO DO: There must be a way to do this without I/O
        with open('vectors.txt' ,'w') as f:
            f.write('{} {}\n'.format(vocabulary_size, dim))
            for token, i in token2id.items():
                f.write('{} {}\n'.format(token, ' '.join(map(str, list(weights[i,:])))))
        return gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False)

In [9]:
class WordContextMatrixDataset(Sequence):

    def __init__(self, lil_mat, positive_samples=10000, negative_samples=5):

        self.lil_mat = lil_mat.tolil(copy=True)
        # self.lil_mat = np.copy(lil_mat)
        self.positive_samples = positive_samples
        self.negative_samples = negative_samples
        self.batch_size = positive_samples + negative_samples
        self.total_samples = self.lil_mat.sum()
        self.batches = np.ceil(self.total_samples / self.batch_size).astype(np.int32)
        self.last_batch_size = np.mod(self.total_samples, self.batch_size)
        self.vocabulary_size = self.lil_mat.shape[0]

        # Create a copy of the matrix to restore
        self.lil_mat_restore = self.lil_mat.tolil(copy=True)
        # self.lil_mat_restore = np.copy(self.lil_mat)
        
        # Nonzero index matrix and index of nonzero index matrix
        self.nonzero_indices = np.array(self.lil_mat.nonzero()).T # Transpose = zip!
        self.nonzero_indices_indices = np.array(np.arange(self.nonzero_indices.shape[0]))
        
    def restore_mat(self):
        self.lil_mat = self.lil_mat_restore.tolil(copy=True)
        # self.lil_mat = np.copy(self.lil_mat_restore)

    def get_batch(self, idx):

        # If we already drew all samples, restore everything and restart
        if self.nonzero_indices_indices.shape[0] == 0:
            self.restore_mat()
            self.nonzero_indices_indices = np.array(np.arange(self.nonzero_indices.shape[0]))

        # Get batch_size samples, or whatever remains
        if self.batch_size > self.nonzero_indices_indices.shape[0]:
            sample_size = self.nonzero_indices_indices.shape[0]
        else:
            sample_size = self.batch_size
        negative_sample_size = sample_size * self.negative_samples

        # Shuffeling takes place here!
        draw = np.random.choice(self.nonzero_indices_indices, size=sample_size, replace=False)
        ii = self.nonzero_indices[draw].T[0]
        pos_jj = self.nonzero_indices[draw].T[1]
        
        # Subsampling
        if sampling_factor > 0:
            rr = np.random.random(size=ii.shape[0])
            mm = np.array(sampling_table[ii+1] > rr) # Creates a boolean mask of True where i>r
            ii = ii[mm] # Where the boolean mask is true, an entry will be kept
            pos_jj=pos_jj[mm]
            sample_size = ii.shape[0] # Adjust sample size
            negative_sample_size = sample_size * self.negative_samples

        words = np.zeros(sample_size + negative_sample_size, dtype=np.int32)
        words = np.tile(ii, 1 + self.negative_samples)
        contexts = np.zeros(sample_size + negative_sample_size, dtype=np.int32)
        labels = np.zeros(sample_size + negative_sample_size, dtype=np.int32)

        # Positive contexts and labels
        ones = np.ones(sample_size, dtype=np.int32)
        contexts[0:sample_size] = pos_jj
        labels[0:sample_size] = ones

        # Negative contexts and labels
        neg_jj = np.random.randint(self.vocabulary_size, size=negative_sample_size)
        zeros = np.zeros(negative_sample_size)
        contexts[sample_size:] = neg_jj
        labels[sample_size:] = zeros

        # Deduct 1 from every positive sample in the matrix
        samples = self.lil_mat[ii, pos_jj].toarray()
        # samples = self.lil_mat[ii, pos_jj]
        samples-=1

        # "Delete" sample by deleting (=boolean masking) the index to its index
        delete = np.where(samples == 0)[0]
        self.nonzero_indices_indices = np.delete(self.nonzero_indices_indices, delete)
        
        # "Keep" sample by just reducing its value in the matrix
        keep = np.where(samples > 0)[0]
        for k in keep:
            self.lil_mat[ii[k], pos_jj[k]]-=1
        # self.lil_mat[ii[keep], pos_jj[keep]]-=1 # Not working with LIL

        # Prepare data for training
        x = [words, contexts]
        return x, labels
         
    def __len__(self):
        return self.batches

    def __getitem__(self, idx):
        batch = self.get_batch(idx)
        return np.array(batch)

In [10]:
# We already shuffle within the generator
data = WordContextMatrixDataset(lil_mat, positive_samples, negative_samples)
tester = ModelTest(query='queen')
checkpointer = ModelCheckpoint(filepath='model.ckpt') # Different from the model for visualization
sgns.fit_generator(data, epochs=epochs, shuffle=False, callbacks=[tester, checkpointer])

Epoch 1/5
[('helene', 0.43006783723831177), ('princess', 0.41735637187957764), ('sovereign', 0.40093570947647095)]
Epoch 2/5
[('princess', 0.32642680406570435), ('march', 0.3052293658256531), ('ministry', 0.2952641248703003)]
Epoch 3/5
[('princess', 0.31089067459106445), ('ministry', 0.2957751154899597), ('priest', 0.28572994470596313)]
Epoch 4/5
[('princess', 0.3147178888320923), ('victoria', 0.2861301898956299), ('wench', 0.28555524349212646)]
Epoch 5/5
[('princess', 0.31343764066696167), ('victoria', 0.30550509691238403), ('ministry', 0.2916772663593292)]


<keras.callbacks.History at 0x7fb2d728ea20>

In [11]:
print(tester.gensim_model.most_similar(positive=['woman', 'king'], negative=['man']))

[('prince', 0.2706182897090912), ('servant', 0.24704021215438843), ('sister', 0.24513643980026245), ('madam', 0.22692053020000458), ('wife', 0.22593748569488525), ('nurse', 0.22547906637191772), ('anne', 0.22380417585372925), ('maiden', 0.2230634242296219), ('briggs', 0.21479201316833496), ('girl', 0.2124357521533966)]


In [12]:
# Create files for Tensorboard embedding projector from internal gensim model
path = 'projector'
if not os.path.exists(path): os.makedirs(path)

vectors = np.zeros((max_vocab, dim))
with open(path + '/metadata.tsv', 'w+') as f:
    for i, word in enumerate(tester.gensim_model.wv.index2word):
        vectors[i] = tester.gensim_model[word]
        f.write(word + '\n')
        
# Plain Tensorflow because Keras gets confused by the double embedding layer
sess = tf.InteractiveSession()
with tf.device("/cpu:0"):
    embedding = tf.Variable(vectors, trainable=False, name='embedding')
tf.global_variables_initializer().run()

saver = tf.train.Saver()
writer = tf.summary.FileWriter(path, sess.graph)
config = projector.ProjectorConfig()
embed = config.embeddings.add()
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(writer, config)
saver.save(sess, path + '/model.ckpt')   

'./projector/model.ckpt'