https://arxiv.org/pdf/1301.3781v3.pdf

https://github.com/chrisjmccormick/word2vec_commented

http://www.claudiobellei.com/2018/01/07/backprop-word2vec-python/

https://github.com/cbellei/word2veclite

https://keras.io/preprocessing/sequence/#skipgrams

https://github.com/Hironsan/awesome-embedding-models/blob/master/examples/skip-gram_with_ns.py

https://github.com/nzw0301/keras-examples

https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/sequence.py

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import string
import pickle
import gensim
from collections import Counter, OrderedDict
from tqdm import tqdm_notebook
from scipy.sparse import coo_matrix

import keras
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import Sequence
from keras.preprocessing.sequence import skipgrams, make_sampling_table

Using TensorFlow backend.


In [2]:
sampling_table=np.array([0,0,0,0,0,1,1,1,1,1,1])

ii = np.array([0,1,2,3,4,5,6,7,8,9])
rr = np.random.random(size=ii.shape[0])
print(rr, rr.shape)
mm = np.array(sampling_table[ii+1] > rr, dtype=np.bool)
print(mm, mm.shape, mm.sum())
ii = ii[mm]
print(ii, ii.shape)

[0.73969958 0.83902403 0.44205325 0.78135131 0.93242846 0.36705956
 0.13969671 0.94913261 0.29242238 0.85364323] (10,)
[False False False False  True  True  True  True  True  True] (10,) 6
[4 5 6 7 8 9] (6,)


In [3]:
# TO DO: implement loading of pickled wcf matrix
# Load from pickle file
# with open('data.pic', 'rb') as f:
#     source = pickle.load(f)

In [4]:
# Hyperparameters
corpus = 'c/proust_ascii.txt'
dim = 300 # word2vec default: 300
window_size = 5 # word2vec default: 5
min_count = 5 # word2vec default: 5 - set to 0 to disable
sampling_factor = 1e-03 # word2vec default: 1e-03 - set to 0 to disable
positive_samples = 10000 # gensim.word2vec.Text8Corpus default: 10000 words/sequence -> Keras skipgram function
negative_samples = 5 # word2vec default: 5
epochs = 20

In [5]:
# Read corpus
with open(corpus, 'r') as f: text = f.read()

# Make it all lower case    
text = text.lower()

# Remove all punctuation
exclude = set(string.punctuation)
text = ''.join(char for char in text if char not in exclude)

print('Text size: '+ str(len(text)) + ' characters')

# Truncate to a million characters for quick testing
text = text

# Make list
print('Converting text to word list...')
text = text.split()

Text size: 7049641 characters
Converting text to word list...


In [6]:
# Get frequencies
frequencies = Counter(text).most_common()

# Eliminate words that appear less than min_count times
frequencies = OrderedDict({word: i for word, i in frequencies if i >= min_count})

# Map words to integers
token2id = {word: i for i, word in enumerate(frequencies.keys())}
id2token = {i: word for i, word in enumerate(frequencies.keys())}

# Length of dictionary is size of vocabulary
vocabulary_size = len(token2id)

# Encode
text = [token2id[token] for token in text if token in token2id]

# Make sampling table for subsampling (starts at 1!)
sampling_table = make_sampling_table(size=vocabulary_size+1, sampling_factor=sampling_factor)

print('Vocabulary size: ' + str(vocabulary_size) + ' words')

Vocabulary size: 11006 words


In [7]:
# Create square word-context-frequency matrix
print('Filling WCF matrix...')
mat = np.zeros((vocabulary_size, vocabulary_size), dtype=np.int32)
for i, wi in (enumerate(tqdm_notebook(text))):
    window_start = max(0, i - window_size)
    window_end = min(len(text), i + window_size + 1)
    for j in range(window_start, window_end):
        if j != i:
            wj = text[j]
            mat[wi][wj]+=1            
print('Total samples in matrix: ' + str(mat.sum()) + ' samples')
        
# Create sparse LIL representation of square word-context matrix
print('Creating sparse matrix...')
coo_mat = coo_matrix(mat)
lil_mat = coo_mat.tolil()

Filling WCF matrix...


HBox(children=(IntProgress(value=0, max=1278853), HTML(value='')))


Total samples in matrix: 12788500 samples
Creating sparse matrix...


In [8]:
print('Building model...')

t_inputs = Input(shape=(1, ), dtype=np.int32)
t = Embedding(vocabulary_size, dim)(t_inputs)
c_inputs = Input(shape=(1, ), dtype=np.int32)
c  = Embedding(vocabulary_size, dim)(c_inputs)
o = Dot(axes=2)([t, c])
o = Reshape((1,), input_shape=(1, 1))(o)
o = Activation('sigmoid')(o)
sgns = Model(inputs=[t_inputs, c_inputs], outputs=o)
sgns.summary()
sgns.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

Building model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 300)       3301800     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 300)       3301800     input_2[0][0]                    
___________________________________________________________________________________________

In [40]:
class TestCallback(keras.callbacks.Callback):
    
    def __init__(self, testword):
        self.testword = testword

    def on_epoch_end(self, epoch, logs={}):
        self.most_similar(positive=[self.testword], negative=[], topn=3)
        
    def most_similar(self, positive, negative=[], topn=10):
        
        # Gensim word vector file format
        with open('vectors.txt' ,'w') as f:
            f.write('{} {}\n'.format(vocabulary_size, dim))
            vectors = sgns.get_weights()[0]
            for token, i in token2id.items():
                f.write('{} {}\n'.format(token, ' '.join(map(str, list(vectors[i,:])))))
                
        # Load and get most similar vectors
        w2v = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False)
        out = w2v.most_similar(positive=positive, negative=negative, topn=topn)
        print(positive, negative, out)
        
        
class WordContextMatrixDataset(Sequence):

    def __init__(self, lil_mat, positive_samples=10000, negative_samples=5):

        self.lil_mat = lil_mat.tolil(copy=True)
        # self.lil_mat = np.copy(lil_mat)
        self.positive_samples = positive_samples
        self.negative_samples = negative_samples
        self.batch_size = positive_samples + negative_samples
        self.total_samples = self.lil_mat.sum()
        self.batches = np.ceil(self.total_samples / self.batch_size).astype(np.int32)
        self.last_batch_size = np.mod(self.total_samples, self.batch_size)
        self.vocabulary_size = self.lil_mat.shape[0]

        # Create a copy of the matrix to restore
        self.lil_mat_restore = self.lil_mat.tolil(copy=True)
        # self.lil_mat_restore = np.copy(self.lil_mat)
        
        # Nonzero index matrix and index of nonzero index matrix
        self.nonzero_indices = np.array(self.lil_mat.nonzero()).T # Transpose = zip!
        self.nonzero_indices_indices = np.array(np.arange(self.nonzero_indices.shape[0]))
        
    def restore_mat(self):
        self.lil_mat = self.lil_mat_restore.tolil(copy=True)
        # self.lil_mat = np.copy(self.lil_mat_restore)

    def get_batch(self, idx):

        # If we already drew all samples, restore everything and restart
        if self.nonzero_indices_indices.shape[0] == 0:
            self.restore_mat()
            self.nonzero_indices_indices = np.array(np.arange(self.nonzero_indices.shape[0]))

        # Get batch_size samples, or whatever remains
        if self.batch_size > self.nonzero_indices_indices.shape[0]:
            sample_size = self.nonzero_indices_indices.shape[0]
        else:
            sample_size = self.batch_size
        negative_sample_size = sample_size * self.negative_samples

        # Shuffeling takes place here!
        draw = np.random.choice(self.nonzero_indices_indices, size=sample_size, replace=False)
        ii = self.nonzero_indices[draw].T[0]
        pos_jj = self.nonzero_indices[draw].T[1]
        
        # Subsampling
        if sampling_factor > 0:
            rr = np.random.random(size=ii.shape[0])
            mm = np.array(sampling_table[ii+1] > rr)
            ii = ii[mm]
            pos_jj=pos_jj[mm]
            sample_size = ii.shape[0]
            negative_sample_size = sample_size * self.negative_samples

        words = np.zeros(sample_size + negative_sample_size, dtype=np.int32)
        words = np.tile(ii, 1 + self.negative_samples)
        contexts = np.zeros(sample_size + negative_sample_size, dtype=np.int32)
        labels = np.zeros(sample_size + negative_sample_size, dtype=np.int32)

        # Positive contexts and labels
        ones = np.ones(sample_size, dtype=np.int32)
        contexts[0:sample_size] = pos_jj
        labels[0:sample_size] = ones

        # Negative contexts and labels
        neg_jj = np.random.randint(self.vocabulary_size, size=negative_sample_size)
        zeros = np.zeros(negative_sample_size)
        contexts[sample_size:] = neg_jj
        labels[sample_size:] = zeros

        # Deduct 1 from every positive sample in the matrix
        samples = self.lil_mat[ii, pos_jj].toarray()
        # samples = self.lil_mat[ii, pos_jj]
        samples-=1

        # "Delete" sample by deleting (=boolean masking) the index to its index
        delete = np.where(samples == 0)[0]
        self.nonzero_indices_indices = np.delete(self.nonzero_indices_indices, delete)
        
        # "Keep" sample by just reducing its value in the matrix
        keep = np.where(samples > 0)[0]
        for k in keep:
            self.lil_mat[ii[k], pos_jj[k]]-=1
        # self.lil_mat[ii[keep], pos_jj[keep]]-=1 # Not working with LIL

        # Prepare data for training
        x = [words, contexts]
        return x, labels
         
    def __len__(self):
        return self.batches

    def __getitem__(self, idx):
        batch = self.get_batch(idx)
        return np.array(batch)

In [None]:
# We already shuffle within the generator
data = WordContextMatrixDataset(lil_mat, positive_samples, negative_samples)
c = TestCallback(testword='she')
sgns.fit_generator(data, epochs=epochs, shuffle=False, callbacks=[c])

In [44]:
c.most_similar(positive=['she'], topn=20)

['she'] [] [('her', 0.8064073324203491), ('for', 0.7562803030014038), ('was', 0.7496068477630615), ('it', 0.7416568994522095), ('i', 0.736336350440979), ('he', 0.730705738067627), ('that', 0.7291724681854248), ('but', 0.721697986125946), ('with', 0.7175629138946533), ('had', 0.7128034234046936), ('as', 0.7115832567214966), ('which', 0.7109073400497437), ('and', 0.7054644227027893), ('not', 0.7006860971450806), ('me', 0.6999820470809937), ('his', 0.673018217086792), ('in', 0.6714975237846375), ('by', 0.6634480953216553), ('my', 0.6605647802352905), ('a', 0.6480237245559692)]


In [46]:
c.most_similar(positive=['albertine', 'charlus'], negative=['i'])

['albertine', 'charlus'] ['i'] [('m', 0.4291450083255768), ('de', 0.35334938764572144), ('andree', 0.26091015338897705), ('guermantes', 0.22548697888851166), ('charluss', 0.2221934050321579), ('morel', 0.21699000895023346), ('bore', 0.21379664540290833), ('albertines', 0.21305590867996216), ('villeparisis', 0.20184263586997986), ('pleasure', 0.20126578211784363)]
