In [181]:
import numpy as np
import gensim

from keras.models import Model
from keras.layers import Input, Embedding, Reshape, Dot, Dense, Activation
from keras.preprocessing.sequence import skipgrams, make_sampling_table
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import get_file

In [47]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')

with open(path) as f:
    text = f.readlines()

corpus = [sentence for sentence in text if sentence.count(' ') >= 2]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [130]:
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBED_DIM = 128
EPOCHS = 5

In [271]:
input_target = Input(shape=(1,), dtype='int32')
input_context = Input(shape=(1,), dtype='int32')

embedding = Embedding(VOCAB_SIZE, EMBED_DIM)
target = embedding(input_target)
context = embedding(input_context)

dot = Dot(axes=2)([target, context])
dot = Reshape((1,))(dot)
output = Dense(1, activation='sigmoid')(dot)

model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_39 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_40 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 1, 128)       433664      input_39[0][0]                   
                                                                 input_40[0][0]                   
__________________________________________________________________________________________________
dot_27 (Dot)                    (None, 1, 1)         0           embedding_24[0][0]               
          

In [272]:
for _ in range(10):
    loss = 0.
    for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)):
        data, labels = skipgrams(sequence=doc, vocabulary_size=VOCAB_SIZE, window_size=5, negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += model.train_on_batch(x, y)

    print(loss)

1188.6274270415306
926.0647531077266
893.3706332147121
872.4651527181268
853.4858864992857
832.332134090364
806.8422005996108
780.1508079171181
751.321742631495
721.541634503752


In [261]:
with open('vectors.txt' ,'w') as f:
    f.write(f"{VOCAB_SIZE - 1} {EMBED_DIM}\n")
    vectors = model.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        f.write(f"{word} {' '.join(map(str, list(vectors[i, :])))}\n")

In [262]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [270]:
w2v.most_similar(positive=['girl'])

  if np.issubdtype(vec.dtype, np.int):


[('serpent', 0.4946148693561554),
 ('she’ll', 0.48323914408683777),
 ('ignorant', 0.4303949475288391),
 ('you’re', 0.4270668625831604),
 ('whether', 0.4068315327167511),
 ('‘a', 0.4067261815071106),
 ('asking', 0.3956163227558136),
 ('doubtfully', 0.37931880354881287),
 ('kick', 0.3604133725166321),
 ('nervous', 0.31047528982162476)]