In [11]:
from __future__ import print_function

__author__ = 'maxim'

import numpy as np
import gensim
import string

import tensorflow as tf
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential

In [12]:
print('\nFetching the text...')
url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt'
path = tf.keras.utils.get_file('arxiv_abstracts.txt', origin=url)


Fetching the text...
Downloading data from https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt


In [23]:
print('\nPreparing the sentences...')
max_sentence_len = 40
with open(path) as file_:
    docs = file_.readlines()

# https://stackoverflow.com/a/40916306

sentences = [[word for word in doc.lower().translate(str.maketrans('','',string.punctuation)).split()[:max_sentence_len]] for doc in docs]

print('Num sentences:', len(sentences))


Preparing the sentences...
Num sentences: 7200


In [24]:
print('\nTraining word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['model', 'network', 'train', 'learn']:
    most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
    print('  %s -> %s' % (word, most_similar))


Training word2vec...
Result embedding shape: (1166, 100)
Checking similar words:
  model -> comprise (0.33), several (0.31), context (0.31), via (0.30), approach (0.29), training (0.29), lp (0.29), trains (0.29)
  network -> networks (0.34), given (0.33), constrained (0.29), lies (0.28), trained (0.25), represent (0.24), be (0.24), from (0.23)
  train -> based (0.39), eigendecompositions (0.34), extend (0.28), then (0.27), average (0.27), derive (0.27), adversarial (0.26), represent (0.26)
  learn -> realize (0.38), units (0.34), lower (0.34), tend (0.34), automatically (0.33), best (0.32), relevant (0.31), enormous (0.31)


  pretrained_weights = word_model.wv.syn0
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])


In [26]:
def word2idx(word):
    return word_model.wv.vocab[word].index
def idx2word(idx):
    return word_model.wv.index2word[idx]

In [27]:
print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence[:-1]):
    train_x[i, t] = word2idx(word)
  train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)


Preparing the data for LSTM...
train_x shape: (7200, 40)
train_y shape: (7200,)


In [28]:
print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


Training LSTM...


In [29]:
def sample(preds, temperature=1.0):
  if temperature <= 0:
    return np.argmax(preds)
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

In [30]:
def generate_next(text, num_generated=10):
  word_idxs = [word2idx(word) for word in text.lower().split()]
  for i in range(num_generated):
    prediction = model.predict(x=np.array(word_idxs))
    idx = sample(prediction[-1], temperature=0.7)
    word_idxs.append(idx)
  return ' '.join(idx2word(idx) for idx in word_idxs)

In [31]:
def on_epoch_end(epoch, _):
  print('\nGenerating text after epoch: %d' % epoch)
  texts = [
    'deep convolutional',
    'simple and effective',
    'a nonconvex',
    'a',
  ]
  for text in texts:
    sample = generate_next(text)
    print('%s... -> %s' % (text, sample))

In [32]:
model.fit(train_x, train_y,
          batch_size=128,
          epochs=20,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])

Epoch 1/20
Generating text after epoch: 0
deep convolutional... -> deep convolutional dnn timeconsuming path here variable distributions exponentially ranging of isometry
simple and effective... -> simple and effective function fx theory structured important tuning transformation reaching up clustering
a nonconvex... -> a nonconvex angular hierarchical noise enormous specifically schemes naturally concept promise understand
a... -> a still overhead analysis revisit mdrnns recognized caused reconsidering respect presents
Epoch 2/20
Generating text after epoch: 1
deep convolutional... -> deep convolutional stationary temporaldifference small take requires forcing nearly automatically structureactivityproperty obstacles
simple and effective... -> simple and effective vast hypotheses postsynaptic richer combines defined convnets performing artificial gasfgadf
a nonconvex... -> a nonconvex masks approximate lies other represent masking normalized better subject some
a... -> a circumvent mac

<tensorflow.python.keras.callbacks.History at 0x7f78c51519a0>