In [52]:
def preprocess(filename):
    # split into sentences and remove punctuation, non words.
    import nltk
    with open(filename) as f:
        return [[word.lower() for word in nltk.word_tokenize(line) if word.isalpha()]
                for line in f.readlines()]

scarlet_sentences = preprocess('scarlet.txt')

In [170]:
print(scarlet_sentences[0])

['in', 'the', 'year', 'i', 'took', 'my', 'degree', 'of', 'doctor', 'of', 'medicine', 'of', 'the', 'university', 'of', 'london', 'and', 'proceeded', 'to', 'netley', 'to', 'go', 'through', 'the', 'course', 'prescribed', 'for', 'surgeons', 'in', 'the', 'army']


In [88]:
def gen_embeddings(text_sentences, method='word2vec'):
    from gensim.models import Word2Vec
    embed_model = Word2Vec(
        text_sentences,
        size=100, # vector len 
        min_count=1, # min number times word has to appear
        workers=4, # num cores
        window=5, # means num words around it that affect prediction
        iter=100 # num model iterations
    )
    pretrained_weights = embed_model.wv.vectors
    vocab_size, embedding_size = pretrained_weights.shape
    return embed_model, pretrained_weights, vocab_size, embedding_size

def word2idx(word, embed_model):
  return embed_model.wv.vocab[word].index

def idx2word(idx, embed_model):
  return embed_model.wv.index2word[idx]

In [155]:
import numpy as np
def samples_generator(sentences, embed_model, batch_size=1):
    import numpy as np
    if batch_size == 1:
        for sentence in sentences:
            indices = [word2idx(word, embed_model) for word in sentence]
            yield (np.array(indices[:-1]),#.reshape(1, len(indices), 1),
                   np.array(indices[-1]))#.reshape(1, 1, 1))
    else:
        for x in range(0, len(sentences), batch_size):
            cur_sentences = sentences[x:min(len(sentences), x + batch_size)]
            max_len = max([len(x) for x in cur_sentences])
            batch_out = np.zeros([len(cur_sentences), max_len - 1])
            batch_y = np.zeros([len(cur_sentences)])
            for r, s in enumerate(cur_sentences):
                for c, word in enumerate(s[:-1]):
                    batch_out[r, c] = word2idx(word, embed_model)
                batch_y[r] = word2idx(s[-1], embed_model)
            yield (batch_out,#.reshape(batch_size, max_len, 1),
                   batch_y)#.reshape(1, max_len, 1))

def samples_arr(sentences, embed_model):
    import numpy as np
    data = [[word2idx(word, embed_model) for word in sentence]
            for sentence in sentences]
    x_train = data[:][:-1]#np.array(data[:][:-1])
    y_train = [i[-1] for i in data]#np.array(data[:][-1])
    return x_train, y_train

def samples_test(sentences, embed_model):
    import numpy as np
    data = np.array([word2idx(w, embed_model) for w in sentences[0]])
    x_train = data[:-1].reshape(1, len(sentences[0]) - 1)
    y_train = data[-1].reshape(1, 1)
    return x_train, y_train

In [157]:
print(samples_test(scarlet_sentences, embed_model))

(array([[ 1,  0, 13,  2, 14,  3, 15,  5, 16,  5, 17,  5,  0, 18,  5, 19,
         4, 20,  6, 21,  6, 22,  9,  0, 23, 24, 25, 26,  1,  0]]), array([[27]]))


In [145]:
def get_model(embed_model, pretrained_weights, vocab_size, embedding_size):
    from keras.layers.recurrent import LSTM
    from keras.layers.embeddings import Embedding
    from keras.layers import Dense, Activation
    from keras.models import Sequential
    
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights]))
    model.add(LSTM(units=embedding_size))
    model.add(Dense(units=vocab_size))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

embed_model, pretrained_weights, vocab_size, embedding_size = gen_embeddings(scarlet_sentences)
nn_model = get_model(embed_model, pretrained_weights, vocab_size, embedding_size)

In [174]:
def predict_model(_, __):
    test = [word2idx('i', embed_model)]
    preds = nn_model.predict(np.array(test).reshape(1, 1))
    print(np.sum(preds))
    print(idx2word(np.argmax(preds), embed_model))

def train_model(model, train_x=None, train_y=None, gen=False, gen_fn=None):
    from keras.callbacks import LambdaCallback
    if gen:
        model.fit_generator(gen_fn,
          steps_per_epoch=5, # number of times generator called
          epochs=20,
          callbacks=[LambdaCallback(on_epoch_end=predict_model)])
    else:    
        model.fit(train_x, train_y,
          epochs=20,
          callbacks=[LambdaCallback(on_epoch_end=predict_model)])

x_train, y_train = samples_test(scarlet_sentences, embed_model)
train_model(nn_model, train_x=x_train, train_y=y_train)
#train_model(nn_model, gen=True, gen_fn=lambda: samples_generator(scarlet_sentences, embed_model))

Epoch 1/20
1.0
army
Epoch 2/20
0.99999994
army
Epoch 3/20
1.0
army
Epoch 4/20
1.0
army
Epoch 5/20
0.9999999
army
Epoch 6/20
0.99999994
army
Epoch 7/20
1.0
army
Epoch 8/20
1.0
army
Epoch 9/20
0.99999994
army
Epoch 10/20
1.0
army
Epoch 11/20
1.0
army
Epoch 12/20
1.0
army
Epoch 13/20
1.0
army
Epoch 14/20
1.0
army
Epoch 15/20
1.0000001
army
Epoch 16/20
1.0
army
Epoch 17/20
1.0000002
army
Epoch 18/20
0.99999994
army
Epoch 19/20
1.0
army
Epoch 20/20
1.0
army


In [147]:

print('\nFetching the text...')
url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt'
path = get_file('arxiv_abstracts.txt', origin=url)

print('\nPreparing the sentences...')
max_sentence_len = 40
with open(path) as file_:
  docs = file_.readlines()
sentences = [[word for word in doc.lower().translate(None, string.punctuation).split()[:max_sentence_len]] for doc in docs]
print('Num sentences:', len(sentences))

print('\nTraining word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['model', 'network', 'train', 'learn']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
  print('  %s -> %s' % (word, most_similar))

def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
  return word_model.wv.index2word[idx]

print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence[:-1]):
    train_x[i, t] = word2idx(word)
  train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

def sample(preds, temperature=1.0):
  if temperature <= 0:
    return np.argmax(preds)
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

def generate_next(text, num_generated=10):
  word_idxs = [word2idx(word) for word in text.lower().split()]
  for i in range(num_generated):
    prediction = model.predict(x=np.array(word_idxs))
    idx = sample(prediction[-1], temperature=0.7)
    word_idxs.append(idx)
  return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
  print('\nGenerating text after epoch: %d' % epoch)
  texts = [
    'deep convolutional',
    'simple and effective',
    'a nonconvex',
    'a',
  ]
  for text in texts:
    sample = generate_next(text)
    print('%s... -> %s' % (text, sample))

model.fit(train_x, train_y,
          batch_size=128,
          epochs=20,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])


Fetching the text...


NameError: name 'get_file' is not defined