### RNN to predict the next word in a sentence

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Libraries
import gensim
from nltk.corpus import gutenberg
from string import punctuation
import numpy as np
from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation, Flatten
from keras.models import Sequential

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


#### Create embeddings

In [3]:
ball_sents = gutenberg.sents('chesterton-ball.txt')
simple_sents = [[word.lower() for word in sent if word not in punctuation] for sent in ball_sents]
len(simple_sents)
w2v = gensim.models.Word2Vec(ball_sents, size = 100, min_count= 1, window = 6, iter = 100)
len(w2v.wv.vocab)

4779

8944

In [4]:
simple_sents[:10]

[['the', 'ball', 'and', 'the', 'cross', 'by', 'g', 'k', 'chesterton', '1909'],
 ['i'],
 ['a', 'discussion', 'somewhat', 'in', 'the', 'air'],
 ['the',
  'flying',
  'ship',
  'of',
  'professor',
  'lucifer',
  'sang',
  'through',
  'the',
  'skies',
  'like',
  'a',
  'silver',
  'arrow',
  'the',
  'bleak',
  'white',
  'steel',
  'of',
  'it',
  'gleaming',
  'in',
  'the',
  'bleak',
  'blue',
  'emptiness',
  'of',
  'the',
  'evening'],
 ['that',
  'it',
  'was',
  'far',
  'above',
  'the',
  'earth',
  'was',
  'no',
  'expression',
  'for',
  'it',
  'to',
  'the',
  'two',
  'men',
  'in',
  'it',
  'it',
  'seemed',
  'to',
  'be',
  'far',
  'above',
  'the',
  'stars'],
 ['the',
  'professor',
  'had',
  'himself',
  'invented',
  'the',
  'flying',
  'machine',
  'and',
  'had',
  'also',
  'invented',
  'nearly',
  'everything',
  'in',
  'it'],
 ['every',
  'sort',
  'of',
  'tool',
  'or',
  'apparatus',
  'had',
  'in',
  'consequence',
  'to',
  'the',
  'full',
  't

In [11]:
pretrained_weights = w2v.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print('Embedding shape: ', pretrained_weights.shape)
print('Similar words:')
for word in ['skies','professor','fantastic','science','evolution']:
    most_similar = ','.join('%s (%.2f)' % (similar, dist) 
                            for similar, dist in w2v.wv.most_similar(word)[:8])
    print(' %s -> %s' % (word,most_similar))
    
print('Similarity between for and against: %s' %w2v.wv.similarity('for','against'))

Embedding shape:  (8944, 100)
Similar words:
 skies -> sang (0.73),site (0.71),conventions (0.70),fracas (0.69),Ball (0.69),towers (0.68),crypts (0.66),arrow (0.65)
 professor -> invented (0.78),problems (0.53),surrendered (0.52),justified (0.51),detected (0.51),jerk (0.50),theories (0.48),Professor (0.48)
 fantastic -> crouching (0.61),hunters (0.57),Against (0.56),unnecessary (0.56),signals (0.55),peculiarly (0.52),sunset (0.52),outlined (0.52)
 science -> persecutor (0.59),result (0.58),faith (0.55),theological (0.55),physical (0.54),modern (0.53),idiots (0.53),symbol (0.51)
 evolution -> identities (0.69),degradedly (0.51),melt (0.48),Puritanism (0.48),diving (0.46),Puritanical (0.46),Highlands (0.46),dominance (0.46)
Similarity between for and against: 0.19238615


In [6]:
def word2idx(word):
    if (word in w2v.wv.vocab):
        return w2v.wv.vocab[word].index
    return 0
# word2idx throws key error when the cell below is run, fixed with above code

def idx2word(idx):
    return w2v.wv.index2word[idx]

#### Data Preparation

In [7]:
w_count = lambda sentence: len(sentence)
max_sent_len = len(max(simple_sents, key = w_count))
train_x = np.zeros([len(simple_sents), max_sent_len], dtype = np.int32)
train_y = np.zeros([len(simple_sents)], dtype = np.int32)
for i, sentence in enumerate(simple_sents):
    for t, word in enumerate(sentence[:-1]):
        if word in w2v.wv.vocab:
            train_x[i,t] = word2idx(word)
        train_y[i] = word2idx(sentence[-1])
# line 7 throws index error even when word2idx is fixed; narrowing it down with the if statement
# will solve it
print(train_x.shape)
print(train_y.shape)

(4779, 135)
(4779,)


#### RNN model

In [10]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size,weights=[pretrained_weights]))
model.add(LSTM(units=embedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# now to generate samples
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)/temperature
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probas = np.random.multinomial(1,preds,1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.5)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
    print('Text after epoch: %d' %epoch)
    texts = ['eternally while', 'science and evolution', 'deadly revolver', 'revolver']
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s' % (text, sample))
        
model.fit(train_x,train_y, batch_size = 150, epochs = 10, 
          callbacks = [LambdaCallback(on_epoch_end=on_epoch_end)])

Epoch 1/10
Text after epoch: 0
eternally while... -> eternally while noisy gunpowder pitilessly it childish THE pure rowdies plan Devil
science and evolution... -> science and evolution bllcr10a others objected mental parallelogram thirteen hundredth improving dismal fresh
deadly revolver... -> deadly revolver Seconds things defying ignore blow cutting full career may half
revolver... -> revolver fifty stopped pass overheard detonation hart champagne offensive physiognomy wreaths
Epoch 2/10
Text after epoch: 1
eternally while... -> eternally while Also childlike shelly islands wrestling soberest chair blew cloudlets legislation
science and evolution... -> science and evolution track systems ocean employment achieved indolent gloves redouble proud schoolmaster
deadly revolver... -> deadly revolver START demanded lent unity assume led trust those evident irritation
revolver... -> revolver dottiness sided ungrateful vulgar report peaked seems committal balustrade Central
Epoch 3/10
Text a

<keras.callbacks.callbacks.History at 0x20ae7133cc8>

#### Analysis:
We're able to predict words that come after a given text, though not very well. The addition of a hidden state layer may help boost our predictions (output). We're not getting a good accuracy and the rate of change in the loss function after a certain epoch has run starts to visibly decrease. We may be encountering the vanishing gradient problem here. 