In [1]:
# Reference: https://github.com/vlraik/word-level-rnn-keras/blob/master/lstm_text_generation.py

import random
import sys
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
dir_split = f"../1.DataPreparationResults/obama"

file_train = open(f"{dir_split}/train.txt", 'r').read()
file_val = open(f"{dir_split}/val.txt", 'r').read()
file_test = open(f"{dir_split}/test.txt", 'r').read()

# Add spaces around <speech_sep>
# Create a set of all words in train.txt but remove <speech_sep>
word_train = set(file_train.replace("<speech_sep>", " <speech_sep> ").split())
word_train.remove("<speech_sep>")

print("total number of unique words: ",len(word_train))

word_indices = dict((c, i) for i, c in enumerate(word_train))
indices_word = dict((i, c) for i, c in enumerate(word_train))

total number of unique words:  6508


In [3]:
x_len = 30
x_step = 1

In [4]:
def vectorization(file):
    sentences = []
    sentences2 = []
    next_words = []
    list_words = []

    for speech in file.split("<speech_sep>"):
        list_words = speech.split()
        # I noticed the last speech has zero word 
        # because <speech_sep> is the last character
        if len(list_words) == 0:
            break

        for i in range(0,len(list_words)-x_len, x_step):
            sentences2 = ' '.join(list_words[i: i + x_len])
            sentences.append(sentences2)
            next_words.append(list_words[i + x_len])

    x = np.zeros((len(sentences), x_len, len(word_train)), dtype=np.bool)
    y = np.zeros((len(sentences), len(word_train)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, word in enumerate(sentence.split()):
            # For words that don't exist in train.txt but exist in val.txt or test.txt,
            #     X[i, t] would be all zeros
            if word in word_train:
                x[i, t, word_indices[word]] = 1
        if next_words[i] in word_train:
            y[i, word_indices[next_words[i]]] = 1
            
    return x, y

In [5]:
# Run into memory issue with huge arrays
# Reference: https://stackoverflow.com/questions/57507832/unable-to-allocate-array-with-shape-and-data-type

train_X, train_Y = vectorization(file_train)
print(train_X.shape)
print(train_Y.shape)

val_X, val_Y = vectorization(file_val)
print(val_X.shape)
print(val_Y.shape)

(80001, 30, 6508)
(80001, 6508)
(83061, 30, 6508)
(83061, 6508)


In [6]:
model = keras.Sequential()
model.add(LSTM(512, input_shape=(x_len, len(word_train)), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(word_train), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [None]:
if os.path.isfile('GoTweights'):
    model.load_weights('GoTweights')

def sample(a, temperature=1.0):
    # helper function to randomly select a word based on probability distribution
#     a = np.log(a) / temperature
#     a = np.exp(a) / np.sum(np.exp(a))
#     return np.argmax(np.random.multinomial(1, a, 1))
    
    # Reference: https://github.com/llSourcell/How-to-Generate-Music-Demo/issues/4
    a = np.log(a) / temperature 
    dist = np.exp(a)/np.sum(np.exp(a)) 
    choices = range(len(a)) 
    return np.random.choice(choices, p=dist)

# train the model, output generated text after each iteration
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(train_X, train_Y, batch_size=1280, epochs=2, validation_data=(val_X,val_Y))
    model.save_weights('GoTweights',overwrite=True)

    # Select a speech from the test file
    # randint(a,b) selects from all integers between a and b (inclusive)
    # The last speech has zero word, so instead of -1 use -2
    gen_speech_index = random.randint(0, len(file_test.split("<speech_sep>"))-2)
    # A list of words in the speech
    list_words = file_test.split("<speech_sep>")[gen_speech_index].split()
    # Select a starting point for the context
    start_index = random.randint(0, len(list_words) - x_len - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        generated = ''
        sentence = list_words[start_index: start_index + x_len]
        generated += ' '.join(sentence)
        print('----- Generating with seed: "' , sentence , '"')
        print()
        sys.stdout.write(generated)
        print()

        for i in range(50):
            x = np.zeros((1, x_len, len(word_train)))
            for t, word in enumerate(sentence):
                if word in word_train:
                    x[0, t, word_indices[word]] = 1.

            preds = model.predict(x, verbose=0)[0]            
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            generated += next_word
            del sentence[0]
            sentence.append(next_word)
            sys.stdout.write(' ')
            sys.stdout.write(next_word)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1
Train on 80001 samples, validate on 83061 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/2
Epoch 2/2

----- diversity: 0.2
----- Generating with seed: " ['rocket', 'attacks', 'from', 'Gaza', ',', 'and', 'we', 'have', 'stood', 'up', 'for', 'Israel', 's', 'right', 'to', 'defend', 'itself', '.', 'And', 'that', 'is', 'why', 'Israel', 'has', 'a', 'right', 'to', 'expect', 'Hamas', 'to'] "

rocket attacks from Gaza , and we have stood up for Israel s right to defend itself . And that is why Israel has a right to expect Hamas to
 . . , that . . , , that . . . , , to , that the , . , . . . . . to . . , the . the , . , . the the , . . , . . . , . , .

----- diversity: 0.5
----- Generating with seed: " ['rocket', 'attacks', 'from', 'Gaza', ',', 'and', 'we', 'have', 'stood', 'up', 'for', 'Israel', 's', 'right', 'to', 'defend', 'itself', '.', 'And', 'that', 'is', 'why', 'Is