# The tutorial is here https://adventuresinmachinelearning.com/keras-lstm-tutorial/

In [1]:
import os
import tensorflow as tf
from tensorflow.python.client import device_lib

os.environ["CUDA_VISIBLE_DEVICES"]="1"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)



import collections
import numpy as np
from keras.utils import to_categorical

def build_vocab(filename):
    data = read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))

    return word_to_id

def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eos>").split()
    
def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

def load_data(data_path):
    # get the data paths
    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path = os.path.join(data_path, "ptb.test.txt")

    # build the complete vocabulary, then convert text data to list of integers
    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

    return train_data, valid_data, test_data, vocabulary, reversed_dictionary


"""
This class generates batches of set size at set skip_step INSTEAD OF creating an ENOROMOUS list or array of text
fragments that would hog the entire memory. You don't have to create the entire tensor (array, list) up front, but
can use a Generator object using 'yield' (see https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do)
that create the needed things on the fly.
"""
class KerasBatchGenerator(object):


    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step
    
    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

In [2]:
data_path = '/home/amplifier/home/NEW_DL/LSTM'
train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data(data_path)

In [3]:
print([reversed_dictionary[x] for x in train_data[0:10]])

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec']


In [4]:
num_steps = 30
batch_size=20
vocabulary=10000
skip_step=30

train_data_generator = KerasBatchGenerator(train_data,
                          num_steps=num_steps,
                          batch_size=batch_size,
                          vocabulary=vocabulary,
                          skip_step=skip_step)

valid_data_generator = KerasBatchGenerator(valid_data,
                          num_steps=num_steps,
                          batch_size=batch_size,
                          vocabulary=vocabulary,
                          skip_step=skip_step)

In [5]:
gg = train_data_generator.generate()


In [8]:
"""
We feed in batches of size (batchsize, len_input_sequence)
"""
# get one batch:
a = next(gg)

# the batch is a list of two elements: 
# (0) is input tensor of size (batchsize, len_input_sequence).
# (1) is the output tensor of size (batchsize, len_input_sequence, dictionary_size). This is needed for computing loss (output of last layer,
# vs. one-hot-encoded targets).

print('input shape:', a[0].shape)
print('ouput shape:', a[1].shape)

# The words in the sequences are coded by integers. These integers are transformed by the embedding layer into 500-dimensional vector representations of words.
# let's inspect one (x,y) pair from a batch:

# get x:
print([reversed_dictionary[a[0][0,i].astype('int32').tolist()] for i in range(30)])
# get y:
print([reversed_dictionary[np.argmax(a[1][0,i,:]).astype('int32').tolist()] for i in range(30)])

# you see that the y is a lagged version of the x.

input shape: (20, 30)
ouput shape: (20, 30, 10000)
['million', '<eos>', 'the', 'thrift', 'holding', 'company', 'said', 'it', 'expects', 'to', 'obtain', 'regulatory', 'approval', 'and', 'complete', 'the', 'transaction', 'by', 'year-end', '<eos>', '<unk>', 'international', 'inc.', 'said', 'its', '<unk>', '&', '<unk>', 'unit', 'completed']
['<eos>', 'the', 'thrift', 'holding', 'company', 'said', 'it', 'expects', 'to', 'obtain', 'regulatory', 'approval', 'and', 'complete', 'the', 'transaction', 'by', 'year-end', '<eos>', '<unk>', 'international', 'inc.', 'said', 'its', '<unk>', '&', '<unk>', 'unit', 'completed', 'the']


In [9]:
[reversed_dictionary[i] for i in range(20)]

['the',
 '<unk>',
 '<eos>',
 'N',
 'of',
 'to',
 'a',
 'in',
 'and',
 "'s",
 'that',
 'for',
 '$',
 'is',
 'it',
 'said',
 'on',
 'by',
 'at',
 'as']

In [10]:
## inspect the data:
# NEXT is a cool statement that tells the generator to generate the next item. So generator is cool thing that
# lets you get the items (i.e. slices of huge tensors) without hogging all the memory!!!


# num_steps = 4
# batch_size=5
# vocabulary=10000
# skip_step=1

# generator = KerasBatchGenerator(test_data,
#                           num_steps=num_steps,
#                           batch_size=batch_size,
#                           vocabulary=vocabulary,
#                           skip_step=skip_step)
# inspect_how_many_batches = 3
# for i in range(inspect_how_many_batches):
#     batch = next(generator.generate())
#     print('\nBatch: {}\n{}\nlabels:'. format(i, batch[0].astype('int')))
#     for j in range(batch_size):
#         print(np.nonzero(batch[1][j])[1])

In [11]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, CSVLogger

hidden_size = 500

model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dropout(0.1))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))
model.summary()

# model = load_model(data_path + "model-30.hdf5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 500)           5000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 500)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 30, 10000)         5010000   
_________________________________________________________________
activation_1 (Activation)    (None, 30, 10000)         0         
Total params: 14,014,000
Trainable params: 14,014,000
Non-trainable params: 0
________________________________________________________________

In [12]:
len(train_data)//(batch_size*num_steps)

1549

In [11]:
num_epochs = 40
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
checkpointer = ModelCheckpoint(filepath=data_path + 'model-{epoch:02d}.hdf5', verbose=1)
csv_logger = CSVLogger('log.csv', append=True, separator=',')

model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(batch_size*num_steps), callbacks=[checkpointer, csv_logger])

Epoch 1/40

KeyboardInterrupt: 

In [16]:
# model = load_model(data_path + "model-15.hdf5")
dummy_iters = 40
example_training_generator = KerasBatchGenerator(train_data,
                                                 num_steps=num_steps,
                                                 batch_size=1,
                                                 vocabulary=vocabulary,
                                                 skip_step=skip_step)


print("Training data:")
for i in range(dummy_iters):
    dummy = next(example_training_generator.generate())
num_predict = 10
true_print_out = "Actual words: "
pred_print_out = "Predicted words: "
for i in range(num_predict):
    data = next(example_training_generator.generate())
    prediction = model.predict(data[0])
    predict_word = np.argmax(prediction[:, num_steps-1, :])
    true_print_out += reversed_dictionary[train_data[num_steps + dummy_iters + i]] + " "
    pred_print_out += reversed_dictionary[predict_word] + " "
print(true_print_out)
print(pred_print_out)

Training data:
Actual words: director of this british industrial conglomerate <eos> a form of 
Predicted words: the <eos> bonds obligations size to said mazda <unk> and 


In [51]:
pred = model.predict(np.array(train_data[500:530]).reshape(1,-1))
print(' '.join([reversed_dictionary[i] for i in from_categorical(pred)]))

to of <unk> <eos> <eos> the the of <eos> the <unk> <eos> <unk> said <eos> the <unk> <unk> n't of the <unk> <unk> <unk> <eos> the n't have a <unk>


In [44]:
print(data[0].shape)
pred = model.predict(data[0])
pred.shape

(1, 30)


(1, 30, 10000)

In [45]:
def from_categorical(arr):
    out = np.zeros([arr.shape[1]])
    for i in range(arr.shape[1]):
        out[i] = np.argmax(arr[0, i, :])
    return out.astype('int').tolist()

In [52]:
pred = model.predict(data[0])
print('PREDICTED SEQUENCE:')
print(' '.join([reversed_dictionary[i] for i in from_categorical(pred)]))

PREDICTED SEQUENCE:
operations and <eos> the <unk> said N old <unk> the <unk> of the the inc. the of the of former of of <unk> <unk> bank union <eos> the spokesman of


In [47]:
print('TRUE SEQUENCE:')
print(' '.join([reversed_dictionary[i] for i in data[0].flatten().tolist()]))

TRUE SEQUENCE:
dutch publishing group <eos> rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate <eos> a form


In [16]:
print('LABELS:')
print(' '.join([reversed_dictionary[i] for i in from_categorical(data[1])]))

LABELS:
publishing group <eos> rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate <eos> a form of
