# The tutorial is here https://adventuresinmachinelearning.com/keras-lstm-tutorial/

In [44]:
import os
import tensorflow as tf
from tensorflow.python.client import device_lib

os.environ["CUDA_VISIBLE_DEVICES"]="1"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)



import collections
import numpy as np
from keras.utils import to_categorical

In [96]:
data_path = '/home/amplifier/home/NEW_DL/LSTM/'

def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "")

def preproc(data):
    new_dat = data.replace('<unk> ', '')
    new_dat = new_dat.replace('\'s ', '')
    new_dat = new_dat.replace('\'ve ', '')
    new_dat = new_dat.replace('N ', '')
    new_dat = new_dat.replace('N ', '')
    new_dat = new_dat.translate({ord(c): None for c in '1234567890/-.*\\&!@#$\''})
    return new_dat

def build_vocab(data):
    a = set(new_dat) # get unique characters in string
    a = list(a)      # convert to list
    a.sort()         # sort inplace
    d = {v:k for k,v in enumerate(a)} # make an ordered list (dictionary) from a
    return d

def get_str(string, length, batch_size):
    """
    This class generates batches of set size at set skip_step INSTEAD OF creating an ENOROMOUS list or array of text
    fragments that would hog the entire memory. You don't have to create the entire tensor (array, list) up front, but
    can use a Generator object using 'yield' (see https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do)
    that create the needed things on the fly.
    """
    X = np.zeros((batch_size, length), dtype='uint8')
    Y = np.zeros((batch_size, length, len(d)), dtype='uint8')
    while True:
        for i in range(batch_size):
            start = np.random.choice(len(string) - length)
            end = start + length
            if end >= len(string):
                pass
            x = string[start:end]
            y = string[start+1:end+1]
            X[i,:] = np.array([d[j] for j in x]).flatten()
            one_hot = to_categorical(np.array([d[j] for j in y]).flatten(), len(d))
            Y[i,:,:] = one_hot
        yield X, Y
        
valid_data = read_words(data_path + 'ptb.valid.txt')
valid_data = preproc(data)

train_data = read_words(data_path + 'ptb.train.txt')
train_data = preproc(data)

d = build_vocab(train_data)

In [97]:
batch_size = 500
num_steps = 10
train_data_generator = get_str(train_data, num_steps, batch_size)
validation_data_generator = get_str(valid_data, num_steps, batch_size)

In [144]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, CSVLogger

hidden_size = 100

model = Sequential()
model.add(Embedding(len(d), hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dropout(0.1))
model.add(TimeDistributed(Dense(len(d))))
model.add(Activation('softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 10, 100)           2700      
_________________________________________________________________
lstm_13 (LSTM)               (None, 10, 100)           80400     
_________________________________________________________________
lstm_14 (LSTM)               (None, 10, 100)           80400     
_________________________________________________________________
dropout_7 (Dropout)          (None, 10, 100)           0         
_________________________________________________________________
time_distributed_7 (TimeDist (None, 10, 27)            2727      
_________________________________________________________________
activation_7 (Activation)    (None, 10, 27)            0         
Total params: 166,227
Trainable params: 166,227
Non-trainable params: 0
_________________________________________________________________


In [145]:
num_epochs = 400
checkpointer = ModelCheckpoint(filepath=data_path + 'model-{epoch:02d}.hdf5', verbose=1)
csv_logger = CSVLogger('log.csv', append=True, separator=',')

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model.fit_generator(train_data_generator,
                    len(train_data)//(batch_size*num_steps),
                    num_epochs,
                    validation_data=validation_data_generator,
                    validation_steps=len(valid_data)//(batch_size*num_steps)
                   )

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

KeyboardInterrupt: 

In [146]:
rev_d = {k:v for k,v in enumerate(d)}
tst = np.array([d[j] for j in 'at is the ']).reshape(1,10)
pred = np.argmax(model.predict(tst).reshape(10,27), axis=1)
print(pred)
print([rev_d[i] for i in pred])

[14  0 20 20  0 20  8  5  0 19]
['n', ' ', 't', 't', ' ', 't', 'h', 'e', ' ', 's']


In [None]:
2+2

In [132]:
rev_d

{0: ' ',
 1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z'}