In [17]:
from __future__ import print_function
import os

import numpy as np
import zipfile
import tarfile
from six.moves.urllib.request import urlretrieve
import shutil 
import random

import string
import tensorflow as tf

# Dirs - must be absolute paths!
LOG_DIR = '/tmp/tf/ptb_char_lstm_mann_lrua_shared_memory/h16b1s1lru3_no_trunk/'
# Local dir where PTB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'

# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"

# Size of the hidden state
HIDDEN_SIZE = 4

# Size of batch
BATCH_SIZE = 1

# Length of sequence (=  number of units of controller (recurrent layer))
SEQ_LENGTH = 1

#### MANN-related parameters.
# Size of the local memory of each cell.
MEMORY_SLOTS = 10

# Number of smallest elements - used in LRUA scheme.
N_SMALLEST = 1

# "Update weight decay".
GAMMA = 0.95

# Eps for normalization in visualization
EPS = 1e-10

In [18]:
def maybe_download_ptb(path, 
                       filename='simple-examples.tgz', 
                       url='http://www.fit.vutbr.cz/~imikolov/rnnlm/', 
                       expected_bytes =34869662):
  # Eventually create the PTB dir.
  if not tf.gfile.Exists(path):
    tf.gfile.MakeDirs(path)
  """Download a file if not present, and make sure it's the right size."""
  _filename = path+filename
  if not os.path.exists(_filename):
    print('Downloading %s...' % filename)
    _filename, _ = urlretrieve(url+filename, _filename)
  statinfo = os.stat(_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', (_filename), '(', statinfo.st_size, ')')
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + _filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download_ptb(PTB_DIR)

Found and verified /home/tkornuta/data/ptb/simple-examples.tgz ( 34869662 )


### Check/maybe download PTB.

### Extract dataset-related files from the PTB archive.

In [19]:
def extract_ptb(path, filename='simple-examples.tgz', files=["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt", 
                                       "ptb.char.train.txt", "ptb.char.valid.txt", "ptb.char.test.txt"]):
    """Extracts files from PTB archive."""
    # Extract
    tar = tarfile.open(path+filename)
    tar.extractall(path)
    tar.close()
    # Copy files
    for file in files:
        shutil.copyfile(PTB_DIR+"simple-examples/data/"+file, PTB_DIR+file)
    # Delete directory
    shutil.rmtree(PTB_DIR+"simple-examples/")        

extract_ptb(PTB_DIR)

### Load train, valid and test texts.

In [20]:
def read_data(filename, path):
    with open(path+filename, 'r') as myfile:
        data=myfile.read()# .replace('\n', '')
        return data

train_text = read_data(TRAIN, PTB_DIR)
train_size=len(train_text)
print(train_size, train_text[:100])

valid_text = read_data(VALID, PTB_DIR)
valid_size=len(valid_text)
print(valid_size, valid_text[:64])

test_text = read_data(TEST, PTB_DIR)
test_size=len(test_text)
print(test_size, test_text[:64])

5101618  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote
399782  consumers may want to move their telephones a little closer to 
449945  no it was n't black monday 
 but while the new york stock excha


### Utility functions to map characters to vocabulary IDs and back.

In [21]:
vocabulary_size = 59 # [A-Z] + [a-z] + ' ' +few 'in between; + punctuation
first_letter = ord(string.ascii_uppercase[0]) # ascii_uppercase before lowercase! 
print("vocabulary size = ", vocabulary_size)
print(first_letter)

def char2id(char):
  """ Converts char to id (int) with one-hot encoding handling of unexpected characters"""
  if char in string.ascii_letters:# or char in string.punctuation or char in string.digits:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    # print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  """ Converts single id (int) to character"""
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

#print(len(string.punctuation))
#for i in string.ascii_letters:
#    print (i, char2id(i))


print(char2id('a'), char2id('A'), char2id('z'), char2id('Z'), char2id(' '), char2id('ï'))
print(id2char(char2id('a')), id2char(char2id('A')))
#print(id2char(65), id2char(33), id2char(90), id2char(58), id2char(0))
#bankno
sample = np.zeros(shape=(1, vocabulary_size), dtype=np.float)
sample[0, char2id(' ')] = 1.0
print(sample)

vocabulary size =  59
65
33 1 58 26 0 0
a A
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.]]


### Helper class for batch generation

In [22]:
class BatchGenerator(object):
  def __init__(self, text, batch_size, seq_length, vocab_size):
    """
    Initializes the batch generator object. Stores the variables and first "letter batch".
    text is text to be processed
    batch_size is size of batch (number of samples)
    seq_length represents the length of sequence
    vocab_size is number of words in vocabulary (assumes one-hot encoding)
    """
    # Store input parameters.
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._seq_length = seq_length
    self._vocab_size = vocab_size
    # Divide text into segments depending on number of batches, each segment determines a cursor position for a batch.
    segment = self._text_size // batch_size
    # Set initial cursor position.
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # Store first "letter batch".
    self._last_letter_batch = self._next_letter_batch()
  
  def _next_letter_batch(self):
    """
    Returns a batch containing of encoded single letters depending on the current batch 
    cursor positions in the data.
    Returned "letter batch" is of size batch_size x vocab_size
    """
    letter_batch = np.zeros(shape=(self._batch_size, self._vocab_size), dtype=np.float)
    # Iterate through "samples"
    for b in range(self._batch_size):
      # Set 1 in position pointed out by one-hot char encoding.
      letter_batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return letter_batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    # First add last letter from previous batch (the "additional one").
    batches = [self._last_letter_batch]
    for step in range(self._seq_length):
      batches.append(self._next_letter_batch())
    # Store last "letter batch" for next batch.
    self._last_letter_batch = batches[-1]
    return batches


In [23]:
# Trick - override first 10 chars
#list1 = list(train_text)
#for i in range(2):
#    list1[i] = 'z'
#train_text = ''.join(list1)
#print("Train set =", train_text[0:100])

# Create objects for training, validation and testing batch generation.
train_batches = BatchGenerator(train_text, BATCH_SIZE, SEQ_LENGTH, vocabulary_size)

# Get first training batch.
batch = train_batches.next()
print(len(batch))
print(batch[0].shape)
#print("Batch = ", batch)
#print(batches2string(batch))
#print("batch len = num of enrollings",len(batch))
#for i in range(num_unrollings):
#    print("i = ", i, "letter=", batches2string(batch)[0][i][0], "bits = ", batch[i][0])


# For validation  - process the whole text as one big batch.
VALID_BATCH_SIZE = int(np.floor(valid_size/SEQ_LENGTH))
valid_batches = BatchGenerator(valid_text, VALID_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
valid_batch = valid_batches.next()
#print (VALID_BATCH_SIZE)
#print(len(valid_batch))
#print(valid_batch[0].shape)

# For texting  - process the whole text as one big batch.
TEST_BATCH_SIZE = int(np.floor(test_size/SEQ_LENGTH))
test_batches = BatchGenerator(test_text, TEST_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
# Get single batch! 
#test_batch = test_batches.next()


2
(1, 59)


In [34]:
# Definition of the cell computation.
def controller_cell(input_, # input x
                    memory_input_, # read vector from the memory returned by previous cell
                    prev_output_, # output of the previous cell
                    prev_cell_state_, # previous cell state
                    prev_read_weights_batch_, # read weights from previous time state (t-1) 
                    prev_update_weights_batch_, # update weights from previous time state (t-1)
                    name_):
    """Create a controller with local memory cell"""
    """First dimensions of each of the computational nodes below is "derrived" from BATCH_SIZE"""
    with tf.name_scope(name_):

        with tf.name_scope("LSTM"):
            # LSTM cell equations according to Christopher Olah blog.
            # colah.github.io/posts/2015-08-Understanding-LSTMs/
            # Concatenate intyp x with h_prev ("prev output") TODO: and memory.
            i_h_m = tf.concat([input_, prev_output_, memory_input_], 1)

            # Calculate forget, input and output gate activations.
            forget_gate = tf.sigmoid(tf.matmul(i_h_m, Wf) + bf, name="Forget_gate")
            input_gate = tf.sigmoid(tf.matmul(i_h_m, Wi) + bi, name="Input_gate")
            output_gate = tf.sigmoid(tf.matmul(i_h_m, Wo) + bo, name="Output_gate")

            # Update of the cell state C~.
            cell_update = tf.tanh(tf.matmul(i_h_m, Wc) + bc, name="Cell_update")
            # New cell state C.
            cell_state = tf.add(forget_gate * prev_cell_state_, input_gate * cell_update, name = "Cell_state")
            # Calculate h - "output".
            cell_output = output_gate * tf.tanh(cell_state)
            
        with tf.name_scope("Keys"):
            # Calculate keys - read and add.
            k_t = tf.tanh(tf.matmul(cell_output, W_key) + b_key) # (batch_size, nb_reads, memory_size[1])
            #a_t = tf.tanh(tf.matmul(cell_output, W_add) + b_add) # (batch_size, nb_reads, memory_size[1])
            alpha = tf.sigmoid(tf.matmul(cell_output, W_alpha) + b_alpha) # (batch_size, nb_reads, 1)

            # Add histograms to TensorBoard.
            k_t_hist = tf.summary.histogram("k_t", k_t)
            #a_t_hist = tf.summary.histogram("a_t", a_t)
            
        # Read from the memory.
        with tf.name_scope("Read_head"):
            
            # Normalize batch.
            norm_batch = tf.nn.l2_normalize(k_t,1)
            # Normalize memory.
            norm_memory = tf.nn.l2_normalize(memory,0)
            # Calculate cosine similarity.
            similarity_batch = tf.matmul(norm_batch, norm_memory)
            # Calculate read weights based on similarity.
            read_weights_batch = tf.nn.softmax(similarity_batch)

            # Add to list returned as "previous read weights".
            read_weights_seq_batch.append(read_weights_batch)
            
            # Add histograms to TensorBoard.
            norm_batch_hist = tf.summary.histogram("norm_batch", norm_batch)
            norm_batch_hist = tf.summary.histogram("norm_batch", norm_batch)
            norm_memory_hist = tf.summary.histogram("norm_memory", norm_memory)
            similarity_batch_hist = tf.summary.histogram("cosine_similarity_batch", similarity_batch)
            read_weights_batch_hist = tf.summary.histogram("read_weights_batch", read_weights_batch)

            # Create hot-cold visualization of read head (red=positive/blue=negative)
            zeros = tf.zeros_like(read_weights_batch) # batch_size (VARYING) x MEMORY_SLOTS
            # Get negative values only.
            neg = tf.less(read_weights_batch, zeros)
            blue = tf.multiply(tf.cast(neg, tf.float32), read_weights_batch)
            min_blue = tf.reduce_min(read_weights_batch, axis=1) + EPS
            norm_blue = 255.0 * blue/min_blue
            # Get positive values only.
            pos = tf.greater(read_weights_batch, zeros)
            red = tf.multiply(tf.cast(pos, tf.float32), read_weights_batch)
            max_red = tf.reduce_max(read_weights_batch, axis=1) + EPS
            norm_red = 255.0 * red/max_red
            # Stack them into three channel image with hot-cold values.
            rgb_read_weights_batch = tf.stack([norm_red, zeros, norm_blue], axis=2)
            rgb_read_weights_batch_reshaped = tf.reshape(rgb_read_weights_batch, [1, -1, MEMORY_SLOTS, 3])

            # Visualize read weights as image.
            rgb_read_weights_batch_img = tf.summary.image("read_weights_batch_reshaped", rgb_read_weights_batch_reshaped)
            
        
        with tf.name_scope("Memory_output"):
            # Calcualte read vector.
            memory_output_batch = tf.tensordot(read_weights_batch, tf.transpose(memory), axes=1, name="Memory_output_batch_r")   
            # Add histograms to TensorBoard.
            memory_output_batch_hist = tf.summary.histogram("memory_output_batch", memory_output_batch)

        with tf.name_scope("Write_head"):
            # "Truncation scheme to update the least-used positions".
            # First, find (size-n) top elements (in each "batch sample"/head separatelly).
            top = tf.nn.top_k(-prev_update_weights_batch_, N_SMALLEST)
            # To get boolean True/False values, you can first get the k-th value and then use tf.greater_equal:
            kth = tf.reduce_min(top.values, axis=1, keep_dims=True)
            top2 = tf.greater_equal(-prev_update_weights_batch_, kth)
            # And finally - cast it to n smallest elements.
            prev_smallest_lru_weights = tf.cast(top2, tf.float32)

            #write_weights_seq_batch.append(prev_smallest_lru_weights)
            write_weights_batch = tf.add(tf.sigmoid(alpha) * prev_read_weights_batch_,
                                   (1.0 - tf.sigmoid(alpha)) * prev_smallest_lru_weights,
                                   name="Write_weights_ww")

            # Add histograms to TensorBoard.
            smallest_lru_weight_batch_hist = tf.summary.histogram("smallest_lru_weight_batch", prev_smallest_lru_weights)
            write_weights_batch_hist = tf.summary.histogram("write_weights_batch", write_weights_batch)

        with tf.name_scope("Update_head"):
            # This relies on prev. weights and will be used in fact in the NEXT step.
            update_weights_batch = tf.add(GAMMA * prev_update_weights_batch_,
                                            read_weights_batch + write_weights_batch,
                                            name="Update_weights_uw")
            
            # Add to list returned as "previous update weights".
            update_weights_seq_batch.append(update_weights_batch)
            # Add histograms to TensorBoard.
            update_weights_batch_hist = tf.summary.histogram("update_weights_batch", update_weights_batch)

            
    with tf.name_scope("Memory_update"):
        # Perform single update for each sequence/batch.
        memory_update_batch = tf.tensordot(tf.transpose(k_t), write_weights_batch, axes=1)
        # Add dependendency control - first prediction?
        #with tf.control_dependencies([prediction_batch]):
        # Update the memory
        memory_update_op = memory.assign(memory + memory_update_batch)
        
        # Create hot-cold visualization of memory (red=positive/blue=negative)
        zeros = tf.zeros([HIDDEN_SIZE, MEMORY_SLOTS])
        # Get negative values only.
        neg = tf.less(memory, zeros)
        blue = tf.multiply(tf.cast(neg, tf.float32), memory)
        min_blue = tf.reduce_min(memory, axis=0) + EPS
        norm_blue = 255.0 * blue/min_blue
        # Get positive values only.
        pos = tf.greater(memory, zeros)
        red = tf.multiply(tf.cast(pos, tf.float32), memory)
        max_red = tf.reduce_max(memory, axis=0) + EPS
        norm_red = 255.0 * red/max_red
        # Stack them into three channel image with hot-cold values.
        rgb_memory = tf.stack([norm_red, zeros, norm_blue], axis=2)
        rgb_memory_reshaped = tf.reshape(rgb_memory, [1, HIDDEN_SIZE, MEMORY_SLOTS, 3])
        
        # Memory "truncation".
        #memory_trunk_op = memory.assign(tf.tanh(memory))
        #memory_trunk_vis = tf.reshape(memory, [1,HIDDEN_SIZE, MEMORY_SLOTS,1])

        # Add histograms to TensorBoard.
        memory_update_batch_hist = tf.summary.histogram("memory_update_batch", memory_update_batch)
        memory_hist = tf.summary.histogram("memory_before_truncation", memory_update_op)
        #memory_trunk_hist = tf.summary.histogram("memory_after_truncation", memory_trunk_op)
        # Visualize memory as image.
        memory_updated_img = tf.summary.image("memory_before_truncation", rgb_memory_reshaped)
        #memory_trunk_img = tf.summary.image("memory_after_truncation", memory_trunk_vis)

        
    return memory_output_batch, cell_output, cell_state

print("Cell definition OK")

Cell definition OK


###  Definition of tensor graph

In [35]:
# Reset graph - just in case.
tf.reset_default_graph()

# Memory.
memory = tf.Variable(tf.truncated_normal(shape=[HIDDEN_SIZE, MEMORY_SLOTS]), trainable=False, name="Memory_M")
# Latest vs LRU ratio.
#alpha = tf.Variable(tf.truncated_normal(shape=[1]), name="Alpha")

# 0. Previous variables.
with tf.name_scope("Previous_variables"):
    # Create "read vectors" (in fact batch).
    read_vectors_seq_batch = list()    
    for i_seq in range(SEQ_LENGTH):
        read_vectors_seq_batch.append(tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="Read_vector_r"))    

    # Placeholders for previous weights.
    prev_read_weights_seq_batch = list()    
    prev_update_weights_seq_batch = list()    
    for i_seq in range(SEQ_LENGTH):
        prev_read_weights_seq_batch.append(tf.placeholder(tf.float32, shape=[None, MEMORY_SLOTS], name="Prev_rw"))
        prev_update_weights_seq_batch.append(tf.placeholder(tf.float32, shape=[None, MEMORY_SLOTS], name="Prev_uw"))

# 1. Placeholders for inputs.
with tf.name_scope("Input_data"):
    # Define input data buffers.
    data_buffers = list()
    for _ in range(SEQ_LENGTH + 1):
        # Collect placeholders for inputs/labels: Batch x Vocab size.
        data_buffers.append(tf.placeholder(tf.float32, shape=[None, vocabulary_size], name="data_buffers"))
    print ("data_buffers shape =", data_buffers[0].shape)

    # Sequence of batches.
    input_seq_batch = data_buffers[:SEQ_LENGTH]
    print ("Seq length  =", len(input_seq_batch))
    print ("Batch shape =", input_seq_batch[0].shape)

    # Labels are pointing to the same placeholders!
    # Labels are inputs shifted by one time step.
    labels_seq_batch = data_buffers[1:]  
    # Concatenate targets into 2D tensor.
    target_batch = tf.concat(labels_seq_batch, 0)

    # Add histograms to TensorBoard.
    input_seq_batch_hist = tf.summary.histogram("input_seq_batch", input_seq_batch)

# 2. Unrolled controller ops.
with tf.name_scope("Controller"):
    # Define parameters:
    # Input gate: input, previous output, and bias.
    Wf = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="Wf")
    bf = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bf")

    # Forget gate: input, previous output, and bias.
    Wi = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE,HIDDEN_SIZE], -0.1, 0.1), name="Wi")
    bi = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bi")

    # Memory cell: input, state and bias.                             
    Wc = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="Wc")
    bc = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bc")

    # Output gate: input, previous output, and bias.
    Wo = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="Wo")
    bo = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bo")

    # Read key.
    W_key = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="W_key")
    b_key = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="b_key")
    
    # Add key.
    #W_add = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="W_add")
    #b_add = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="b_add")
    
    # Alpha - used in Latest vs LRU ratio.
    W_alpha = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, 1], -0.1, 0.1), name="W_alpha")
    b_alpha = tf.Variable(tf.zeros([1, 1]), name="b_alpha")
    
    # Placeholders for "zero" (the oldest) state and output: Batch x Hidden size.
    init_controller_output = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_controller_output")
    init_controller_state = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_controller_state")
    # Placeholder for "zero" memory read: Batch X Hidden (TODO: memory?) size.
    init_memory_output = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_memory_read")

    # Unrolled LSTM.
    # Build outpus of size SEQ_LENGTH.
    controller_outputs_batch_seq = list()
    memory_outputs_batch_seq = list()
    # Two lists that will be "returned" and later passed as previous states. 
    read_weights_seq_batch = list()  
    update_weights_seq_batch = list()  
    
    # "Link" oldest statte and output to placeholders.
    controller_output = init_controller_output
    controller_state = init_controller_state
    memory_output = init_memory_output
    # For every buffer in input sequence batch buffers...
    for i in range(SEQ_LENGTH):
        # ... add cell...     
        memory_output, controller_output, controller_state = controller_cell(
            input_seq_batch[i], 
            memory_output, 
            controller_output, 
            controller_state, 
            prev_read_weights_seq_batch[i],
            prev_update_weights_seq_batch[i],
            "cell_"+str(i))
        # .. add controller buffer to outputs...
        controller_outputs_batch_seq.append(controller_output)
        memory_outputs_batch_seq.append(memory_output)
        # .. and set memory input of (i+1) cell to i-th read vector buffer.
        #memory_input = read_vectors_seq_batch[i]
        
    # Add histograms to TensorBoard.
    controller_outputs_batch_seq_hist = tf.summary.histogram("controller_outputs_batch_seq", controller_outputs_batch_seq)
    memory_outputs_batch_seq_hist = tf.summary.histogram("memory_outputs_batch_seq", memory_outputs_batch_seq)
    memory_hist = tf.summary.histogram("memory", memory)
    read_weights_seq_batch_hist = tf.summary.histogram("read_weights_seq_batch", read_weights_seq_batch)
    update_weights_seq_batch_hist = tf.summary.histogram("update_weights_seq_batch", update_weights_seq_batch)

# 3. Output ops.
with tf.name_scope("Output"):
    # Concatenate controller hidden state with the read vector.
    coutput_rvector_seq_batch = list()    
    for i_seq in range(SEQ_LENGTH):
        coutput_rvector_seq_batch.append(tf.concat([controller_outputs_batch_seq[i_seq], 
                                                 memory_outputs_batch_seq[i_seq]], 1, name="Concat_coutput_rvector"))    
    # Add histograms to TensorBoard.
    coutput_rvector_seq_batch_hist = tf.summary.histogram("coutput_rvector_seq_batch", coutput_rvector_seq_batch)

    output_batch = tf.concat(controller_outputs_batch_seq, 0) 
    #output_batch = tf.concat(controller_outputs_batch_seq, 0)
 
    # Output layer weights and biases.
    w = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, vocabulary_size], -0.1, 0.1), name="w")
    b = tf.Variable(tf.zeros([vocabulary_size]), name="b")

    # Logits.
    logits_batch = tf.nn.xw_plus_b(output_batch, w, b, name = "Final_FC")
    # Add fully connected softmax layer on top - predictions.
    prediction_batch = tf.nn.softmax(logits_batch)
    
# 4. Loss ops.
with tf.name_scope("Loss"):
    # Loss function(s) - one for every output generated by every LSTM cell.
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=target_batch, logits=logits_batch))
    # Add loss summary.
    loss_summary = tf.summary.scalar("loss", loss)

# 5. Training ops.  
with tf.name_scope("Optimization"):
    # Learning rate decay.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(0.1, global_step, 5000, 0.9, staircase=True)
    # Optimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    # Gradient clipping.
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

# Merge all summaries.
merged_summaries = tf.summary.merge_all()

print("Graph definition OK")

data_buffers shape = (?, 59)
Seq length  = 1
Batch shape = (?, 59)
Graph definition OK


In [36]:
def create_feed_dict(set_type_):
    """Creates a dictionaries for different sets: maps data onto Tensor placeholders."""
    feed_dict = dict()
    
    #if set_type_=="train":
    # Get next batch and create a feed dict.
    next_batch = train_batches.next()
    # Feed batch to input buffers.
    for i in range(SEQ_LENGTH + 1):
        feed_dict[data_buffers[i]] = next_batch[i]

    # Set previous weights of read and write heades.
    for i in range(SEQ_LENGTH):
        feed_dict[prev_read_weights_seq_batch[i]] = prev_rw_seq_batch[i]
        feed_dict[prev_update_weights_seq_batch[i]] = prev_uw_seq_batch[i]

    # Reset "init" state and output of controller.
    feed_dict[init_controller_output] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
    feed_dict[init_controller_state] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
    feed_dict[init_memory_output] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
            
    #elif set_type_=="valid":
    #    for i in range(SEQ_LENGTH + 1):
    #        feed_dict[data_buffers[i]] = valid_batch[i]

    # TODO: HOW TO VALIDATE !! when update/write depends on the previous batch??
    
    #else: # test
    #    for i in range(SEQ_LENGTH + 1):
    #        feed_dict[data_buffers[i]] = test_batch[i]
        
       
    return feed_dict

print("Feed_dict definition OK")

Feed_dict definition OK


### Session execution

In [37]:
# Eventually clear the log dir.
if tf.gfile.Exists(LOG_DIR):
  tf.gfile.DeleteRecursively(LOG_DIR)
# Create (new) log dir.
tf.gfile.MakeDirs(LOG_DIR)

print("Log dir CLEARED")

Log dir CLEARED


In [38]:
# How often the test loss on validation batch will be computed. 
summary_frequency = 100

# Create session to execute graph.
sess=tf.InteractiveSession()

# Create summary writers, point them to LOG_DIR.
train_writer = tf.summary.FileWriter(LOG_DIR + '/train', sess.graph)
valid_writer = tf.summary.FileWriter(LOG_DIR + '/valid')
test_writer = tf.summary.FileWriter(LOG_DIR + '/test')

# Initialize global variables.
tf.global_variables_initializer().run()
print('Variables initialized')


# Create initial previous read and update - full of zeros. 
prev_rw_seq_batch = list()
prev_uw_seq_batch = list()
for i in range(SEQ_LENGTH):
    prev_rw_seq_batch.append(np.zeros([BATCH_SIZE, MEMORY_SLOTS]))
    prev_uw_seq_batch.append(np.zeros([BATCH_SIZE, MEMORY_SLOTS]))

num_steps = 10000 #train_size // (BATCH_SIZE*SEQ_LENGTH) #70001
print("Number of iterations per epoch =", num_steps)
for step in range(num_steps):
    input_seq_batch_, memory_, prev_rw_seq_batch, prev_uw_seq_batch, summaries, _, loss_, lr_ = sess.run([
        input_seq_batch, memory, read_weights_seq_batch, update_weights_seq_batch, merged_summaries, optimizer, loss, learning_rate],
        feed_dict=create_feed_dict("train"))#batch_seq))
    
    # Add summary.
    train_writer.add_summary(summaries, step*BATCH_SIZE*SEQ_LENGTH)
    train_writer.flush()

    # Every (100) steps collect statistics.
    if step % summary_frequency == 0:
        print("memory=\n", memory_)
        # Print loss from last batch.
        print('Training set BPC at step %d: %0.5f learning rate: %f' % (step, loss_, lr_))
    
        # Validation set BPC.
        #v_summaries, v_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("valid"))
        #print("Validation set BPC: %.5f" % v_loss)
        #valid_writer.add_summary(v_summaries, step*BATCH_SIZE*SEQ_LENGTH)
        #valid_writer.flush()
    # End of statistics collection

# Test set BPC.
#print("Calculating BPC on test dataset")
#t_summary, t_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("test"))
#print("Final test set BPC: %.5f" % t_loss)
#test_writer.add_summary(t_summary, step*BATCH_SIZE*SEQ_LENGTH)
#test_writer.flush()
    
# Close writers and session.
train_writer.close()
valid_writer.close()
test_writer.close()
sess.close() 

Variables initialized
Number of iterations per epoch = 10000
memory=
 [[ 0.75184965  0.30351904  1.44875455  0.83386803  1.2392602  -0.10503245
  -0.15712482  0.19002728 -1.05824733  0.54126722]
 [ 1.16734219 -0.77776682 -1.74520576  0.72832596 -0.7012766  -1.19848382
   0.89097685  0.42031693 -0.91780639  0.1528822 ]
 [ 1.77687776  0.94992328  1.81453133  1.87101102 -0.65485996 -0.18847853
   0.38106561  1.18798268 -0.29824588  0.42026457]
 [ 0.10946539 -0.65063179  0.31195322 -0.87512648  1.65326619 -0.06295472
   1.21986663  0.93531328 -1.49960518  0.94153011]]
Training set BPC at step 0: 4.06645 learning rate: 0.100000
memory=
 [[ 0.63037449  0.17439751  1.40519369  0.71968079  0.98911417 -0.19926478
  -0.25553876 -0.05703909 -1.23643577  0.44028074]
 [ 1.24350083 -0.73337311 -1.54600823  0.80307835 -0.65089172 -1.07376468
   1.01595426  0.38472387 -0.85216331  0.28832588]
 [ 1.94348824  1.06258857  2.08798575  2.0263617  -0.48268571  0.03486843
   0.59045255  1.27941573 -0.1495202

memory=
 [[-0.32641214 -1.53912425 -0.06424771 -1.06877422 -0.6979928  -1.70987117
  -1.26720262 -1.18366063 -2.38479614 -0.85958385]
 [ 1.37348831 -1.61320722 -2.20604587 -0.00836594 -1.40623546 -1.69240248
   1.23175824 -0.03237147 -0.95428675  0.48707476]
 [ 2.53240705  0.96131837  2.35976887  1.94325387 -0.473488   -0.10904928
   1.4309727   1.68454134  0.14838496  1.10309446]
 [-1.0280751  -2.86793375 -1.89602578 -3.10135317 -0.64733613 -1.71546495
  -0.13460508 -1.05817842 -2.75957751 -0.07892621]]
Training set BPC at step 1500: 2.07581 learning rate: 0.100000
memory=
 [[ -5.02233684e-01  -1.63196588e+00  -5.24018444e-02  -1.22612119e+00
   -7.70031393e-01  -1.80708146e+00  -1.26461470e+00  -1.14242208e+00
   -2.53924012e+00  -9.10175800e-01]
 [  1.29848754e+00  -1.57505250e+00  -2.13258076e+00   2.15823613e-02
   -1.37503564e+00  -1.67405784e+00   1.38242054e+00   7.44318441e-02
   -9.33728218e-01   5.29690206e-01]
 [  2.47906041e+00   1.00910854e+00   2.46193790e+00   2.0088465

memory=
 [[-1.61808634 -2.34740973 -0.49198005 -1.10105383 -1.18109179 -2.66911173
  -1.82457483 -1.02998507 -3.42801237 -1.33085358]
 [ 1.69674468 -0.94783944 -1.2388804   1.27499735 -0.42994294 -1.05751216
   2.07619977  1.15531957 -0.43675056  1.28689373]
 [ 3.01775861  1.83890474  3.56521368  3.33117247  0.60573059  0.75599176
   2.35611534  3.04326439  0.89750564  2.17274785]
 [-1.68639672 -3.27416825 -2.05363202 -2.83483982 -0.67951643 -2.23144007
  -0.22559361 -0.70210582 -3.35644484 -0.33528578]]
Training set BPC at step 2900: 2.38987 learning rate: 0.100000
memory=
 [[-1.58877206 -2.34852552 -0.47741148 -1.30139196 -1.22787762 -2.74542999
  -1.88965178 -1.11652303 -3.49094772 -1.21097994]
 [ 1.78012836 -0.84788191 -1.18452394  1.22386169 -0.41445637 -1.0493654
   2.07603884  1.17871654 -0.42470837  1.42695451]
 [ 3.12387848  1.91631913  3.68268728  3.34021688  0.65900528  0.78810978
   2.40421247  3.07690668  0.93537897  2.34055424]
 [-1.66001463 -3.21207285 -2.09816599 -3.031

memory=
 [[ -1.63112497e+00  -2.54475760e+00  -2.27431223e-01  -1.89015377e+00
   -5.92362344e-01  -2.44709015e+00  -1.63757205e+00  -9.38894629e-01
   -2.96653676e+00  -1.18582368e+00]
 [  2.69469643e+00   5.66152744e-02  -2.42251158e-03   1.90641832e+00
    1.24308240e+00   3.86403233e-01   3.31090307e+00   2.26662374e+00
    1.06707764e+00   2.58016729e+00]
 [  4.25011444e+00   2.79542828e+00   5.08110523e+00   3.97457767e+00
    2.25240350e+00   1.98650026e+00   3.51860356e+00   4.16394377e+00
    2.45667076e+00   3.44577360e+00]
 [ -1.57092500e+00  -2.89480329e+00  -1.80171919e+00  -2.96566081e+00
    3.01070929e-01  -1.28896070e+00   4.70474988e-01  -2.06734002e-01
   -2.61391521e+00   2.73278207e-01]]
Training set BPC at step 4400: 2.75723 learning rate: 0.100000
memory=
 [[-1.72000539 -2.66571808 -0.32455865 -2.02026677 -0.77544743 -2.57126951
  -1.70779979 -1.06984401 -3.06221938 -1.37050021]
 [ 2.72486424  0.0721487   0.10732463  1.90330887  1.21785486  0.42230868
   3.400368

memory=
 [[-1.90644884 -2.20584297 -0.16585259 -1.70266581 -0.72092545 -2.81239486
  -1.34134507 -0.70717323 -3.17083335 -0.98057675]
 [ 4.22645283  1.99690425  1.64012575  3.70485568  2.76916718  1.89983273
   5.00511742  4.23652458  2.55685663  4.45243931]
 [ 5.27120638  4.08603764  6.16888332  5.12668753  3.24669647  2.84653139
   4.63087082  5.37632751  3.41306424  4.65630722]
 [-0.488821   -1.17341506 -0.46735609 -1.36222887  1.48767042 -0.07755944
   2.01172376  1.62404037 -1.4598105   1.93153381]]
Training set BPC at step 5800: 0.48418 learning rate: 0.090000
memory=
 [[-1.80482829 -2.22698259 -0.10919898 -1.6741432  -0.67657179 -2.7393086
  -1.33400273 -0.61061895 -3.15141702 -0.89735729]
 [ 4.41201687  2.07596231  1.79454136  3.8615315   2.91549397  2.06062031
   5.09564447  4.37475109  2.73653102  4.65528727]
 [ 5.36298418  4.10290051  6.23983335  5.17469358  3.31252098  2.93904614
   4.64529991  5.49625015  3.48513341  4.75643349]
 [-0.25468639 -1.07915199 -0.27779412 -1.157

memory=
 [[-0.82930756 -1.63312554  0.64615458 -1.42539728 -0.12108073 -2.42326283
  -0.34886622 -0.1622186  -2.78540134 -0.45733273]
 [ 7.49984503  4.80615902  4.51799059  6.04155731  5.60593796  4.4831419
   8.08967972  6.80575752  5.02350235  7.19777632]
 [ 7.19304562  5.49382973  7.64808846  6.30523682  4.65862226  4.19236755
   6.31022501  6.74461985  4.57042789  6.01133823]
 [ 2.87104964  1.85899067  2.63521028  1.04795659  4.56823587  2.58101249
   5.32397747  4.31989717  1.23101914  4.89541674]]
Training set BPC at step 7300: 2.46253 learning rate: 0.090000
memory=
 [[-0.73064899 -1.49785984  0.65072012 -1.31840491  0.09920975 -2.48180509
  -0.40224391  0.07966985 -2.74397421 -0.38496062]
 [ 7.76548433  5.1121459   4.67237711  6.36679459  6.05739927  4.60316944
   8.2453928   7.19456291  5.27214289  7.43516541]
 [ 7.34705257  5.65888405  7.7230978   6.49009705  4.90531445  4.25577974
   6.39395571  6.98866224  4.7058506   6.14626217]
 [ 3.15320134  2.19644308  2.78754592  1.394

memory=
 [[ -0.19695273  -1.31925404   1.76827443  -1.56902218   0.58621329
   -1.76562679   0.1365654    0.29833612  -2.42587185  -1.03654504]
 [  9.94681549   6.751441     6.99892902   7.8945303    8.07913113
    6.60175467  10.33503437   8.92747593   7.05576897   8.48073292]
 [  8.3850832    6.3662672    9.0163517    7.06058359   5.99990273
    5.28341436   7.35044813   7.86099863   5.52921534   6.43428898]
 [  5.55430794   3.98695683   5.40788412   2.97508979   7.16089439
    4.90641975   7.78321171   6.52419519   3.43940878   6.03893948]]
Training set BPC at step 8700: 1.21063 learning rate: 0.090000
memory=
 [[ -0.31517255  -1.33992159   1.69209814  -1.72289538   0.44168884
   -1.6601094    0.23630601   0.33053434  -2.48439431  -1.1378746 ]
 [ 10.00493145   6.76539278   7.02415848   7.86529636   8.04575062
    6.71139431  10.60994244   9.01429176   7.06646585   8.48649025]
 [  8.39905643   6.30516148   8.95051289   6.9801178    5.88749743
    5.27242851   7.48066807   7.84952831 