In [1]:
from __future__ import print_function
import os

import numpy as np
import zipfile
import tarfile
from six.moves.urllib.request import urlretrieve
import shutil 
import random

import string
import tensorflow as tf

# Dirs - must be absolute paths!
LOG_DIR = '/tmp/tf/ptb_char_lstm_mann_lrua_shared_memory/h16b1s1lru3_no_trunk/'
# Local dir where PTB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'

# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"

# Size of the hidden state
HIDDEN_SIZE = 5

# Size of batch
BATCH_SIZE = 1

# Length of sequence (=  number of units of controller (recurrent layer))
SEQ_LENGTH = 1

#### MANN-related parameters.
# Size of the local memory of each cell.
MEMORY_SLOTS = 10

# Number of smallest elements - used in LRUA scheme.
N_SMALLEST = 1

# "Update weight decay".
GAMMA = 0.95

# Eps for normalization in visualization
EPS = 1e-10

In [2]:
def maybe_download_ptb(path, 
                       filename='simple-examples.tgz', 
                       url='http://www.fit.vutbr.cz/~imikolov/rnnlm/', 
                       expected_bytes =34869662):
  # Eventually create the PTB dir.
  if not tf.gfile.Exists(path):
    tf.gfile.MakeDirs(path)
  """Download a file if not present, and make sure it's the right size."""
  _filename = path+filename
  if not os.path.exists(_filename):
    print('Downloading %s...' % filename)
    _filename, _ = urlretrieve(url+filename, _filename)
  statinfo = os.stat(_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', (_filename), '(', statinfo.st_size, ')')
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + _filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download_ptb(PTB_DIR)

Found and verified /home/tkornuta/data/ptb/simple-examples.tgz ( 34869662 )


### Check/maybe download PTB.

### Extract dataset-related files from the PTB archive.

In [3]:
def extract_ptb(path, filename='simple-examples.tgz', files=["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt", 
                                       "ptb.char.train.txt", "ptb.char.valid.txt", "ptb.char.test.txt"]):
    """Extracts files from PTB archive."""
    # Extract
    tar = tarfile.open(path+filename)
    tar.extractall(path)
    tar.close()
    # Copy files
    for file in files:
        shutil.copyfile(PTB_DIR+"simple-examples/data/"+file, PTB_DIR+file)
    # Delete directory
    shutil.rmtree(PTB_DIR+"simple-examples/")        

extract_ptb(PTB_DIR)

### Load train, valid and test texts.

In [4]:
def read_data(filename, path):
    with open(path+filename, 'r') as myfile:
        data=myfile.read()# .replace('\n', '')
        return data

train_text = read_data(TRAIN, PTB_DIR)
train_size=len(train_text)
print(train_size, train_text[:100])

valid_text = read_data(VALID, PTB_DIR)
valid_size=len(valid_text)
print(valid_size, valid_text[:64])

test_text = read_data(TEST, PTB_DIR)
test_size=len(test_text)
print(test_size, test_text[:64])

5101618  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote
399782  consumers may want to move their telephones a little closer to 
449945  no it was n't black monday 
 but while the new york stock excha


### Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = 59 # [A-Z] + [a-z] + ' ' +few 'in between; + punctuation
first_letter = ord(string.ascii_uppercase[0]) # ascii_uppercase before lowercase! 
print("vocabulary size = ", vocabulary_size)
print(first_letter)

def char2id(char):
  """ Converts char to id (int) with one-hot encoding handling of unexpected characters"""
  if char in string.ascii_letters:# or char in string.punctuation or char in string.digits:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    # print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  """ Converts single id (int) to character"""
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

#print(len(string.punctuation))
#for i in string.ascii_letters:
#    print (i, char2id(i))


print(char2id('a'), char2id('A'), char2id('z'), char2id('Z'), char2id(' '), char2id('ï'))
print(id2char(char2id('a')), id2char(char2id('A')))
#print(id2char(65), id2char(33), id2char(90), id2char(58), id2char(0))
#bankno
sample = np.zeros(shape=(1, vocabulary_size), dtype=np.float)
sample[0, char2id(' ')] = 1.0
print(sample)

vocabulary size =  59
65
33 1 58 26 0 0
a A
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.]]


### Helper class for batch generation

In [6]:
class BatchGenerator(object):
  def __init__(self, text, batch_size, seq_length, vocab_size):
    """
    Initializes the batch generator object. Stores the variables and first "letter batch".
    text is text to be processed
    batch_size is size of batch (number of samples)
    seq_length represents the length of sequence
    vocab_size is number of words in vocabulary (assumes one-hot encoding)
    """
    # Store input parameters.
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._seq_length = seq_length
    self._vocab_size = vocab_size
    # Divide text into segments depending on number of batches, each segment determines a cursor position for a batch.
    segment = self._text_size // batch_size
    # Set initial cursor position.
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # Store first "letter batch".
    self._last_letter_batch = self._next_letter_batch()
  
  def _next_letter_batch(self):
    """
    Returns a batch containing of encoded single letters depending on the current batch 
    cursor positions in the data.
    Returned "letter batch" is of size batch_size x vocab_size
    """
    letter_batch = np.zeros(shape=(self._batch_size, self._vocab_size), dtype=np.float)
    # Iterate through "samples"
    for b in range(self._batch_size):
      # Set 1 in position pointed out by one-hot char encoding.
      letter_batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return letter_batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    # First add last letter from previous batch (the "additional one").
    batches = [self._last_letter_batch]
    for step in range(self._seq_length):
      batches.append(self._next_letter_batch())
    # Store last "letter batch" for next batch.
    self._last_letter_batch = batches[-1]
    return batches


In [7]:
# Trick - override first 10 chars
#list1 = list(train_text)
#for i in range(2):
#    list1[i] = 'z'
#train_text = ''.join(list1)
#print("Train set =", train_text[0:100])

# Create objects for training, validation and testing batch generation.
train_batches = BatchGenerator(train_text, BATCH_SIZE, SEQ_LENGTH, vocabulary_size)

# Get first training batch.
batch = train_batches.next()
print(len(batch))
print(batch[0].shape)
#print("Batch = ", batch)
#print(batches2string(batch))
#print("batch len = num of enrollings",len(batch))
#for i in range(num_unrollings):
#    print("i = ", i, "letter=", batches2string(batch)[0][i][0], "bits = ", batch[i][0])


# For validation  - process the whole text as one big batch.
VALID_BATCH_SIZE = int(np.floor(valid_size/SEQ_LENGTH))
valid_batches = BatchGenerator(valid_text, VALID_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
valid_batch = valid_batches.next()
#print (VALID_BATCH_SIZE)
#print(len(valid_batch))
#print(valid_batch[0].shape)

# For texting  - process the whole text as one big batch.
TEST_BATCH_SIZE = int(np.floor(test_size/SEQ_LENGTH))
test_batches = BatchGenerator(test_text, TEST_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
# Get single batch! 
#test_batch = test_batches.next()


2
(1, 59)


In [8]:
# Definition of the cell computation.
def controller_cell(input_, # input x
                    memory_input_, # read vector from the memory returned by previous cell
                    prev_output_, # output of the previous cell
                    prev_cell_state_, # previous cell state
                    prev_read_weights_batch_, # read weights from previous time state (t-1) 
                    prev_update_weights_batch_, # update weights from previous time state (t-1)
                    name_):
    """Create a controller with local memory cell"""
    """First dimensions of each of the computational nodes below is "derrived" from BATCH_SIZE"""
    with tf.name_scope(name_):

        with tf.name_scope("LSTM"):
            # LSTM cell equations according to Christopher Olah blog.
            # colah.github.io/posts/2015-08-Understanding-LSTMs/
            # Concatenate intyp x with h_prev ("prev output") TODO: and memory.
            i_h_m = tf.concat([input_, prev_output_, memory_input_], 1)

            # Calculate forget, input and output gate activations.
            forget_gate = tf.sigmoid(tf.matmul(i_h_m, Wf) + bf, name="Forget_gate")
            input_gate = tf.sigmoid(tf.matmul(i_h_m, Wi) + bi, name="Input_gate")
            output_gate = tf.sigmoid(tf.matmul(i_h_m, Wo) + bo, name="Output_gate")

            # Update of the cell state C~.
            cell_update = tf.tanh(tf.matmul(i_h_m, Wc) + bc, name="Cell_update")
            # New cell state C.
            cell_state = tf.add(forget_gate * prev_cell_state_, input_gate * cell_update, name = "Cell_state")
            # Calculate h - "output".
            cell_output = output_gate * tf.tanh(cell_state)
            
        with tf.name_scope("Keys"):
            # Calculate keys - read and add.
            k_t = tf.tanh(tf.matmul(cell_output, W_key) + b_key) # (batch_size, nb_reads, memory_size[1])
            #a_t = tf.tanh(tf.matmul(cell_output, W_add) + b_add) # (batch_size, nb_reads, memory_size[1])
            alpha = tf.sigmoid(tf.matmul(cell_output, W_alpha) + b_alpha) # (batch_size, nb_reads, 1)

            # Add histograms to TensorBoard.
            k_t_hist = tf.summary.histogram("k_t", k_t)
            #a_t_hist = tf.summary.histogram("a_t", a_t)
            
        # Read from the memory.
        with tf.name_scope("Read_head"):
            
            # Normalize batch.
            norm_batch = tf.nn.l2_normalize(k_t,1)
            # Normalize memory.
            norm_memory = tf.nn.l2_normalize(memory,0)
            # Calculate cosine similarity.
            similarity_batch = tf.matmul(norm_batch, norm_memory)
            # Calculate read weights based on similarity.
            read_weights_batch = tf.nn.softmax(similarity_batch)

            # Add to list returned as "previous read weights".
            read_weights_seq_batch.append(read_weights_batch)
            
            # Add histograms to TensorBoard.
            norm_batch_hist = tf.summary.histogram("norm_batch", norm_batch)
            norm_batch_hist = tf.summary.histogram("norm_batch", norm_batch)
            norm_memory_hist = tf.summary.histogram("norm_memory", norm_memory)
            similarity_batch_hist = tf.summary.histogram("cosine_similarity_batch", similarity_batch)
            read_weights_batch_hist = tf.summary.histogram("read_weights_batch", read_weights_batch)
        
        
        with tf.name_scope("Memory_output"):
            # Calcualte read vector.
            memory_output_batch = tf.tensordot(read_weights_batch, tf.transpose(memory), axes=1, name="Memory_output_batch_r")   
            # Add histograms to TensorBoard.
            memory_output_batch_hist = tf.summary.histogram("memory_output_batch", memory_output_batch)

        with tf.name_scope("Write_head"):
            # "Truncation scheme to update the least-used positions".
            # First, find (size-n) top elements (in each "batch sample"/head separatelly).
            top = tf.nn.top_k(-prev_update_weights_batch_, N_SMALLEST)
            # To get boolean True/False values, you can first get the k-th value and then use tf.greater_equal:
            kth = tf.reduce_min(top.values, axis=1, keep_dims=True)
            top2 = tf.greater_equal(-prev_update_weights_batch_, kth)
            # And finally - cast it to n smallest elements.
            prev_smallest_lru_weights = tf.cast(top2, tf.float32)

            #write_weights_seq_batch.append(prev_smallest_lru_weights)
            write_weights_batch = tf.add(tf.sigmoid(alpha) * prev_read_weights_batch_,
                                   (1.0 - tf.sigmoid(alpha)) * prev_smallest_lru_weights,
                                   name="Write_weights_ww")

            # Add histograms to TensorBoard.
            smallest_lru_weight_batch_hist = tf.summary.histogram("smallest_lru_weight_batch", prev_smallest_lru_weights)
            write_weights_batch_hist = tf.summary.histogram("write_weights_batch", write_weights_batch)

        with tf.name_scope("Update_head"):
            # This relies on prev. weights and will be used in fact in the NEXT step.
            update_weights_batch = tf.add(GAMMA * prev_update_weights_batch_,
                                            read_weights_batch + write_weights_batch,
                                            name="Update_weights_uw")
            
            # Add to list returned as "previous update weights".
            update_weights_seq_batch.append(update_weights_batch)
            # Add histograms to TensorBoard.
            update_weights_batch_hist = tf.summary.histogram("update_weights_batch", update_weights_batch)

            
    with tf.name_scope("Memory_update"):
        # Perform single update for each sequence/batch.
        memory_update_batch = tf.tensordot(tf.transpose(k_t), write_weights_batch, axes=1)
        # Add dependendency control - first prediction?
        #with tf.control_dependencies([prediction_batch]):
        # Update the memory
        memory_update_op = memory.assign(memory + memory_update_batch)
        
        # Create hot-cold visualization of memory (red=positive/blue=negative)
        zeros = tf.zeros([HIDDEN_SIZE, MEMORY_SLOTS])
        # Get negative values only.
        neg = tf.less(memory, zeros)
        blue = tf.multiply(tf.cast(neg, tf.float32), memory)
        min_blue = tf.reduce_min(memory, axis=0) + EPS
        norm_blue = 255.0 * blue/min_blue
        # Get positive values only.
        pos = tf.greater(memory, zeros)
        red = tf.multiply(tf.cast(pos, tf.float32), memory)
        max_red = tf.reduce_max(memory, axis=0) + EPS
        norm_red = 255.0 * red/max_red
        # Stack them into three channel image with hot-cold values.
        rgb_memory = tf.stack([norm_red, zeros, norm_blue], axis=2)
        rgb_memory_reshaped = tf.reshape(rgb_memory, [1, HIDDEN_SIZE, MEMORY_SLOTS, 3])
        
        # Memory "truncation".
        #memory_trunk_op = memory.assign(tf.tanh(memory))
        #memory_trunk_vis = tf.reshape(memory, [1,HIDDEN_SIZE, MEMORY_SLOTS,1])

        # Add histograms to TensorBoard.
        memory_update_batch_hist = tf.summary.histogram("memory_update_batch", memory_update_batch)
        memory_hist = tf.summary.histogram("memory_before_truncation", memory_update_op)
        #memory_trunk_hist = tf.summary.histogram("memory_after_truncation", memory_trunk_op)
        # Visualize memory as image.
        memory_updated_img = tf.summary.image("memory_before_truncation", rgb_memory_reshaped)
        #memory_trunk_img = tf.summary.image("memory_after_truncation", memory_trunk_vis)

        
    return memory_output_batch, cell_output, cell_state

print("Cell definition OK")

Cell definition OK


###  Definition of tensor graph

In [9]:
# Reset graph - just in case.
tf.reset_default_graph()

# Memory.
memory = tf.Variable(tf.truncated_normal(shape=[HIDDEN_SIZE, MEMORY_SLOTS]), trainable=False, name="Memory_M")
# Latest vs LRU ratio.
#alpha = tf.Variable(tf.truncated_normal(shape=[1]), name="Alpha")

# 0. Previous variables.
with tf.name_scope("Previous_variables"):
    # Create "read vectors" (in fact batch).
    read_vectors_seq_batch = list()    
    for i_seq in range(SEQ_LENGTH):
        read_vectors_seq_batch.append(tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="Read_vector_r"))    

    # Placeholders for previous weights.
    prev_read_weights_seq_batch = list()    
    prev_update_weights_seq_batch = list()    
    for i_seq in range(SEQ_LENGTH):
        prev_read_weights_seq_batch.append(tf.placeholder(tf.float32, shape=[None, MEMORY_SLOTS], name="Prev_rw"))
        prev_update_weights_seq_batch.append(tf.placeholder(tf.float32, shape=[None, MEMORY_SLOTS], name="Prev_uw"))

# 1. Placeholders for inputs.
with tf.name_scope("Input_data"):
    # Define input data buffers.
    data_buffers = list()
    for _ in range(SEQ_LENGTH + 1):
        # Collect placeholders for inputs/labels: Batch x Vocab size.
        data_buffers.append(tf.placeholder(tf.float32, shape=[None, vocabulary_size], name="data_buffers"))
    print ("data_buffers shape =", data_buffers[0].shape)

    # Sequence of batches.
    input_seq_batch = data_buffers[:SEQ_LENGTH]
    print ("Seq length  =", len(input_seq_batch))
    print ("Batch shape =", input_seq_batch[0].shape)

    # Labels are pointing to the same placeholders!
    # Labels are inputs shifted by one time step.
    labels_seq_batch = data_buffers[1:]  
    # Concatenate targets into 2D tensor.
    target_batch = tf.concat(labels_seq_batch, 0)

    # Add histograms to TensorBoard.
    input_seq_batch_hist = tf.summary.histogram("input_seq_batch", input_seq_batch)

# 2. Unrolled controller ops.
with tf.name_scope("Controller"):
    # Define parameters:
    # Input gate: input, previous output, and bias.
    Wf = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="Wf")
    bf = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bf")

    # Forget gate: input, previous output, and bias.
    Wi = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE,HIDDEN_SIZE], -0.1, 0.1), name="Wi")
    bi = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bi")

    # Memory cell: input, state and bias.                             
    Wc = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="Wc")
    bc = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bc")

    # Output gate: input, previous output, and bias.
    Wo = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="Wo")
    bo = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bo")

    # Read key.
    W_key = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="W_key")
    b_key = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="b_key")
    
    # Add key.
    #W_add = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="W_add")
    #b_add = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="b_add")
    
    # Alpha - used in Latest vs LRU ratio.
    W_alpha = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, 1], -0.1, 0.1), name="W_alpha")
    b_alpha = tf.Variable(tf.zeros([1, 1]), name="b_alpha")
    
    # Placeholders for "zero" (the oldest) state and output: Batch x Hidden size.
    init_controller_output = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_controller_output")
    init_controller_state = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_controller_state")
    # Placeholder for "zero" memory read: Batch X Hidden (TODO: memory?) size.
    init_memory_output = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_memory_read")

    # Unrolled LSTM.
    # Build outpus of size SEQ_LENGTH.
    controller_outputs_batch_seq = list()
    memory_outputs_batch_seq = list()
    # Two lists that will be "returned" and later passed as previous states. 
    read_weights_seq_batch = list()  
    update_weights_seq_batch = list()  
    
    # "Link" oldest statte and output to placeholders.
    controller_output = init_controller_output
    controller_state = init_controller_state
    memory_output = init_memory_output
    # For every buffer in input sequence batch buffers...
    for i in range(SEQ_LENGTH):
        # ... add cell...     
        memory_output, controller_output, controller_state = controller_cell(
            input_seq_batch[i], 
            memory_output, 
            controller_output, 
            controller_state, 
            prev_read_weights_seq_batch[i],
            prev_update_weights_seq_batch[i],
            "cell_"+str(i))
        # .. add controller buffer to outputs...
        controller_outputs_batch_seq.append(controller_output)
        memory_outputs_batch_seq.append(memory_output)
        # .. and set memory input of (i+1) cell to i-th read vector buffer.
        #memory_input = read_vectors_seq_batch[i]
        
    # Add histograms to TensorBoard.
    controller_outputs_batch_seq_hist = tf.summary.histogram("controller_outputs_batch_seq", controller_outputs_batch_seq)
    memory_outputs_batch_seq_hist = tf.summary.histogram("memory_outputs_batch_seq", memory_outputs_batch_seq)
    memory_hist = tf.summary.histogram("memory", memory)
    read_weights_seq_batch_hist = tf.summary.histogram("read_weights_seq_batch", read_weights_seq_batch)
    update_weights_seq_batch_hist = tf.summary.histogram("update_weights_seq_batch", update_weights_seq_batch)

# 3. Output ops.
with tf.name_scope("Output"):
    # Concatenate controller hidden state with the read vector.
    coutput_rvector_seq_batch = list()    
    for i_seq in range(SEQ_LENGTH):
        coutput_rvector_seq_batch.append(tf.concat([controller_outputs_batch_seq[i_seq], 
                                                 memory_outputs_batch_seq[i_seq]], 1, name="Concat_coutput_rvector"))    
    # Add histograms to TensorBoard.
    coutput_rvector_seq_batch_hist = tf.summary.histogram("coutput_rvector_seq_batch", coutput_rvector_seq_batch)

    output_batch = tf.concat(controller_outputs_batch_seq, 0) 
    #output_batch = tf.concat(controller_outputs_batch_seq, 0)
 
    # Output layer weights and biases.
    w = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, vocabulary_size], -0.1, 0.1), name="w")
    b = tf.Variable(tf.zeros([vocabulary_size]), name="b")

    # Logits.
    logits_batch = tf.nn.xw_plus_b(output_batch, w, b, name = "Final_FC")
    # Add fully connected softmax layer on top - predictions.
    prediction_batch = tf.nn.softmax(logits_batch)
    
# 4. Loss ops.
with tf.name_scope("Loss"):
    # Loss function(s) - one for every output generated by every LSTM cell.
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=target_batch, logits=logits_batch))
    # Add loss summary.
    loss_summary = tf.summary.scalar("loss", loss)

# 5. Training ops.  
with tf.name_scope("Optimization"):
    # Learning rate decay.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(0.1, global_step, 5000, 0.9, staircase=True)
    # Optimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    # Gradient clipping.
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

# Merge all summaries.
merged_summaries = tf.summary.merge_all()

print("Graph definition OK")

data_buffers shape = (?, 59)
Seq length  = 1
Batch shape = (?, 59)
Graph definition OK


In [10]:
def create_feed_dict(set_type_):
    """Creates a dictionaries for different sets: maps data onto Tensor placeholders."""
    feed_dict = dict()
    
    #if set_type_=="train":
    # Get next batch and create a feed dict.
    next_batch = train_batches.next()
    # Feed batch to input buffers.
    for i in range(SEQ_LENGTH + 1):
        feed_dict[data_buffers[i]] = next_batch[i]

    # Set previous weights of read and write heades.
    for i in range(SEQ_LENGTH):
        feed_dict[prev_read_weights_seq_batch[i]] = prev_rw_seq_batch[i]
        feed_dict[prev_update_weights_seq_batch[i]] = prev_uw_seq_batch[i]

    # Reset "init" state and output of controller.
    feed_dict[init_controller_output] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
    feed_dict[init_controller_state] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
    feed_dict[init_memory_output] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
            
    #elif set_type_=="valid":
    #    for i in range(SEQ_LENGTH + 1):
    #        feed_dict[data_buffers[i]] = valid_batch[i]

    # TODO: HOW TO VALIDATE !! when update/write depends on the previous batch??
    
    #else: # test
    #    for i in range(SEQ_LENGTH + 1):
    #        feed_dict[data_buffers[i]] = test_batch[i]
        
       
    return feed_dict

print("Feed_dict definition OK")

Feed_dict definition OK


### Session execution

In [11]:
# Eventually clear the log dir.
if tf.gfile.Exists(LOG_DIR):
  tf.gfile.DeleteRecursively(LOG_DIR)
# Create (new) log dir.
tf.gfile.MakeDirs(LOG_DIR)

print("Log dir CLEARED")

Log dir CLEARED


In [14]:
# How often the test loss on validation batch will be computed. 
summary_frequency = 100

# Create session to execute graph.
sess=tf.InteractiveSession()

# Create summary writers, point them to LOG_DIR.
train_writer = tf.summary.FileWriter(LOG_DIR + '/train', sess.graph)
valid_writer = tf.summary.FileWriter(LOG_DIR + '/valid')
test_writer = tf.summary.FileWriter(LOG_DIR + '/test')

# Initialize global variables.
tf.global_variables_initializer().run()
print('Variables initialized')


# Create initial previous read and update - full of zeros. 
prev_rw_seq_batch = list()
prev_uw_seq_batch = list()
for i in range(SEQ_LENGTH):
    prev_rw_seq_batch.append(np.zeros([BATCH_SIZE, MEMORY_SLOTS]))
    prev_uw_seq_batch.append(np.zeros([BATCH_SIZE, MEMORY_SLOTS]))

num_steps = 10000 #train_size // (BATCH_SIZE*SEQ_LENGTH) #70001
print("Number of iterations per epoch =", num_steps)
for step in range(num_steps):
    input_seq_batch_, memory_, prev_rw_seq_batch, prev_uw_seq_batch, summaries, _, loss_, lr_ = sess.run([
        input_seq_batch, memory, read_weights_seq_batch, update_weights_seq_batch, merged_summaries, optimizer, loss, learning_rate],
        feed_dict=create_feed_dict("train"))#batch_seq))
    
    # Add summary.
    train_writer.add_summary(summaries, step*BATCH_SIZE*SEQ_LENGTH)
    train_writer.flush()

    # Every (100) steps collect statistics.
    if step % summary_frequency == 0:
        print("memory=\n", memory_)
        # Print loss from last batch.
        print('Training set BPC at step %d: %0.5f learning rate: %f' % (step, loss_, lr_))
    
        # Validation set BPC.
        #v_summaries, v_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("valid"))
        #print("Validation set BPC: %.5f" % v_loss)
        #valid_writer.add_summary(v_summaries, step*BATCH_SIZE*SEQ_LENGTH)
        #valid_writer.flush()
    # End of statistics collection

# Test set BPC.
#print("Calculating BPC on test dataset")
#t_summary, t_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("test"))
#print("Final test set BPC: %.5f" % t_loss)
#test_writer.add_summary(t_summary, step*BATCH_SIZE*SEQ_LENGTH)
#test_writer.flush()
    
# Close writers and session.
train_writer.close()
valid_writer.close()
test_writer.close()
sess.close() 

Variables initialized
Number of iterations per epoch = 10000
memory=
 [[-1.62183619  0.42470118  1.28924441  0.60822302 -0.61682415  1.00247824
   1.41308975  1.18546331 -1.21948457  0.37774783]
 [-1.64181793  0.58796543 -1.28425908 -0.30923542 -0.5706014  -0.61637932
  -1.17260826  0.05734515 -1.00058949 -0.44328439]
 [ 1.12578321 -0.75492275  0.74851525 -0.60906279  0.21810517  1.17030799
   1.81300998 -0.15758897  0.44733503  1.12421823]
 [ 0.39198929 -0.11256467 -1.69561434 -0.94239986  0.7355327   1.72307825
   0.78167647  0.50731486 -1.34242415  0.21491817]
 [ 0.04932863  0.47347218  1.62203944  0.48852581  1.74889898  0.12459899
   0.18837456  1.44944644 -1.38566017  0.16286469]]
Training set BPC at step 0: 4.07823 learning rate: 0.100000
memory=
 [[-1.35506594  0.7182405   1.54528081  0.96097815 -0.33629978  1.31876838
   1.65656352  1.41641772 -0.79469842  0.58067894]
 [-1.32378888  0.92816538 -0.98989129  0.13525626 -0.22159512 -0.25117764
  -0.91565925  0.31586424 -0.4839370

memory=
 [[ 1.09771335  3.04433751  3.8560462   3.22221375  2.40452361  3.71601915
   3.67833829  3.58117366  1.90522218  2.66240573]
 [ 1.10164869  3.11363459  0.9977805   2.1768744   2.2973597   2.02153397
   1.10673273  2.31036711  2.24496865  1.63594675]
 [ 3.22156882  1.17223573  2.60433745  1.38089824  2.4707408   3.22376728
   3.58202958  1.6675272   2.83428359  2.77984142]
 [ 2.36404228  1.48790038 -0.36651006  0.70672613  2.53232431  3.50488329
   2.39891362  2.00334835  0.83122575  1.54088271]
 [ 2.29707384  2.72104192  3.71139336  2.5600059   4.25015163  2.3493607
   2.0031395   3.30502009  1.3724035   2.03800702]]
Training set BPC at step 1200: 2.67206 learning rate: 0.100000
memory=
 [[ 1.2141124   3.25680804  4.0947485   3.44735789  2.69498539  3.82311821
   3.92611933  3.8811183   2.24932551  2.81708026]
 [ 1.23357499  3.31921411  1.33971274  2.43048811  2.66562057  2.09563041
   1.38388753  2.74996853  2.64566398  1.79705405]
 [ 3.31968117  1.35064733  2.82487059  1.571

memory=
 [[ 4.06503963  6.99123001  7.565063    6.78344345  5.86716986  7.03643942
   8.06806564  7.30474472  5.6587348   6.66304541]
 [ 3.60838842  6.84423256  4.45565939  5.38105679  5.53064299  5.19225216
   5.22652864  6.02880812  5.63035631  5.20755291]
 [ 5.37332344  4.17894506  5.3120923   3.95059299  5.01469278  5.67857838
   6.81127262  4.40537262  5.60895967  5.60283375]
 [ 3.82164073  3.82731986  1.71903193  2.48457241  4.45655775  5.563025
   4.92728233  4.42919874  2.77335763  3.49830151]
 [ 4.51210928  5.82216549  6.81509399  5.48796701  7.10969734  5.11364126
   5.66269016  6.62783194  4.33440781  5.43327379]]
Training set BPC at step 2400: 2.89620 learning rate: 0.100000
memory=
 [[ 4.17000389  7.19363165  7.78436756  7.01193666  6.15983915  7.28636503
   8.35802174  7.47700071  5.81656885  6.85704136]
 [ 3.58885336  7.00581455  4.59868574  5.61039972  5.72515631  5.35278749
   5.44957066  6.15830135  5.73863268  5.34725618]
 [ 5.41392517  4.29689693  5.45523024  4.1162

memory=
 [[  7.26798391   9.77956772  10.96530247   9.91740227   9.59500408
   10.38308716  11.15550232   9.83610821   8.19562721  10.00970554]
 [  6.13023853   8.82290554   7.15252256   7.8274622    8.51410198
    7.72100592   7.62263203   7.83309555   7.42203283   7.91787863]
 [  7.39675617   5.90648222   7.4889493    5.91104364   7.34181929
    7.80556297   8.80628014   6.06184483   7.18839407   7.77433634]
 [  5.15833855   4.68597651   3.01557541   3.77133679   6.00528097
    6.80600309   6.24935865   5.31759596   3.60367465   5.05012083]
 [  7.29812384   8.09623241   9.77022648   8.22181034  10.47718811
    7.89869022   8.20538616   8.50402546   6.37282658   8.31025028]]
Training set BPC at step 3600: 0.68163 learning rate: 0.100000
memory=
 [[  7.50505114  10.17921066  11.33293343  10.15253448  10.06380367
   10.80856991  11.39659786  10.16469383   8.48966599  10.32464695]
 [  6.23485422   9.15652847   7.47066212   7.98714828   8.85101795
    8.02707863   7.75609398   8.07607079 

memory=
 [[ 10.02234936  12.56306267  13.62746239  13.36639023  12.72086906
   12.96455574  13.79468822  12.55919456  10.59315491  12.44140053]
 [  8.10723305  10.7226181    9.04156685  10.37512398  10.67312908
    9.50917339   9.35860538   9.522645     8.92144489   9.4658041 ]
 [  9.14197922   7.59300041   9.12554836   8.06659889   9.23726177
    9.42643833  10.35037041   7.67349434   8.64880753   9.23318005]
 [  6.03220081   5.52574158   3.7917459    4.9460721    6.81364441
    7.56977129   6.87896824   5.90557766   4.07144928   5.55869961]
 [  9.48028851  10.32489014  11.95797539  11.08397293  13.04496002
    9.89501476  10.38117504  10.59207916   8.15847874  10.19233704]]
Training set BPC at step 4800: 1.39553 learning rate: 0.100000
memory=
 [[ 10.36020279  12.91185951  14.07751179  13.87800407  13.03811169
   13.42077732  14.20510769  12.94034481  10.82588196  12.85246181]
 [  8.35806561  10.94859791   9.40243244  10.87587929  10.92146969
    9.91424847   9.64872265   9.80268574 

memory=
 [[ 13.43429661  16.63531685  16.79667854  17.04382706  16.57407379
   16.00850487  16.65687943  15.82613754  14.28365707  15.9374485 ]
 [ 10.16023445  13.65401077  11.16518879  13.22885418  13.37457371
   11.16219902  10.93703556  11.47299194  11.31969547  11.76667595]
 [ 11.07327271  10.05827332  10.98454762  10.40530014  11.51326466
   11.1073494   11.95673656   9.50816822  10.81783867  11.29931927]
 [  6.38821888   6.50654268   4.44223213   6.26041222   7.60672665
    7.71250963   7.04549885   6.33283091   4.71797276   6.25404787]
 [ 12.28939152  13.8702774   14.64535904  14.23839855  16.43720055
   12.28985977  12.60051727  13.27252769  11.21221066  13.08101177]]
Training set BPC at step 6000: 3.33460 learning rate: 0.090000
memory=
 [[ 13.66508293  16.79536247  17.1636219   17.11199951  16.75824547
   16.21742439  16.95121193  15.96338654  14.35857201  16.17629242]
 [ 10.33539677  13.76104832  11.4366169   13.18825817  13.50144291
   11.33118725  11.28138447  11.5479269  

memory=
 [[ 16.12380028  19.1020813   19.77911377  19.75449181  19.2549324
   18.85455704  19.19846153  18.75199509  17.03657341  18.73190308]
 [ 12.10137177  15.27678299  13.2986002   15.21354389  15.4420166
   13.33500671  12.79788685  13.59050465  13.512043    13.82618332]
 [ 12.85977459  11.73500443  12.94261456  12.1600666   13.33222866
   13.01893234  13.63807392  11.44608212  12.70606709  13.11136436]
 [  7.16664219   7.100667     5.24259663   7.18993044   8.56725216
    8.7065506    7.82397985   7.15398407   5.77563381   7.13872147]
 [ 14.28773403  15.4599638   16.90650177  16.34461975  18.49421501
   14.48390865  14.52462387  15.48493195  13.34963512  15.29634571]]
Training set BPC at step 7100: 1.07547 learning rate: 0.090000
memory=
 [[ 16.16317749  19.26581955  19.86623573  19.88246727  19.36651039
   19.05417633  19.36450195  18.85756111  17.17725372  18.9296875 ]
 [ 12.03099632  15.46335125  13.31039906  15.34756851  15.54849529
   13.5063982   12.92863083  13.71126652  1

memory=
 [[ 17.76004028  21.337883    21.83579063  21.92007446  21.04336548
   20.6552906   21.45885658  20.98411942  19.01706696  20.63298225]
 [ 12.57814217  16.728899    14.17374134  16.69368362  16.44054222
   14.09382629  14.19851303  14.78962612  14.33572578  14.5413456 ]
 [ 13.74223423  13.1401701   14.18477058  13.56896687  14.45994186
   14.04910374  15.0405407   12.82604313  13.74411774  14.16063499]
 [  6.85835314   7.66890049   5.17513895   7.88003206   8.91070271
    8.7063036    8.34101963   7.39369726   5.74084473   7.02222681]
 [ 15.16257954  17.09135818  18.05630875  17.89224434  19.59581947
   15.56139565  16.13433456  16.90564537  14.71663189  16.42279625]]
Training set BPC at step 8200: 1.81756 learning rate: 0.090000
memory=
 [[ 17.87589073  21.46368027  21.98634911  22.02259445  21.32466888
   20.75629997  21.73462296  21.20092201  19.22961807  20.74131966]
 [ 12.62778282  16.83875084  14.29209995  16.72482109  16.62910843
   14.15124607  14.47942066  14.97465515 

memory=
 [[ 19.98944092  23.90219879  24.93404388  24.90758514  23.80462074
   23.05251312  23.98224449  23.15200233  21.33725357  23.06303406]
 [ 14.11722469  18.68014908  16.75949669  18.86453056  18.34955788
   16.06866264  16.10206413  16.0410347   15.97167587  15.99679661]
 [ 15.24664116  14.94696045  16.47942162  15.58908463  16.31477928
   15.8000946   16.88092232  14.22905827  15.30720329  15.76546288]
 [  7.65221262   8.73490047   6.65351105   8.93478775   9.69991112
    9.91291809   9.38517666   7.78411722   6.55573225   7.48165989]
 [ 16.6615448   18.8841114   20.30265427  20.05385971  21.53364372
   17.2613678   17.77129555  18.24555206  16.34903526  17.95440292]]
Training set BPC at step 9400: 2.51365 learning rate: 0.090000
memory=
 [[ 20.35612869  24.05298042  25.26974678  25.17163849  23.95405197
   23.16744804  24.04198647  23.27064323  21.4329071   23.30231476]
 [ 14.4578476   18.82097054  17.0442791   19.14455032  18.43238831
   16.10301018  16.0748024   16.13743591 