In [1]:
from __future__ import print_function
import os

import numpy as np
import zipfile
import tarfile
from six.moves.urllib.request import urlretrieve
import shutil 
import random

import string
import tensorflow as tf

# Dirs - must be absolute paths!
LOG_DIR = '/tmp/tf/ptb_char_gru_em_address/B2S2_H32_N20A20C8/'
# Local dir where PTB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'

# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"

# Length of sequence (=  number of units of controller (recurrent layer))
SEQ_LENGTH = 2

# Size of batch
BATCH_SIZE = 2

# Length of a vector - a single hidden state
HIDDEN_SIZE = 32

# Size of the input vector - 59 = [A-Z] + [a-z] + ' '
INPUT_SIZE = 59

#### MANN-related parameters.
# Size of the local memory of each cell.
NUMBER_OF_SLOTS = 20

# Length of a single vector - a single memory slot.
ADDRESS_SIZE = NUMBER_OF_SLOTS
CONTENT_SIZE = 8
SLOT_SIZE = ADDRESS_SIZE + CONTENT_SIZE

# Number of smallest elements - used in LRUA scheme.
#N_SMALLEST = 1

# "Update weight decay".
#GAMMA = 1e-4 

# Eps for normalization in visualization
EPS = 1e-10

### Check/maybe download PTB.

In [2]:
def maybe_download_ptb(path, 
                       filename='simple-examples.tgz', 
                       url='http://www.fit.vutbr.cz/~imikolov/rnnlm/', 
                       expected_bytes =34869662):
  # Eventually create the PTB dir.
  if not tf.gfile.Exists(path):
    tf.gfile.MakeDirs(path)
  """Download a file if not present, and make sure it's the right size."""
  _filename = path+filename
  if not os.path.exists(_filename):
    print('Downloading %s...' % filename)
    _filename, _ = urlretrieve(url+filename, _filename)
  statinfo = os.stat(_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', (_filename), '(', statinfo.st_size, ')')
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + _filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download_ptb(PTB_DIR)

Found and verified /home/tkornuta/data/ptb/simple-examples.tgz ( 34869662 )


### Extract dataset-related files from the PTB archive.

In [3]:
def extract_ptb(path, filename='simple-examples.tgz', files=["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt", 
                                       "ptb.char.train.txt", "ptb.char.valid.txt", "ptb.char.test.txt"]):
    """Extracts files from PTB archive."""
    # Extract
    tar = tarfile.open(path+filename)
    tar.extractall(path)
    tar.close()
    # Copy files
    for file in files:
        shutil.copyfile(PTB_DIR+"simple-examples/data/"+file, PTB_DIR+file)
    # Delete directory
    shutil.rmtree(PTB_DIR+"simple-examples/")        

extract_ptb(PTB_DIR)

### Load train, valid and test texts.

In [4]:
def read_data(filename, path):
    with open(path+filename, 'r') as myfile:
        data=myfile.read()# .replace('\n', '')
        return data

train_text = read_data(TRAIN, PTB_DIR)
train_size=len(train_text)
print(train_size, train_text[:100])

valid_text = read_data(VALID, PTB_DIR)
valid_size=len(valid_text)
print(valid_size, valid_text[:64])

test_text = read_data(TEST, PTB_DIR)
test_size=len(test_text)
print(test_size, test_text[:64])

5101618  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote
399782  consumers may want to move their telephones a little closer to 
449945  no it was n't black monday 
 but while the new york stock excha


### Utility functions to map characters to vocabulary IDs and back.

In [5]:
first_letter = ord(string.ascii_uppercase[0]) # ascii_uppercase before lowercase! 
print("vocabulary size = ", INPUT_SIZE)
print(first_letter)

def char2id(char):
  """ Converts char to id (int) with one-hot encoding handling of unexpected characters"""
  if char in string.ascii_letters:# or char in string.punctuation or char in string.digits:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    # print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  """ Converts single id (int) to character"""
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

#print(len(string.punctuation))
#for i in string.ascii_letters:
#    print (i, char2id(i))


print(char2id('a'), char2id('A'), char2id('z'), char2id('Z'), char2id(' '), char2id('ï'))
print(id2char(char2id('a')), id2char(char2id('A')))
#print(id2char(65), id2char(33), id2char(90), id2char(58), id2char(0))
#bankno
sample = np.zeros(shape=(1, INPUT_SIZE), dtype=np.float)
sample[0, char2id(' ')] = 1.0
print(sample)

vocabulary size =  59
65
33 1 58 26 0 0
a A
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.]]


### Helper class for batch generation

In [6]:
class BatchGenerator(object):
  def __init__(self, text, batch_size, seq_length, vocab_size):
    """
    Initializes the batch generator object. Stores the variables and first "letter batch".
    text is text to be processed
    batch_size is size of batch (number of samples)
    seq_length represents the length of sequence
    vocab_size is number of words in vocabulary (assumes one-hot encoding)
    """
    # Store input parameters.
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._seq_length = seq_length
    self._vocab_size = vocab_size
    # Divide text into segments depending on number of batches, each segment determines a cursor position for a batch.
    segment = self._text_size // batch_size
    # Set initial cursor position.
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # Store first "letter batch".
    self._last_letter_batch = self._next_letter_batch()
  
  def _next_letter_batch(self):
    """
    Returns a batch containing of encoded single letters depending on the current batch 
    cursor positions in the data.
    Returned "letter batch" is of size batch_size x vocab_size
    """
    letter_batch = np.zeros(shape=(self._batch_size, self._vocab_size), dtype=np.float)
    # Iterate through "samples"
    for b in range(self._batch_size):
      # Set 1 in position pointed out by one-hot char encoding.
      letter_batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return letter_batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    # First add last letter from previous batch (the "additional one").
    batches = [self._last_letter_batch]
    for step in range(self._seq_length):
      batches.append(self._next_letter_batch())
    # Store last "letter batch" for next batch.
    self._last_letter_batch = batches[-1]
    return batches


In [7]:
# Trick - override first 10 chars
#list1 = list(train_text)
#for i in range(2):
#    list1[i] = 'z'
#train_text = ''.join(list1)
#print("Train set =", train_text[0:100])

# Create objects for training, validation and testing batch generation.
train_batches = BatchGenerator(train_text, BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)

# Get first training batch.
#batch = train_batches.next()
#print(len(batch))
#print(batch[0].shape)
#print("Batch = ", batch)
#print(batches2string(batch))
#print("batch len = num of enrollings",len(batch))
#for i in range(num_unrollings):
#    print("i = ", i, "letter=", batches2string(batch)[0][i][0], "bits = ", batch[i][0])


# For validation  - process the whole text as one big batch.
VALID_BATCH_SIZE = int(np.floor(valid_size/SEQ_LENGTH))
valid_batches = BatchGenerator(valid_text, VALID_BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
# Get a single batch! 
valid_batch = valid_batches.next()

# For texting  - process the whole text as one big batch.
TEST_BATCH_SIZE = int(np.floor(test_size/SEQ_LENGTH))
test_batches = BatchGenerator(test_text, TEST_BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
# Get a single batch! 
test_batch = test_batches.next()


### Helper functions - used during graph definition

In [8]:
# Function adding visualization to a given "matrix" (keys, memory etc.) with additional normalization
def visualize_hot_cold_normalized(matrix_,
                    axis_of_reduction_,
                    name_):
    with tf.name_scope(name_+"_vis_hc_norm"):
        # Create hot-cold visualization (red=positive/blue=negative)
        zeros = tf.zeros_like(matrix_) 
        
        # Get negative values only.
        neg = tf.less(matrix_, zeros)
        blue = tf.multiply(tf.cast(neg, tf.float32), matrix_)
        min_blue = tf.reduce_min(matrix_, axis=axis_of_reduction_, keep_dims=True) + EPS
        norm_blue = 255.0 * blue/min_blue
        
        # Get positive values only.
        pos = tf.greater(matrix_, zeros)
        red = tf.multiply(tf.cast(pos, tf.float32), matrix_)
        max_red = tf.reduce_max(matrix_, axis=axis_of_reduction_, keep_dims=True) + EPS
        norm_red = 255.0 * red/max_red
        
        # Stack them into three channel image with hot-cold values.
        rgb = tf.stack([norm_red, zeros, norm_blue], axis=2)

        #print("name_=",name_)
        #print("np.int32(zeros.shape[1]=", np.int32(zeros.shape[1]))
        # TODO: find fix and get rid of the batch_size(!)
        rgb_reshaped = tf.reshape(rgb, [1, -1,  np.int32(zeros.shape[1]), 3])

        # Visualize read weights as image.
        rgb_reshaped_summary = tf.summary.image(name_+"_visv", rgb_reshaped)

In [9]:
# Function adding visualization to a given "matrix" (keys, memory etc.)
def visualize_hot_cold(matrix_,
                    name_):
    with tf.name_scope(name_+"_vis_hc"):
        # Create hot-cold visualization (red=positive/blue=negative)
        zeros = tf.zeros_like(matrix_) 
        
        # Get negative values only.
        neg = tf.less(matrix_, zeros)
        blue = 255.0 * tf.multiply(tf.cast(neg, tf.float32), matrix_)
        
        # Get positive values only.
        pos = tf.greater(matrix_, zeros)
        red = 255.0 * tf.multiply(tf.cast(pos, tf.float32), matrix_)
        
        # Stack them into three channel image with hot-cold values.
        rgb = tf.stack([red, zeros, blue], axis=2)

        #print("name_=",name_)
        #print("np.int32(zeros.shape[1]=", np.int32(zeros.shape[1]))
        # TODO: find fix and get rid of the batch_size(!)
        rgb_reshaped = tf.reshape(rgb, [1, -1,  np.int32(zeros.shape[1]), 3])

        # Visualize read weights as image.
        rgb_reshaped_summary = tf.summary.image(name_+"_vis", rgb_reshaped)


In [10]:
# Function "fixing" tensordot lack of dimensions.
def tensordot_fix(matrix_a_, matrix_b_, axes_, output_shape_):
    tensor_without_dims = tf.tensordot(matrix_a_, matrix_b_, axes=axes_)
    # TF bugfix - tensordot not returning proper shapes when only partial shapes are known. :]
    # https://github.com/tensorflow/tensorflow/issues/6682
    tensor = tf.reshape(tensor_without_dims, output_shape_)
    return tensor 
    

In [33]:
# Definition of graph of the MANN controller cell.
def controller_cell(input_batch_, # input x [BATCH_SIZE x INPUT_SIZE]
                    prev_memory_output_batch_, # read vector from the memory returned by previous cell [BATCH_SIZE x SLOT_SIZE]
                    prev_read_weights_, # read weights from previous time state (t-1) [1xNUMBER_OF_SLOTS]
                    name_):
    """Creates a controller with shared memory"""
 
    #with tf.name_scope(name_):

    # Create shared (reuseable) space for all cells.   
    with tf.variable_scope("controller_variables", reuse=True):
        
        # Memory.
        memory = tf.get_variable("memory", [SLOT_SIZE, NUMBER_OF_SLOTS], trainable=False)
        
        # Get content part of the memory.
        content = tf.slice(memory, [ADDRESS_SIZE, 0], [CONTENT_SIZE, NUMBER_OF_SLOTS], name="memory_content") 

        # Hidden state calculation gate: input, previous output, and bias.
        Wih = tf.get_variable("Wih", [INPUT_SIZE, HIDDEN_SIZE])
        Wmh = tf.get_variable("Wmh", [SLOT_SIZE, HIDDEN_SIZE])
        bih_mh = tf.get_variable("bih_mh", [1, HIDDEN_SIZE])

        # Read head.
        Whk = tf.get_variable("Whk", [HIDDEN_SIZE, SLOT_SIZE])
        Whbeta = tf.get_variable("Whbeta", [HIDDEN_SIZE, 1])

        # Interpolation gate.
        Wiig = tf.get_variable("Wiig", [INPUT_SIZE, 1])
        Wmig = tf.get_variable("Wmig", [SLOT_SIZE, 1])
        biig_mig = tf.get_variable("biig_mig", [1, 1])

         # Content vector.
        Whv = tf.get_variable("Whv", [HIDDEN_SIZE, CONTENT_SIZE])
        bhv = tf.get_variable("bhv", [1, CONTENT_SIZE])

        # Erase vector.
        Whe = tf.get_variable("Whe", [HIDDEN_SIZE, CONTENT_SIZE])
        bhe = tf.get_variable("bhe", [1, CONTENT_SIZE])

        content_hist = tf.summary.histogram("0content_CxN", content)
        # Add hot-cold visualizations.
        visualize_hot_cold_normalized(matrix_=content, axis_of_reduction_=None, name_="0content_CxN")          


    with tf.name_scope("Hidden_state"):

        # Concatenate input with previous memory output.
        #x_prev_m = tf.concat([input_, prev_memory_output_], 1)

        # Calculate the hidden state [BATCH_SIZE x HIDDEN_SIZE].
        hidden_batch = tf.tanh(tf.matmul(input_batch_, Wih) + 
                               tf.matmul(prev_memory_output_batch_, Wmh) + 
                               bih_mh, name="Hidden")

        with tf.control_dependencies([hidden_batch]):
            # Add histograms to TensorBoard.
            input_batch_hist = tf.summary.histogram("input_batch_batch_BxI", input_batch_)
            prev_memory_output_batch_hist = tf.summary.histogram("prev_memory_output_batch_BxS", prev_memory_output_batch_)
            hidden_batch_hist = tf.summary.histogram("hidden_batch_BxH", hidden_batch)
            # Add hot-cold visualizations.
            visualize_hot_cold_normalized(matrix_=input_batch_, axis_of_reduction_=None, name_="input_batch_batch_BxI")          
            visualize_hot_cold_normalized(matrix_=prev_memory_output_batch_, axis_of_reduction_=None, name_="prev_memory_output_batch_BxS")          
            visualize_hot_cold_normalized(matrix_=hidden_batch, axis_of_reduction_=None, name_="hidden_batch_BxH")          


            
    # Read from the memory.
    with tf.name_scope("read_head"):

        # 1. Calculate read key(s) [BATCH_SIZE x SLOT_SIZE].
        key_batch = tf.tanh(tf.matmul(hidden_batch, Whk))

        # Calculate beta - used for sharpening/smoothing similarity - scalar [BATCHx1].
        beta_batch = 1 + tf.log1p(tf.exp(tf.matmul(hidden_batch, Whbeta)))
        print("beta_batch=", beta_batch)

        # 2. Content addressing.
        # Normalize batch.
        norm_key_batch = tf.nn.l2_normalize(key_batch,1)
        # Normalize memory.
        norm_memory = tf.nn.l2_normalize(memory, 0)
        # Calculate cosine similarity [BATCH_SIZE x NUMBER_OF_SLOTS].
        similarity_batch = tf.matmul(norm_key_batch, norm_memory, name="similarity_batch")

        # Not-normalized similarity with min value adjustment.
        #similarity_batch = tf.matmul(key_batch, memory) # [BATCH_SIZE x NUMBER_OF_SLOTS]
        #sim_min_batch = tf.reduce_min(similarity_batch, axis=1)
        #similarity_batch_adj = similarity_batch - sim_min_batch # neg values will become 0.

        # Element-wise multiplication [BATCH_SIZE x NUMBER_OF_SLOTS]
        sharpened_similarity_batch = beta_batch * similarity_batch
        print("sharpened_similarity_batch=", sharpened_similarity_batch)

        # Calculate read weights based on similarity along the "slot dimension" [BATCH_SIZE x NUMBER_OF_SLOTS].
        content_read_weights_batch = tf.nn.softmax(sharpened_similarity_batch, dim=1)

        # 3. Focusing by location - use of "interpolation gate".
        # Calculate gate - [BATCH_SIZE x1].
        interpolation_gate_batch = tf.sigmoid(tf.matmul(input_batch_, Wiig) + 
                                        tf.matmul(prev_memory_output_batch_, Wmig) + 
                                        biig_mig, name="interpolation_gate_batch")
        print("interpolation_gate_batch=", interpolation_gate_batch)

        # Calculate read weights [BATCH_SIZE x NUMBER_OF_SLOTS].
        read_weights_batch = tf.add(
            tf.matmul((1 - interpolation_gate_batch), prev_read_weights_),
            interpolation_gate_batch * content_read_weights_batch, name="read_weights_batch")

        # Add to list returned as "previous read weights" - reduced to [NUMBER_OF_SLOTS] for the whole batch!
        read_weights_seq.append(tf.reduce_mean(read_weights_batch, axis=0, keep_dims=True))

        # Calculate the read vector.
        memory_output_batch_without_dims = tf.tensordot(read_weights_batch, tf.transpose(memory), axes=1, name="Memory_output_batch_r")   

        # Add histograms to TensorBoard.
        key_batch_hist = tf.summary.histogram("1key_batch_BxS", key_batch)
        norm_key_batch_hist = tf.summary.histogram("1norm_key_batch_BxS", norm_key_batch)
        norm_memory_hist = tf.summary.histogram("1norm_memory_S_N", norm_memory)
        similarity_batch_hist = tf.summary.histogram("2similarity_batch_BxN", similarity_batch)
        beta_batch_hist = tf.summary.histogram("3beta_batch_Bx1", beta_batch)
        sharpened_similarity_batch_hist = tf.summary.histogram("4sharpened_similarity_batch_BxN", sharpened_similarity_batch)
        content_read_weights_batch_hist = tf.summary.histogram("5content_read_weights_batch_BxN", content_read_weights_batch)
        interpolation_gate_batch_hist = tf.summary.histogram("6interpolation_gate_batch_Bx1", interpolation_gate_batch)
        read_weights_batch_hist = tf.summary.histogram("7read_weights_batch_BxN", read_weights_batch)

        # Add hot-cold visualizations.
        visualize_hot_cold_normalized(matrix_=memory, axis_of_reduction_=None, name_="0memory_SxN_at_start")          
        visualize_hot_cold_normalized(matrix_=key_batch, axis_of_reduction_=1, name_="1key_batch_BxS")          
        visualize_hot_cold_normalized(matrix_=similarity_batch, axis_of_reduction_=1, name_="2similarity_batch_BxN")          
        #visualize_hot_cold_normalized(matrix_=beta_batch, axis_of_reduction_=1, name_="3beta_batch")          
        visualize_hot_cold_normalized(matrix_=sharpened_similarity_batch, axis_of_reduction_=1, name_="4sharpened_similarity_batch_BxN")          
        visualize_hot_cold(matrix_=content_read_weights_batch, name_="5content_read_weights_batch_BxN")          
        #visualize_hot_cold(matrix_=interpolation_gate_batch, name_="6interpolation_gate_batch")          
        visualize_hot_cold(matrix_=read_weights_batch, name_="7read_weights_batch_BxN")          

    with tf.name_scope("memory_update"):
        # Perform single update for each sequence/batch.

        # 1. Calculate content vector v  [BATCH_SIZE x CONTENT_SIZE]. (range -inf : + int) ??
        add_content_vector_batch = tf.tanh(tf.matmul(hidden_batch, Whv))

        # 2. Calculate erase vector e  [BATCH_SIZE x NUMBER_OF_SLOTS]. (range 0 : 1)
        erase_content_vector_batch = tf.sigmoid(tf.matmul(hidden_batch, Whe))

        # 3. Preservation gate p [CONTENT_SIZE x NUMBER_OF_SLOTS]
        preservation_gate = 1 - tf.matmul(tf.transpose(erase_content_vector_batch), read_weights_batch)
        # How much memory will "preserve" [SLOT_SIZE x NUMBER_OF_SLOTS] 
        memory_preserved = memory * tf.concat(
            [tf.ones([ADDRESS_SIZE, NUMBER_OF_SLOTS], tf.float32), preservation_gate],
            axis=0)
        
        # 4. Calculate update of the content [CONTENT_SIZE x NUMBER_OF_SLOTS]
        content_update = tf.matmul(tf.transpose(add_content_vector_batch), read_weights_batch)
        # 4. Calculate update of memory [SLOT_SIZE x NUMBER_OF_SLOTS]
        update = tf.concat(
            [tf.zeros([ADDRESS_SIZE, NUMBER_OF_SLOTS], tf.float32), content_update],
            axis=0)
          
        # 5. Calculate "updated" memory [SLOT_SIZE x MEMORY_SIZE]
        #memory_updated = memory            
        #memory_updated = memory_preserved + update            
        memory_updated = tf.minimum(tf.maximum(memory_preserved + update, -1), 1)
        #memory_updated = tf.tanh(memory_preserved + update)

        # Perform memory update !just make sure we alread read from the memory! 
        with tf.control_dependencies([memory_output_batch_without_dims]):
            # Memory "truncation".
            memory_update_op = tf.assign(memory, memory_updated)
            #memory_update_op = memory.assign(tf.tanh(memory_updated))
            #memory_update_op = memory.assign(tf.minimum(tf.maximum(memory_updated, -1), 1))
            #memory_update_op = tf.assign(memory, tf.zeros([SLOT_SIZE, NUMBER_OF_SLOTS]))

        with tf.control_dependencies([memory_updated, memory_update_op]):
            # Add histograms to TensorBoard.
            add_content_vector_batch_hist = tf.summary.histogram("1.add_content_vector_batch_BxS", add_content_vector_batch)
            erase_content_vector_batch_hist = tf.summary.histogram("2.erase_content_vector_batch_BxS", erase_content_vector_batch)
            preservation_gate_hist = tf.summary.histogram("3.1.preservation_gate_SxN", preservation_gate)
            memory_preserved_hist = tf.summary.histogram("3.3.memory_preserved_SxN", memory_preserved)
            update_hist = tf.summary.histogram("4.1.update_SxN", update)
            memory_hist = tf.summary.histogram("5.memorySxN_at_end", memory)
            #memory_trunk_hist = tf.summary.histogram("memory_after_truncation", memory_trunk_op)

            # Add hot-cold visualizations.
            visualize_hot_cold_normalized(matrix_=add_content_vector_batch, axis_of_reduction_=None, name_="1.add_content_vector_batch_BxS")          
            visualize_hot_cold_normalized(matrix_=erase_content_vector_batch, axis_of_reduction_=None, name_="2.erase_content_vector_batch_BxS")          
            visualize_hot_cold_normalized(matrix_=preservation_gate, axis_of_reduction_=None, name_="3.1.preservation_gate_SxN")          
            visualize_hot_cold_normalized(matrix_=memory_preserved, axis_of_reduction_=None, name_="3.3.memory_preserved_SxN")          
            visualize_hot_cold_normalized(matrix_=update, axis_of_reduction_=None, name_="4.update_SxN")          
            visualize_hot_cold_normalized(matrix_=memory, axis_of_reduction_=None, name_="5.memory_SxN_at_end")          
            #visualize_hot_cold_normalized(matrix_=memory, axis_of_reduction_=None, name_="memory_after_truncation")          

    with tf.name_scope("memory_output"):
        # TF bugfix - tensordot not returning proper shapes when only partial shapes are known. :]
        # https://github.com/tensorflow/tensorflow/issues/6682

        # Before returning output batch !make sure that we updated the memory!
        with tf.control_dependencies([memory_update_op]):
            memory_output_batch = tf.reshape(memory_output_batch_without_dims, [-1, SLOT_SIZE])

            # Add histograms to TensorBoard.
            memory_output_batch_hist = tf.summary.histogram("memory_output_batch_BxS", memory_output_batch)
            # Add hot-cold visualizations.
            visualize_hot_cold_normalized(matrix_=memory_output_batch, axis_of_reduction_=None, name_="memory_output_batch_BxS")          

    return memory_output_batch

print("Cell definition OK")

Cell definition OK


###  Definition of tensor graph

In [36]:
# Reset graph - just in case.
tf.reset_default_gr aph()

# Create shared (reuseable) space for all cells.   
with tf.variable_scope("controller_variables"):
    # Memory.
    memory = tf.get_variable("memory", [SLOT_SIZE, NUMBER_OF_SLOTS], trainable=False)

    visualize_hot_cold_normalized(matrix_=memory, axis_of_reduction_=None, name_="memory_SxN")          
    memory_hist = tf.summary.histogram("memory", memory)

    # Hidden state calculation gate: input, previous output, and bias.
    Wih = tf.get_variable("Wih", [INPUT_SIZE, HIDDEN_SIZE])
    Wmh = tf.get_variable("Wmh", [SLOT_SIZE, HIDDEN_SIZE])
    bih_mh = tf.get_variable("bih_mh", [1, HIDDEN_SIZE])

    # Read head.
    Whk = tf.get_variable("Whk", [HIDDEN_SIZE, SLOT_SIZE])
    Whbeta = tf.get_variable("Whbeta", [HIDDEN_SIZE, 1])

    # Interpolation gate.
    Wiig = tf.get_variable("Wiig", [INPUT_SIZE, 1])
    Wmig = tf.get_variable("Wmig", [SLOT_SIZE, 1])
    biig_mig = tf.get_variable("biig_mig", [1, 1])

     # Content vector.
    Whv = tf.get_variable("Whv", [HIDDEN_SIZE, CONTENT_SIZE])
    bhv = tf.get_variable("bhv", [1, CONTENT_SIZE])

    # Erase vector.
    Whe = tf.get_variable("Whe", [HIDDEN_SIZE, CONTENT_SIZE])
    bhe = tf.get_variable("bhe", [1, CONTENT_SIZE])


# Sets sparse memory addresses.
#memory_content = tf.slice(memory, [ADDRESS_SIZE, 0], [CONTENT_SIZE, NUMBER_OF_SLOTS])
memory_content = tf.truncated_normal(shape=[CONTENT_SIZE, NUMBER_OF_SLOTS])
sparse_address = tf.eye(ADDRESS_SIZE) # must be = NUMBER_OF_SLOTS!
init_sparse_memory_addressing = tf.assign(memory, tf.concat([sparse_address, memory_content], axis = 0))

#memory = tf.get_variable("memory", tf.truncated_normal(shape=[HIDDEN_SIZE, NUMBER_OF_SLOTS]), trainable=False)


# 0. Previous variables.
with tf.name_scope("Previous_variables"):

    # Placeholders for previous weights.
    # Each cell (element in sequence) has its own prev read/update vector - ONE FOR THE WHOLE BATCH [1xMEMORY_SIZE]
    prev_read_weights_seq = list()    
    for i_seq in range(SEQ_LENGTH):
        prev_read_weights_seq.append(tf.placeholder(tf.float32, shape=[1, NUMBER_OF_SLOTS], name="Prev_rw"))

# 1. Placeholders for inputs.
with tf.name_scope("Input_data"):
    # Define input data buffers.
    data_buffers = list()
    for _ in range(SEQ_LENGTH + 1):
        # Collect placeholders for inputs/labels: Batch x Vocab size.
        data_buffers.append(tf.placeholder(tf.float32, shape=[None, INPUT_SIZE], name="data_buffers"))

    # Sequence of batches.
    input_seq_batch = data_buffers[:SEQ_LENGTH]

    # Labels are pointing to the same placeholders!
    # Labels are inputs shifted by one time step.
    labels_seq_batch = data_buffers[1:]  
    # Concatenate targets into 2D tensor.
    target_batch = tf.concat(labels_seq_batch, 0)

    # Add histograms to TensorBoard.
    input_seq_batch_hist = tf.summary.histogram("input_seq_batch", input_seq_batch)

# 2. Unrolled controller ops.
#with tf.name_scope("Controller"):
# Placeholder for "initial" memory output [BATCH_SIZE X SLOT_SIZE].
init_memory_output = tf.placeholder(tf.float32, shape=[None, SLOT_SIZE], name="init_memory_read")

# Build list of outpus for sequence (at the end of size SEQ_LENGTH).
memory_outputs_batch_seq = list()
# Two lists that will be "returned" and later passed as previous states. 
read_weights_seq = list()  

# Build "unrolled controller" - "link" oldest memory output to initial placeholder.
memory_output = init_memory_output
# For every buffer in input sequence batch buffers...
for i in range(SEQ_LENGTH):
    # ... add cell...     
    memory_output = controller_cell(
        input_seq_batch[i], 
        memory_output, 
        prev_read_weights_seq[i],
        "cell_"+str(i))
    # .. and add controller buffer to outputs.
    memory_outputs_batch_seq.append(memory_output)

# Add histograms to TensorBoard.
memory_outputs_batch_seq_hist = tf.summary.histogram("memory_outputs_batch_seq", memory_outputs_batch_seq)
read_weights_seq_hist = tf.summary.histogram("read_weights_seq", read_weights_seq)

# 3. Output ops.
with tf.name_scope("Output"):
    # Concatenate sequence along time axis.
    output_batch = tf.concat(memory_outputs_batch_seq, 0)
 
    # Output layer weights and biases.
    Wol = tf.get_variable("Wol", [SLOT_SIZE, INPUT_SIZE])
    bol = tf.get_variable("bol", [INPUT_SIZE])

    # Logits.
    logits_batch = tf.nn.xw_plus_b(output_batch, Wol, bol, name = "Logits")
    # Add fully connected softmax layer on top - for predictions.
    prediction_batch = tf.nn.softmax(logits_batch)

 
# 4. Loss ops.
with tf.name_scope("Loss"):
    # Loss function(s) - one for every output generated by every LSTM cell.
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=target_batch, logits=logits_batch))
    # Add loss summary.
    loss_summary = tf.summary.scalar("loss", loss)

# 5. Training ops.  
with tf.name_scope("Optimization"):
    # Learning rate decay.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(0.1, global_step, 5000, 0.9, staircase=True)
    # Optimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    gradients, v = zip(*grads_and_vars)
    # Gradient clipping.
    gradients, _ = tf.clip_by_global_norm(gradients, 0.1)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

    for i, (grad, var) in enumerate(grads_and_vars):
        if grad is not None:
            gradients[i] = (tf.clip_by_value(grad, -10, 10), var)
            tf.summary.histogram(var.name, var)
            tf.summary.histogram(var.name + '/grad', grad)
    
    
# Merge all summaries.
merged_summaries = tf.summary.merge_all()

print("Graph definition OK")

beta_batch= Tensor("read_head/add:0", shape=(?, 1), dtype=float32)
sharpened_similarity_batch= Tensor("read_head/mul:0", shape=(?, 20), dtype=float32)
interpolation_gate_batch= Tensor("read_head/interpolation_gate_batch:0", shape=(?, 1), dtype=float32)
beta_batch= Tensor("read_head_1/add:0", shape=(?, 1), dtype=float32)
sharpened_similarity_batch= Tensor("read_head_1/mul:0", shape=(?, 20), dtype=float32)
interpolation_gate_batch= Tensor("read_head_1/interpolation_gate_batch:0", shape=(?, 1), dtype=float32)
INFO:tensorflow:Summary name controller_variables/Wih:0 is illegal; using controller_variables/Wih_0 instead.
INFO:tensorflow:Summary name controller_variables/Wih:0/grad is illegal; using controller_variables/Wih_0/grad instead.
INFO:tensorflow:Summary name controller_variables/Wmh:0 is illegal; using controller_variables/Wmh_0 instead.
INFO:tensorflow:Summary name controller_variables/Wmh:0/grad is illegal; using controller_variables/Wmh_0/grad instead.
INFO:tensorflow:Summary name

In [37]:
def create_feed_dict(set_type_):
    """Creates feed dictionaries and set initial values for placeholders for different sets"""
    feed_dict = dict()
    
    if set_type_=="train":
        # Get next batch and create a feed dict.
        next_batch = train_batches.next()
        # Feed batch to input buffers.
        for i in range(SEQ_LENGTH + 1):
            feed_dict[data_buffers[i]] = next_batch[i]

        # Set previous weights of read and write heades.
        for i in range(SEQ_LENGTH):
            feed_dict[prev_read_weights_seq[i]] = prev_rw_seq[i]

        # Reset "initial" memory output.
        feed_dict[init_memory_output] = np.zeros([BATCH_SIZE, SLOT_SIZE])
            
    elif set_type_=="valid":
        for i in range(SEQ_LENGTH + 1):
            feed_dict[data_buffers[i]] = valid_batch[i]

        # Set previous weights of read and write heades.
        for i in range(SEQ_LENGTH):
            feed_dict[prev_read_weights_seq[i]] = prev_rw_seq[i]

        # Reset "initial" memory output.
        feed_dict[init_memory_output] = np.zeros([valid_size, SLOT_SIZE])

    else: # test
        for i in range(SEQ_LENGTH + 1):
            feed_dict[data_buffers[i]] = test_batch[i]
        
        # Set previous weights of read and write heades.
        for i in range(SEQ_LENGTH):
            feed_dict[prev_read_weights_seq[i]] = prev_rw_seq[i]

        # Reset "initial" memory output.
        feed_dict[init_memory_output] = np.zeros([test_size, SLOT_SIZE])
       
    return feed_dict

print("Feed_dict definition OK")

Feed_dict definition OK


### Session execution

In [38]:
# Eventually clear the log dir.
if tf.gfile.Exists(LOG_DIR):
  tf.gfile.DeleteRecursively(LOG_DIR)
# Create (new) log dir.
tf.gfile.MakeDirs(LOG_DIR)

print("Log dir CLEARED")

Log dir CLEARED


In [39]:

# Create session to execute graph.
sess=tf.InteractiveSession()

# Create summary writers, point them to LOG_DIR.
train_writer = tf.summary.FileWriter(LOG_DIR + '/train', sess.graph)
valid_writer = tf.summary.FileWriter(LOG_DIR + '/valid')
test_writer = tf.summary.FileWriter(LOG_DIR + '/test')

# Initialize global variables.
tf.global_variables_initializer().run()
print('Variables initialized')

# Create initial previous read and update - full of zeros. 
prev_rw_seq = list()
for i in range(SEQ_LENGTH):
    prev_rw_seq.append(np.zeros([1, NUMBER_OF_SLOTS]))

# Initialize sparse memory addressing
#sess.run(init_sparse_memory_addressing)
    
# Determine how long to perform the training and how often the test loss on validation batch will be computed. 
summary_frequency = 1
validation_frequency = 1000
num_steps = 10 #// (BATCH_SIZE*SEQ_LENGTH)
#num_steps = train_size // (BATCH_SIZE*SEQ_LENGTH)
print("Number of iterations per epoch =", num_steps)
for step in range(num_steps):
    input_seq_batch_, memory_, prev_rw_seq, summaries, _, loss_, lr_ = sess.run([
        input_seq_batch, memory, read_weights_seq, merged_summaries, optimizer, loss, learning_rate],
        feed_dict=create_feed_dict("train"))
    # Every (100) steps collect statistics.
    if step % summary_frequency == 0:
        # Add summary.
        train_writer.add_summary(summaries, step*BATCH_SIZE*SEQ_LENGTH)
        train_writer.flush()
        print('Training set BPC at step %d: %0.5f learning rate: %f' % (step, loss_, lr_))

        #print("memory=\n", memory_)
        # Print loss from last batch.
    
    #if step % validation_frequency == 0:
    #    # Validation set BPC.
    #    print('=' * 80)
    #    print("Calculating BPC on validation set")
    #    v_summaries, v_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("valid"))
    #    print("Validation set BPC: %.5f" % v_loss)
    #    valid_writer.add_summary(v_summaries, step*BATCH_SIZE*SEQ_LENGTH)
    #    valid_writer.flush()
    # End of statistics collection

#for gv in grads_and_vars:
#    #print(str(sess.run(gv[0])) + " - " + gv[1].name)        
#    print(" - " + gv[1].name)        

# Test set BPC.
#print('=' * 80)
#print("Calculating BPC on test set")
#t_summary, t_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("test"))
#print("Final test set BPC: %.5f" % t_loss)
#test_writer.add_summary(t_summary, step*BATCH_SIZE*SEQ_LENGTH)
#test_writer.flush()
    
# Close writers and session.
train_writer.close()
valid_writer.close()
test_writer.close()
sess.close() 

Variables initialized
Number of iterations per epoch = 100
Training set BPC at step 0: 4.08130 learning rate: 0.100000
Training set BPC at step 10: 4.54316 learning rate: 0.100000
Training set BPC at step 20: 3.91468 learning rate: 0.100000
Training set BPC at step 30: 3.96386 learning rate: 0.100000
Training set BPC at step 40: 3.12513 learning rate: 0.100000
Training set BPC at step 50: 2.63880 learning rate: 0.100000
Training set BPC at step 60: 3.40091 learning rate: 0.100000
Training set BPC at step 70: 4.54256 learning rate: 0.100000
Training set BPC at step 80: 2.73602 learning rate: 0.100000
Training set BPC at step 90: 3.06107 learning rate: 0.100000
