In [1]:
from __future__ import print_function
import os

import numpy as np
import zipfile
import tarfile
from six.moves.urllib.request import urlretrieve
import shutil 
import random

import string
import tensorflow as tf

# Dirs - must be absolute paths!
LOG_DIR = '/tmp/tf/ptb_char_lstm_mann_lrua_shared_memory_unified_prev/h32b10s1mem10lr2/'
# Local dir where PTB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'

# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"

# Size of the hidden state
HIDDEN_SIZE = 32

# Size of batch
BATCH_SIZE = 10

# Length of sequence (=  number of units of controller (recurrent layer))
SEQ_LENGTH = 1

#### MANN-related parameters.
# Size of the local memory of each cell.
MEMORY_SLOTS = 10

# Number of smallest elements - used in LRUA scheme.
N_SMALLEST = 1

# "Update weight decay".
GAMMA = 1e-4 

# Eps for normalization in visualization
EPS = 1e-10

In [2]:
def maybe_download_ptb(path, 
                       filename='simple-examples.tgz', 
                       url='http://www.fit.vutbr.cz/~imikolov/rnnlm/', 
                       expected_bytes =34869662):
  # Eventually create the PTB dir.
  if not tf.gfile.Exists(path):
    tf.gfile.MakeDirs(path)
  """Download a file if not present, and make sure it's the right size."""
  _filename = path+filename
  if not os.path.exists(_filename):
    print('Downloading %s...' % filename)
    _filename, _ = urlretrieve(url+filename, _filename)
  statinfo = os.stat(_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', (_filename), '(', statinfo.st_size, ')')
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + _filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download_ptb(PTB_DIR)

Found and verified /home/tkornuta/data/ptb/simple-examples.tgz ( 34869662 )


### Check/maybe download PTB.

### Extract dataset-related files from the PTB archive.

In [3]:
def extract_ptb(path, filename='simple-examples.tgz', files=["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt", 
                                       "ptb.char.train.txt", "ptb.char.valid.txt", "ptb.char.test.txt"]):
    """Extracts files from PTB archive."""
    # Extract
    tar = tarfile.open(path+filename)
    tar.extractall(path)
    tar.close()
    # Copy files
    for file in files:
        shutil.copyfile(PTB_DIR+"simple-examples/data/"+file, PTB_DIR+file)
    # Delete directory
    shutil.rmtree(PTB_DIR+"simple-examples/")        

extract_ptb(PTB_DIR)

### Load train, valid and test texts.

In [4]:
def read_data(filename, path):
    with open(path+filename, 'r') as myfile:
        data=myfile.read()# .replace('\n', '')
        return data

train_text = read_data(TRAIN, PTB_DIR)
train_size=len(train_text)
print(train_size, train_text[:100])

valid_text = read_data(VALID, PTB_DIR)
valid_size=len(valid_text)
print(valid_size, valid_text[:64])

test_text = read_data(TEST, PTB_DIR)
test_size=len(test_text)
print(test_size, test_text[:64])

5101618  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote
399782  consumers may want to move their telephones a little closer to 
449945  no it was n't black monday 
 but while the new york stock excha


### Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = 59 # [A-Z] + [a-z] + ' ' +few 'in between; + punctuation
first_letter = ord(string.ascii_uppercase[0]) # ascii_uppercase before lowercase! 
print("vocabulary size = ", vocabulary_size)
print(first_letter)

def char2id(char):
  """ Converts char to id (int) with one-hot encoding handling of unexpected characters"""
  if char in string.ascii_letters:# or char in string.punctuation or char in string.digits:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    # print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  """ Converts single id (int) to character"""
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

#print(len(string.punctuation))
#for i in string.ascii_letters:
#    print (i, char2id(i))


print(char2id('a'), char2id('A'), char2id('z'), char2id('Z'), char2id(' '), char2id('ï'))
print(id2char(char2id('a')), id2char(char2id('A')))
#print(id2char(65), id2char(33), id2char(90), id2char(58), id2char(0))
#bankno
sample = np.zeros(shape=(1, vocabulary_size), dtype=np.float)
sample[0, char2id(' ')] = 1.0
print(sample)

vocabulary size =  59
65
33 1 58 26 0 0
a A
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.]]


### Helper class for batch generation

In [6]:
class BatchGenerator(object):
  def __init__(self, text, batch_size, seq_length, vocab_size):
    """
    Initializes the batch generator object. Stores the variables and first "letter batch".
    text is text to be processed
    batch_size is size of batch (number of samples)
    seq_length represents the length of sequence
    vocab_size is number of words in vocabulary (assumes one-hot encoding)
    """
    # Store input parameters.
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._seq_length = seq_length
    self._vocab_size = vocab_size
    # Divide text into segments depending on number of batches, each segment determines a cursor position for a batch.
    segment = self._text_size // batch_size
    # Set initial cursor position.
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # Store first "letter batch".
    self._last_letter_batch = self._next_letter_batch()
  
  def _next_letter_batch(self):
    """
    Returns a batch containing of encoded single letters depending on the current batch 
    cursor positions in the data.
    Returned "letter batch" is of size batch_size x vocab_size
    """
    letter_batch = np.zeros(shape=(self._batch_size, self._vocab_size), dtype=np.float)
    # Iterate through "samples"
    for b in range(self._batch_size):
      # Set 1 in position pointed out by one-hot char encoding.
      letter_batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return letter_batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    # First add last letter from previous batch (the "additional one").
    batches = [self._last_letter_batch]
    for step in range(self._seq_length):
      batches.append(self._next_letter_batch())
    # Store last "letter batch" for next batch.
    self._last_letter_batch = batches[-1]
    return batches


In [7]:
# Trick - override first 10 chars
#list1 = list(train_text)
#for i in range(2):
#    list1[i] = 'z'
#train_text = ''.join(list1)
#print("Train set =", train_text[0:100])

# Create objects for training, validation and testing batch generation.
train_batches = BatchGenerator(train_text, BATCH_SIZE, SEQ_LENGTH, vocabulary_size)

# Get first training batch.
#batch = train_batches.next()
#print(len(batch))
#print(batch[0].shape)
#print("Batch = ", batch)
#print(batches2string(batch))
#print("batch len = num of enrollings",len(batch))
#for i in range(num_unrollings):
#    print("i = ", i, "letter=", batches2string(batch)[0][i][0], "bits = ", batch[i][0])


# For validation  - process the whole text as one big batch.
VALID_BATCH_SIZE = int(np.floor(valid_size/SEQ_LENGTH))
valid_batches = BatchGenerator(valid_text, VALID_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
# Get single batch! 
valid_batch = valid_batches.next()

# For texting  - process the whole text as one big batch.
TEST_BATCH_SIZE = int(np.floor(test_size/SEQ_LENGTH))
test_batches = BatchGenerator(test_text, TEST_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
# Get single batch! 
test_batch = test_batches.next()


### Helper functions - used during graph definition

In [8]:
# Function adding visualization to a given "matrix" (keys, memory etc.) with additional normalization
def visualize_hot_cold_normalized(matrix_,
                    axis_of_reduction_,
                    name_):
        # Create hot-cold visualization (red=positive/blue=negative)
        zeros = tf.zeros_like(matrix_) 
        
        # Get negative values only.
        neg = tf.less(matrix_, zeros)
        blue = tf.multiply(tf.cast(neg, tf.float32), matrix_)
        min_blue = tf.reduce_min(matrix_, axis=axis_of_reduction_, keep_dims=True) + EPS
        norm_blue = 255.0 * blue/min_blue
        
        # Get positive values only.
        pos = tf.greater(matrix_, zeros)
        red = tf.multiply(tf.cast(pos, tf.float32), matrix_)
        max_red = tf.reduce_max(matrix_, axis=axis_of_reduction_, keep_dims=True) + EPS
        norm_red = 255.0 * red/max_red
        
        # Stack them into three channel image with hot-cold values.
        rgb = tf.stack([norm_red, zeros, norm_blue], axis=2)

        #print("name_=",name_)
        #print("np.int32(zeros.shape[1]=", np.int32(zeros.shape[1]))
        # TODO: find fix and get rid of the batch_size(!)
        rgb_reshaped = tf.reshape(rgb, [1, -1,  np.int32(zeros.shape[1]), 3])

        # Visualize read weights as image.
        rgb_reshaped_summary = tf.summary.image(name_+"_normalized", rgb_reshaped)

In [9]:
# Function adding visualization to a given "matrix" (keys, memory etc.)
def visualize_hot_cold(matrix_,
                    name_):
        # Create hot-cold visualization (red=positive/blue=negative)
        zeros = tf.zeros_like(matrix_) 
        
        # Get negative values only.
        neg = tf.less(matrix_, zeros)
        blue = 255.0 * tf.multiply(tf.cast(neg, tf.float32), matrix_)
        
        # Get positive values only.
        pos = tf.greater(matrix_, zeros)
        red = 255.0 * tf.multiply(tf.cast(pos, tf.float32), matrix_)
        
        # Stack them into three channel image with hot-cold values.
        rgb = tf.stack([red, zeros, blue], axis=2)

        #print("name_=",name_)
        #print("np.int32(zeros.shape[1]=", np.int32(zeros.shape[1]))
        # TODO: find fix and get rid of the batch_size(!)
        rgb_reshaped = tf.reshape(rgb, [1, -1,  np.int32(zeros.shape[1]), 3])

        # Visualize read weights as image.
        rgb_reshaped_summary = tf.summary.image(name_, rgb_reshaped)


In [10]:
# Definition of the controller cell graph.
def controller_cell(input_, # input x
                    memory_input_, # read vector from the memory returned by previous cell
                    prev_output_, # output of the previous cell
                    prev_cell_state_, # previous cell state
                    prev_read_weights_, # read weights from previous time state (t-1) [1xMEMORY_SLOTS]
                    prev_update_weights_, # update weights from previous time state (t-1) [1xMEMORY_SLOTS]
                    name_):
    """Create a controller with local memory cell"""
    """First dimensions of each of the computational nodes below is "derrived" from BATCH_SIZE"""
    with tf.name_scope(name_):

        with tf.name_scope("LSTM"):
            # LSTM cell equations according to Christopher Olah blog.
            # colah.github.io/posts/2015-08-Understanding-LSTMs/
            # Concatenate intyp x with h_prev ("prev output") TODO: and memory.
            i_h_m = tf.concat([input_, prev_output_, memory_input_], 1)

            # Calculate forget, input and output gate activations.
            forget_gate = tf.sigmoid(tf.matmul(i_h_m, cWf) + cbf, name="Forget_gate")
            input_gate = tf.sigmoid(tf.matmul(i_h_m, cWi) + cbi, name="Input_gate")
            output_gate = tf.sigmoid(tf.matmul(i_h_m, cWo) + cbo, name="Output_gate")

            # Update of the cell state C~.
            cell_update = tf.tanh(tf.matmul(i_h_m, cWc) + cbc, name="Cell_update")
            # New cell state C.
            cell_state = tf.add(forget_gate * prev_cell_state_, input_gate * cell_update, name = "Cell_state")
            # Calculate h - "output".
            cell_output = output_gate * tf.tanh(cell_state)
            
        with tf.name_scope("Keys"):
            # Calculate keys - read and add.
            #k_t = tf.tanh(tf.matmul(cell_output, W_key) + b_key) # (batch_size, nb_reads, memory_size[1])
            #a_t = tf.tanh(tf.matmul(cell_output, W_add) + b_add) # (batch_size, nb_reads, memory_size[1])
            #alpha = tf.sigmoid(tf.matmul(cell_output, W_alpha) + b_alpha) # (batch_size, nb_reads, 1)
            # Latest vs LRU ratio.
            alpha = tf.Variable(tf.truncated_normal(shape=[]), name="Alpha")

            # Add histograms to TensorBoard.
            #k_t_hist = tf.summary.histogram("k_t", k_t)
            #a_t_hist = tf.summary.histogram("a_t", a_t)
            alpha_scalar = tf.summary.scalar("alpha", alpha)
            
        # Read from the memory.
        with tf.name_scope("Read_head"):
            
            # Normalize batch.
            norm_batch = tf.nn.l2_normalize(cell_output,1)
            # Normalize memory.
            norm_memory = tf.nn.l2_normalize(memory,0)
            # Calculate cosine similarity.
            #similarity_batch = tf.matmul(norm_batch, norm_memory) 

            # Not-normalized similarity with min value adjustment.
            similarity_batch = tf.matmul(cell_output, memory) 
            #sim_min_batch = tf.reduce_min(similarity_batch, axis=1)
            #similarity_batch_adj = similarity_batch - sim_min_batch # neg values will become 0.
            
            # Calculate read weights based on similarity.
            read_weights_batch = tf.nn.softmax(10 *similarity_batch)

            # Add to list returned as "previous read weights" - reduced to [1xMEMORY_SLOTS] for the whole batch!
            read_weights_seq.append(tf.reduce_mean(read_weights_batch, axis=0, keep_dims=True))
            
            # Add histograms to TensorBoard.
            norm_batch_hist = tf.summary.histogram("norm_batch", norm_batch)
            norm_batch_hist = tf.summary.histogram("norm_batch", norm_batch)
            norm_memory_hist = tf.summary.histogram("norm_memory", norm_memory)
            similarity_batch_hist = tf.summary.histogram("cosine_similarity_batch", similarity_batch)
            read_weights_batch_hist = tf.summary.histogram("read_weights_batch", read_weights_batch)
            
            # Add hot-cold visualizations.
            visualize_hot_cold_normalized(matrix_=cell_output, axis_of_reduction_=None, name_="cell_output")          
            #hot_cold_visualization(matrix_=k_t, axis_of_reduction_=1, name_="k_t")          
            visualize_hot_cold_normalized(matrix_=similarity_batch, axis_of_reduction_=1, name_="similarity_batch")          
            #visualize_hot_cold_normalized(matrix_=similarity_batch_adj, axis_of_reduction_=1, name_="similarity_batch_adj")          
            visualize_hot_cold(matrix_=read_weights_batch, name_="read_weights_batch")          
        
        with tf.name_scope("Memory_output"):
            # Calcualte read vector.
            memory_output_batch_without_dims = tf.tensordot(read_weights_batch, tf.transpose(memory), axes=1, name="Memory_output_batch_r")   
            # TF bugfix - tensordot not returning proper shapes when only partial shapes are known. :]
            # https://github.com/tensorflow/tensorflow/issues/6682
            memory_output_batch = tf.reshape(memory_output_batch_without_dims, [-1, HIDDEN_SIZE])
            # Add histograms to TensorBoard.
            memory_output_batch_hist = tf.summary.histogram("memory_output_batch", memory_output_batch)
            # Add hot-cold visualizations.
            visualize_hot_cold_normalized(matrix_=memory_output_batch, axis_of_reduction_=None, name_="memory_output_batch")          

        with tf.name_scope("Write_head"):
            # "Truncation scheme to update the least-used positions".
            # First, find (size-n) top elements (one for all samples in a batch, but separatelly for each cell).
            top = tf.nn.top_k(-prev_update_weights_, N_SMALLEST)
            # To get boolean True/False values, you can first get the k-th value and then use tf.greater_equal:
            kth = tf.reduce_min(top.values, axis=1, keep_dims=True)
            top2 = tf.greater_equal(-prev_update_weights_, kth)
            # And finally - cast it to n smallest elements.
            prev_smallest_lru_weights = tf.cast(top2, tf.float32)

            #write_weights_seq_batch.append(prev_smallest_lru_weights)
            write_weights = tf.add(tf.sigmoid(alpha) * prev_read_weights_,
                                   (1.0 - tf.sigmoid(alpha)) * prev_smallest_lru_weights,
                                   name="Write_weights_ww")

            # Add histograms to TensorBoard.
            prev_smallest_lru_weights_hist = tf.summary.histogram("prev_smallest_lru_weights", prev_smallest_lru_weights)
            write_weights_hist = tf.summary.histogram("write_weights", write_weights)

            # Add hot-cold visualizations.
            visualize_hot_cold_normalized(matrix_=write_weights, axis_of_reduction_=None, name_="write_weights")          

        with tf.name_scope("Update_head"):
            # This relies on prev. weights and will be used in fact in the NEXT step.
            rw_sum_reduced = tf.reduce_mean(read_weights_batch, axis=0, keep_dims=True) + write_weights
            update_weights = tf.add(GAMMA * prev_update_weights_,
                                            rw_sum_reduced,
                                            name="Update_weights_uw")
            
            # Add to list returned as "previous update weights".
            update_weights_seq.append(update_weights)
            # Add histograms to TensorBoard.
            update_weights_hist = tf.summary.histogram("update_weights", update_weights)
            # Add hot-cold visualizations.
            visualize_hot_cold_normalized(matrix_=update_weights, axis_of_reduction_=None, name_="update_weights")          

            
    with tf.name_scope("Memory_update"):
        # Perform single update for each sequence/batch.
        #memory_update_without_dims = tf.tensordot(cell_output_sum, write_weights, axes=1)
        # TF bugfix - tensordot not returning proper shapes when only partial shapes are known. :]
        # https://github.com/tensorflow/tensorflow/issues/6682
        #memory_update = tf.reshape(memory_update_without_dims, [HIDDEN_SIZE, MEMORY_SLOTS]) 
        
        cell_output_sum_transposed = tf.transpose(tf.reduce_sum(cell_output, axis=0, keep_dims=True))
        memory_update = cell_output_sum_transposed * write_weights
        memory_update_seq.append(memory_update)
        
        # Add dependendency control - first prediction?
        #with tf.control_dependencies([prediction_batch]):
        # Update the memory.
        memory_updated = memory + memory_update
        
        # Memory "truncation".
        #memory_trunk_op = memory.assign(memory_updated)
        #memory_trunk_op = memory.assign(tf.tanh(memory_updated))
        memory_trunk_op = memory.assign(tf.minimum(tf.maximum(memory_updated, -1), 1))

        # Add histograms to TensorBoard.
        memory_update_hist = tf.summary.histogram("memory_update", memory_update)
        memory_hist = tf.summary.histogram("memory_before_truncation", memory_updated)
        memory_trunk_hist = tf.summary.histogram("memory_after_truncation", memory_trunk_op)

        # Add hot-cold visualizations.
        visualize_hot_cold_normalized(matrix_=memory_update, axis_of_reduction_=None, name_="memory_update")          
        visualize_hot_cold_normalized(matrix_=memory_updated, axis_of_reduction_=None, name_="memory_before_truncation")          
        visualize_hot_cold_normalized(matrix_=memory, axis_of_reduction_=None, name_="memory_after_truncation")          
        
    return memory_output_batch, cell_output, cell_state

print("Cell definition OK")

Cell definition OK


###  Definition of tensor graph

In [11]:
# Reset graph - just in case.
tf.reset_default_graph()

# Memory.
memory = tf.Variable(tf.truncated_normal(shape=[HIDDEN_SIZE, MEMORY_SLOTS]), trainable=False, name="Memory_M")
# Add histograms to TensorBoard.
memory_hist = tf.summary.histogram("memory_main", memory)

# 0. Previous variables.
with tf.name_scope("Previous_variables"):

    # Placeholders for previous weights.
    # Each cell (element in sequence) has its own prev read/update vector - ONE FOR THE WHOLE BATCH [1xMEMORY_SIZE]
    prev_read_weights_seq = list()    
    prev_update_weights_seq = list()    
    for i_seq in range(SEQ_LENGTH):
        prev_read_weights_seq.append(tf.placeholder(tf.float32, shape=[1, MEMORY_SLOTS], name="Prev_rw"))
        prev_update_weights_seq.append(tf.placeholder(tf.float32, shape=[1, MEMORY_SLOTS], name="Prev_uw"))

# 1. Placeholders for inputs.
with tf.name_scope("Input_data"):
    # Define input data buffers.
    data_buffers = list()
    for _ in range(SEQ_LENGTH + 1):
        # Collect placeholders for inputs/labels: Batch x Vocab size.
        data_buffers.append(tf.placeholder(tf.float32, shape=[None, vocabulary_size], name="data_buffers"))

    # Sequence of batches.
    input_seq_batch = data_buffers[:SEQ_LENGTH]

    # Labels are pointing to the same placeholders!
    # Labels are inputs shifted by one time step.
    labels_seq_batch = data_buffers[1:]  
    # Concatenate targets into 2D tensor.
    target_batch = tf.concat(labels_seq_batch, 0)

    # Add histograms to TensorBoard.
    input_seq_batch_hist = tf.summary.histogram("input_seq_batch", input_seq_batch)

# 2. Unrolled controller ops.
with tf.name_scope("Controller"):
    # Define controller parameters:
    # Input gate: input, previous output, and bias.
    cWf = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="cWf")
    cbf = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bf")

    # Forget gate: input, previous output, and bias.
    cWi = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE,HIDDEN_SIZE], -0.1, 0.1), name="cWi")
    cbi = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bi")

    # Output gate: input, previous output, and bias.
    cWo = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="cWo")
    cbo = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bo")

    # Memory cell: input, state and bias.                             
    cWc = tf.Variable(tf.truncated_normal([vocabulary_size+2*HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="cWc")
    cbc = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bc")

    # Read key.
    #W_key = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="W_key")
    #b_key = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="b_key")
    
    # Add key.
    #W_add = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="W_add")
    #b_add = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="b_add")
    
    # Alpha - used in Latest vs LRU ratio.
    #W_alpha = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, 1], -0.1, 0.1), name="W_alpha")
    #b_alpha = tf.Variable(tf.zeros([1, 1]), name="b_alpha")
    
    # Placeholders for "zero" (the oldest) state and output: Batch x Hidden size.
    init_controller_output = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_controller_output")
    init_controller_state = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_controller_state")
    # Placeholder for "zero" memory read: Batch X Hidden (TODO: memory?) size.
    init_memory_output = tf.placeholder(tf.float32, shape=[None, HIDDEN_SIZE], name="init_memory_read")

    # Unrolled LSTM.
    # Build outpus of size SEQ_LENGTH.
    controller_outputs_batch_seq = list()
    memory_outputs_batch_seq = list()
    # Two lists that will be "returned" and later passed as previous states. 
    read_weights_seq = list()  
    update_weights_seq = list()  
    memory_update_seq = list()
    
    # "Link" oldest statte and output to placeholders.
    controller_output = init_controller_output
    controller_state = init_controller_state
    memory_output = init_memory_output
    # For every buffer in input sequence batch buffers...
    for i in range(SEQ_LENGTH):
        # ... add cell...     
        memory_output, controller_output, controller_state = controller_cell(
            input_seq_batch[i], 
            memory_output, 
            controller_output, 
            controller_state, 
            prev_read_weights_seq[i],
            prev_update_weights_seq[i],
            "cell_"+str(i))
        # .. and add controller buffer to outputs.
        controller_outputs_batch_seq.append(controller_output)
        memory_outputs_batch_seq.append(memory_output)
         
    # Add histograms to TensorBoard.
    controller_outputs_batch_seq_hist = tf.summary.histogram("controller_outputs_batch_seq", controller_outputs_batch_seq)
    memory_outputs_batch_seq_hist = tf.summary.histogram("memory_outputs_batch_seq", memory_outputs_batch_seq)
    memory_hist = tf.summary.histogram("memory", memory)
    read_weights_seq_hist = tf.summary.histogram("read_weights_seq", read_weights_seq)
    update_weights_seq_hist = tf.summary.histogram("update_weights_seq", update_weights_seq)

# 3. Output ops.
with tf.name_scope("Output"):
    # Concatenate controller hidden state with the read vector.
    concat_batch_seq = list()    
    for i_seq in range(SEQ_LENGTH):
        concat_output_batch = tf.concat([controller_outputs_batch_seq[i_seq],
                                        memory_outputs_batch_seq[i_seq]],
                                        axis=1, 
                                        name="Concat_mann_output")
        concat_batch_seq.append(concat_output_batch)   
        # Add hot-cold visualizations.
        visualize_hot_cold_normalized(matrix_=concat_output_batch, axis_of_reduction_=None, name_="concatenated_mann_output_batch")          

    # Add histograms to TensorBoard.
    concat_batch_seq_hist = tf.summary.histogram("concat_batch_seq", concat_batch_seq)
    
    #output_batch = tf.concat(memory_outputs_batch_seq, 0) 
    output_batch = tf.concat(concat_batch_seq, 0)
 
    # Output layer weights and biases.
    Wh = tf.Variable(tf.truncated_normal([2*HIDDEN_SIZE, vocabulary_size], -0.1, 0.1), name="Wh")
    bh = tf.Variable(tf.zeros([vocabulary_size]), name="Wh")

    # Logits.
    logits_batch = tf.nn.xw_plus_b(output_batch, Wh, bh, name = "Logits")
    # Add fully connected softmax layer on top - predictions.
    prediction_batch = tf.nn.softmax(logits_batch)

 
# 4. Loss ops.
with tf.name_scope("Loss"):
    # Loss function(s) - one for every output generated by every LSTM cell.
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=target_batch, logits=logits_batch))
    # Add loss summary.
    loss_summary = tf.summary.scalar("loss", loss)

# 5. Training ops.  
with tf.name_scope("Optimization"):
    # Learning rate decay.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(0.1, global_step, 5000, 0.9, staircase=True)
    # Optimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    gradients, v = zip(*grads_and_vars)
    # Gradient clipping.
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

# Merge all summaries.
merged_summaries = tf.summary.merge_all()

print("Graph definition OK")

Graph definition OK


In [12]:
def create_feed_dict(set_type_):
    """Creates a dictionaries for different sets: maps data onto Tensor placeholders."""
    feed_dict = dict()
    
    if set_type_=="train":
        # Get next batch and create a feed dict.
        next_batch = train_batches.next()
        # Feed batch to input buffers.
        for i in range(SEQ_LENGTH + 1):
            feed_dict[data_buffers[i]] = next_batch[i]

        # Set previous weights of read and write heades.
        for i in range(SEQ_LENGTH):
            feed_dict[prev_read_weights_seq[i]] = prev_rw_seq[i]
            feed_dict[prev_update_weights_seq[i]] = prev_uw_seq[i]

        # Reset "init" state and output of controller.
        feed_dict[init_controller_output] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
        feed_dict[init_controller_state] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
        feed_dict[init_memory_output] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
            
    elif set_type_=="valid":
        for i in range(SEQ_LENGTH + 1):
            feed_dict[data_buffers[i]] = valid_batch[i]

        # Set previous weights of read and write heades.
        for i in range(SEQ_LENGTH):
            feed_dict[prev_read_weights_seq[i]] = prev_rw_seq[i]
            feed_dict[prev_update_weights_seq[i]] = prev_uw_seq[i]

        # Reset "init" state and output of controller.
        feed_dict[init_controller_output] = np.zeros([valid_size, HIDDEN_SIZE])
        feed_dict[init_controller_state] = np.zeros([valid_size, HIDDEN_SIZE])
        feed_dict[init_memory_output] = np.zeros([valid_size, HIDDEN_SIZE])

    else: # test
        for i in range(SEQ_LENGTH + 1):
            feed_dict[data_buffers[i]] = test_batch[i]
        
        # Set previous weights of read and write heades.
        for i in range(SEQ_LENGTH):
            feed_dict[prev_read_weights_seq[i]] = prev_rw_seq[i]
            feed_dict[prev_update_weights_seq[i]] = prev_uw_seq[i]

        # Reset "init" state and output of controller.
        feed_dict[init_controller_output] = np.zeros([test_size, HIDDEN_SIZE])
        feed_dict[init_controller_state] = np.zeros([test_size, HIDDEN_SIZE])
        feed_dict[init_memory_output] = np.zeros([test_size, HIDDEN_SIZE])
       
    return feed_dict

print("Feed_dict definition OK")

Feed_dict definition OK


### Session execution

In [13]:
# Eventually clear the log dir.
if tf.gfile.Exists(LOG_DIR):
  tf.gfile.DeleteRecursively(LOG_DIR)
# Create (new) log dir.
tf.gfile.MakeDirs(LOG_DIR)

print("Log dir CLEARED")

Log dir CLEARED


In [14]:

# Create session to execute graph.
sess=tf.InteractiveSession()

# Create summary writers, point them to LOG_DIR.
train_writer = tf.summary.FileWriter(LOG_DIR + '/train', sess.graph)
valid_writer = tf.summary.FileWriter(LOG_DIR + '/valid')
test_writer = tf.summary.FileWriter(LOG_DIR + '/test')

# Initialize global variables.
tf.global_variables_initializer().run()
print('Variables initialized')


# Create initial previous read and update - full of zeros. 
prev_rw_seq = list()
prev_uw_seq = list()
for i in range(SEQ_LENGTH):
    prev_rw_seq.append(np.zeros([1, MEMORY_SLOTS]))
    prev_uw_seq.append(np.zeros([1, MEMORY_SLOTS]))

# Determine how long to perform the training and how often the test loss on validation batch will be computed. 
summary_frequency = 100
validation_frequency = 1000
#num_steps = 20 #// (BATCH_SIZE*SEQ_LENGTH)
num_steps = train_size // (BATCH_SIZE*SEQ_LENGTH)
print("Number of iterations per epoch =", num_steps)
for step in range(num_steps):
    input_seq_batch_, memory_, memory_update_seq_, prev_rw_seq, prev_uw_seq, summaries, _, loss_, lr_ = sess.run([
        input_seq_batch, memory, memory_update_seq, read_weights_seq, update_weights_seq, merged_summaries, optimizer, loss, learning_rate],
        feed_dict=create_feed_dict("train"))
    # Every (100) steps collect statistics.
    if step % summary_frequency == 0:
        # Add summary.
        train_writer.add_summary(summaries, step*BATCH_SIZE*SEQ_LENGTH)
        train_writer.flush()
        print('Training set BPC at step %d: %0.5f learning rate: %f' % (step, loss_, lr_))

        #print("memory=\n", memory_)
        #print("memory_update_seq=\n", memory_update_seq_)
        # Print loss from last batch.
    
    #if step % validation_frequency == 0:
    #    # Validation set BPC.
    #    print('=' * 80)
    #    print("Calculating BPC on validation set")
    #    v_summaries, v_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("valid"))
    #    print("Validation set BPC: %.5f" % v_loss)
    #    valid_writer.add_summary(v_summaries, step*BATCH_SIZE*SEQ_LENGTH)
    #    valid_writer.flush()
    # End of statistics collection

for gv in grads_and_vars:
    #print(str(sess.run(gv[0])) + " - " + gv[1].name)        
    print(" - " + gv[1].name)        

# Test set BPC.
#print('=' * 80)
#print("Calculating BPC on test set")
#t_summary, t_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("test"))
#print("Final test set BPC: %.5f" % t_loss)
#test_writer.add_summary(t_summary, step*BATCH_SIZE*SEQ_LENGTH)
#test_writer.flush()
    
# Close writers and session.
train_writer.close()
valid_writer.close()
test_writer.close()
sess.close() 

Variables initialized
Number of iterations per epoch = 510161
Training set BPC at step 0: 4.18776 learning rate: 0.100000
Training set BPC at step 100: 2.59062 learning rate: 0.100000
Training set BPC at step 200: 4.54680 learning rate: 0.100000
Training set BPC at step 300: 3.02341 learning rate: 0.100000
Training set BPC at step 400: 5.56842 learning rate: 0.100000
Training set BPC at step 500: 3.25376 learning rate: 0.100000
Training set BPC at step 600: 5.32287 learning rate: 0.100000
Training set BPC at step 700: 5.54398 learning rate: 0.100000
Training set BPC at step 800: 3.32321 learning rate: 0.100000
Training set BPC at step 900: 4.75889 learning rate: 0.100000
Training set BPC at step 1000: 2.33149 learning rate: 0.100000
Training set BPC at step 1100: 1.82037 learning rate: 0.100000
Training set BPC at step 1200: 3.33494 learning rate: 0.100000
Training set BPC at step 1300: 1.35400 learning rate: 0.100000
Training set BPC at step 1400: 3.93246 learning rate: 0.100000
Train

Training set BPC at step 13000: 3.36748 learning rate: 0.081000
Training set BPC at step 13100: 6.11184 learning rate: 0.081000
Training set BPC at step 13200: 3.62969 learning rate: 0.081000
Training set BPC at step 13300: 2.91092 learning rate: 0.081000
Training set BPC at step 13400: 4.19733 learning rate: 0.081000
Training set BPC at step 13500: 4.48831 learning rate: 0.081000
Training set BPC at step 13600: 2.59278 learning rate: 0.081000
Training set BPC at step 13700: 5.50830 learning rate: 0.081000
Training set BPC at step 13800: 2.68724 learning rate: 0.081000
Training set BPC at step 13900: 3.03212 learning rate: 0.081000
Training set BPC at step 14000: 2.13325 learning rate: 0.081000
Training set BPC at step 14100: 4.29360 learning rate: 0.081000
Training set BPC at step 14200: 2.43481 learning rate: 0.081000
Training set BPC at step 14300: 3.86337 learning rate: 0.081000
Training set BPC at step 14400: 2.48660 learning rate: 0.081000
Training set BPC at step 14500: 1.81104 

Training set BPC at step 25900: 2.28779 learning rate: 0.059049
Training set BPC at step 26000: 2.44139 learning rate: 0.059049
Training set BPC at step 26100: 3.60353 learning rate: 0.059049
Training set BPC at step 26200: 3.27338 learning rate: 0.059049
Training set BPC at step 26300: 2.31089 learning rate: 0.059049
Training set BPC at step 26400: 3.37359 learning rate: 0.059049
Training set BPC at step 26500: 2.78606 learning rate: 0.059049
Training set BPC at step 26600: 3.27165 learning rate: 0.059049
Training set BPC at step 26700: 3.56621 learning rate: 0.059049
Training set BPC at step 26800: 4.63505 learning rate: 0.059049
Training set BPC at step 26900: 3.07493 learning rate: 0.059049
Training set BPC at step 27000: 3.08880 learning rate: 0.059049
Training set BPC at step 27100: 2.53921 learning rate: 0.059049
Training set BPC at step 27200: 2.54696 learning rate: 0.059049
Training set BPC at step 27300: 3.67688 learning rate: 0.059049
Training set BPC at step 27400: 2.28149 

Training set BPC at step 38900: 4.94309 learning rate: 0.047830
Training set BPC at step 39000: 4.39126 learning rate: 0.047830
Training set BPC at step 39100: 3.31236 learning rate: 0.047830
Training set BPC at step 39200: 4.21805 learning rate: 0.047830
Training set BPC at step 39300: 3.16632 learning rate: 0.047830
Training set BPC at step 39400: 2.63365 learning rate: 0.047830
Training set BPC at step 39500: 3.20886 learning rate: 0.047830
Training set BPC at step 39600: 3.64808 learning rate: 0.047830
Training set BPC at step 39700: 4.07551 learning rate: 0.047830
Training set BPC at step 39800: 3.00507 learning rate: 0.047830
Training set BPC at step 39900: 2.69117 learning rate: 0.047830
Training set BPC at step 40000: 1.87040 learning rate: 0.043047
Training set BPC at step 40100: 3.19790 learning rate: 0.043047
Training set BPC at step 40200: 2.79354 learning rate: 0.043047
Training set BPC at step 40300: 2.72434 learning rate: 0.043047
Training set BPC at step 40400: 1.71024 

Training set BPC at step 51900: 2.34628 learning rate: 0.034868
Training set BPC at step 52000: 1.95522 learning rate: 0.034868
Training set BPC at step 52100: 3.38315 learning rate: 0.034868
Training set BPC at step 52200: 2.99452 learning rate: 0.034868
Training set BPC at step 52300: 2.60188 learning rate: 0.034868
Training set BPC at step 52400: 2.70547 learning rate: 0.034868
Training set BPC at step 52500: 2.73543 learning rate: 0.034868
Training set BPC at step 52600: 2.45705 learning rate: 0.034868
Training set BPC at step 52700: 2.07877 learning rate: 0.034868
Training set BPC at step 52800: 3.17305 learning rate: 0.034868
Training set BPC at step 52900: 2.42558 learning rate: 0.034868
Training set BPC at step 53000: 2.36168 learning rate: 0.034868
Training set BPC at step 53100: 2.24363 learning rate: 0.034868
Training set BPC at step 53200: 2.63435 learning rate: 0.034868
Training set BPC at step 53300: 2.24943 learning rate: 0.034868
Training set BPC at step 53400: 1.95712 

Training set BPC at step 64800: 2.46714 learning rate: 0.028243
Training set BPC at step 64900: 2.54016 learning rate: 0.028243
Training set BPC at step 65000: 3.27339 learning rate: 0.025419
Training set BPC at step 65100: 2.36551 learning rate: 0.025419
Training set BPC at step 65200: 2.76080 learning rate: 0.025419
Training set BPC at step 65300: 3.42646 learning rate: 0.025419
Training set BPC at step 65400: 2.29833 learning rate: 0.025419
Training set BPC at step 65500: 2.80353 learning rate: 0.025419
Training set BPC at step 65600: 2.98587 learning rate: 0.025419
Training set BPC at step 65700: 2.37229 learning rate: 0.025419
Training set BPC at step 65800: 1.82004 learning rate: 0.025419
Training set BPC at step 65900: 2.29103 learning rate: 0.025419
Training set BPC at step 66000: 1.87881 learning rate: 0.025419
Training set BPC at step 66100: 2.43048 learning rate: 0.025419
Training set BPC at step 66200: 3.49084 learning rate: 0.025419
Training set BPC at step 66300: 2.47095 

Training set BPC at step 77700: 2.68800 learning rate: 0.020589
Training set BPC at step 77800: 1.09731 learning rate: 0.020589
Training set BPC at step 77900: 2.47082 learning rate: 0.020589
Training set BPC at step 78000: 2.83054 learning rate: 0.020589
Training set BPC at step 78100: 2.20855 learning rate: 0.020589
Training set BPC at step 78200: 2.32936 learning rate: 0.020589
Training set BPC at step 78300: 3.08512 learning rate: 0.020589
Training set BPC at step 78400: 2.80368 learning rate: 0.020589
Training set BPC at step 78500: 1.89093 learning rate: 0.020589
Training set BPC at step 78600: 2.79943 learning rate: 0.020589
Training set BPC at step 78700: 2.33416 learning rate: 0.020589
Training set BPC at step 78800: 1.73797 learning rate: 0.020589
Training set BPC at step 78900: 2.45076 learning rate: 0.020589
Training set BPC at step 79000: 2.76721 learning rate: 0.020589
Training set BPC at step 79100: 2.24409 learning rate: 0.020589
Training set BPC at step 79200: 2.55726 

Training set BPC at step 90700: 2.71862 learning rate: 0.015009
Training set BPC at step 90800: 2.80588 learning rate: 0.015009
Training set BPC at step 90900: 2.98941 learning rate: 0.015009
Training set BPC at step 91000: 2.50106 learning rate: 0.015009
Training set BPC at step 91100: 2.53369 learning rate: 0.015009
Training set BPC at step 91200: 3.22356 learning rate: 0.015009
Training set BPC at step 91300: 2.57307 learning rate: 0.015009
Training set BPC at step 91400: 2.52296 learning rate: 0.015009
Training set BPC at step 91500: 2.81383 learning rate: 0.015009
Training set BPC at step 91600: 2.65154 learning rate: 0.015009
Training set BPC at step 91700: 2.32324 learning rate: 0.015009
Training set BPC at step 91800: 2.18284 learning rate: 0.015009
Training set BPC at step 91900: 2.18615 learning rate: 0.015009
Training set BPC at step 92000: 3.33132 learning rate: 0.015009
Training set BPC at step 92100: 2.35269 learning rate: 0.015009
Training set BPC at step 92200: 2.74248 

Training set BPC at step 103500: 2.68072 learning rate: 0.012158
Training set BPC at step 103600: 2.60335 learning rate: 0.012158
Training set BPC at step 103700: 3.05388 learning rate: 0.012158
Training set BPC at step 103800: 2.45404 learning rate: 0.012158
Training set BPC at step 103900: 2.83223 learning rate: 0.012158
Training set BPC at step 104000: 2.49410 learning rate: 0.012158
Training set BPC at step 104100: 2.77057 learning rate: 0.012158
Training set BPC at step 104200: 2.30361 learning rate: 0.012158
Training set BPC at step 104300: 2.53602 learning rate: 0.012158
Training set BPC at step 104400: 2.47981 learning rate: 0.012158
Training set BPC at step 104500: 2.27416 learning rate: 0.012158
Training set BPC at step 104600: 2.25725 learning rate: 0.012158
Training set BPC at step 104700: 1.97939 learning rate: 0.012158
Training set BPC at step 104800: 2.28278 learning rate: 0.012158
Training set BPC at step 104900: 2.95188 learning rate: 0.012158
Training set BPC at step 

Training set BPC at step 116300: 1.84256 learning rate: 0.008863
Training set BPC at step 116400: 2.16162 learning rate: 0.008863
Training set BPC at step 116500: 1.86413 learning rate: 0.008863
Training set BPC at step 116600: 3.20870 learning rate: 0.008863
Training set BPC at step 116700: 1.94904 learning rate: 0.008863
Training set BPC at step 116800: 2.23934 learning rate: 0.008863
Training set BPC at step 116900: 2.36880 learning rate: 0.008863
Training set BPC at step 117000: 2.12996 learning rate: 0.008863
Training set BPC at step 117100: 2.21016 learning rate: 0.008863
Training set BPC at step 117200: 3.19148 learning rate: 0.008863
Training set BPC at step 117300: 2.97315 learning rate: 0.008863
Training set BPC at step 117400: 1.85356 learning rate: 0.008863
Training set BPC at step 117500: 2.65729 learning rate: 0.008863
Training set BPC at step 117600: 2.54238 learning rate: 0.008863
Training set BPC at step 117700: 2.24335 learning rate: 0.008863
Training set BPC at step 

Training set BPC at step 129100: 2.76477 learning rate: 0.007179
Training set BPC at step 129200: 2.92224 learning rate: 0.007179
Training set BPC at step 129300: 2.29042 learning rate: 0.007179
Training set BPC at step 129400: 2.75098 learning rate: 0.007179
Training set BPC at step 129500: 2.59283 learning rate: 0.007179
Training set BPC at step 129600: 2.22806 learning rate: 0.007179
Training set BPC at step 129700: 1.80167 learning rate: 0.007179
Training set BPC at step 129800: 2.34907 learning rate: 0.007179
Training set BPC at step 129900: 2.08581 learning rate: 0.007179
Training set BPC at step 130000: 2.13832 learning rate: 0.006461
Training set BPC at step 130100: 2.34156 learning rate: 0.006461
Training set BPC at step 130200: 2.76593 learning rate: 0.006461
Training set BPC at step 130300: 2.86751 learning rate: 0.006461
Training set BPC at step 130400: 2.13704 learning rate: 0.006461
Training set BPC at step 130500: 2.05415 learning rate: 0.006461
Training set BPC at step 

Training set BPC at step 141800: 1.89551 learning rate: 0.005233
Training set BPC at step 141900: 2.30432 learning rate: 0.005233
Training set BPC at step 142000: 2.47604 learning rate: 0.005233
Training set BPC at step 142100: 2.38201 learning rate: 0.005233
Training set BPC at step 142200: 2.39120 learning rate: 0.005233
Training set BPC at step 142300: 3.04862 learning rate: 0.005233
Training set BPC at step 142400: 1.98135 learning rate: 0.005233
Training set BPC at step 142500: 2.85751 learning rate: 0.005233
Training set BPC at step 142600: 2.97559 learning rate: 0.005233
Training set BPC at step 142700: 2.37739 learning rate: 0.005233
Training set BPC at step 142800: 2.82392 learning rate: 0.005233
Training set BPC at step 142900: 2.02575 learning rate: 0.005233
Training set BPC at step 143000: 2.05185 learning rate: 0.005233
Training set BPC at step 143100: 2.31758 learning rate: 0.005233
Training set BPC at step 143200: 2.54738 learning rate: 0.005233
Training set BPC at step 

Training set BPC at step 154600: 2.79400 learning rate: 0.004239
Training set BPC at step 154700: 2.29836 learning rate: 0.004239
Training set BPC at step 154800: 2.34797 learning rate: 0.004239
Training set BPC at step 154900: 2.35880 learning rate: 0.004239
Training set BPC at step 155000: 2.11187 learning rate: 0.003815
Training set BPC at step 155100: 1.64017 learning rate: 0.003815
Training set BPC at step 155200: 2.26333 learning rate: 0.003815
Training set BPC at step 155300: 2.92883 learning rate: 0.003815
Training set BPC at step 155400: 2.11446 learning rate: 0.003815
Training set BPC at step 155500: 2.21438 learning rate: 0.003815
Training set BPC at step 155600: 2.41359 learning rate: 0.003815
Training set BPC at step 155700: 1.82015 learning rate: 0.003815
Training set BPC at step 155800: 2.50989 learning rate: 0.003815
Training set BPC at step 155900: 2.07680 learning rate: 0.003815
Training set BPC at step 156000: 2.16646 learning rate: 0.003815
Training set BPC at step 

Training set BPC at step 167400: 2.64227 learning rate: 0.003090
Training set BPC at step 167500: 2.07186 learning rate: 0.003090
Training set BPC at step 167600: 2.37756 learning rate: 0.003090
Training set BPC at step 167700: 2.36681 learning rate: 0.003090
Training set BPC at step 167800: 2.72759 learning rate: 0.003090
Training set BPC at step 167900: 2.42545 learning rate: 0.003090
Training set BPC at step 168000: 2.04359 learning rate: 0.003090
Training set BPC at step 168100: 2.87253 learning rate: 0.003090
Training set BPC at step 168200: 2.23251 learning rate: 0.003090
Training set BPC at step 168300: 1.87042 learning rate: 0.003090
Training set BPC at step 168400: 2.14498 learning rate: 0.003090
Training set BPC at step 168500: 2.39043 learning rate: 0.003090
Training set BPC at step 168600: 2.11356 learning rate: 0.003090
Training set BPC at step 168700: 2.63449 learning rate: 0.003090
Training set BPC at step 168800: 2.31419 learning rate: 0.003090
Training set BPC at step 

Training set BPC at step 180200: 2.58533 learning rate: 0.002253
Training set BPC at step 180300: 2.73074 learning rate: 0.002253
Training set BPC at step 180400: 2.38322 learning rate: 0.002253
Training set BPC at step 180500: 2.25755 learning rate: 0.002253
Training set BPC at step 180600: 2.57928 learning rate: 0.002253
Training set BPC at step 180700: 2.16039 learning rate: 0.002253
Training set BPC at step 180800: 1.92856 learning rate: 0.002253
Training set BPC at step 180900: 2.32086 learning rate: 0.002253
Training set BPC at step 181000: 2.75090 learning rate: 0.002253
Training set BPC at step 181100: 2.06194 learning rate: 0.002253
Training set BPC at step 181200: 3.22747 learning rate: 0.002253
Training set BPC at step 181300: 2.20417 learning rate: 0.002253
Training set BPC at step 181400: 2.15277 learning rate: 0.002253
Training set BPC at step 181500: 2.26838 learning rate: 0.002253
Training set BPC at step 181600: 2.41662 learning rate: 0.002253
Training set BPC at step 

Training set BPC at step 192900: 2.89547 learning rate: 0.001825
Training set BPC at step 193000: 2.48242 learning rate: 0.001825
Training set BPC at step 193100: 2.51107 learning rate: 0.001825
Training set BPC at step 193200: 2.16694 learning rate: 0.001825
Training set BPC at step 193300: 2.81731 learning rate: 0.001825
Training set BPC at step 193400: 2.49964 learning rate: 0.001825
Training set BPC at step 193500: 2.79827 learning rate: 0.001825
Training set BPC at step 193600: 2.12083 learning rate: 0.001825
Training set BPC at step 193700: 2.72427 learning rate: 0.001825
Training set BPC at step 193800: 2.80993 learning rate: 0.001825
Training set BPC at step 193900: 2.72095 learning rate: 0.001825
Training set BPC at step 194000: 1.80464 learning rate: 0.001825
Training set BPC at step 194100: 2.49835 learning rate: 0.001825
Training set BPC at step 194200: 2.21267 learning rate: 0.001825
Training set BPC at step 194300: 1.47793 learning rate: 0.001825
Training set BPC at step 

Training set BPC at step 205700: 2.09863 learning rate: 0.001330
Training set BPC at step 205800: 2.43739 learning rate: 0.001330
Training set BPC at step 205900: 2.07108 learning rate: 0.001330
Training set BPC at step 206000: 2.34593 learning rate: 0.001330
Training set BPC at step 206100: 2.30571 learning rate: 0.001330
Training set BPC at step 206200: 2.51324 learning rate: 0.001330
Training set BPC at step 206300: 1.80338 learning rate: 0.001330
Training set BPC at step 206400: 1.90399 learning rate: 0.001330
Training set BPC at step 206500: 2.31428 learning rate: 0.001330
Training set BPC at step 206600: 2.24308 learning rate: 0.001330
Training set BPC at step 206700: 2.47049 learning rate: 0.001330
Training set BPC at step 206800: 2.49354 learning rate: 0.001330
Training set BPC at step 206900: 2.70873 learning rate: 0.001330
Training set BPC at step 207000: 2.23206 learning rate: 0.001330
Training set BPC at step 207100: 2.80701 learning rate: 0.001330
Training set BPC at step 

Training set BPC at step 218400: 1.77147 learning rate: 0.001078
Training set BPC at step 218500: 1.93866 learning rate: 0.001078
Training set BPC at step 218600: 2.23754 learning rate: 0.001078
Training set BPC at step 218700: 2.13821 learning rate: 0.001078
Training set BPC at step 218800: 2.01649 learning rate: 0.001078
Training set BPC at step 218900: 2.22174 learning rate: 0.001078
Training set BPC at step 219000: 2.74160 learning rate: 0.001078
Training set BPC at step 219100: 2.73035 learning rate: 0.001078
Training set BPC at step 219200: 2.24695 learning rate: 0.001078
Training set BPC at step 219300: 1.77201 learning rate: 0.001078
Training set BPC at step 219400: 2.65462 learning rate: 0.001078
Training set BPC at step 219500: 2.90322 learning rate: 0.001078
Training set BPC at step 219600: 2.62209 learning rate: 0.001078
Training set BPC at step 219700: 2.42367 learning rate: 0.001078
Training set BPC at step 219800: 2.39394 learning rate: 0.001078
Training set BPC at step 

Training set BPC at step 231100: 2.59172 learning rate: 0.000786
Training set BPC at step 231200: 2.91140 learning rate: 0.000786
Training set BPC at step 231300: 2.17788 learning rate: 0.000786
Training set BPC at step 231400: 2.75065 learning rate: 0.000786
Training set BPC at step 231500: 2.44251 learning rate: 0.000786
Training set BPC at step 231600: 2.55206 learning rate: 0.000786
Training set BPC at step 231700: 2.53966 learning rate: 0.000786
Training set BPC at step 231800: 2.34137 learning rate: 0.000786
Training set BPC at step 231900: 2.51320 learning rate: 0.000786
Training set BPC at step 232000: 2.44954 learning rate: 0.000786
Training set BPC at step 232100: 2.23576 learning rate: 0.000786
Training set BPC at step 232200: 3.51660 learning rate: 0.000786
Training set BPC at step 232300: 2.76267 learning rate: 0.000786
Training set BPC at step 232400: 2.42329 learning rate: 0.000786
Training set BPC at step 232500: 2.74575 learning rate: 0.000786
Training set BPC at step 

Training set BPC at step 243800: 2.34090 learning rate: 0.000636
Training set BPC at step 243900: 1.88554 learning rate: 0.000636
Training set BPC at step 244000: 2.53087 learning rate: 0.000636
Training set BPC at step 244100: 2.41271 learning rate: 0.000636
Training set BPC at step 244200: 2.87806 learning rate: 0.000636
Training set BPC at step 244300: 2.30885 learning rate: 0.000636
Training set BPC at step 244400: 2.14602 learning rate: 0.000636
Training set BPC at step 244500: 2.31418 learning rate: 0.000636
Training set BPC at step 244600: 2.85754 learning rate: 0.000636
Training set BPC at step 244700: 2.10017 learning rate: 0.000636
Training set BPC at step 244800: 2.60801 learning rate: 0.000636
Training set BPC at step 244900: 2.62388 learning rate: 0.000636
Training set BPC at step 245000: 1.90484 learning rate: 0.000573
Training set BPC at step 245100: 2.05116 learning rate: 0.000573
Training set BPC at step 245200: 2.40847 learning rate: 0.000573
Training set BPC at step 

Training set BPC at step 256600: 2.21056 learning rate: 0.000464
Training set BPC at step 256700: 2.54916 learning rate: 0.000464
Training set BPC at step 256800: 2.08254 learning rate: 0.000464
Training set BPC at step 256900: 2.28009 learning rate: 0.000464
Training set BPC at step 257000: 2.46321 learning rate: 0.000464
Training set BPC at step 257100: 1.64105 learning rate: 0.000464
Training set BPC at step 257200: 2.93716 learning rate: 0.000464
Training set BPC at step 257300: 2.03921 learning rate: 0.000464
Training set BPC at step 257400: 3.37611 learning rate: 0.000464
Training set BPC at step 257500: 2.30747 learning rate: 0.000464
Training set BPC at step 257600: 2.11779 learning rate: 0.000464
Training set BPC at step 257700: 2.11497 learning rate: 0.000464
Training set BPC at step 257800: 2.45555 learning rate: 0.000464
Training set BPC at step 257900: 2.20513 learning rate: 0.000464
Training set BPC at step 258000: 2.14640 learning rate: 0.000464
Training set BPC at step 

Training set BPC at step 269300: 2.09720 learning rate: 0.000376
Training set BPC at step 269400: 2.28228 learning rate: 0.000376
Training set BPC at step 269500: 2.59371 learning rate: 0.000376
Training set BPC at step 269600: 2.31357 learning rate: 0.000376
Training set BPC at step 269700: 2.02093 learning rate: 0.000376
Training set BPC at step 269800: 1.73159 learning rate: 0.000376
Training set BPC at step 269900: 2.89871 learning rate: 0.000376
Training set BPC at step 270000: 2.81207 learning rate: 0.000338
Training set BPC at step 270100: 2.13401 learning rate: 0.000338
Training set BPC at step 270200: 2.17802 learning rate: 0.000338
Training set BPC at step 270300: 2.43234 learning rate: 0.000338
Training set BPC at step 270400: 2.34751 learning rate: 0.000338
Training set BPC at step 270500: 2.25904 learning rate: 0.000338
Training set BPC at step 270600: 2.17007 learning rate: 0.000338
Training set BPC at step 270700: 2.59403 learning rate: 0.000338
Training set BPC at step 

Training set BPC at step 282000: 1.92777 learning rate: 0.000274
Training set BPC at step 282100: 2.92019 learning rate: 0.000274
Training set BPC at step 282200: 2.40936 learning rate: 0.000274
Training set BPC at step 282300: 2.78789 learning rate: 0.000274
Training set BPC at step 282400: 1.94702 learning rate: 0.000274
Training set BPC at step 282500: 3.10435 learning rate: 0.000274
Training set BPC at step 282600: 1.42970 learning rate: 0.000274
Training set BPC at step 282700: 2.02114 learning rate: 0.000274
Training set BPC at step 282800: 1.92882 learning rate: 0.000274
Training set BPC at step 282900: 1.90654 learning rate: 0.000274
Training set BPC at step 283000: 2.55759 learning rate: 0.000274
Training set BPC at step 283100: 1.89270 learning rate: 0.000274
Training set BPC at step 283200: 2.23879 learning rate: 0.000274
Training set BPC at step 283300: 2.11728 learning rate: 0.000274
Training set BPC at step 283400: 1.89247 learning rate: 0.000274
Training set BPC at step 

Training set BPC at step 294700: 1.83407 learning rate: 0.000222
Training set BPC at step 294800: 1.95907 learning rate: 0.000222
Training set BPC at step 294900: 3.06865 learning rate: 0.000222
Training set BPC at step 295000: 2.32522 learning rate: 0.000200
Training set BPC at step 295100: 2.33210 learning rate: 0.000200
Training set BPC at step 295200: 2.38753 learning rate: 0.000200
Training set BPC at step 295300: 2.44317 learning rate: 0.000200
Training set BPC at step 295400: 2.49914 learning rate: 0.000200
Training set BPC at step 295500: 1.69961 learning rate: 0.000200
Training set BPC at step 295600: 1.78467 learning rate: 0.000200
Training set BPC at step 295700: 2.27833 learning rate: 0.000200
Training set BPC at step 295800: 2.60488 learning rate: 0.000200
Training set BPC at step 295900: 2.23419 learning rate: 0.000200
Training set BPC at step 296000: 2.42424 learning rate: 0.000200
Training set BPC at step 296100: 2.35726 learning rate: 0.000200
Training set BPC at step 

Training set BPC at step 307500: 2.23540 learning rate: 0.000162
Training set BPC at step 307600: 1.78846 learning rate: 0.000162
Training set BPC at step 307700: 2.47704 learning rate: 0.000162
Training set BPC at step 307800: 2.63485 learning rate: 0.000162
Training set BPC at step 307900: 2.48326 learning rate: 0.000162
Training set BPC at step 308000: 2.48762 learning rate: 0.000162
Training set BPC at step 308100: 2.66710 learning rate: 0.000162
Training set BPC at step 308200: 2.80782 learning rate: 0.000162
Training set BPC at step 308300: 2.44694 learning rate: 0.000162
Training set BPC at step 308400: 2.30601 learning rate: 0.000162
Training set BPC at step 308500: 1.97364 learning rate: 0.000162
Training set BPC at step 308600: 2.26910 learning rate: 0.000162
Training set BPC at step 308700: 2.04463 learning rate: 0.000162
Training set BPC at step 308800: 2.30040 learning rate: 0.000162
Training set BPC at step 308900: 2.55316 learning rate: 0.000162
Training set BPC at step 

Training set BPC at step 320300: 2.48775 learning rate: 0.000118
Training set BPC at step 320400: 2.29147 learning rate: 0.000118
Training set BPC at step 320500: 1.67171 learning rate: 0.000118
Training set BPC at step 320600: 1.46570 learning rate: 0.000118
Training set BPC at step 320700: 2.87241 learning rate: 0.000118
Training set BPC at step 320800: 1.94700 learning rate: 0.000118
Training set BPC at step 320900: 1.71757 learning rate: 0.000118
Training set BPC at step 321000: 2.42424 learning rate: 0.000118
Training set BPC at step 321100: 2.22700 learning rate: 0.000118
Training set BPC at step 321200: 1.93017 learning rate: 0.000118
Training set BPC at step 321300: 2.12286 learning rate: 0.000118
Training set BPC at step 321400: 2.36120 learning rate: 0.000118
Training set BPC at step 321500: 1.71320 learning rate: 0.000118
Training set BPC at step 321600: 2.22400 learning rate: 0.000118
Training set BPC at step 321700: 2.98059 learning rate: 0.000118
Training set BPC at step 

Training set BPC at step 333000: 2.29098 learning rate: 0.000096
Training set BPC at step 333100: 2.41479 learning rate: 0.000096
Training set BPC at step 333200: 2.51998 learning rate: 0.000096
Training set BPC at step 333300: 2.13686 learning rate: 0.000096
Training set BPC at step 333400: 2.13541 learning rate: 0.000096
Training set BPC at step 333500: 2.39657 learning rate: 0.000096
Training set BPC at step 333600: 1.82772 learning rate: 0.000096
Training set BPC at step 333700: 2.09980 learning rate: 0.000096
Training set BPC at step 333800: 2.15596 learning rate: 0.000096
Training set BPC at step 333900: 2.53805 learning rate: 0.000096
Training set BPC at step 334000: 2.78836 learning rate: 0.000096
Training set BPC at step 334100: 2.27441 learning rate: 0.000096
Training set BPC at step 334200: 2.56492 learning rate: 0.000096
Training set BPC at step 334300: 2.28938 learning rate: 0.000096
Training set BPC at step 334400: 2.04304 learning rate: 0.000096
Training set BPC at step 

Training set BPC at step 345700: 2.40564 learning rate: 0.000070
Training set BPC at step 345800: 3.04762 learning rate: 0.000070
Training set BPC at step 345900: 1.65699 learning rate: 0.000070
Training set BPC at step 346000: 2.78924 learning rate: 0.000070
Training set BPC at step 346100: 1.52901 learning rate: 0.000070
Training set BPC at step 346200: 2.45055 learning rate: 0.000070
Training set BPC at step 346300: 2.62223 learning rate: 0.000070
Training set BPC at step 346400: 2.07303 learning rate: 0.000070
Training set BPC at step 346500: 1.80701 learning rate: 0.000070
Training set BPC at step 346600: 2.26136 learning rate: 0.000070
Training set BPC at step 346700: 2.56106 learning rate: 0.000070
Training set BPC at step 346800: 2.12016 learning rate: 0.000070
Training set BPC at step 346900: 1.93056 learning rate: 0.000070
Training set BPC at step 347000: 2.50830 learning rate: 0.000070
Training set BPC at step 347100: 2.39651 learning rate: 0.000070
Training set BPC at step 

Training set BPC at step 358500: 1.77289 learning rate: 0.000056
Training set BPC at step 358600: 2.16651 learning rate: 0.000056
Training set BPC at step 358700: 2.03037 learning rate: 0.000056
Training set BPC at step 358800: 2.50629 learning rate: 0.000056
Training set BPC at step 358900: 1.95558 learning rate: 0.000056
Training set BPC at step 359000: 1.95583 learning rate: 0.000056
Training set BPC at step 359100: 1.93558 learning rate: 0.000056
Training set BPC at step 359200: 2.99942 learning rate: 0.000056
Training set BPC at step 359300: 2.84496 learning rate: 0.000056
Training set BPC at step 359400: 1.97182 learning rate: 0.000056
Training set BPC at step 359500: 2.74327 learning rate: 0.000056
Training set BPC at step 359600: 2.79228 learning rate: 0.000056
Training set BPC at step 359700: 2.46260 learning rate: 0.000056
Training set BPC at step 359800: 2.02864 learning rate: 0.000056
Training set BPC at step 359900: 2.47097 learning rate: 0.000056
Training set BPC at step 

Training set BPC at step 371200: 2.55167 learning rate: 0.000041
Training set BPC at step 371300: 2.36053 learning rate: 0.000041
Training set BPC at step 371400: 2.31934 learning rate: 0.000041
Training set BPC at step 371500: 3.02319 learning rate: 0.000041
Training set BPC at step 371600: 3.14172 learning rate: 0.000041
Training set BPC at step 371700: 2.06592 learning rate: 0.000041
Training set BPC at step 371800: 2.46218 learning rate: 0.000041
Training set BPC at step 371900: 2.07487 learning rate: 0.000041
Training set BPC at step 372000: 2.08549 learning rate: 0.000041
Training set BPC at step 372100: 2.46258 learning rate: 0.000041
Training set BPC at step 372200: 2.26833 learning rate: 0.000041
Training set BPC at step 372300: 2.91942 learning rate: 0.000041
Training set BPC at step 372400: 1.95902 learning rate: 0.000041
Training set BPC at step 372500: 2.29902 learning rate: 0.000041
Training set BPC at step 372600: 2.67655 learning rate: 0.000041
Training set BPC at step 

Training set BPC at step 384000: 2.61185 learning rate: 0.000033
Training set BPC at step 384100: 1.87608 learning rate: 0.000033
Training set BPC at step 384200: 2.24358 learning rate: 0.000033
Training set BPC at step 384300: 1.87772 learning rate: 0.000033
Training set BPC at step 384400: 2.35585 learning rate: 0.000033
Training set BPC at step 384500: 2.47418 learning rate: 0.000033
Training set BPC at step 384600: 2.54625 learning rate: 0.000033
Training set BPC at step 384700: 2.44866 learning rate: 0.000033
Training set BPC at step 384800: 2.18240 learning rate: 0.000033
Training set BPC at step 384900: 2.66672 learning rate: 0.000033
Training set BPC at step 385000: 2.53059 learning rate: 0.000030
Training set BPC at step 385100: 2.46612 learning rate: 0.000030
Training set BPC at step 385200: 1.25958 learning rate: 0.000030
Training set BPC at step 385300: 2.30610 learning rate: 0.000030
Training set BPC at step 385400: 1.46965 learning rate: 0.000030
Training set BPC at step 

Training set BPC at step 396800: 2.46786 learning rate: 0.000024
Training set BPC at step 396900: 2.30368 learning rate: 0.000024
Training set BPC at step 397000: 2.44161 learning rate: 0.000024
Training set BPC at step 397100: 1.94677 learning rate: 0.000024
Training set BPC at step 397200: 2.30499 learning rate: 0.000024
Training set BPC at step 397300: 1.50955 learning rate: 0.000024
Training set BPC at step 397400: 2.29366 learning rate: 0.000024
Training set BPC at step 397500: 2.33028 learning rate: 0.000024
Training set BPC at step 397600: 2.15607 learning rate: 0.000024
Training set BPC at step 397700: 2.18629 learning rate: 0.000024
Training set BPC at step 397800: 2.30479 learning rate: 0.000024
Training set BPC at step 397900: 2.01390 learning rate: 0.000024
Training set BPC at step 398000: 1.85300 learning rate: 0.000024
Training set BPC at step 398100: 2.26866 learning rate: 0.000024
Training set BPC at step 398200: 2.23194 learning rate: 0.000024
Training set BPC at step 

Training set BPC at step 409500: 1.69525 learning rate: 0.000020
Training set BPC at step 409600: 2.61420 learning rate: 0.000020
Training set BPC at step 409700: 2.43668 learning rate: 0.000020
Training set BPC at step 409800: 2.30947 learning rate: 0.000020
Training set BPC at step 409900: 2.29498 learning rate: 0.000020
Training set BPC at step 410000: 2.74305 learning rate: 0.000018
Training set BPC at step 410100: 3.03506 learning rate: 0.000018
Training set BPC at step 410200: 2.67235 learning rate: 0.000018
Training set BPC at step 410300: 2.29359 learning rate: 0.000018
Training set BPC at step 410400: 2.42508 learning rate: 0.000018
Training set BPC at step 410500: 2.65457 learning rate: 0.000018
Training set BPC at step 410600: 2.51655 learning rate: 0.000018
Training set BPC at step 410700: 2.26592 learning rate: 0.000018
Training set BPC at step 410800: 2.86756 learning rate: 0.000018
Training set BPC at step 410900: 2.70102 learning rate: 0.000018
Training set BPC at step 

Training set BPC at step 422200: 2.14953 learning rate: 0.000014
Training set BPC at step 422300: 1.64315 learning rate: 0.000014
Training set BPC at step 422400: 2.69012 learning rate: 0.000014
Training set BPC at step 422500: 2.76561 learning rate: 0.000014
Training set BPC at step 422600: 2.15699 learning rate: 0.000014
Training set BPC at step 422700: 2.21742 learning rate: 0.000014
Training set BPC at step 422800: 2.32604 learning rate: 0.000014
Training set BPC at step 422900: 2.35283 learning rate: 0.000014
Training set BPC at step 423000: 2.87086 learning rate: 0.000014
Training set BPC at step 423100: 2.64175 learning rate: 0.000014
Training set BPC at step 423200: 2.46749 learning rate: 0.000014
Training set BPC at step 423300: 2.48566 learning rate: 0.000014
Training set BPC at step 423400: 2.94122 learning rate: 0.000014
Training set BPC at step 423500: 2.84590 learning rate: 0.000014
Training set BPC at step 423600: 2.07978 learning rate: 0.000014
Training set BPC at step 

Training set BPC at step 435000: 2.32338 learning rate: 0.000010
Training set BPC at step 435100: 2.43461 learning rate: 0.000010
Training set BPC at step 435200: 2.39744 learning rate: 0.000010
Training set BPC at step 435300: 1.76095 learning rate: 0.000010
Training set BPC at step 435400: 2.57162 learning rate: 0.000010
Training set BPC at step 435500: 2.22992 learning rate: 0.000010
Training set BPC at step 435600: 2.00699 learning rate: 0.000010
Training set BPC at step 435700: 2.20099 learning rate: 0.000010
Training set BPC at step 435800: 2.46127 learning rate: 0.000010
Training set BPC at step 435900: 2.48317 learning rate: 0.000010
Training set BPC at step 436000: 2.27174 learning rate: 0.000010
Training set BPC at step 436100: 3.23223 learning rate: 0.000010
Training set BPC at step 436200: 1.95879 learning rate: 0.000010
Training set BPC at step 436300: 2.63371 learning rate: 0.000010
Training set BPC at step 436400: 2.17418 learning rate: 0.000010
Training set BPC at step 

Training set BPC at step 447700: 1.95646 learning rate: 0.000008
Training set BPC at step 447800: 2.05334 learning rate: 0.000008
Training set BPC at step 447900: 2.72737 learning rate: 0.000008
Training set BPC at step 448000: 2.99404 learning rate: 0.000008
Training set BPC at step 448100: 2.04257 learning rate: 0.000008
Training set BPC at step 448200: 1.67350 learning rate: 0.000008
Training set BPC at step 448300: 2.31296 learning rate: 0.000008
Training set BPC at step 448400: 2.34587 learning rate: 0.000008
Training set BPC at step 448500: 2.47261 learning rate: 0.000008
Training set BPC at step 448600: 2.37325 learning rate: 0.000008
Training set BPC at step 448700: 2.15711 learning rate: 0.000008
Training set BPC at step 448800: 2.16984 learning rate: 0.000008
Training set BPC at step 448900: 3.13967 learning rate: 0.000008
Training set BPC at step 449000: 2.08935 learning rate: 0.000008
Training set BPC at step 449100: 2.27990 learning rate: 0.000008
Training set BPC at step 

Training set BPC at step 460500: 2.26440 learning rate: 0.000006
Training set BPC at step 460600: 2.84828 learning rate: 0.000006
Training set BPC at step 460700: 1.97175 learning rate: 0.000006
Training set BPC at step 460800: 2.76211 learning rate: 0.000006
Training set BPC at step 460900: 2.22742 learning rate: 0.000006
Training set BPC at step 461000: 2.89242 learning rate: 0.000006
Training set BPC at step 461100: 2.26253 learning rate: 0.000006
Training set BPC at step 461200: 2.61356 learning rate: 0.000006
Training set BPC at step 461300: 2.67276 learning rate: 0.000006
Training set BPC at step 461400: 2.38659 learning rate: 0.000006
Training set BPC at step 461500: 1.89963 learning rate: 0.000006
Training set BPC at step 461600: 1.98551 learning rate: 0.000006
Training set BPC at step 461700: 2.37042 learning rate: 0.000006
Training set BPC at step 461800: 2.11315 learning rate: 0.000006
Training set BPC at step 461900: 1.99082 learning rate: 0.000006
Training set BPC at step 

Training set BPC at step 473200: 2.19748 learning rate: 0.000005
Training set BPC at step 473300: 2.20303 learning rate: 0.000005
Training set BPC at step 473400: 1.78913 learning rate: 0.000005
Training set BPC at step 473500: 2.52838 learning rate: 0.000005
Training set BPC at step 473600: 2.49873 learning rate: 0.000005
Training set BPC at step 473700: 2.27176 learning rate: 0.000005
Training set BPC at step 473800: 2.27259 learning rate: 0.000005
Training set BPC at step 473900: 2.44861 learning rate: 0.000005
Training set BPC at step 474000: 2.84698 learning rate: 0.000005
Training set BPC at step 474100: 1.67304 learning rate: 0.000005
Training set BPC at step 474200: 2.18426 learning rate: 0.000005
Training set BPC at step 474300: 2.39766 learning rate: 0.000005
Training set BPC at step 474400: 1.97230 learning rate: 0.000005
Training set BPC at step 474500: 1.58731 learning rate: 0.000005
Training set BPC at step 474600: 1.48447 learning rate: 0.000005
Training set BPC at step 

Training set BPC at step 485900: 2.19533 learning rate: 0.000004
Training set BPC at step 486000: 2.52801 learning rate: 0.000004
Training set BPC at step 486100: 2.43682 learning rate: 0.000004
Training set BPC at step 486200: 2.54320 learning rate: 0.000004
Training set BPC at step 486300: 2.20969 learning rate: 0.000004
Training set BPC at step 486400: 2.30048 learning rate: 0.000004
Training set BPC at step 486500: 2.49920 learning rate: 0.000004
Training set BPC at step 486600: 2.13323 learning rate: 0.000004
Training set BPC at step 486700: 2.33035 learning rate: 0.000004
Training set BPC at step 486800: 2.05151 learning rate: 0.000004
Training set BPC at step 486900: 2.05457 learning rate: 0.000004
Training set BPC at step 487000: 2.63490 learning rate: 0.000004
Training set BPC at step 487100: 2.14639 learning rate: 0.000004
Training set BPC at step 487200: 2.21538 learning rate: 0.000004
Training set BPC at step 487300: 1.92264 learning rate: 0.000004
Training set BPC at step 

Training set BPC at step 498700: 2.86417 learning rate: 0.000003
Training set BPC at step 498800: 1.94733 learning rate: 0.000003
Training set BPC at step 498900: 3.04187 learning rate: 0.000003
Training set BPC at step 499000: 2.53373 learning rate: 0.000003
Training set BPC at step 499100: 2.46435 learning rate: 0.000003
Training set BPC at step 499200: 2.24360 learning rate: 0.000003
Training set BPC at step 499300: 2.78009 learning rate: 0.000003
Training set BPC at step 499400: 2.55440 learning rate: 0.000003
Training set BPC at step 499500: 2.20856 learning rate: 0.000003
Training set BPC at step 499600: 2.34343 learning rate: 0.000003
Training set BPC at step 499700: 2.24345 learning rate: 0.000003
Training set BPC at step 499800: 2.49515 learning rate: 0.000003
Training set BPC at step 499900: 2.53427 learning rate: 0.000003
Training set BPC at step 500000: 2.22075 learning rate: 0.000003
Training set BPC at step 500100: 2.26059 learning rate: 0.000003
Training set BPC at step 