# RNN Play Ground

## Setup
First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures

In [2]:
import tensorflow as tf
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rnn"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

## Basic RNNs in Tensorflow

For a simle  model, create an RNN composed of a layer of five RNNs ![Simple RNN](rnn1.png) using the tanh activation function.  we will assume that the RNN runs over only two time steps, taking vectors of size 3 at eachtime step.  The following code builds the RNN, unrolled through 2 time steps

### Manual RNNs

In [2]:
reset_graph()

n_inputs=3
n_neurons=5
X0=tf.placeholder(tf.float32,[None,n_inputs])
X1=tf.placeholder(tf.float32,[None,n_inputs])

Wx=tf.Variable(tf.random_normal(shape=[n_inputs,n_neurons],dtype=tf.float32))
Wy=tf.Variable(tf.random_normal(shape=[n_neurons,n_neurons],dtype=tf.float32))
b=tf.Variable(tf.zeros([1,n_neurons],dtype=tf.float32))

Y0=tf.tanh(tf.matmul(X0,Wx)+b)
Y1=tf.tanh(tf.matmul(Y0,Wy)+tf.matmul(X1,Wx)+b)

This network looks much like a two-layer feedforward neural network, with a few twists: first, the same weights and bias terms are shared by both layers, and second, we feed inputs at each layer, adnwe get outputs from each layer.  To run the mode, we need to feed it the inputs at both time steps, like so.

In [3]:
import numpy as np
# Mini-batch: instance 0, instance 1, instance 2, instance 3
X0_batch=np.array([[0,1,2],[3,4,5],[6,7,8],[9,0,1]]) # t=0
X1_batch=np.array([[9,8,7],[0,0,0],[6,5,4],[3,2,1]]) # t=1

This mini-batch contains four instances, each with an input sequence composed of exactly two inputs.  At the end, Y0_val and Y1_val contain the outputs of the networks at both time steps for all neurons and all instances in the mini-batch:

In [6]:
def run_me():
    init=tf.global_variables_initializer()
    with tf.Session() as sess:
        init.run()
        Y0_val, Y1_val=sess.run([Y0,Y1], feed_dict={X0:X0_batch,X1:X1_batch})
    print(Y0_val,Y1_val)

In [4]:
run_me()

[[-0.0664006   0.96257669  0.68105793  0.70918542 -0.89821601]
 [ 0.9977755  -0.71978903 -0.99657607  0.96739239 -0.99989718]
 [ 0.99999774 -0.99898803 -0.99999893  0.99677622 -0.99999988]
 [ 1.         -1.         -1.         -0.99818915  0.99950868]] [[ 1.         -1.         -1.          0.40200251 -0.99999982]
 [-0.12210421  0.62805271  0.96718431 -0.99371219 -0.25839362]
 [ 0.99999827 -0.9999994  -0.9999975  -0.85943311 -0.99998808]
 [ 0.99928284 -0.99999815 -0.99990582  0.98579627 -0.92205757]]


That's it! Of course, if you want to be able to run an RNN over 100 time steps, the graph is going to be pretty big, let's therefore see how Tensorflow's RNN operations handles these parameters.

## Static Unrolling through time
The static_rnn() function creates an unrolled RNN network by chaining cells.  The following code creates the exact same model as the previous one

In [12]:
reset_graph()

X0=tf.placeholder(tf.float32,[None,n_inputs])
X1=tf.placeholder(tf.float32,[None,n_inputs])

basic_cell=tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs,states=tf.contrib.rnn.static_rnn(basic_cell,[X0,X1],dtype=tf.float32)
Y0,Y1=output_seqs
run_me()

[[ 0.30741334 -0.32884315 -0.65428472 -0.93850589  0.52089024]
 [ 0.99122757 -0.95425421 -0.75180793 -0.99952078  0.98202348]
 [ 0.99992681 -0.99783254 -0.82473528 -0.9999963   0.99947774]
 [ 0.99677098 -0.68750614  0.84199691  0.93039107  0.8120684 ]] [[ 0.99998885 -0.99976051 -0.06679298 -0.99998039  0.99982214]
 [-0.65249437 -0.51520866 -0.37968954 -0.59225935 -0.08968385]
 [ 0.99862403 -0.99715197 -0.03308626 -0.99915648  0.99329019]
 [ 0.99681675 -0.95981938  0.39660636 -0.83076048  0.79671967]]


In [13]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "b<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [14]:
show_graph(tf.get_default_graph())

First, we create the input placeholders, as before.  Then we create a BasicRNNCell, which you can think of a a factory that create copies of the cell to build the unrolled RNN (one for each time step).  Then we call static_rnn(), giving it the cell factory and the input tenesors, and telling it the data type of the inputs (this is used to create the initial state matrix, which by default is full of zeros).  The static_rnn() function calls the cell factory's __call__() function once per input, creating two copies of the terms, and it chains them like we did earlier.  The static_rnn() function returns two objects.  The first is a Python list containing the output tensors for each time step.  The second is a tensor containing the final states of the network.  when you are using basic cells, the final state is simply equal to the last output.

If there were 50 time steps, it would not be very convenient to have 50 input placeholders and 50 output tensors.  Moreover, at execution time you would have to feed each of the 50 placeholders and manipulate the 50 outputs.  Let's simplify this.  The following code builds the same RNN again, but this time it takes single placeholder of shape [None, n_steps, n_inputs] where the first dimension is the mini-batch size.  Then it extracts the list of input sequences for each time step.  X_seqs is a Python list of n_steps tensors of shape [None, n_inputs] where the first dimension is the mini-batch size.  To do this, we first swap the first two dimensions using the transpose() functrion, so that the time steps are not the first dimension.  Then we extract a Python list of tensors along the first dimension (i.e. one tensor per time step) using the unstack() function.  The next two lines are the same as before.  Finally, we merge all the output tensors into a single tensor using the stack() function, and we swap the first two dimensions to get a final outputs tensor of shape [None, n_steps, n_neurons] (again the first dimension is the mini-batch size).

In [15]:
n_steps=2

reset_graph()

X=tf.placeholder(tf.float32,[None,n_steps,n_inputs])
X_seqs=tf.unstack(tf.transpose(X,perm=[1,0,2]))

basic_cell=tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs,states=tf.contrib.rnn.static_rnn(basic_cell,X_seqs,dtype=tf.float32)
outputs=tf.transpose(tf.stack(output_seqs),perm=[1,0,2])

In [19]:
def run_me2():
    init=tf.global_variables_initializer()
    with tf.Session() as sess:
        init.run()
        outputs_val = outputs.eval(feed_dict={X: X_batch})
    return (outputs_val)

Now we can run the network by feeding it a single tensor that contains the mini-batch sequences:

In [17]:
X_batch=np.array([
        # t=0         t=1
        [[0,1,2],[9,8,7]], # instance 0
        [[3,4,5],[0,0,0]], # instance 1
        [[6,7,8],[6,5,4]], # instance 2
        [[9,0,1],[3,2,1]], # instance 3
    ])

print(run_me2())

[[[-0.45652324 -0.68064123  0.40938237  0.63104504 -0.45732826]
  [-0.94288003 -0.99988687  0.94055814  0.99999851 -0.9999997 ]]

 [[-0.80015349 -0.99218267  0.78177971  0.9971031  -0.99646091]
  [-0.63711601  0.11300932  0.5798437   0.43105593 -0.63716984]]

 [[-0.93605185 -0.99983788  0.93088669  0.99998152 -0.99998295]
  [-0.9165386  -0.99456042  0.89605415  0.99987197 -0.99997509]]

 [[ 0.99273688 -0.99819332 -0.55543643  0.9989031  -0.9953323 ]
  [-0.02746334 -0.73191994  0.7827872   0.95256817 -0.97817713]]]


In [20]:
print(np.transpose(run_me2(), axes=[1, 0, 2])[1])

[[-0.94288003 -0.99988687  0.94055814  0.99999851 -0.9999997 ]
 [-0.63711601  0.11300932  0.5798437   0.43105593 -0.63716984]
 [-0.9165386  -0.99456042  0.89605415  0.99987197 -0.99997509]
 [-0.02746334 -0.73191994  0.7827872   0.95256817 -0.97817713]]


However, this approach still builds as graph containing one cell per time step.  If there were 50 time steps, the graph would look pretty ugly.  It is a bit like writing a program without ever using loops (e.g. Y0=f(0,X0); Y1=f(Y0,X1); Y2=f(Y1,X2); ...; Y50=f(Y49,X50)).  With such as large graph, you may even get out of memory (OOM) errors during backpropagation (especially with limited memory of GPU cards), since it must store all tensor values during the forward pass so it can use them to compute gradients during the reverse pass.  The solution to this is to use dyanmic_rnn() function.

## Dynamic Unrolling through time
The dynamic_rnn() function uses a while_loop() operation to run over the cell the appropriate number of times, and you can set swap_memory=True if you want to swap the GPU's memory to the CPU's memory during the backpropagation to avoid OOM errors. Conversely, it also accepts a single tensor for all inputs at every step (shape [None, n_steps, n_inputs]) and it outputs a single tensor for all all outputs at every time step (shape [None, n_steps, n_neurons]); there is no need to stack, unstack or transpose.  The following code creates the same RNN as earlier using the dynamic_rnn() function.  It's so much nicer using dynamic rnn:

In [24]:
reset_graph()

X=tf.placeholder(tf.float32,[None, n_steps, n_inputs])
basic_cell=tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs,states=tf.nn.dynamic_rnn(basic_cell,X,dtype=tf.float32)

print(np.transpose(run_me2(), axes=[1, 0, 2])[1])

[[-0.99999809 -0.99989897 -0.99999422  0.99959809 -0.4085584 ]
 [ 0.21658944 -0.20340651  0.02591269  0.44528607 -0.40116802]
 [-0.99981058 -0.99268609 -0.99952865  0.99385983 -0.25559649]
 [-0.97511095 -0.34840977 -0.92128128  0.39821982  0.27953893]]


In [25]:
show_graph(tf.get_default_graph())

## Variable length Input sequences
So far we have used only fixed-size input sequences (all exactly two steps long). What if the input sequences have variable lengths (e.g. sentences)?  In this case you should set the sequence_length argument when calling the dynamic_rnn() (or static_rnn()) function; it must be a 1D tensor indicating the length of the input sequence for each instance. For example

In [None]:
seq_length=tf.placeholder(tf.int32,[None])
outputs,states=tf.nn.dynamic_rnn(basic_cell,X,dtype=tf.float32,sequence_length=seq_length)

For example, suppose the second input sequence contains only one input instead of two.  It must be padded with a zero in order to fit in the input tensor X (because the input tensor's second dimension is the size of the longest sequence - i.e. 2).

In [None]:
X_batch=np.array([
        # t=0         t=1
        [[0,1,2],[9,8,7]], # instance 0
        [[3,4,5],[0,0,0]], # instance 1 (padded with a zero vector)
        [[6,7,8],[6,5,4]], # instance 2
        [[9,0,1],[3,2,1]], # instance 3
    ])
seq_length_batch=np.array([2,1,2,2])

Of coure, you now feed values for both placeholder X and seq_length.

In [None]:
with tf.Session as sess:
    init.run()
    output_val,states_val=sess.run([output,states],\
        feed_dict=[X:X_batch, seq_length=seq_length_batch])
print(output_val)

Now the RNN outputs zero vectors for every time step past the input sequence length.

## Variable length output sequences
What if the output sequences have variable lengths as well? If you know in advance what each sequence will have (for example if you know that it will be the same length as the input sequence), then you can set the sequence_length parameter as described above.  Unfortunately, in generall this will not be possible for example, the length of a translated sentence is generally different from the length of the input sentence.  In this case, the most common solution is to define a special output called end-of-sequence token (EOS token).  Any output past the EOS should be ignored.

## Training a sequence Classifier
Let's train an RNN to classify MNIST images.  A convolutional neural network would be better suited for image classification, but this makes for a simple example.  We will treat each image as a sequence of 28 rows of 28 pixels each.  We will use cells of 150 recurrent neurons, plus a fully connected layer containing 10 neurons (one per class) connected to the output of the last time step, followed by a softmax layer as in the figure below:
![Sequence Classifier](rnn2.png)

The construction phase is quite straightforward; an unrolled RNN is used and the fully connected layer is connected to the states tensor, which contains only the final state of the RNN (i.e., the 28th output).  Also note that y is a place holder for the target classes.

In [3]:
reset_graph()

n_steps=28
n_inputs=28
n_neurons=150
n_outputs=10

learning_rate=0.001

X=tf.placeholder(tf.float32,[None,n_steps,n_inputs])
y=tf.placeholder(tf.int32,[None])

basic_cell=tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell,X,dtype=tf.float32)

logits = tf.layers.dense(states, n_outputs)
xentropy=tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=logits)

loss= tf.reduce_mean(xentropy)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

Now let's load the MNIST data and reshape the test data to [batch_size, n_steps, n_inputs] as is expected by the network.  We will take care of reshaping the training data in a moment

In [5]:
!h:setpath


c:\users\u1273400\Dropbox\rtmp\src\python\notebooks\research>set path=C:\Users\u1273400\AppData\Local\Continuum\Anaconda3\Library\bin;C:\Users\u1273400\AppData\Local\Continuum\Anaconda3\Library\bin;C:\SIMULIA\Commands;C:\Program Files\Microsoft MPI\Bin\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\ProgramData\Oracle\Java\javapath;C:\Program Files (x86)\ActiveState Komodo Edit 10\;C:\windows\system32;C:\windows;C:\windows\System32\Wbem;C:\windows\System32\WindowsPowerShell\v1.0\;C:\PROGRAM FILES\OPENOFFICE.ORG 3\URE\BIN; C:\PROGRAM FILES (X86)\OPENOFFICE.ORG 3\URE\BIN;C:\Program Files\Microsoft SQL Server\120\Tools\Binn\;C:\program files\Statistics\JRE\bin;C:\Program Files (x86)\Windows Kits\8.1\Windows Performance Toolkit\;C:\Program Files\MATLAB\R2017a\runtime\win64;C:\Program Files\MATLAB\R2017a\bin;C:\Program Files\TortoiseSVN\bin;C:\Users\u1273400\AppData\Local\Microsoft\WindowsApps;;C:\intelFPGA_pro\16.1\modelsim_ase\win32aloem;C:\intelFPGA_pro\17.0\modelsim_ae\win32

In [7]:
import urllib
proxy_support = urllib.request.ProxyHandler({'http': 'http://wwwproxy.hud.ac.uk:3128/'})
opener = urllib.request.build_opener(proxy_support)
# This time, rather than install the OpenerDirector, we use it directly:
urllib.request.install_opener(opener)
#opener.open('http://www.example.com/login.html')

In [8]:
from tensorflow.examples.tutorials.mnist import input_data

mnist=input_data.read_data_sets('/tmp/data/')
X_test=mnist.test.reshape((-1,n_steps,n_inputs))
y_test=mnist.test.labels

URLError: <urlopen error [WinError 10061] No connection could be made because the target machine actively refused it>

Now we are ready to train the RNN.  The execution phase is exactly the same  as for the MNIST classifier for feedforward networks, except that we reshape each training batch before feeding it to the network.

In [None]:
n_epochs = 100
batch_size = 150

with tf.Session()as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples//batch_size):
            X_batch,y_batch=mnist.train.next_batch(batch_size)
            X_batch=X_batch.reshape((-1,n_steps,n_inputs))
            sess.run(training_op,feed_dict={X:X_batch,y:y_batch})
        acc_train=accuracy.eval(feed_dict={X:X_batch, y:y_batch})
        acc_test=accuracy.eval(feed_dict={X:X_test, y:y_test})
        print(epoch,"Train accuracy:",acc_train,"Test accuracy:",acc_test)

98%+ accuracy is pretty impressive. Tuning hyperperameters should lead to even better reasults other tweaks include pre-weight initialisation, longer training, regularisation etc.

## Deep RNNs
It is quite common to stack multiple layers of cells, as shown in the figure below.  This gives you a deep RNN.
![Deep RNN](rnn3.PNG)
To implement a deep RNN in Tensorflow, you can create several cells and stack them into a MultiRNNCell.  In the following code we stack three identical cells (but one could also use various kinds of cells with different number of neurons).

In [None]:
n_neurons=100
n_layers=3

layers=[tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu) for layer in range(n_layers)]
multi_layer_cell=tf.contrib.rnn.MultiRNNCell(layers)
outputs,states=tf.nn.dynamic_rnn(multi_layer_cell,X,dtype=tf.float32)

That's all there is to it! The states variable is the tuple containing one tensor per layer, each representing the final state of that layer's cell (with shape [batch_size, n_neurons]).  If you set state_is_tuple=False when creating the MultiRNNCell, then states become a single tensor containing the states from every layer, concatenated along the column axis (i.e., its shape is [batch_size, n_layers * n_neurons]).  Note that before TensorFlow0.11.0, this behaviour was default.

### Distributing across GPUs
We can efficiently distribute deep RNNs across multiple GPUs by pinning each layer to a different GPU (see fig. 12.16).  However, if you try to create each cell in a different device() block, it will not work:

In [None]:
with tf.device("/gpu:0"): # BAD! this is ignored.
    layer1=tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
    
with tf.device("/gpu:1"): # BAD! ignored again
    layer2=tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)

This fails because a BasicRNNCell is a cell factory, not a cell per se (a mentioned earlier); no cells get created when you create the factory, and thus no variables do either.  The device block is simply ignored.  The cells actually get created later.  When you call dynamic_rnn(), it calls the MultiRNNCell, which calls each individual BasicRNNCell, which create the actual cells (including their variables).  Unfortunately, none of the classes provide anyway to control the deevices on which the variables get created.  if you try to put the dynamic_rnn() cell within a device block, the whole RNN gets pinned into a single device.  So are you stuck? Fortunately not!  The trick is to create your own cell wrapper:

In [1]:
import tensorflow as tf
class DeviceCellWrapper(tf.contrib.rnn.RNNCell):
    def __init__(self,device,cell):
        self._cell=cell
        self._device=device
    
    @property
    def state_size(self):
        return self._cell.state_size
    
    @property
    def output_size(self):
        return self._cell.output_size

    def __call__(self,inputs,state,scope=None):
        with tf.device(self._defice):
            return self._cell(inputs,state,scope)

This wrapper simply proxies every method call to another cell, except it wraps the __call__() function within a device block. Now you can distribute each layer on a different GPU:

In [None]:
devices=["/gpu:0","/gpu:1","/gpu:2"]
cells=[DeviceCellWrapper(dev,tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)) for dev in devices]
multi_layer_cell=tf.contrib.rnn.MultiRNNCell(cells)
outputs, states=tf.nn.dynamic_rnn(multi_layer_cell,X,dtype=tf.float32)

Do not set state_is_tuple=False, or the MultiRNNCell will not concatenate all the cell states into a single tensore, on a single GPU.

### Applying dropout
If you build a very deep RNN, it may end up overfitting the training set.  to prevent that, a common technique is to apply dropout.  You can simply add a dropout layer before or after the RNN as usual, but if you also want to apply dropout between the RNN layers, you need to use a DropoutWrapper, the following code applies dropout to the inputs of each layer in the RNN, dropping each input with 50% probability.

In [None]:
keep_prob=0.5
cells=[tf.contrib.rnn.BasicRNNCell(num_units=n_neurons) for layer in range(n_layers)]
cells_drop=[tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob) for cell in cells]
multi_layer_cell=tf.contrib.runMultiRNNCell(cells_drop)
rnn_outputs, states=tf.nn.dynamic_rnn(multi_layer_cell,X,dtype=tf.float32)

Note that it is also possible to apply dropout to the outputs by setting output_keep_prob.

The main problem with this code is that it will apply dropout not only during training but also during testing, which is not what you want.  To prevent this, either use your own dropout wrapper class or have two different graphs one for training and one for testing.  The second option looks like this

## Machine Translation
The `basic_rnn_seq2seq()` function creates a simple Encoder/Decoder model: it first runs an RNN to encode `encoder_inputs` into a state vector, then runs a decoder initialized with the last encoder state on `decoder_inputs`. Encoder and decoder use the same RNN cell type but they don't share parameters.

In [None]:
import tensorflow as tf
reset_graph()

n_steps = 50
n_neurons = 200
n_layers = 3
num_encoder_symbols = 20000
num_decoder_symbols = 20000
embedding_size = 150
learning_rate = 0.01

X = tf.placeholder(tf.int32, [None, n_steps]) # English sentences
Y = tf.placeholder(tf.int32, [None, n_steps]) # French translations
W = tf.placeholder(tf.float32, [None, n_steps - 1, 1])
Y_input = Y[:, :-1]
Y_target = Y[:, 1:]

encoder_inputs = tf.unstack(tf.transpose(X)) # list of 1D tensors
decoder_inputs = tf.unstack(tf.transpose(Y_input)) # list of 1D tensors

lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)
              for layer in range(n_layers)]
cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)

output_seqs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
    encoder_inputs,
    decoder_inputs,
    cell,
    num_encoder_symbols,
    num_decoder_symbols,
    embedding_size)

logits = tf.transpose(tf.unstack(output_seqs), perm=[1, 0, 2])

In [None]:
logits_flat = tf.reshape(logits, [-1, num_decoder_symbols])
Y_target_flat = tf.reshape(Y_target, [-1])
W_flat = tf.reshape(W, [-1])
xentropy = W_flat * tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y_target_flat, logits=logits_flat)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()