# Arabic text recognition using MDLSTM + CTC

## Imports

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time
import logging
import sys
import math
import random

import tensorflow as tf
from PIL import Image
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow.contrib.rnn import RNNCell, LSTMStateTuple
from tensorflow.contrib.rnn.python.ops.core_rnn_cell import _linear
from tensorflow.python.ops.rnn import dynamic_rnn
import tensorflow.contrib.slim as slim

## Paramaters

In [None]:
# Logging configuration.
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.DEBUG,
                    stream=sys.stdout)

# Model path.
MODEL_PATH = "./models/model.ckpt"

# Summary directory.
SUMMARY_PATH = "./logs/"


# Constants.
SPACE_TOKEN = 'sp'
SPACE_INDEX = 63
FIRST_INDEX = 0 

# Number of features.
NUM_FEATURES = 2200

# Accounting to the symbol dictionary defined above
NUM_CLASSES = 66

# Hyper-parameters.
NUM_EPOCHS = 1
NUM_HIDDEN = 10
NUM_LAYERS = 2
BATCH_SIZE = 128

# Optimizer parameters.
INITIAL_LEARNING_RATE = 1e-2
MOMENTUM = 0.9

WIND_H = 5
WIND_W = 5

PARL_EXC = 1000

H  = 150
W  =2200
CH =   1

## Data Loading

In [None]:
import os
from PIL import Image
import numpy as np
symbol_dict = {'hh': 1,'am':2,'ae': 3,'ah': 4,'aa': 5,'ba': 6,
               'ta': 7,'teE': 8,'th': 9,'ja': 10,'ha': 11,'kh': 12,
               'da': 13,'dh': 14,'ra': 15,'za': 16,'se': 17,'sh': 18,
               'sa': 19,'de': 20,'to': 21,'zha': 22,'ay': 23,'gh': 24,
               'fa': 25,'ka': 26,'ke': 27,'la': 28,'ma': 29,'na': 30,
               'he': 31,'wa': 32,'wl': 33,'ya': 34,'ee': 35,'al': 36,
               'n0': 37,'n1': 38,'n2': 39,'n3': 40,'n4': 41,'n5': 42,
               'n6': 43,'n7': 44,'n8': 45,'n9': 46,'atr': 47,'col': 48,
               'dbq': 49,'com': 50,'qts': 51,'exc': 52,'dot': 53,'bro': 54,
               'brc': 55,'fsl': 56,'bsl': 57,'equ': 58,'hyp': 59,'usc': 60,
               'src': 61,'per': 62, 'sp': 63, 'te': 64
              }
def to_symbols(seq):
    new_seq = []
    for c in seq:
        new_seq.append(symbol_dict[c])
    return new_seq
def load_data(image_folder_location, txt_file_location):
    """
        Loads data from the KHATT dataset, images are assumed to be text line images, and the ground truth table is assumed to be stored in a .txt file
    """
    
    txt_file = open(txt_file_location, "r")
    lines = txt_file.readlines()
    list_dir = os.listdir(image_folder_location)
    list_data = []
    list_labels = []
    #dict_data = {}
    for i in range(len(list_dir)):
        im = Image.open(image_folder_location + "/"+list_dir[i])
        imarray = np.array(im)
        
        aspect = imarray.shape[1]/imarray.shape[0]
        if aspect < 5:
            continue
            
        im = im.resize((2200, 150), Image.ANTIALIAS)
        imarray = np.array(im)
        
        list_data.append(imarray.reshape(150,2200,1))
        line_data = lines[i].replace(list_dir[i],"").replace("	 ","")
        list_labels.append(to_symbols(line_data.split()))
        #dict_data[list_data[i]] = line_data
    return list_data,list_labels
x_train, y_train = load_data("C:/Users/Yazeed/Desktop/KHATT/Training","C:/Users/Yazeed/Desktop/KHATT/Training-Groundtruth.txt")
x_test, y_test = load_data("C:/Users/Yazeed/Desktop/KHATT/Test","C:/Users/Yazeed/Desktop/KHATT/Test-Groundtruth.txt")
"""summ = np.sum((np.array(x_train[i])).shape[0] for i in range(len(x_train)))
print(summ/len(x_train))
summ = np.sum((np.array(x_train[i])).shape[1] for i in range(len(x_train)))
print(summ/len(x_train))"""
print("Training set:", len(x_train), len(y_train))
print("Testing set:", len(x_test), len(y_test))
plt.imshow(x_train[random.randint(0,len(x_train))].reshape(150,2200))
plt.imshow(x_test[random.randint(0,len(x_test))].reshape(150,2200))

 ## Edit Distance Function

In [None]:
def edit_distance(targ, hyp):
    """
        Calculates the number of changes required to turn a hypothesis string to a target (reference string)
    """
    n = len(targ)
    m = len(hyp)
    
    ins  = 0
    dels = 0
    subs = 0
    corr = 0
    
    D = np.zeros((n+1,m+1))
    
    D[:,0] = np.arange(n+1)
    D[0,:] = np.arange(m+1)
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            if targ[i-1] == hyp[j-1]:
                D[i, j] = D[i-1, j-1]
            else:
                D[i, j] = min(D[i-1, j],D[i, j-1],D[i-1, j-1])+1
    
    i = n
    j = m
    while i>0 and j>0:
        if targ[i-1] == hyp[j-1]:
            corr += 1
        elif D[i-1, j] == D[i, j]-1:
            ins += 1
            j += 1
        elif D[i, j-1] == D[i, j]-1:
            dels += 1
            i += 1
        elif D[i-1, j-1] == D[i, j]-1:
            subs += 1
        i -= 1
        j -= 1
    ins += i
    dels += j
    
    return D[-1, -1],ins,dels,subs,corr
i,j,k,l,m = edit_distance("human","humen")
print(int(i),j,k,l,m)

## Connectionist Temporal Classification (CTC)

In [None]:
def ctc_loss(params, seq, blank=0, is_prob=True):
    """
        CTC loss funtion. 

        Description: 
        takes a sequence of characters, calculates best path, calculates the network gradients according to the path and the prdedcitons.

        Input:
        params ~ nxm matrix of n probability distributions over m frames.
        seq ~ sequence of character id's.
        is_prob ~ are the params normalized? (already passed through softmax)

        Output:
        objective
        gradient
    """
    grad = np.zeros_like(params)
    try:
        seqLen = seq.shape[0]      # length of sequence
        numchars = params.shape[0] # num of labels
        L = 2*seqLen + 1           # length of label sequence with blanks
        T = params.shape[1]        # lenght of time (utterance)

        alphas = np.zeros((L, T))
        betas  = np.zeros((L, T))

        if not is_prob:
            params = params - np.max(params, axis=0)
            params = np.exp(params)
            params = params / np.sum(params, axis=0)

        # initialize the alphas and forward pass
        #print (params.shape)
        alphas[0,0] = params[blank, 0]
        alphas[1,0] = params[seq[0],0]
        c = np.sum(alphas[:,0])
        alphas[:,0]=alphas[:,0] / c
        forward = np.log(c)

        for t in range(1,T):
            start = max(0,L-2*(T-t))
            end = min(2*t+2,L)

            for s in range(start, L):
                l = (s-1)/2
                # blank
                if s%2 == 0:
                    if s==0:
                        alphas[s, t] = alphas[s, t-1] * params[blank, t]
                    else:
                        alphas[s, t] = (alphas[s, t-1] + alphas[s-1, t-1]) * params[blank, t]
                # same label twice
                elif s == 1 or seq[l] == seq[l-1]:
                    alphas[s, t] = (alphas[s, t-1] + alphas[s-1, t-1])* params[seq[l], t]
                else:
                    alphas[s, t] = (alphas[s, t-1] + alphas[s-1, t-1] + alphas[s-2, t-1])* params[seq[l], t]
            # normalize at current time (prevent underflow)
            c = np.sum(alphas[start:end], t)
            alphas[start:end,t] = alphas[start:end,t] / c
            forward += np.log(c)

        # initialize the betas and backwards pass
        betas[-1, -1] = params[blank, -1]
        betas[-2, -1] = params[seq[-1], -1]
        c = np.sum(betas[:,-1])
        betas[:, -1]  = betas[:, -1] / c
        backward = np.log(c)
        for t in range(T-2,-1,-1):
            start = max(0,L-2*(T-t))
            end = min(2*t+2,L)

            for s in range(end-1, -1, -1):
                l = (s-1)/2
                # blank
                if s%2 == 0:
                    if s==L-1:
                        betas[s, t] = betas[s, t+1] * params[blank, t]
                    else:
                        betas[s, t] = (betas[s, t+1] + betas[s+1, t+1]) * params[blank, t]
                # same label twice
                elif s == L-2 or seq[l] == seq[l+1]:
                    betas[s, t] = (betas[s, t+1] + alphas[s+1, t+1])* params[seq[l], t]
                else:
                    alphas[s, t] = (alphas[s, t+1] + alphas[s+1, t+1] + alphas[s+2, t+1])* params[seq[l], t]
            # normalize at current time (prevent underflow)
            c = np.sum(betas[start:end], t)
            betas[start:end,t] = betas[start:end,t] / c
            backward += np.log(c)

        # compute gradient with respect to unnormalized input parameters
        grad = np.zeros(params.shape)
        ab = alphas*betas

        for s in range(L):
            # blank
            if s%2 == 0:
                grad[blank, :] += ab[s, :]
                ab[s, :] = ab[s, :] / params[blank, :]
            else:
                grad[seq[(s-1)/2], :] += ab[s, :]
                ab[s, :] = ab[s, :] / (params[seq[(s-1)/2, :]])
        absum = np.sum(ab, axis=0)

        grad = params - grad / (params * absum)
    except:
        print("Error")
        return -forward, grad, True
    return -forward, grad, False

## Multidimentional Long Short-Term Memory (MDLSTM)

In [None]:
def ln(tensor, scope=None, epsilon=1e-5):
    """ 
        Layer normalizes a 2D tensor along its second axis 
    """
    assert (len(tensor.get_shape()) == 2)
    m, v = tf.nn.moments(tensor, [1], keep_dims=True)
    if not isinstance(scope, str):
        scope = ''
    with tf.variable_scope(scope + 'layer_norm'):
        scale = tf.get_variable('scale',
                                shape=[tensor.get_shape()[1]],
                                initializer=tf.constant_initializer(1))
        shift = tf.get_variable('shift',
                                shape=[tensor.get_shape()[1]],
                                initializer=tf.constant_initializer(0))
    ln_initial = (tensor - m) / tf.sqrt(v + epsilon)

    return ln_initial * scale + shift


class MultiDimensionalLSTMCell(RNNCell):
    """
        Adapted from TF's BasicLSTMCell to use Layer Normalization.
        Note that state_is_tuple is always True.
    """

    def __init__(self, num_units, forget_bias=0.0, activation=tf.nn.tanh):
        self._num_units = num_units
        self._forget_bias = forget_bias
        self._activation = activation

    def state_size(self):
        return LSTMStateTuple(self._num_units, self._num_units)

    def output_size(self):
        return self._num_units

    def __call__(self, inputs, state, scope=None):
        """
            Long short-term memory cell (LSTM).
            inputs (batch,n)
            state: the states and hidden unit of the two cells
        """
        with tf.variable_scope(scope or type(self).__name__,reuse = tf.AUTO_REUSE):
            c1, c2, h1, h2 = state

            # change bias argument to False since LN will add bias via shift
            concat = _linear([inputs, h1, h2], 5 * self._num_units, False)

            i, j, f1, f2, o = tf.split(value=concat, num_or_size_splits=5, axis=1)

            # add layer normalization to each gate
            i = ln(i, scope='i/')
            j = ln(j, scope='j/')
            f1 = ln(f1, scope='f1/')
            f2 = ln(f2, scope='f2/')
            o = ln(o, scope='o/')

            new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) +
                     c2 * tf.nn.sigmoid(f2 + self._forget_bias) + tf.nn.sigmoid(i) *
                     self._activation(j))

            # add layer_normalization in calculation of new hidden state
            new_h = self._activation(ln(new_c, scope='new_h/')) * tf.nn.sigmoid(o)
            new_state = LSTMStateTuple(new_c, new_h)

            return new_h, new_state


def multi_dimensional_rnn_while_loop(rnn_size, input_data, sh, dims=None, scope_n="layer1"):
    """
        Implements naive multi dimension recurrent neural networks

        rnn_size: the hidden units
        input_data: the data to process of shape [batch,h,w,channels]
        sh: [height,width] of the windows
        dims: dimensions to reverse the input data,eg.
            dims=[False,True,True,False] => true means reverse dimension
        scope_n : the scope

        returns [batch,h/sh[0],w/sh[1],rnn_size] the output of the lstm
    """

    with tf.variable_scope("MultiDimensionalLSTMCell-" + scope_n):

        # Create multidimensional cell with selected size
        cell = MultiDimensionalLSTMCell(rnn_size)

        # Get the shape of the input (batch_size, x, y, channels)
        batch_size, X_dim, Y_dim, channels = input_data.shape.as_list()
        # Window size
        X_win, Y_win = sh
        # Get the runtime batch size
        batch_size_runtime = tf.shape(input_data)[0]

        # If the input cannot be exactly sampled by the window, we patch it with zeros
        if X_dim % X_win != 0:
            # Get offset size
            offset = tf.zeros([batch_size_runtime, X_win - (X_dim % X_win), Y_dim, channels])
            # Concatenate X dimension
            input_data = tf.concat(axis=1, values=[input_data, offset])
            # Update shape value
            X_dim = input_data.shape[1].value

        # The same but for Y axis
        if Y_dim % Y_win != 0:
            # Get offset size
            offset = tf.zeros([batch_size_runtime, X_dim, Y_win - (Y_dim % Y_win), channels])
            # Concatenate Y dimension
            input_data = tf.concat(axis=2, values=[input_data, offset])
            # Update shape value
            Y_dim = input_data.shape[2].value

        # Get the steps to perform in X and Y axis
        h, w = int(X_dim / X_win), int(Y_dim / Y_win)

        # Get the number of features (total number of input values per step)
        features = Y_win * X_win * channels

        # Reshape input data to a tensor containing the step indexes and features inputs
        # The batch size is inferred from the tensor size
        x = tf.reshape(input_data, [batch_size_runtime, h, w, features])

        # Reverse the selected dimensions
        if dims is not None:
            assert dims[0] is False and dims[3] is False
            x = tf.reverse(x, dims)

        # Reorder inputs to (h, w, batch_size, features)
        x = tf.transpose(x, [1, 2, 0, 3])
        # Reshape to a one dimensional tensor of (h*w*batch_size , features)
        x = tf.reshape(x, [-1, features])
        # Split tensor into h*w tensors of size (batch_size , features)
        x = tf.split(axis=0, num_or_size_splits=h * w, value=x)

        # Create an input tensor array (literally an array of tensors) to use inside the loop
        inputs_ta = tf.TensorArray(dtype=tf.float32, size=h * w, name='input_ta')
        # Unstack the input X in the tensor array
        inputs_ta = inputs_ta.unstack(x)
        # Create an input tensor array for the states
        states_ta = tf.TensorArray(dtype=tf.float32, size=h * w + 1, name='state_ta', clear_after_read=False)
        # And an other for the output
        outputs_ta = tf.TensorArray(dtype=tf.float32, size=h * w, name='output_ta')

        # initial cell hidden states
        # Write to the last position of the array, the LSTMStateTuple filled with zeros
        states_ta = states_ta.write(h * w, LSTMStateTuple(tf.zeros([batch_size_runtime, rnn_size], tf.float32),
                                                          tf.zeros([batch_size_runtime, rnn_size], tf.float32)))

        # Function to get the sample skipping one row
        def get_up(t_, w_):
            return t_ - tf.constant(w_)

        # Function to get the previous sample
        def get_last(t_, w_):
            return t_ - tf.constant(1)

        # Controls the initial index
        time = tf.constant(0)
        zero = tf.constant(0)

        # Body of the while loop operation that applies the MD LSTM
        def body(time_, outputs_ta_, states_ta_):

            # If the current position is less or equal than the width, we are in the first row
            # and we need to read the zero state we added in row (h*w). 
            # If not, get the sample located at a width distance.
            state_up = tf.cond(tf.less_equal(time_, tf.constant(w)),
                               lambda: states_ta_.read(h * w),
                               lambda: states_ta_.read(get_up(time_, w)))

            # If it is the first step we read the zero state if not we read the immediate last
            state_last = tf.cond(tf.less(zero, tf.mod(time_, tf.constant(w))),
                                 lambda: states_ta_.read(get_last(time_, w)),
                                 lambda: states_ta_.read(h * w))

            # We build the input state in both dimensions
            current_state = state_up[0], state_last[0], state_up[1], state_last[1]
            # Now we calculate the output state and the cell output
            out, state = cell(inputs_ta.read(time_), current_state,)
            # We write the output to the output tensor array
            outputs_ta_ = outputs_ta_.write(time_, out)
            # And save the output state to the state tensor array
            states_ta_ = states_ta_.write(time_, state)

            # Return outputs and incremented time step 
            return time_ + 1, outputs_ta_, states_ta_

        # Loop output condition. The index, given by the time, should be less than the
        # total number of steps defined within the image
        def condition(time_, outputs_ta_, states_ta_):
            return tf.less(time_, tf.constant(h * w))

        # Run the looped operation
        result, outputs_ta, states_ta = tf.while_loop(condition, body, [time, outputs_ta, states_ta],
                                                      parallel_iterations=PARL_EXC)

        # Extract the output tensors from the processesed tensor array
        outputs = outputs_ta.stack()
        states = states_ta.stack()
        #print(outputs.shape)
        # Reshape outputs to match the shape of the input
        y = tf.reshape(outputs, [h, w, batch_size_runtime, rnn_size])
        #print("y: ",y.shape)
        # Reorder te dimensions to match the input
        y = tf.transpose(y, [2, 0, 1, 3])
        # Reverse if selected
        if dims is not None:
            y = tf.reverse(y, dims)
        #print("y's final shape: ", y.shape)
        # Return the output and the inner states
        return y, states


## Utility Methods

In [None]:
current = 0
def next_batch(batch_size):
    start = current*batch_size
    end = len(x_train)
    if((current+1)*batch_size<end):
        end = (current+1)*batch_size
    else:
        current = -1
    current += 1
    return x_train[start:end],y_train[start:end]
def get_sequence_lengths(inputs):
    """
    Get sequence length of each sequence.
    Args:
        inputs: list of lists where each element is a sequence.
    Returns:
        array of sequence lengths.
    """
    result = []
    for input in inputs:
        result.append(len(input))

    return np.array(result, dtype=np.int64)

def texts_encoder(texts, first_index=(ord('a') - 1), space_index=0, space_token='<space>'):
    """
    Encode texts to numbers.
    Args:
        texts: list of texts.
            Data directory.
        first_index: int.
            First index (usually index of 'a').
        space_index: int.
            Index of 'space'.
        space_token: string.
            'space' representation.
    Returns:
        array of encoded texts.
    """
    result = []
    for text in texts:
        item = make_char_array(text, space_token)
        item = np.asarray([space_index if x == space_token else ord(x) - first_index for x in item])
        result.append(item)

    return np.array(result)


def sequence_decoder(sequence, first_index=(ord('a') - 1)):
    """
    Read text files.
    Args:
        sequence: list of int.
            Encoded sequence
        first_index: int.
            First index (usually index of 'a').
    Returns:
        decoded_text: string.
    """
    decoded_text = ''.join([chr(x) for x in np.asarray(sequence) + first_index])
    # Replacing blank label to none.
    decoded_text = decoded_text.replace(chr(ord('z') + 1), '')
    # Replacing space label to space.
    decoded_text = decoded_text.replace(chr(ord('a') - 1), ' ')
    return decoded_text
def sparse_tuples_from_sequences(sequences, dtype=np.int32):
    """
    Create a sparse representations of inputs.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indexes = []
    values = []

    for n, sequence in enumerate(sequences):
        #print([n]*len(sequence),range(len(sequence)))
        indexes.extend(zip([n] * len(sequence), range(len(sequence))))
        values.extend(sequence)

    indexes = np.asarray(indexes, dtype=np.int64)
    #print(values)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indexes).max(0)[1] + 1], dtype=np.int64)

    return indexes, values, shape


def make_char_array(text, space_token='<space>'):
    """
    Make text as char array. Replace spaces with space token.
    Args:
        text: string.
            Given text.
        space_token: string.
            Text which represents space char.
    Returns:
        string array.
            Split text.
    """
    result = np.hstack([space_token if x == ' ' else list(x) for x in text])
    return result

## Training

In [None]:
# Training data.
train_inputs = np.array(x_train)
train_labels = np.asarray(y_train)

# Validation data.
validation_inputs = np.array(x_test)
validation_labels = np.asarray(y_test)
validation_labels = sparse_tuples_from_sequences(validation_labels)

# Testing data.
test_inputs = np.array(x_test)
test_labels = np.asarray(y_test)
test_labels = sparse_tuples_from_sequences(test_labels)
test_texts = y_test

#print("prepared")
train_sequence_lengths = get_sequence_lengths(train_inputs)
validation_sequence_lengths = get_sequence_lengths(validation_inputs)
test_sequence_lengths = get_sequence_lengths(test_inputs)
with tf.device('/gpu:0'):
    config = tf.ConfigProto()

    graph = tf.Graph()
    with graph.as_default():
        #print("starting graph")
        logging.debug("Starting new TensorFlow graph.")
        inputs_placeholder = tf.placeholder(tf.float32, [None, H, W, CH])

        # SparseTensor placeholder required by ctc_loss op.
        labels_placeholder = tf.sparse_placeholder(tf.int32)

        # 1d array of size [batch_size].
        sequence_length_placeholder = tf.placeholder(tf.int32, [None])

        # Defining the cell.
        #def lstm_cell():
        #    return tf.contrib.rnn.LSTMCell(NUM_HIDDEN, state_is_tuple=True)

        # Stacking rnn cells.
        #stack = tf.contrib.rnn.MultiRNNCell(
        #    [lstm_cell() for _ in range(NUM_LAYERS)], state_is_tuple=True)

        # Creates a recurrent neural network.
        #outputs, _ = tf.nn.dynamic_rnn(stack, inputs_placeholder, sequence_length_placeholder, dtype=tf.float32)

        shape = tf.shape(inputs_placeholder)
        batch_size, max_time_steps = shape[0], shape[1]

        # Reshaping to apply the same weights over the time steps.
        #outputs = tf.reshape(outputs, [-1, NUM_HIDDEN])
        rnn_out, _ = multi_dimensional_rnn_while_loop(rnn_size=NUM_HIDDEN, input_data=inputs_placeholder, sh=[WIND_H, WIND_W])
        """weights = tf.Variable(tf.truncated_normal([NUM_HIDDEN, NUM_CLASSES], stddev=0.1),
                              name='weights')
        bias = tf.Variable(tf.constant(0., shape=[NUM_CLASSES]),
                           name='bias')

        # Doing the affine projection.
        print(outputs.shape, weights.shape)
        logits = tf.matmul(outputs, weights) + bias"""
        model_out = slim.fully_connected(inputs=rnn_out,
                                num_outputs=66,
                                activation_fn=None)
        #print(model_out.shape)
        logits = tf.reshape(model_out,[batch_size, -1, NUM_CLASSES])
        #print(logits.shape)
        #print(rnn_out.shape)
        # Reshaping back to the original shape.
        #logits = tf.reshape(logits, [batch_size, -1, NUM_CLASSES])

        # Time is major.
        logits = tf.transpose(logits, (1, 0, 2))
        #print("defining loss")
        with tf.name_scope('loss'):
            loss = tf.nn.ctc_loss(labels_placeholder, logits, sequence_length_placeholder)
            cost = tf.reduce_mean(loss)
            tf.summary.scalar("loss", cost)
        #print("defining optimizer")
        optimizer = tf.train.MomentumOptimizer(INITIAL_LEARNING_RATE, 0.9).minimize(cost)

        # CTC decoder.
        #print("decoder and label error rate")
        decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(logits, sequence_length_placeholder)

        label_error_rate = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                                           labels_placeholder))
    #print("starting session")
    with tf.Session(config=config, graph=graph) as session:
        logging.debug("Starting TensorFlow session.")

        # Saver op to save and restore all the variables.
        saver = tf.train.Saver()

        # Merge all the summaries and write them out.
        merged_summary = tf.summary.merge_all()

        # Initializing summary writer for TensorBoard.
        summary_writer = tf.summary.FileWriter(SUMMARY_PATH, tf.get_default_graph())

        # Initialize the weights and biases.
        tf.global_variables_initializer().run()

        train_num = train_inputs.shape[0]
        validation_num = validation_inputs.shape[0]

        # Check if there is any example.
        if train_num <= 0:
            logging.error("There are no training examples.")
            pass

        num_batches_per_epoch = math.ceil(train_num / BATCH_SIZE)
        print("Starting training")
        for current_epoch in range(NUM_EPOCHS):
            train_cost = 0
            train_label_error_rate = 0
            start_time = time.time()
            print("Epoch:"+str(current_epoch+1)+"/"+str(NUM_EPOCHS))
            for step in range(num_batches_per_epoch):
                # Format batches.
                print("Batch: "+str(step+1)+"/"+str(num_batches_per_epoch), end='\r')
                if int(train_num / ((step + 1) * BATCH_SIZE)) >= 1:
                    indexes = [i % train_num for i in range(step * BATCH_SIZE, (step + 1) * BATCH_SIZE)]
                else:
                    indexes = [i % train_num for i in range(step * BATCH_SIZE, train_num)]
                #print("batching...")
                #print(indexes)
                batch_train_inputs = train_inputs[indexes]
                batch_train_sequence_lengths = train_sequence_lengths[indexes]
                batch_train_targets = sparse_tuples_from_sequences(train_labels[indexes])
                #print("defining feed")
                feed = {inputs_placeholder: batch_train_inputs,
                        labels_placeholder: batch_train_targets,
                        sequence_length_placeholder: batch_train_sequence_lengths}
                #print("calculating cost")
                batch_cost, _, summary = session.run([cost, optimizer, merged_summary], feed)
                train_cost += batch_cost * BATCH_SIZE
                train_label_error_rate += session.run(label_error_rate, feed_dict=feed) * BATCH_SIZE

                # Write logs at every iteration.
                summary_writer.add_summary(summary, current_epoch * num_batches_per_epoch + step)
            print("Validating the epoch",end="\r")
            train_cost /= train_num
            train_label_error_rate /= train_num
            validation_feed = {inputs_placeholder: validation_inputs,
                               labels_placeholder: validation_labels,
                               sequence_length_placeholder: validation_sequence_lengths}

            validation_cost, validation_label_error_rate = session.run([cost, label_error_rate],
                                                                       feed_dict=validation_feed)
            validation_cost /= validation_num
            validation_label_error_rate /= validation_num
            print("Finished validation.","\n","Accuracy: ",str(1-validation_label_error_rate)," Cost:",validation_cost )
            
            # Output intermediate step information.
            logging.info("Epoch %d/%d (time: %.3f s)",
                         current_epoch + 1,
                         NUM_EPOCHS,
                         time.time() - start_time)
            logging.info("Train cost: %.3f, train label error rate: %.3f",
                         train_cost,
                         train_label_error_rate)
            logging.info("Validation cost: %.3f, validation label error rate: %.3f",
                         validation_cost,
                         validation_label_error_rate)
        print("Finished Training!")
        


## Testing & Prediction

In [None]:
with tf.device('/gpu:0'):
    config = tf.ConfigProto()

    graph = tf.Graph()
    with graph.as_default():
        #print("starting graph")
        logging.debug("Starting new TensorFlow graph.")
        inputs_placeholder = tf.placeholder(tf.float32, [None, H, W, CH])

        # SparseTensor placeholder required by ctc_loss op.
        labels_placeholder = tf.sparse_placeholder(tf.int32)

        # 1d array of size [batch_size].
        sequence_length_placeholder = tf.placeholder(tf.int32, [None])

        # Defining the cell.
        #def lstm_cell():
        #    return tf.contrib.rnn.LSTMCell(NUM_HIDDEN, state_is_tuple=True)

        # Stacking rnn cells.
        #stack = tf.contrib.rnn.MultiRNNCell(
        #    [lstm_cell() for _ in range(NUM_LAYERS)], state_is_tuple=True)

        # Creates a recurrent neural network.
        #outputs, _ = tf.nn.dynamic_rnn(stack, inputs_placeholder, sequence_length_placeholder, dtype=tf.float32)

        shape = tf.shape(inputs_placeholder)
        batch_size, max_time_steps = shape[0], shape[1]

        # Reshaping to apply the same weights over the time steps.
        #outputs = tf.reshape(outputs, [-1, NUM_HIDDEN])
        rnn_out, _ = multi_dimensional_rnn_while_loop(rnn_size=NUM_HIDDEN, input_data=inputs_placeholder, sh=[WIND_H, WIND_W])
        """weights = tf.Variable(tf.truncated_normal([NUM_HIDDEN, NUM_CLASSES], stddev=0.1),
                              name='weights')
        bias = tf.Variable(tf.constant(0., shape=[NUM_CLASSES]),
                           name='bias')

        # Doing the affine projection.
        print(outputs.shape, weights.shape)
        logits = tf.matmul(outputs, weights) + bias"""
        model_out = slim.fully_connected(inputs=rnn_out,
                                num_outputs=66,
                                activation_fn=None)
        print(model_out.shape)
        logits = tf.reshape(model_out,[batch_size, -1, NUM_CLASSES])
        #print(logits.shape)
        #print(rnn_out.shape)
        # Reshaping back to the original shape.
        #logits = tf.reshape(logits, [batch_size, -1, NUM_CLASSES])

        # Time is major.
        logits = tf.transpose(logits, (1, 0, 2))
        #print("defining loss")
        with tf.name_scope('loss'):
            loss = tf.nn.ctc_loss(labels_placeholder, logits, sequence_length_placeholder)
            cost = tf.reduce_mean(loss)
            tf.summary.scalar("loss", cost)
        #print("defining optimizer")
        optimizer = tf.train.MomentumOptimizer(INITIAL_LEARNING_RATE, 0.9).minimize(cost)

        # CTC decoder.
        #print("decoder and label error rate")
        decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(logits, sequence_length_placeholder)

        label_error_rate = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                                           labels_placeholder))
    #print("starting session")
    with tf.Session(config=config, graph=graph) as session:
        logging.debug("Starting TensorFlow session.")

        # Saver op to save and restore all the variables.
        saver = tf.train.Saver()

        # Merge all the summaries and write them out.
        merged_summary = tf.summary.merge_all()

        # Initializing summary writer for TensorBoard.
        summary_writer = tf.summary.FileWriter(SUMMARY_PATH, tf.get_default_graph())

        # Initialize the weights and biases.
        tf.global_variables_initializer().run()

        print("Testing...")
        test_feed = {inputs_placeholder: test_inputs,
                     sequence_length_placeholder: test_sequence_lengths}
        # Decoding.
        decoded_outputs = session.run(decoded[0], feed_dict=test_feed)
        dense_decoded = tf.sparse_tensor_to_dense(decoded_outputs, default_value=-1).eval(session=session)
        test_num = 211#x_test.shape[0]

        for i, sequence in enumerate(dense_decoded):
            sequence = [s for s in sequence if s != -1]
            decoded_text = sequence_decoder(sequence)

            logging.info("Sequence %d/%d", i + 1, test_num)
            logging.info("Original:\n%s", test_texts[i])
            logging.info("Decoded:\n%s", decoded_text)
        print("Saving model...")
        # Save model weights to disk.
        save_path = saver.save(session, MODEL_PATH)
        logging.info("Model saved in file: %s", save_path)
        print("END!!")

## Results Visualization

In [None]:
figs_to_plot = 10
fig, axs = plt.subplots(figs_to_plot, figsize=(8, 1.3*figs_to_plot))

for i in range(figs_to_plot):
    n = int(random.random()*len(x_test))
    image, actual_label = x_test[n],y_test[n]
    image, _ = transform(image, actual_label)

    image = nd.array(image)
    image = image.as_in_context(ctx)
    image = image.expand_dims(axis=0)
    output = net(image)
    predictions = output.softmax().topk(axis=2).asnumpy()
    decoded_prediction_text = decode(predictions)[0].replace("&quot", '\"').replace("&amp", "&").replace('";', '\"')
    axs[i].imshow(image.asnumpy().squeeze(), cmap='Greys_r')
    axs[i].set_title("[Label]: {}\n[Pred]:  {}".format(actual_label[0].replace("&quot", '\"').replace("&amp", "&").replace('";', '\"'), decoded_prediction_text),
                    fontdict={"horizontalalignment":"left", "family":"monospace"}, x=0)
    axs[i].tick_params(axis='both',       
                       which='both',      
                       bottom=False,      
                       top=False,         
                       left=False,
                       right=False,
                       labelleft=False,
                       labelbottom=False)