# General functions along with localization - network

In [14]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import VGG16
import tensorflow.keras.backend as K

In [15]:
IMGSIZE = (224, 224 ,3)

In [16]:
OUT_IMG_SIZE = (150 ,150 ,3)

In [17]:
IMG_PIXEL_RATIO = int(IMGSIZE[0]/OUT_IMG_SIZE[0])

In [18]:
Input_Layer = layers.Input(shape = IMGSIZE)

In [19]:
#BATCH_SIZE = 2

In [61]:
### Localization network architecture ###
def get_Localization_net(trainable = False):
    """ This is the localization network . It uses VGG16 network , prtrained on Imagenet dataset and it generates  
    parameters for transformations
    Input
    -----
    - Input Layer (added by default)
    Returns
    -------
    - 3D-Tensor of dimensions (B , 2 ,3)
    """
    output_bias = tf.keras.initializers.Constant([1, 0, 0, 0, 1, 0])
    #vgg = VGG16(include_top =False ,weights ='imagenet' ,input_shape = IMGSIZE )
    #vgg.trainable = trainable
    model = tf.keras.Sequential([
       layers.Conv2D(8, kernel_size=7, input_shape=(IMGSIZE[0], IMGSIZE[0], 3), 
                      activation="relu", kernel_initializer="he_normal"),
        layers.MaxPool2D(strides=2),
        layers.Conv2D(10, kernel_size=5, activation="relu", kernel_initializer="he_normal"),
        layers.MaxPool2D(strides=2),
       layers.Dense(32 , activation = keras.layers.LeakyReLU(0.3),kernel_initializer="he_normal"), 
       layers.Dense(6 , kernel_initializer="zeros", bias_initializer=output_bias) 
    ])
    #a = layers.Flatten()(vgg.output)
    #a = layers.Dense(128 , activation = keras.layers.LeakyReLU(0.3),kernel_initializer="he_normal")(a)
    #a = layers.Dense(64 , activation = keras.layers.LeakyReLU(0.3),kernel_initializer="he_normal")(a)
    #a = layers.Dense(32 , activation = keras.layers.LeakyReLU(0.3),kernel_initializer="he_normal")(a)
    #a = layers.Dense(6 , kernel_initializer="zeros", bias_initializer=output_bias)(a)
    #a = layers.Reshape((2, 3), input_shape=(6,))(a)
    return model

In [62]:
get_Localization_net(trainable = False).summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 218, 218, 8)       1184      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 109, 109, 8)       0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 105, 105, 10)      2010      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 52, 52, 10)        0         
_________________________________________________________________
dense_28 (Dense)             (None, 52, 52, 32)        352       
_________________________________________________________________
dense_29 (Dense)             (None, 52, 52, 6)         198       
Total params: 3,744
Trainable params: 3,744
Non-trainable params: 0
____________________________________________________

In [55]:
def get_localization_network():
    localization = tf.keras.Sequential([
        layers.Conv2D(8, kernel_size=7, input_shape=(IMGSIZE[0], IMGSIZE[0], 3), 
                      activation="relu", kernel_initializer="he_normal"),
        layers.MaxPool2D(strides=2),
        layers.Conv2D(10, kernel_size=5, activation="relu", kernel_initializer="he_normal"),
        layers.MaxPool2D(strides=2)
    ])
    return localization
def get_affine_params():
    output_bias = tf.keras.initializers.Constant([1, 0, 0, 0, 1, 0])
    fc_loc = tf.keras.Sequential([
        layers.Dense(32, activation="relu", kernel_initializer="he_normal"),
        layers.Dense(3 * 2, kernel_initializer="zeros", bias_initializer=output_bias)
    ])

    return fc_loc

# Spatial Transformation utilities

In [22]:
##pixel value generator##
def get_pixel_value(img, x, y):
    """
    Utility function to get pixel value for coordinate
    vectors x and y from a  4D tensor image.
    Input
    -----
    - img: tensor of shape (B, H, W, C)
    - x: flattened tensor of shape (B*H*W,)
    - y: flattened tensor of shape (B*H*W,)
    Returns
    -------
    - output: tensor of shape (B, H, W, C)
    """
    shape = tf.shape(x)
    batch_size = shape[0]
    height = shape[1]
    width = shape[2]

    batch_idx = tf.range(0, batch_size)
    batch_idx = tf.reshape(batch_idx, (batch_size, 1, 1))
    b = tf.tile(batch_idx, (1, height, width))

    indices = tf.stack([b, y, x], 3)

    return tf.gather_nd(img, indices)

In [23]:
## transformation grid generator##
def affine_grid_generator(height, width, theta):
    """
    This function returns a sampling grid, which when
    used with the bilinear sampler on the input feature
    map, will create an output feature map that is an
    affine transformation [1] of the input feature map.
    Input
    -----
    - height: desired height of grid/output. Used
      to downsample or upsample.
    - width: desired width of grid/output. Used
      to downsample or upsample.
    - theta: affine transform matrices of shape (num_batch, 2, 3).
      For each image in the batch, we have 6 theta parameters of
      the form (2x3) that define the affine transformation T.
    Returns
    -------
    - normalized grid (-1, 1) of shape (num_batch, 2, H, W).
      The 2nd dimension has 2 components: (x, y) which are the
      sampling points of the original image for each point in the
      target image.
    Note
    ----
    [1]: the affine transformation allows cropping, translation,
         and isotropic scaling.
    """
    num_batch = tf.shape(theta)[0]

    # create normalized 2D grid
    x = tf.linspace(-1.0, 1.0, width)
    y = tf.linspace(-1.0, 1.0, height)
    x_t, y_t = tf.meshgrid(x, y)

    # flatten
    x_t_flat = tf.reshape(x_t, [-1])
    y_t_flat = tf.reshape(y_t, [-1])

    # reshape to [x_t, y_t , 1] - (homogeneous form)
    ones = tf.ones_like(x_t_flat)
    sampling_grid = tf.stack([x_t_flat, y_t_flat, ones])

    # repeat grid num_batch times
    sampling_grid = tf.expand_dims(sampling_grid, axis=0)
    sampling_grid = tf.tile(sampling_grid, tf.stack([num_batch, 1, 1]))

    # cast to float32 (required for matmul)
    theta = tf.cast(theta, 'float32')
    sampling_grid = tf.cast(sampling_grid, 'float32')

    # transform the sampling grid - batch multiply
    batch_grids = tf.matmul(theta, sampling_grid)
    # batch grid has shape (num_batch, 2, H*W)

    # reshape to (num_batch,2, H, W)
    batch_grids = tf.reshape(batch_grids, [num_batch, 2, height, width])

    return batch_grids

In [24]:
def bilinear_sampler(img, x, y):
    """
    Performs bilinear sampling of the input images according to the
    normalized coordinates provided by the sampling grid. Note that
    the sampling is done identically for each channel of the input.
    To test if the function works properly, output image should be
    identical to input image when theta is initialized to identity
    transform.
    Input
    -----
    - img: batch of images in (B, H, W, C) layout.
    - grid: x, y which is the output of affine_grid_generator.
    Returns
    -------
    - out: interpolated images according to grids. Same size as grid.
    """
    H = tf.shape(img)[1]
    W = tf.shape(img)[2]
    max_y = tf.cast(H - 1, 'int32')
    max_x = tf.cast(W - 1, 'int32')
    zero = tf.zeros([], dtype='int32')

    # rescale x and y to [0, W-1/H-1]
    x = tf.cast(x, 'float32')
    y = tf.cast(y, 'float32')
    x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32'))
    y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32'))

    # grab 4 nearest corner points for each (x_i, y_i)
    x0 = tf.cast(tf.floor(x), 'int32')
    x1 = x0 + 1
    y0 = tf.cast(tf.floor(y), 'int32')
    y1 = y0 + 1

    # clip to range [0, H-1/W-1] to not violate img boundaries
    x0 = tf.clip_by_value(x0, zero, max_x)
    x1 = tf.clip_by_value(x1, zero, max_x)
    y0 = tf.clip_by_value(y0, zero, max_y)
    y1 = tf.clip_by_value(y1, zero, max_y)

    # get pixel value at corner coords
    Ia = get_pixel_value(img, x0, y0)
    Ib = get_pixel_value(img, x0, y1)
    Ic = get_pixel_value(img, x1, y0)
    Id = get_pixel_value(img, x1, y1)

    # recast as float for delta calculation
    x0 = tf.cast(x0, 'float32')
    x1 = tf.cast(x1, 'float32')
    y0 = tf.cast(y0, 'float32')
    y1 = tf.cast(y1, 'float32')

    # calculate deltas
    wa = (x1-x) * (y1-y)
    wb = (x1-x) * (y-y0)
    wc = (x-x0) * (y1-y)
    wd = (x-x0) * (y-y0)

    # add dimension for addition
    wa = tf.expand_dims(wa, axis=3)
    wb = tf.expand_dims(wb, axis=3)
    wc = tf.expand_dims(wc, axis=3)
    wd = tf.expand_dims(wd, axis=3)

    # compute output
    out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])

    return out

# Spatial Transformation Network forward function

In [45]:
def stn(x):
    localization = get_localization_network()
    fc_loc = get_affine_params()
    
    xs = localization(x)
    xs = tf.reshape(xs, (-1, 10 * 3 * 3 ))
    theta = fc_loc(xs)
    theta = tf.reshape(theta, (-1, 2, 3))
    
    grid = affine_grid_generator(OUT_IMG_SIZE[0], OUT_IMG_SIZE[1], theta)
    x_s = grid[:, 0, :, :]
    y_s = grid[:, 1, :, :]
    x = bilinear_sampler(x, x_s, y_s)

    return x

In [51]:
def get_training_model(use_stn=True):
    inputs = layers.Input(shape = (IMGSIZE[0], IMGSIZE[0], 3))
    if use_stn:
        outputs = stn(inputs)
    else:
        outputs = inputs
    return tf.keras.Model(inputs, outputs)    

In [56]:
stn_model = get_training_model()
stn_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 52, 52, 10)   3194        input_8[0][0]                    
__________________________________________________________________________________________________
tf.reshape_7 (TFOpLambda)       (None, 90)           0           sequential_2[0][0]               
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 6)            3110        tf.reshape_7[0][0]               
____________________________________________________________________________________________

In [64]:
def lambda_function(lst):
    Input_Layer = lst[0]
    trans_matrix = lst[1]
    trans_matrix = tf.reshape(trans_matrix, (-1, 2, 3))
    grid = affine_grid_generator(OUT_IMG_SIZE[0],OUT_IMG_SIZE[1], trans_matrix)
    x_s = grid[:, 0, :, :]
    y_s = grid[:, 1, :, :]
    x = bilinear_sampler(Input_layer, x_s, y_s)
    return x

In [63]:
def get_STN(Input_layer = Input_Layer):
    """
    This the model generator function.
    Input
    -----
    - None
    Returns
    -------
    - tensorflow.keras.model (Trainable)
    """
    trans_matrix = get_Localization_net(trainable = False)
    x = layers.Lambda(lambda_function)([Input_layer , trans_matrix ])
    #grid = affine_grid_generator(OUT_IMG_SIZE[0],OUT_IMG_SIZE[1], trans_matrix)
    #x_s = grid[:, 0, :, :]
    #y_s = grid[:, 1, :, :]
    #x = bilinear_sampler(Input_layer, x_s, y_s)
    return x

In [65]:
stn = keras.Model(inputs = Input_Layer ,outputs = get_STN(Input_layer = Input_Layer))
stn.summary()

TypeError: Failed to convert object of type <class 'tensorflow.python.keras.engine.sequential.Sequential'> to Tensor. Contents: <tensorflow.python.keras.engine.sequential.Sequential object at 0x00000221E1D07E20>. Consider casting elements to a supported type.

In [25]:
trans_matrix = get_Localization_net(trainable = False ,Input_Layer = Input_Layer)
trans_matrix.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0   

In [21]:
## Testing ##
model = get_Localization_net(trainable = False)
#x = tf.random.normal((1,IMGSIZE[0] ,IMGSIZE[1],IMGSIZE[2]))
#print(x.shape)
#y = model(x)
#print(y.shape)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0   