<a href="https://colab.research.google.com/github/wesley34/comp3414_course_material/blob/master/ch11_python_Tensorflow_2.0/MaskedRCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## on process 

In [None]:
## HYPERPARAMETER

In [None]:
import os
import random
import datetime
import re
import math
import logging
import numpy as np
from collections import OrderedDict
import skimage.color
import skimage.io
import skimage.transform
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers as KL
from tensorflow.keras import models as KM



In [None]:
#TODO
deltas = np.array([[[1,2,3,4],[2,2,3,4]],[[5,6,7,8],[2,6,7,8]]])
ix = np.array([[1,0],[1,0]])
for data,i in zip(deltas,ix):
  print(tf.gather(data,i).numpy())

[[2 2 3 4]
 [1 2 3 4]]
[[2 6 7 8]
 [5 6 7 8]]


In [None]:
# SETUP IMAGE
IMAGE_MIN_DIM = 800
IMAGE_MAX_DIM = 1024
IMAGE_DIM = IMAGE_MAX_DIM
IMAGE_RESIZE_MODE = "square"
IMAGE_MIN_SCALE = 0

# SETUP BACKBONE NETWORK
BACKBONE = "resnet101"
BACKBONE_STRIDES = [4,8,16,32,64]

# SETUP RPN
RPN_ANCHOR_STRIDE = 1                #stride of rpn
RPN_ANCHOR_SCALES = (32,64,128,256,512)       #different scales for rpn upsampling
RPN_ANCHOR_RATIO = [0.5,1,2]             #aspect ratio of rpn achor
RPN_TRAIN_ANCHORS_PER_IMAGE = 256           #number of anchors to be chosen

# SETUP ROI
TRAIN_ROI_PER_IMAGE = 200             #use how many roi to fpn layer
ROI_POSITIVE_RATIO = 0.33             #use how many positive roi ratio into fpn

# SET UP NMS
POST_NMS_ROIS_TRAINING = 2000
POST_NMS_ROIS_INFERENCE = 1000
RPN_MNS_THRESHOLD = 0.3
FPN_FREATURE = 256
DETECTION_MAX_INSTANCE = 100 

MAX_GT_INSTANCE = 100

DETECTION_MIN_CONFIDENCE = 0.7

DETECTION_MNMS_THRESHOLD = 0.3

# ROI pooling setup
POOL_SIZE = 7
MASK_POOL_SIZE = 14
MASK_SHAPE = [28,28]

RPN_BBOX_STD_DEV = np.array([0.1,0.1,0.2,0.2])
BBOX_STD_DEV = np.array([0.1,0.1,0.2,0.2])

USE_MINI_MASK = True
MINI_MASK_SHAPE = (56,56) 

## Section 1 : Resnet

In [None]:
## construction of resnet

In [None]:
def compute_backbone_shapes(image_shape):
  return_shape = [[int(math.ceil(image_shape[0]/stride)),int(math.ceil(image_shape[1]/stride))] for stride in BACKBONE_STRIDES]
  return np.array(return_shape)

In [None]:
# resnet identity_block
def identity_block(input_tensor,kernel_size,filters,stage,block,use_bias=True,train_bn=True):
  nb_filter1,nb_filter2,nb_filter3 = filters
  conv_name_base = "res" + str(stage) + block + "_branch" #conv layer name
  bn_name_base = "bn" + str(stage) + block + "_branch" #bn layer name

  x = KL.Conv2D(nb_filter1,(1,1),name=conv_name_base + '2a',use_bias=use_bias)(input_tensor) #output 
  x = KL.BatchNormalization(name=bn_name_base + '2a')(x,training=train_bn)
  x = KL.Activation('relu')

  x = KL.Conv2D(nb_filter2,(kernel_size,kernel_size),padding="same",name=conv_name_base+"2b",use_bias=use_bias)(x)
  x = KL.BatchNormalization(name=bn_name_base + '2b')(x,training=train_bn)
  x = KL.Activation('relu')

  x = KL.Conv2D(nb_filter3,(1,1),name=conv_name_base + '2c',use_bias=use_bias)(input_tensor)
  x = KL.BatchNormalization(name=bn_name_base + '2c')(x,training=train_bn)
  
  x = KL.add()([x,input_tensor])
  x = KL.Activation('relu',name="res"+str(stage)+block+"_out")(x)

  return x

In [None]:
## resnet conv_block
def conv_block(input_tensor,kernel_size,filters,stage,block,strides=(2,2),use_bias=True,train_bn=True):
  nb_filter1,nb_filter2,nb_filter3 = filters
  conv_name_base = "res" + str(stage) + block + "_branch" #conv layer name
  bn_name_base = "bn" + str(stage) + block + "_branch" #bn layer name

  # downsampling , strides = (2,2) => x = concat(x, shortcut)
  # shortcut is done by conv with filter3 and stride => batchNorm => shortcut
  # x is done as identity block with initial strides = (2,2) a.k.a downsampling
  # first layer 

  x = KL.Conv2D(nb_filter1,(1,1),strides=strides ,name=conv_name_base + '2a',use_bias=use_bias)(input_tensor) #output 
  x = KL.BatchNormalization(name=bn_name_base + '2a')(x,training=train_bn)
  x = KL.Activation('relu')

  x = KL.Conv2D(nb_filter2,(kernel_size,kernel_size),padding="same",name=conv_name_base+"2b",use_bias=use_bias)(x)
  x = KL.BatchNormalization(name=bn_name_base + '2b')(x,training=train_bn)
  x = KL.Activation('relu')

  x = KL.Conv2D(nb_filter3,(1,1),name=conv_name_base + '2c',use_bias=use_bias)(input_tensor)
  x = KL.BatchNormalization(name=bn_name_base + '2c')(x,training=train_bn)
  
  shortcut = KL.Conv2D(nb_filter3,(1,1),strides=strides,name=conv_name_base + "1",use_bias=use_bias)(input_tensor)
  shortcut = KL.BatchNormalization(name=bn_name_base + '1')(shortcut,training=train_bn)

  x = KL.add()([x,shortcut])
  x = KL.Activation('relu',name="res"+str(stage)+block+"_out")(x)

  return x

In [None]:
## support resnet 50 / 101 , statement support for stage 5 feature map output
"""
@param
FORMAT: param_name : type || meaning
-----------------------------------------------------------------
input_image : tensor || image that feed to network
architecture : string || choose from resnet 50 / resnet 101
stage 5 : bool || if you want output stage5
train_bn : bool || if you want to train for batch_normalization
-----------------------------------------------------------------
@process description
it builds a resnet-FPN and return you C1,C2,C3,C4,C5
-----------------------------------------------------------------
@output
list of Feature with different size 
[C1,C2,C3,C4,C5]
-----------------------------------------------------------------
"""
def resnet_graph(input_image,architecture,stage5=False,train_bn=True):
  # check architecture
  assert architecture in {"resnet50","resnet101"}
  """------------------------------------------------------------------------------------------"""

  # first feature layer
  x = KL.ZeroPadding((3,3))(input_image)
  x = KL.Conv2D(64,(7,7),strides=(2,2,),name="conv1",use_bias=True)(x)
  x = KL.BatchNormalization(name="bn_conv1")(x,training=train_bn)
  x = KL.Activation("relu")(x)
  C1 = x = KL.MaxPooling2D((3,3),strides=(2,2),padding="same")(x)

  """------------------------------------------------------------------------------------------"""
  # second feature layer
  x = conv_block(x,3,[64,64,256],stage=2,block="a",train_bn=train_bn)
  x = identity_block(x,3,[64,64,256],stage=2,block="b",train_bn=train_bn)
  C2 = x = identity_block(x,3,[64,64,256],stage=2,block="c",train_bn=train_bn)


  """------------------------------------------------------------------------------------------"""
  # thrid feature layer
  x = conv_block(x,3,[128,128,512],stage=3,block="a",train_bn=train_bn)
  x = identity_block(x,3,[128,128,512],stage=3,block="b",train_bn=train_bn)
  x = identity_block(x,3,[128,128,512],stage=3,block="b",train_bn=train_bn)
  C3 = x = identity_block(x,3,[128,128,512],stage=3,block="c",train_bn=train_bn)

  """------------------------------------------------------------------------------------------"""
  # forth feature layer 
  x = conv_block(x,3,[256,256,1024],stage=4,block="a",train_bn=train_bn)
  block_count = {"resnet50":5,"resnet101":222}[architecture] #get number from dictionary depends on your architecture
  for i in range(block_count):
    x = identity_block(x,3,[256,256,1024],stage=4,block=chr(98+1),train_bn=train_bn)
  C4 = x

  """------------------------------------------------------------------------------------------"""
  # fifth feature layer
  if stage5:
    x = conv_block(x,3,[512,512,2048],stage=5,block="a",train_bn=train_bn)
    x = identity_block(x,3,[512,512,2048],stage=5,block="b",train_bn=train_bn)
    C5 = x = identity_block(x,3,[512,512,2048],stage=5,block="c",train_bn=train_bn)
  else:
    C5 = None
  return [C1,C2,C3,C4,C5]

## Helper Function

In [None]:
#Image mean RGB
MEAN_PIXEL = np.array([123.7,116.8,103.9])

def mold_image(images):
  return images.astype(np.float32) - MEAN_PIXEL
def unmodel_image(normalized_images):
  return (normalized_images + MEAN_PIXEL).astype(np.uint8)
def resize_image(image,min_dim=None,max_dim=None,min_scale=None,mode="square"):
  image_dtype = image.dtype
  h,w = image.shape[:2]
  window = (0,0,h,w)
  scale = 1
  padding = [(0,0),(0,0),(0,0)]
  crop = None
  if mode == "none":
    return image,window,scale,padding,crop
  if min_dim:
    scale = max(1,min_dim/min(h,w))
  if min_scale and scale < min_scale:
    scale = min_scale

  # if exceed max dim
  if max_dim and mode == "square":
    image_max = max(h,w)
    if round(image_max * scale) > max_dim:
      scale = max_dim / image_max
  
  if scale != 1:
    image = skimage.transform.resize(
        image,(round(h*scale),round(w*scale)),
        order=1,mode="constant",preserve_range=True
    )
  
  if mode == "square":
    # get new height and width
    h,w = image.shape[:2]
    top_pad = (max_dim-h)//2
    bottom_pad = min_dim-h-top_pad
    left_pad = (max_dim-w)//2
    right_pad = max_dim-w-left_pad
    padding = [(top_pad,bottom_pad),(left_pad,right_pad),(0,0)]
    image = np.pad(image,padding,mode="constant",constant_values=0)
    window = (top_pad,left_pad,h+top_pad,w+left_pad)
  elif mode == "pad64":
    h,w = image.shape[:2]
    assert min_dim % 64 == 0, "Minimum dimension must be multiple of 64"
    if h%64>0:
      max_h = h-(h%64)+64
      top_pad = (max_h-h)//2
      bottom_pad = max_h - top_pad - h
    else:
      top_pad = bottom_pad = 0
    if w%64>0:
      max_w = w-(w%64)+64
      left_pad = (max_w-w)//2
      right_pad = max_w-w-right_pad
    else:
      left_pad = right_pad = 0
    padding = [(top_pad,bottom_pad),(left_pad,right_pad),(0,0)]
    image = np.pad(image,padding,mode="constant",constant_values=0)
    window = (top_pad,left_pad,top_pad+h,left_pad+w)
  elif mode == "crop":
    # random crop
    h,w = image.shape[:2]
    y = random.randint(0,(h-min_dim))
    x = random.randint(0,(w-min_dim))
    crop = (y,x,min_dim,min_dim)
    image = image[y:y+min_dim,x:x+min_dim]
    window = (0,0,min_dim,min_dim)
  else:
    raise Exception("Mode {} not support".format(mode))
  return image.astype(image_dtype),window,scale,padding,crop


In [None]:
def compose_image_meta(image_id,original_image_shape,image_shape,window,scale,active_class_ids):
  meta = np.array([image_id]+         # 1
            list(original_image_shape) # 3
            list(image_shape)+    # 3
            list(window)+       # 4
            [scale]+         # 1
            list(active_class_ids)  # num_class size
      )
  return meta

def log(text,array=None):
  if array is Not None:
    text = text.ljust(25)
    text += ("shape: {:20} min: {:10.5f} max {:10.5f} {}".format(
        str(array.shape),
        array.min() if array.size else "",
        array.max() if array.size else "",
        array.dtype
    ))
  print(text)

def run_graph(MaskRCNNobj,images,outputs,batch_size,image_metas=None):
  model = MaskRCNNobj.keras_model
  outputs = OrderedDict(outputs)
  for o in outputs.values()
    assert o is not None

  inputs = model.inputs

  kf = K.function(model.inputs,list(outputs.values()))

  if image_metas is None:
    molded_images,images_metas,_ = MaskRCNNobj.mold_inputs(images) 

## Section2 : RPN

In [None]:
"""
functon rpn_graph
@ param
feature_map : output layer from resnet-FPN
anchors_per_location: int , how much anchors you need for 1 location box
anchor_stride, int , the stride for anchor to conv the photo
@ output
logits,class,bbox 
"""
def rpn_graph(feature_map,anchors_per_location,anchor_stride):
  shared = KL.Conv2D(512,(3,3),padding="same",activation="relu",strides=anchor_stride,name="rpn_conv_shared")(feature_map)

  # step 1 : calculate the score of anchors [batch_size,height,width,anchors_per_location * 2] => *2 helps to reshape as 2 classes [positive and negative]
  # scale up channel output to 2*anchors_per_location
  x = KL.Conv2D(2*anchors_per_location,(1,1),padding="valid",activation="linear",name="rpn_class_raw")(shared)

  # feature map extention to [batch,anchors_location]
  rpn_class_logits = KL.lambda(lambda t:tf.reshape(t,[shape(t)[0],-1,2]))(x) ## batch_size,width*height*anchor_per_location,2]

  # calculate the feature map's score with respect to positive and negative
  rpn_probs = KL.Activation("softmax",name="rpn_class_xxx")(rpn_class_logits) # [batch_size,width*height*anchor_per_location,2] => positive means front-side else back-side

  # step 2: calculate the bbox
  x = KL.Conv2D(4*anchors_per_location,(1,1),padding="valid",activation="linear",name="rpn_bbox_pred")(shared)

  rpn_bbox = KL.Lambda(lambda t : tf.reshape(t,[t.shape[0],-1,4]))(x)

  return [rpn_class_logits,rpn_probs,rpn_bbox]
"""
function build_rpn_model
@ param
depth : int , channel-in, say rgb means channel is 3, greytone means 1
"""
def build_rpn_model(anchor_stride,anchors_per_location,depth): 

  input_feature_map = KL.Input(shape=[None,None,depth],name="input_rpn_feature_map")
  outputs = rpn_graph(input_feature_map,anchors_per_location,anchor_stride)
  model = KM.Model(inputs=input_feature_map,outputs=outputs,name="rpn_model")
  return model


def apply_box_deltas_graph(boxes,deltas): #[batches,[y1,x1,y2,x2]],[batches,[dy,dx,log(dh),log(dw)]]

  height = boxes[:,2] - boxes[:,0]
  width = boxes[:,3] - boxes[:,1]
  center_y = boxes[:,0] + 0.5 * height
  center_x = boxes[:,1] + 0.5 * width
  
  # adding with bias 
  center_y += delta[:,0] * height
  center_x += delta[:,1] * width
  height *= tf.exp(delta[:,2])
  width *= tf.exp(delta[:,3])

  #make new y1,x1,y2,x2
  y1 = center_y - 0.5*height
  y2 = center_y + 0.5*height
  x1 = center_x - 0.5*width
  x2 = center-x + 0.5*width

  result = tf.stack([y1,x1,y2,x2],axis=1,name="apply_box_delta_out")
  return result

def clip_boxes_graph(boxes,window): # [batches,[y1,x1,y2,x2]],[wy1,wx1,wy2,wx2]
  wy1,wx1,wy2,wx2 = tf.split(window,4)
  y1,x1,y2,x2 = tf.split(boxes,4,axis=1)
  # cutting
  y1 = tf.maximum(tf.minimum(y1,wy2),wy1)
  y2 = tf.maximum(tf.minimum(y2,wy2),wy1)
  x1 = tf.maximum(tf.minimum(x1,wx2),wx1)
  x2 = tf.maximum(tf.minimum(x2,wy2),wx1)

  clipped = tf.concat([y1,y2,x1,x2],axis=1,name="clipped_box")
  clipped.set_shape((clipped.shape[0],4))
  return clipped

In [None]:
class ProposalLayer(tf.keras.layers.Layers):

  def __init__(self,proposal_count,nms_threshold,batch_size,**kwargs):
    super(ProposalLayer,self).__init__(**kwargs)
    self.proposal_count = proposal_count
    self.nms_threshold = mns_threshold
    self.batch_size = batch_size
  
  '''
  @param
  rpn_probs = [batch,num_anchros,2] #inputs[0]
  rpn_bbox = [batch,num_anchors,(dy,dx,log(dh),log(dw))]  #inputs[1]
  anchors = [batch,(y1,x1,y2,x2)]  #inputs[2]
  '''
  def call(self,inputs):
    scores = inputs[0][:,:,1] #batch,num_anchors,1
    deltas = inputs[1]
    deltas = deltas * np.reshape(mask_rcnn_model.RPN_BBOX_STD_DEV,[1,1,4])
    anchors = inputs[2] 

    pre_nms_limit = tf.minimum(6000,tf.shape(anchors)[1]) # get 6000 or less
    ix = tf.nn.top_k(scores,pre_nms_limit,sorted=True,name="top_anchors").indices # base on score , get best 6000
    scores = utils.batch_slice()

   

## Build all

In [None]:
class MaskRCNN():
  def __init__(self,mode,model_dir,num_class,batch_size):
    assert mode = "inference"

    self.mode = mode
    self.num_class = num_class
    self.batch_size = batch_size
    self.model_dir = model_dir
    self.set_log_dir()
    self.keras_model = self.build(mode=mode)

  # image_preprcoessing
  def mold_inputs(self,images):
    moleded_image = []
    image_metas = []
    windows = []
    for image in images:

      #window is scaled position
      #scale is aspect ratioh
      # TODO
      pass
  def build(self,mode):
    h,w = IMAGE_DIM , IMAGE_DIM
    if h / 2**6 != int(h / 2**6 ) or  w / 2**6 != int(w / 2**6 ):
      raise Exception("Image size must be divisible by 64")

    input_image = KL.Input(shape=[None,None,3],name="input_image")

    input_image_meta = KL>Input(shape=[img_meta_size],name="input_image_meta")

    if mode == "inference":
      input_anchors = KL.Input(shape=[None,4],name="input_anchors")

    # SECTION 1 : build resnet-FPN

    # step 1 build resnet
    # output from resnet upsampling
    _,C2,C3,C4,C5 = resnet_graph(input_image,BACKBONE,stage5=True,train_bn=False)

    # step 2 build FPN
    P5 = KL.Conv2D(256,(1,1),name="fpn_c5p5")(C5)
    P4 = KL.Add(name="fpn_p4add")([KL.UpSampling2D(size=(2,2),name="fpn_p5upsampled")(P5),
                                   KL.Conv2D(256,(1,1),name="fpn_c4p4")
                                   ])
    P3 = KL.add(name="fpn_p3add")([KL.UpSampling2D(size=(2,2),name="fpn_p4upsampled")(P4),
                                   KL.Conv2D(256,(1,1),name="fpn_c3p3")
                                   ])
    P2 = KL.add(name="fpn_p2add")([KL.UpSampling2D(size=(2,2),name="fpn_p3upsampled")(P3),
                                   KL.Conv2D(256,(1,1),name="fpn_c2p2")
                                   ])
    
    # conv to all P

    P2 = KL.Conv2D(FPN_FREATURE,(3,3),padding="same",name="fpn_p2")(P2)
    P3 = KL.Conv2D(FPN_FREATURE,(3,3),padding="same",name="fpn_p3")(P3)
    P4 = KL.Conv2D(FPN_FREATURE,(3,3),padding="same",name="fpn_p4")(P4)
    P5 = KL.Conv2D(FPN_FREATURE,(3,3),padding="same",name="fpn_p5")(P5)
    P6 = KL.MaxPooling2D(pool_size=(1,1),strides=2,name="fpn_p6")(P6)

    rpn_feature_maps = [P2,P3,P4,P5,P6]
    mrcnn_feature_maps = [P2,P3,P4,P5] 
    #--------------------------------------------------------------------------------------------------------------------------
    # END OF RESNET-FPN

    # SECTION2 : build forward rpn
    # define rpn Regional proposal network
    rpn = build_rpn_model(RPN_ANCHOR_STRIDE,len(RPN_ANCHOR_STRIDE),FPN_FREATURE)

    layer_outputs = []
    for p in rpn_feature_maps:
      #use all features in rpn_feature maps to feed the rpn
      layer_outputs.append(rpn([p]))
    
    output_names = ["rpn_class_logits","rpn_class","rpn_bbox"]
    # [[rpn_class_logits1,rpn_class1,rpn_bbox1],[rpn_class_logits2,rpn_class2,rpn_bbox2] becomes [[rpn_class_logits1,rpn_class_logits2],[rpn_class1,rpn_class2],[rpn_bbox1,rpn_bbox2]]
    outputs = list(zip(*layer_outputs)) #any size it can be due to the * so [[1,2,3],[4,5,6]] can be resize to [[1,4],[2,5],[3,6]] after zip(outputs,output_names)
    # outputs shape follows outputs_names !!!!!
    outputs = [KL.Concatenate(axis=1,name=n)(list(o)) for o,n in zip(outputs,output_names)]
    rpn_class_logits, rpn_class, rpn_bbox = outputs

    proposal_count = POST_NMS_ROIS_TRAINING if mode == "inference" else POST_NMS_ROIS_INFERENCE

    if mode == "inference":
      anchors = input_anchors
    
    rpn_rois = ProposalLayer(proposal_count=proposal_count,mns_threshold=RPN_MNS_THRESHOLD,batch_size=self.batch_size,name="ROI")([rpn_class,rpn_bbox,anchors])