In [0]:
!pip install tensorflow==2.0.0-beta1

In [0]:
!pip install pascal-voc-tools

Collecting pascal-voc-tools
[?25l  Downloading https://files.pythonhosted.org/packages/70/39/f3a438c87c3950102119fe247e2980367e474549b10d97b3c8e1de7ea0fc/pascal_voc_tools-0.1.22-py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 3.5MB/s 
Installing collected packages: pascal-voc-tools
Successfully installed pascal-voc-tools-0.1.22


Get COCO Dataset

In [0]:
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

In [0]:
!tar -xvf VOCtrainval_11-May-2012.tar

In [0]:
import pascal_voc_tools

In [0]:
xml_path = 'VOCdevkit/VOC2012/Annotations/2007_000027.xml'
parser = pascal_voc_tools.

TypeError: ignored

In [0]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

In [0]:
annType = ['segm','bbox','keypoints']
annType = annType[1]      #specify type here
prefix = 'person_keypoints' if annType=='keypoints' else 'instances'

In [0]:
dataDir='../'
dataType='val2014'
annFile = '%sannotations/%s_%s.json'%(dataDir,prefix,dataType)
cocoGt=COCO(annFile)

In [0]:
#initialize COCO detections api
resFile='%s/results/%s_%s_fake%s100_results.json'
resFile = resFile%(dataDir, prefix, dataType, annType)
cocoDt=cocoGt.loadRes(resFile)

In [0]:
imgIds=sorted(cocoGt.getImgIds())
imgIds=imgIds[0:100]
imgId = imgIds[np.random.randint(100)]

In [0]:
# running evaluation
cocoEval = COCOeval(cocoGt,cocoDt,annType)
cocoEval.params.imgIds  = imgIds
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()

Random

In [0]:
from keras.applications.vgg16 import VGG16
import keras as k

model = VGG16(weights='imagenet',
              input_shape=(224,224,3), 
              include_top=True)

print(model.summary())

# Faster RCNN

In [0]:
import numpy as np
import keras as k

class AnchorProposals:
  def __init__(self,
               W,
               H,
               input_shape,
               ratios,
               resolutions,
               mini_batch_size):
    """
    variables:
      W = output convolutional width
      H = output convolutional width
      input_shape
      model = a keras applications model
      input_shape = shape of the input image
      ratios = the ratios to use for the anchors
      resolutions = the resolutions to use for the anchors
      mini_batch_size = the mini_batch_size to be returned by generator
    """
    self.W = W
    self.H = H
    self.input_shape = model.input_shape[1:3]
    self.ratios = ratios
    self.resolutions = resolutions
    if mini_batch_size % 2 != 0:
      raise ValueError("mini_batch_size needs to be an even number (% 2 == 0)")
    self.mini_batch_size = mini_batch_size
    self.anchors = self.calc_anchor_pos()
    self.anchor_boxes = np.array([self.calc_anchor_box(self.anchors[it,:])
                                  for it in range(self.anchors.shape[0])])
  
  def calc_anchor_pos(self):
    """pre-generates the list of all possible anchors
    This functions runs only once.
    returns:
      list of all anchors [W,H,position in ratio array, resolution]
    """
    # calculate the anchor positions on the original image from the size
    # of the last convolutional feature map
    Ws = np.arange(0, self.input_shape[0], int(self.input_shape[0]/self.W))
    Hs = np.arange(0, self.input_shape[1], int(self.input_shape[1]/self.H))
    
    # broadcast each of the anchor variable arrays to combined size
    # of [len Ws, len Hs, len ratios, len resolutions]
    a = np.meshgrid(Ws,Hs,np.arange(len(self.ratios)),self.resolutions)
    
    # stack the anchor variable arrays to a combined array
    # of size [4, len Ws, len Hs, len ratios, len resolutions]
    a = np.stack(a)
    
    # reshape the combined array to a 2D array with [4,N]
    a = a.reshape((a.shape[0],np.prod(a.shape[1:])))
    
    # transpose to desired [N,4] and return
    a = a.T
    return a
  
  def calc_anchor_box(self, anchor, minmax=False):
    """from the anchors calculate the proper box coordinates
    
    """
    width = anchor[3] * self.ratios[anchor[2]][0]
    height = anchor[3] * self.ratios[anchor[2]][1]
    if minmax:
      x = anchor[0] - int(width/2)
      y = anchor[1] - int(height/2)
    else:
      x = anchor[0]
      y = anchor[1]
      
    return x, y, width, height
  
  # TODO maybe move this to the C++ function from pycocotools
  @staticmethod
  def IoU(box1, box2):
    """intersect over union calculator between bounding box 1 & 2
    variable:
      a = [x_center, y_center, width, height]
      b = [x_center, y_center, width, height]
    """
    # determine the (x, y)-coordinates of the intersection rectangle
    x1 = max(box1[0] - int(box1[2]/2),
             box2[0] - int(box2[2]/2)) 
    y1 = max(box1[1] - int(box1[3]/2),
             box2[1] - int(box2[3]/2)) 
    x2 = min(box1[0] + int(box1[2]/2),
             box2[0] + int(box2[2]/2)) 
    y2 = min(box1[1] + int(box1[3]/2),
             box2[1] + int(box2[3]/2))

    # compute the area of intersection rectangle
    interArea = max(0, x2 - x1) * max(0, y2 - y1)

    # compute the area of both the prediction and ground-truth
    # rectangles
    box1Area = box1[2] * box1[3]
    box2Area = box2[2] * box2[3]

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(box1Area + box2Area - interArea)

    # return the intersection over union value
    return iou
    
  # TODO: maybe allow parallel or vectorized for this
  # TODO: maybe shuffle async after yield
  # TODO: we probably need converters between the different box styles
  # TODO: rewrite this with the followig order per image:
  #       1.) calc iou for all anchors in vectorized form
  #       2.) do non-maximum supression on anchors per gt
  #       3.) sample a even mini_batch by rounds per gt
  # TODO: anchor proposals during training vs during inference ???
  #       how does it work?
  def generator(self, gts, parallel=False, lazy=True):
    """generates mini_batches with if possible half negative half positive
    samples, if not enough possitive, pads with negative samples.
    negative == IoU < 0.3 | positive == IoU > 0.7
    
    variables:
      gts: list of the ground truth boxes [[x_min, y_min, width, height], ...]
      parallel: creates mini_batch in parallel, only use if not
                one generator per thread
      lazy: only searches for positive and negative sample until
            mini_batch_size is reached
    """
    # the max number of neg & pos elements in a perfect mini_batch
    b_max = int(self.mini_batch_size / 2)
    
    while True:
      # shuffle list of anchors to make sure we recieve different ones every time
      np.random.shuffle(self.anchors)

      # count the number of neg & pos samples
      neg_cnt = 0
      pos_cnt = 0
      
      # preallocate a mini_batch list
      a_boxes = np.zeros((self.mini_batch_size, 4), dtype=np.float16)
      gts_boxes = np.zeros((self.mini_batch_size, 4), dtype=np.float16)
      gts_cls = np.zeros((self.mini_batch_size,), dtype=np.float16)
      # iterate through all anchor_boxes
      for it in range(self.anchor_boxes.shape[0]):
        
        # extract the anchor box
        a_box = self.anchor_boxes[it, :]
        
        # for every ground trough calculate the iou and
        # find the max iou for the anchor
        ious = [self.IoU(a_box, gt) for gt in gts]
        iou_max = max(ious)
        gt_max = gts[ious.index(iou_max)]

        # positive samples are iou > 0.7, negative samples iou < 0.3
        # continue to add samples as long as there are not enough pos or neg
        # prefill the array if not completely full with the more abundant one
        #TODO: if performance is to low add the anchor with highest IoU for a gt as positive
        if (iou_max > 0.7 and 
            (pos_cnt < b_max or pos_cnt + neg_cnt < self.mini_batch_size)):
          
          # add positive samples from the left
          a_boxes[pos_cnt,:] = a_box
          gts_boxes[pos_cnt,:] = gt_max
          gts_cls[pos_cnt] = 1.0
          
          # increase positives counter
          pos_cnt += 1
        
        elif (iou_max < 0.3 and 
              (neg_cnt < b_max or pos_cnt + neg_cnt < self.mini_batch_size)):
          
          #increase negatives counter
          neg_cnt += 1

          # add negative samples from the right
          a_boxes[-1 * pos_cnt,:] = a_box
          gts_boxes[-1 * pos_cnt,:] = [0,0,0,0]
          gts_cls[-1 * pos_cnt] = 0.0
          
        elif pos_cnt >= b_max and neg_cnt >= b_max:
          break
      
      yield a_boxes, gts

In [0]:
from keras.engine.base_layer import Layer

class ROIPooling(Layer):
  def __init__(self,
               mini_batch_size,
               W = 7,
               H = 7,
               strides=None,
               padding='valid',
               data_format=None,
               **kwargs):
    super(ROIPooling, self).__init__(name=name, **kwargs)
  
    self.mini_batch_size = mini_batch_size
    self.W = W
    self.H = H
    self.strides = strides
    self.padding = padding
    self.data_format = data_format
    self.built = True
  
  def build(input_shape):
    self.built = True
  
  def call(self, inputs)
    assert isinstance(inputs, list)
    
    x = inputs[0]
    rois = inputs[1]
    
    if rois.shape[0] / self.mini_batch_size < x.shape[0]:
      raise TypeError("batch size, num ROIs and mini-batch size missmatch, " +  
                      "there must be ROIs({}) /".format(rois.shape[0]) +
                      "mini_batch_size({}) = ".format(self.mini_batch_size) + 
                      "batches({}) ".format(rois.shape[0] / self.mini_batch_size) +
                      "in the inputs, but recieved {} as batch size".format(x.shape[0]))
    
    outputs = k.backend.placeholder(self.compute_output_shape(x.shape),
                                    dtype=np.float16,
                                    name="ROI_pool_output")
    
    #TODO check if this loop can be unroled and vectorized
    for roi_it in range(rois.shape[0])
    
      roi = rois[roi_it]
    
      pool_w = roi.shape[2] / self.W
      pool_h = roi.shape[3] / self.H

      if self.data_format == 'channels_last':
        pool_shape = (1, pool_w, pool_h, 1)
        strides = (1,) + self.strides + (1,)
      else:
        pool_shape = (1, 1, pool_w, pool_h)
        strides = (1, 1) + self.strides
      
      outputs[roi_it, :] = nn.max_pool(
          x[int(roi_it/self.mini_batch_size),
            roi[0]:roi[0]+roi[2],
            roi[1]:roi[1]+roi[3],
            :]
          ksize=pool_shape,
          strides=strides,
          padding=self.padding.upper())
      
      return outputs
    
  def compute_output_shape(self, input_shape):
    assert isinstance(input_shape, list)
    return input_shape[1][0] * self.mini_batch_size + [self.W, self.H] + input_shape[0][3]

In [0]:
class RPN(Layer):
    def __init__(self,
                 feature_vector_length = 256,
                 conv_filter_size = 3,
                 num_anchors = 128)

      self.feature_vector_length = feature_vector_length
      self.conv_filter_size = conv_filter_size
      self.num_anchors = num_anchors

    def build(input_shape):
      self.built = True
      
    def call(self, x):
      #TODO: maybe regularization
      x = k.layers.Conv2D(self.feature_vector_length,
                          (self.conv_filter_size, self.conv_filter_size),
                          activation='relu',
                          kernel_initializer='normal',
                          name="RPN_base_conv")(x)
      
      x_reg = k.layers.Conv2D(4 * self.num_anchors,
                              (1, 1),
                              activation='linear',
                              kernel_initializer='normal',
                              name="RPN_reg")(x)
      
      x_cls = k.layers.Conv2D(self.num_anchors,
                              (1, 1),
                              activation='sigmoid',
                              kernel_initializer='normal',
                              name="RPN_cls")(x)
      
      return x_reg, x_cls
    
    def compute_output_shape(self, _):
      return [4 * self.num_anchors, self.num_anchors]

In [0]:
def RPN2ROI(x_reg, x_cls):
    treshold = 0.5
    mask = Lambda(lambda x: backend.greater(x, treshold))(x_cls)
    rois = Lambda(lambda x: tf.boolean_mask(x, mask))(x_reg)
    return rois
    
#TODO We still need to go from ROI to last convolutional layer level

In [0]:
# TODO: gibt es bei Keras eine loss base class die verwendet werden kann?
def smooth_L1_loss(x)
    """ calculate smooth L1 loss according to Ross Girshick, 2015, Fast-RCNN
    per box

    variable:
      x = t_predicted - t_groundtruth; tensor with shape (num_predictions, 4)
    returns:
      the loss per box as a tensor with shape (num_predictions)

    smooth L1 loss:
            | 0.5 * x^2  if |x| < 1
    l(x) =  |
            | |x| - 0.5  else
    """
    # placeholder for loss elements
    l_elem = k.backend.placeholder(x.shape, dtype=np.float16)

    # index to access the |x| < 1 values
    ind = [!k.backend.less(k.backend.abs(x), 1)]

    # calculate loss parts if |x| < 1 first and sum them up over x,y,w,h
    l_elem[ind] = 0.5 * k.backend.pow(x[ind], 2)

    # add sum of loss parts for |x| > 1 
    l_elem[!ind] = k.backend.abs(x[!ind]) - 0.5

    # sum over x,y,w,h for loss per box
    l = k.backend.sum(l_elem, axis = 1)

    #return loss
    return l


class MultiTaskLoss:
  """calculate multitask loss for rpn
  
  keras does not allow

  variables:
    gts_reg: groundtruth box coordinates  


  """
  def __init__(self, pos_thresh = 0.5, lam = 1.0, **kwargs)
    self.pos_thresh = pos_thresh
    self.lam = lam
    
  #(gts_reg, a_reg, x_reg, gts_cls, x_cls, ):
  def __call__(y_true, y_pred):
    """
    calculate the Faster RCNN Multitask loss for object detection, 
    Classification Accuracy & Position Regression Accuracy
    
    variables:
      y_true = [gts_reg, gts_anchors, gts_cls]
      gts_reg = groundtruth regression coordinates [x, y, width, height]
      gts_anchors = per groundtruth regression coordinates the corresponding anchor [x, y, width, height]
      gts_cls = groundtruth class
      y_pred = [y_reg, y_anchors, y_cls]
      y_reg = predicted regression coordinates [x, y, width, height]
      y_anchors = per predicted regression coordinates the corresponding anchor [x, y, width, height]
      y_cls = predicted class

    t_coordinates:
      t_x = (x - a_x) / a_width
      t_y = (y - a_y) / a_height
      t_width = log(width / a_widht)
      t_height = log(height / a_height)
      
    returns:
      multitask loss according to Girschick - Fast-RCNN, 2015 
    """
    # transform the predicted box coordinates to the t coordinates
    y_reg, y_a, y_cls = y_pred
    t = k.backend.placeholder(gts_reg.shape, dtype=np.float16)
    t[:,0] = (y_reg[:,0] - y_a[:,0]) / y_a[:,2]
    t[:,1] = (y_reg[:,1] - y_a[:,1]) / y_a[:,3]
    t[:,2] = k.backend.log(y_reg[:,2] / y_a[:,2])
    t[:,3] = k.backend.log(y_reg[:,3] / y_a[:,3])
    
    #transform the groundtruth box coordinaties to the t coordinates
    gts_reg, gts_a, gts_cls = y_true
    ts = k.backend.placeholder(gts_reg.shape, dtype=np.float16)
    ts[:,0] = (gts_reg[:,0] - gts_a[:,0]) / gts_a[:,2]
    ts[:,1] = (gts_reg[:,1] - gts_a[:,1]) / gts_a[:,3]
    ts[:,2] = k.backend.log(gts_reg[:,2] / gts_a[:,2])
    ts[:,3] = k.backend.log(gts_reg[:,3] / gts_a[:,3])

    # calculate the regression part of the loss as smooth L1 loss
    #TODO this should be 0 if background else 1
    l_reg = k.layers.multiply(gts_cls, self.smooth_L1_loss(t - ts))

    # turn the class predictions from confidence into catigorical
    #TODO this is not categorical if more than two classes
    x_cls_cat = k.backend.round(y_cls - (pos_thresh - 0.5))

    # calculate the catigorical cross_entropy / logloss
    l_cls = k.losses.categorical_crossentropy(gts_cls, x_cls_cat)

    # calculate the combined loss
    l_rpn = l_cls / gts_cls.shape[0] + lam * l_reg / gts_reg.shape[0]

    return l_rpn


In [0]:
from keras.model import Model

#TODO create opportunity to load weights from xxx dataset
def FasterRCNN(base_model,
               num_classes,
               feature_vector_length = 256,
               num_anchors = 128,
              ):
    """
    
    variables:
      base_model: a keras applications image classification model
      num_classes: the number of classes to be detected
      feature_vector_length: the feature vector size before prediction
      num_anchors: max number of anchors to propose
    
    """
  
    #TODO extract from the model the right layer, i.e check if -1 is a max_pool than it should be the -2 layer
    x = base_model.output
    
    #rpn
    rpn = RPN(feature_vector_length,
              num_anchors)
    x_reg, x_cls = rpn(x)
    
    #create a rpn model, since we need to train on the rpn level as well
    rpn_model = models.Model(base_model.inputs, [x_cls, x_reg], name='RPN')
    
    #turn selected rpn into rois
    rois = RPN2ROI(x_reg, x_cls)
    
    #roipooling
    roipooling = ROIPooling(mini_batch_size)
    x = roipooling([x, rois])
    
    #detection block of the model
    x = l.layers.Flatten()(x)
    #TODO: the SVD stuff from the Fast RCNN paper
    #TODO: what is the proper size for this
    #TODO: this has to not only accept batch but as well num rois like (batch, num_rois, ...)
    x = k.layers.Dense(4096,
                       activation=None,
                       name = "Detection_bloc_dense_1")(x)
    x = k.layers.Dense(feature_vector_length,
                       activation=None,
                       name = "Detection_bloc_dense_2")(x)
    x_cls = k.layers.Dense(num_classes + 1,
                           activation="softmax",
                           name = "Detection_bloc_cls")
    x_reg = k.layers.Dense(4,
                           activation="linear",
                           name = "Detection_bloc_reg")
    
    # create the complete model
    model = models.Model(base_model.inputs, [x_cls, x_reg], name='Faster-RCNN')
    
    return model, rpn_model, base_model

SyntaxError: ignored

In [0]:
def RPN_data_generator(data, W, H, input_shape, ratios, resolutions, mini_batch_size):
  """training data generator for the RPN
  takes training data as images and boxes
  and extracts anchorProposals as groundtruth
  from those.
  
  variables:
    data: input data ??? as tfrecord i guess, what else could it be ???
    **kwargs: variables for anchorProposals
  """
  
  anchorProposals = AnchorProposals(W=W,
                                    H=H,
                                    input_shape,
                                    ratios,
                                    resolutions,
                                    mini_batch_size
                                   )
  
  while True:
    #extract one batch of images with bounding boxes
    
    
    #extract the minibatch of anchors for each image
    anchorProposals.generator(gts=)
    
    #yield a batch of minibatches 
    #(if there is more than one image, make the batch dimensions batch_size * mini_batch_size
    yield True

In [0]:
def ObjectDetection_data_generator(data):
  
  
  while True:
    #extract one batch of images with bounding boxes
    
    
    #yield a batch of images with bounding boxes
    yield True

In [0]:
def train_FasterRCNN(base_model,
                     model_kwargs,
                     rpn_training_kwargs,
                     detection_training_kwargs,
                     data,
                     train_from_detection = False,
                     detection_weights = None,
                     epochs = 100,
                     steps_rpn = 100,
                     steps_detection = 100,
                     ratios = [(1,1),(1,2),(2,1)],
                     resolutions = [128,256,512],
                     mini_batch_size = 256,
                     **kwargs
                    ):
  """
  Ren et al - Faster RCNN - 2017:
  In the first step, we train the RPN as described above. This network is
  initialized with an ImageNet- pre-trained model and fine-tuned end-to-end
  for the region proposal task. In the second step, we train a separate
  detection network by Fast R-CNN using the proposals generated by the step-1
  RPN. This detection network is also initialized by the ImageNet-pre-trained
  model. At this point the two networks do not share conv layers. In the
  third step, we use the detector network to initialize RPN training, but we
  fix the shared conv layers and only fine-tune the layers unique to RPN. Now
  the two networks share conv layers. Finally, keeping the shared conv layers
  fixed, we fine-tune the fc layers of the Fast R-CNN. As such, both networks
  share the same conv layers and form a unified network.
  
  variables:
    model: the Faster-RCNN model
    model_rpn: the RPN model
    base_model: the image classification model
    train_from_detection: whether to use object detection or image classification
        weights for finetuning
    detection_weights: The type of detection weights if train_from_detection is 
        True. Any of ["Pascal","COCO","OID"]
    metrics: keras callback for metrics to use
    ratios: ratios for anchorproposals
    resolutions: resolutions for anchorproposals
    mini_batch_size: mini_batch_size for anchorproposals
    **kwargs: keras.model.fit_generator variables after epochs
  """
  # Step 1: a image_net based rpn
  gen_RPN = RPN_data_generator(data=data,
                               W=int(base_model.layers[-2].output.shape[1]),
                               H=int(base_model.layers[-2].output.shape[2]),
                               input_shape=base_model.input.shape,
                               ratios=ratios,
                               resolutions=resolutions,
                               mini_batch_size=mini_batch_size)
  
  _, model_rpn_tmp =  FasterRCNN(base_model,
                             num_classes,
                             **model_kwargs)
  
  model_rpn_tmp.fit_generator(
              generator = gen_RPN,
              steps_per_epoch=steps_rpn,
              epochs=epochs,
              **rpn_training_kwargs
          )
  
  #--> we need to get the ROIS from this model an funnel them into the ROIS layer in model
  
  #TODO ensure this is a deep copy not a shared one
  # Step 2: a image_net based detection with model_rpn ROI
  gen = ObjectDetection_data_generator(data=data)
  
  model, model_rpn =  FasterRCNN(base_model,
                         num_classes,
                         **model_kwargs)
  
  model.fit_generator(
              generator = gen,
              steps_per_epoch=steps_detection,
              epochs=epochs,
              **detection_training_kwargs
          )
  
  #TODO are Step 3 & 4 sequential or iterative per epoch??
  # Step 3: keep model from step 2, finetune rpn layer
  gen_RPN = RPN_data_generator(data=data,
                               W=int(base_model.layers[-2].output.shape[1]),
                               H=int(base_model.layers[-2].output.shape[2]),
                               input_shape=base_model.input.shape,
                               ratios=ratios,
                               resolutions=resolutions,
                               mini_batch_size=mini_batch_size)
  
  #TODO freeze the base model layers
  model_rpn_tmp.fit_generator(
              generator = gen_RPN,
              steps_per_epoch=steps_rpn,
              epochs=epochs,
              **rpn_training_kwargs
          )
  
  # Step 4: keep model from step 3, finetune detection layer
  gen = ObjectDetection_data_generator(data=data)
  
  #TODO make sure base_model & rpn layers are frozen
  model.fit_generator(
              generator = gen,
              steps_per_epoch=steps_detection,
              epochs=epochs,
              **detection_training_kwargs
          )
  
  
  return model
  

# future

In [0]:
def main(_):

  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
  tf.tpu.experimental.initialize_tpu_system(resolver)
  strategy = tf.distribute.Strategy(resolver)

  data = SyntheticDataset(FLAGS.batch_size)
  
  with strategy.scope():
    model = model_cls(weights=None, input_shape=data.input_shape,
                      classes=data.num_classes)

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])

    history = model.fit(
        data.train_dataset,
        epochs=FLAGS.epochs,
        steps_per_epoch=data.num_train_images // FLAGS.batch_size,
        validation_data=data.test_dataset,
        validation_steps=data.num_test_images // FLAGS.batch_size)

    return history.history