# Initialisation

In [None]:
!pip3 uninstall keras-nightly
!pip3 uninstall -y tensorflow
!pip3 install keras==2.1.6
!pip3 install tensorflow==1.15.0
!pip3 install h5py==2.10.0

In [None]:
!git clone https://github.com/matterport/Mask_RCNN.git

In [None]:
%cd Mask_RCNN

In [None]:
!python setup.py install
!pip install -r requirements.txt

In [None]:
!pip install elementpath
!pip install manga109api

In [None]:
import elementpath
from xml.etree import ElementTree
import manga109api
from google.colab import files
from os import listdir
from numpy import zeros, asarray, expand_dims, mean
from numpy import asarray
from mrcnn.utils import Dataset, extract_bboxes, compute_ap
from mrcnn.config import Config
from mrcnn.visualize import display_instances
from mrcnn.model import MaskRCNN, load_image_gt, mold_image
import matplotlib.pyplot as pyplot
from matplotlib.patches import Rectangle, Arrow
import math

In [None]:
root_dir = "/content/drive/MyDrive/NRP/Project/Manga109/"
p = manga109api.Parser(root_dir=root_dir)

# Reformat Manga109 annotations

In [None]:
# %cd /content
# for book in p.books:
#   tree = ElementTree.parse(root_dir + "annotations/" + book + ".xml")
#   root = tree.getroot()

#   %mkdir $book
#   %cd /content/$book

#   for page in root.findall('.//page'):
#     new_xml = page
#     b_xml = ElementTree.tostring(new_xml)
#     with open("new_" + book + str(page.attrib["index"]) + ".xml", "wb") as f:
#       f.write(b_xml)
  
#   %cd /content

In [None]:
# for book in p.books:
#   !zip -r /content/$book /content/$book

In [None]:
# print('\n'.join(p.books))

# Prepare Dataset

In [None]:
class MangaDataset(Dataset):
  def load_dataset(self, is_train=True):
    self.add_class("dataset", 1, "face")
    self.add_class("dataset", 2, "text")

    for book in p.books:
      images_dir = root_dir + "images/" + book + "/"
      annotations_dir = root_dir + "/annotations/" + book + "/"

      for img in listdir(images_dir):
        image_id = img[:-4]
      
        tree = ElementTree.parse(annotations_dir + "new_" + book + str(int(image_id)) + ".xml")
        root = tree.getroot()
        faces = []
        texts = []

        for face in root.findall(".//face"):
          faces.append(face)
        
        for text in root.findall(".//text"):
          texts.append(text)
        
        if len(faces) < 1: #if there are no faces
          continue
        
        if len(texts) < 1: #if there are no texts
          continue
 
        if is_train and int(image_id) >= 50:
          continue

        if not is_train and int(image_id) < 50:
          continue
        
        img_path = images_dir + img
        ann_path = annotations_dir + "new_" + book + str(int(image_id)) + ".xml"

        self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path, class_ids = [0,1,2])


  def extract_boxes(self, filename):
    tree = ElementTree.parse(filename)
    root = tree.getroot()
    boxes = []

    for box in root.findall(".//face"):
      att = box.attrib
      xmin = att['xmin']
      ymin = att['ymin']
      xmax = att['xmax']
      ymax = att['ymax']
      coors = [xmin, ymin, xmax, ymax, "face"]
      boxes.append(coors)
    
    for box in root.findall(".//text"):
      att = box.attrib
      xmin = att['xmin']
      ymin = att['ymin']
      xmax = att['xmax']
      ymax = att['ymax']
      coors = [xmin, ymin, xmax, ymax, "text"]
      boxes.append(coors)

    page_att = root.attrib
    width = int(page_att['width'])
    height = int(page_att['height'])

    return boxes, width, height


  def load_mask(self, image_id):
    info = self.image_info[image_id]
    path = info["annotation"]
    boxes, w, h = self.extract_boxes(path)
    
    masks = zeros([h, w, len(boxes)], dtype='uint8')

    class_ids = []

    for i in range(len(boxes)):
      box = boxes[i]
      row_s, row_e = box[1], box[3]
      col_s, col_e = box[0], box[2]

      if box[4] == "face":
        masks[int(row_s):int(row_e), int(col_s):int(col_e), i] = 1
        class_ids.append(self.class_names.index('face'))

      elif box[4] == "text":
        masks[int(row_s):int(row_e), int(col_s):int(col_e), i] = 2
        class_ids.append(self.class_names.index('text'))

    return masks, asarray(class_ids, dtype='int32')


  def image_reference(self, image_id):
    info = self.image_info[image_id]
    return info["path"]

In [None]:
# train set
# train_set = MangaDataset()
# train_set.load_dataset(is_train=True)
# train_set.prepare()
# print('Train: %d' % len(train_set.image_ids))
 
# test/val set
test_set = MangaDataset()
test_set.load_dataset(is_train=False)
test_set.prepare()
print('Test: %d' % len(test_set.image_ids))

In [None]:
# load an image and mask
image_id = 1
image = train_set.load_image(image_id)
print(image.shape)

mask, class_ids = train_set.load_mask(image_id)
print(mask.shape)

In [None]:
# display image with masks and bounding boxes
bbox = extract_bboxes(mask)
display_instances(image, bbox, mask, class_ids, train_set.class_names)

# Train Model

In [None]:
class MangaConfig(Config):
  NAME = "manga_cfg"
  NUM_CLASSES = 1 + 2
  STEPS_PER_EPOCH = 131

In [None]:
config = MangaConfig()
model = MaskRCNN(mode='training', model_dir='/content', config=config)
# model.load_weights('/content/drive/MyDrive/NRP/Project/mask_rcnn_coco.h5',
#                    by_name=True,
#                    exclude=["mrcnn_class_logits", "mrcnn_bbox_fc",  "mrcnn_bbox", "mrcnn_mask"])

model.load_weights('/content/drive/MyDrive/NRP/Project/FaceSpeech/model_2.h5',
                   by_name=True,
                   exclude=["mrcnn_class_logits", "mrcnn_bbox_fc",  "mrcnn_bbox", "mrcnn_mask"])

model.train(train_set, test_set, learning_rate=0.00001, epochs=40, layers="all")

# model naming: [classes]_[learning_rate]_[epochs]_[layers]

# LEARNING_RATE = 0.001

# We could follow this training with further epochs that fine-tune all of the weights in the model.
# This could be achieved by using a smaller learning rate and changing the ‘layer’ argument from ‘heads’ to ‘all’.


# Evaluate Model

In [None]:
class PredictionConfig(Config):
  NAME = "manga_cfg"
  NUM_CLASSES = 1 + 2
  GPU_COUNT = 1
  IMAGES_PER_GPU = 1

In [None]:
cfg = PredictionConfig()
model = MaskRCNN(mode='inference', model_dir='/content', config=cfg)

In [None]:
model.load_weights('/content/drive/MyDrive/NRP/Project/FaceSpeech/model_2.h5', by_name=True)

In [None]:
def evaluate_model(dataset, model, cfg):
  APs = []
  for image_id in dataset.image_ids:
    image, image_meta, gt_class_id, gt_bbox, gt_mask = load_image_gt(dataset, cfg, image_id, use_mini_mask=False)
    scaled_image = mold_image(image, cfg)
    sample = expand_dims(scaled_image, 0)
    yhat = model.detect(sample, verbose=0)
    r = yhat[0]

    AP, _, _, _ = compute_ap(gt_bbox, gt_class_id, gt_mask, r["rois"], r["class_ids"], r["scores"], r['masks'])
    APs.append(AP)

  mAP = mean(APs)
  return mAP

In [None]:
# evaluate model on training dataset
# train_mAP = evaluate_model(train_set, model, cfg)
# print("Train mAP: %.3f" % train_mAP)

# evaluate model on test dataset
test_mAP = evaluate_model(test_set, model, cfg)
print("Test mAP: %.3f" % test_mAP)

# Associate Face to Text

In [None]:
def arrow_face_text(dataset, image_id, face_to_nearest_text):
  image = dataset.load_image(image_id)
  mask, _ = dataset.load_mask(image_id)
  scaled_image = mold_image(image, cfg)
  sample = expand_dims(scaled_image, 0)

  yhat = model.detect(sample, verbose=0)[0]

  pyplot.subplot(1, 2, image_id*2+1)
  pyplot.imshow(image)
  pyplot.title('Face to Text')

  for j in range(mask.shape[2]):
    pyplot.imshow(mask[:, :, j], cmap='gray', alpha=0.3)

  ax = pyplot.gca()

  for box in yhat['rois']:
    y1, x1, y2, x2 = box
    width, height = x2 - x1, y2 - y1

    rect = Rectangle((x1, y1), width, height, fill=False, color='red')
    ax.add_patch(rect)
  
  for coors in face_to_nearest_text:
    face_x, face_y = coors[0][0], coors[0][1]
    text_x, text_y = coors[1][0], coors[1][1]

    length_x = abs(face_x - text_x)
    length_y = abs(face_y - text_y)

    arrow = Arrow(face_x, face_y, length_x, length_y)
    ax.add_patch(arrow)

  pyplot.show()

In [None]:
def assoc_face_text(dataset, xml_file, image_id):
  boxes, _, _ = dataset.extract_boxes(xml_file)
  face_centers = []
  text_centers = []

  for box in boxes:
    #find centers of faces and text
    if "face" in box:
      face_x = (int(box[0]) + int(box[2]))//2 #average of xmin and xmax
      face_y = (int(box[1]) + int(box[3]))//2 #average of ymin and ymax
      face_centers.append([face_x, face_y])
    elif "text" in box:
      text_x = (int(box[0]) + int(box[2]))//2 #average of xmin and xmax
      text_y = (int(box[1]) + int(box[3]))//2 #average of ymin and ymax
      text_centers.append([text_x, text_y])

  face_to_nearest_text = []

  for face in face_centers:
    nearest_text = text_centers[0]
    distance_x = abs(face[0] - nearest_text[0])
    distance_y = abs(face[1] - nearest_text[1])
    shortest_distance = math.sqrt(distance_x**2 + distance_y**2)

    for text in text_centers:
      distance_x = abs(face[0] - text[0])
      distance_y = abs(face[1] - text[1])
      distance = math.sqrt(distance_x**2 + distance_y**2)
      if distance < shortest_distance:
        shortest_distance = distance
        nearest_text = text
    
    face_to_nearest_text.append([face, nearest_text]) #coordinates
  
  return face_to_nearest_text

In [None]:
dataset = test_set
# dataset = train_set + test_set

for book in p.books:
  images_dir = root_dir + "images/" + book + "/"
  annotations_dir = root_dir + "/annotations/" + book + "/"

  for img in listdir(images_dir):
    image_id = int(img[:-4])
    xml_file = annotations_dir + "new_" + book + str(image_id) + ".xml"

    tree = ElementTree.parse(xml_file)
    root = tree.getroot()
    faces = []
    texts = []

    for face in root.findall(".//face"):
      faces.append(face)
    
    for text in root.findall(".//text"):
      texts.append(text)
    
    if len(faces) < 1: #if there are no faces
      continue
    
    if len(texts) < 1: #if there are no texts
      continue

  face_to_nearest_text = assoc_face_text(dataset, xml_file, image_id)
  arrow_face_text(dataset, image_id, group)

  # for group in face_to_nearest_text:
  #   arrow_face_text(dataset, image_id, group)

  break

# Detect in New Photos

In [None]:
def plot_actual_vs_predicted(dataset, model, cfg, n_images=5):
  for i in range(n_images):
    image = dataset.load_image(i)
    mask, _ = dataset.load_mask(i)
    scaled_image = mold_image(image, cfg)
    sample = expand_dims(scaled_image, 0)

    yhat = model.detect(sample, verbose=0)[0]

    pyplot.subplot(n_images, 2, i*2+1)
    pyplot.imshow(image)
    pyplot.title('Actual')

    for j in range(mask.shape[2]):
      pyplot.imshow(mask[:, :, j], cmap='gray', alpha=0.3)

    pyplot.subplot(n_images, 2, i*2+2)
    pyplot.imshow(image)
    pyplot.title('Predicted')
    ax = pyplot.gca()

    for box in yhat['rois']:
      y1, x1, y2, x2 = box
      width, height = x2 - x1, y2 - y1

      rect = Rectangle((x1, y1), width, height, fill=False, color='red')
      ax.add_patch(rect)

  pyplot.show()

In [None]:
model = MaskRCNN(mode='inference', model_dir='./', config=cfg)
model_path = '/content/drive/MyDrive/NRP/Project/FaceSpeech/model_2.h5'
model.load_weights(model_path, by_name=True)

# plot_actual_vs_predicted(train_set, model, cfg)
plot_actual_vs_predicted(test_set, model, cfg)

# Credits

[Kangaroo](https://machinelearningmastery.com/how-to-train-an-object-detection-model-with-keras/)

[Matterport Mask RCNN](https://github.com/matterport/Mask_RCNN)

[Manga109](http://www.manga109.org/en/index.html)