# Comparison of Deep Learning based Object Detection and OpenCV based approach

### This Notebook compares the best receipts detector from fine-tuning with the basic approach to detect receipts using OpenCV
* Test data: 50 real receipt images that ever used
* Receipts Detector: NanoDet (fine-tuned on 1500 mixed data)
* OpenCV Detection Pipeline: the step by step details are defined in processing/basicprocessing.ipynb
* The goal of this comparison is to select the best choice for receipts detection and thus to crop the images for OCR
* The evaluation metrics are average IoU and the number of detected receipts.

In [21]:
import cv2
import numpy as np
import torch
from tqdm import tqdm
import os
import xml.etree.ElementTree as ET
import pandas as pd
import torch
from PIL import Image
import sys
sys.path.insert(0,'/Users/local_admin/Desktop/thesis/object_detection/nanodet')
from nanodet.util import cfg, load_config, Logger
from demo.demo import Predictor
import os

In [8]:
def opencv_resize(image, ratio):
    width = int(image.shape[1] * ratio)
    height = int(image.shape[0] * ratio)
    dim = (width, height)
    return cv2.resize(image, dim, interpolation=cv2.INTER_AREA)

In [4]:
def detect_receipt_with_opencv(image):
    """
    image: test images

    The function apply basic image processing approach using openCV to extract receipt

    return: None if there isn't any approximate contour with length = 4, otherwise return the coordinates of extract lagest extract contour 
    """
    resize_ratio = 500 / image.shape[0]
    image = opencv_resize(image, resize_ratio)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
    dilated = cv2.dilate(blurred, rectKernel)
    edged = cv2.Canny(dilated, 100, 200, apertureSize=3)
    contours, hierarchy = cv2.findContours(edged, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    largest_contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]

    def approximate_contour(contour):
        peri = cv2.arcLength(contour, True)
        return cv2.approxPolyDP(contour, 0.032 * peri, True)

    def get_receipt_contour(contours):
        for c in contours:
            approx = approximate_contour(c)
            if len(approx) == 4:
                return approx
        return None

    receipt_contour = get_receipt_contour(largest_contours)

    if receipt_contour is None:
        return None

    def contour_to_rect(contour):
        pts = contour.reshape(4, 2)
        rect = np.zeros((4, 2), dtype="float32")
        s = pts.sum(axis=1)
        rect[0] = pts[np.argmin(s)]
        rect[2] = pts[np.argmax(s)]
        diff = np.diff(pts, axis=1)
        rect[1] = pts[np.argmin(diff)]
        rect[3] = pts[np.argmax(diff)]
        return rect / resize_ratio

    rect = contour_to_rect(receipt_contour)
    (tl, tr, br, bl) = rect
    x_min = min(tl[0], bl[0])
    y_min = min(tl[1], tr[1])
    x_max = max(tr[0], br[0])
    y_max = max(bl[1], br[1])
    return int(x_min), int(y_min), int(x_max), int(y_max)

In [5]:
def calculate_iou(groundtruth, predict_box):
    """
    groundtruth: annotation of receipts, represents true position and size of receipt in the image
    predict_box: boundingbox predicted by detection methos(NanoDet and basic approach)

    This function computes the IoU between groundtruth and predict boundingbox
    """
    x1 = max(groundtruth[0], predict_box[0])
    y1 = max(groundtruth[1], predict_box[1])
    x2 = min(groundtruth[2], predict_box[2])
    y2 = min(groundtruth[3], predict_box[3])
    
    interArea = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
    groundtruthArea = (groundtruth[2] - groundtruth[0] + 1) * (groundtruth[3] - groundtruth[1] + 1)
    predict_boxArea = (predict_box[2] - predict_box[0] + 1) * (predict_box[3] - predict_box[1] + 1)
    
    iou = interArea / float(groundtruthArea + predict_boxArea - interArea)

    return iou


In [6]:
#Get ground truth bounding box from XML annotation
def get_groundtruth_box(image_path):
    """
    This function aims to get the annotation of images
    """
    base_name = os.path.basename(image_path)
    annotation_path = os.path.join('/Users/local_admin/Desktop/thesis/data/50test_detector/annotation', base_name.replace('.png', '.xml'))

    tree = ET.parse(annotation_path)
    root = tree.getroot()
    for obj in root.iter('object'):
        xmlbox = obj.find('bndbox')
        xmin = int(xmlbox.find('xmin').text)
        ymin = int(xmlbox.find('ymin').text)
        xmax = int(xmlbox.find('xmax').text)
        ymax = int(xmlbox.find('ymax').text)
        return xmin, ymin, xmax, ymax

    return None

In [27]:
def evaluation(image_folder, config_path, model_path):
    """
    This function evaluates two approach to detect receipts on a test dataset by:
    calculating IoU for each detected bounding box against the ground truth.
    aggregates IoU results and counts the number of detected receipts (if IoU > 0.5).

    return: a dataframe summarizing the results.
    """
    # List of image paths
    image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith('.png')]

    # Load the NanoDet configuration and initialize the predictor
    load_config(cfg, config_path)
    logger = Logger(-1, use_tensorboard=False)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    predictor = Predictor(cfg, model_path, logger, device=device)

    # Initialize lists to store IoU scores and detection counts
    opencv_ious = []
    model_ious = []
    opencv_detected = 0
    model_detected = 0

    for image_path in tqdm(image_paths):
        image = cv2.imread(image_path)
        gt_box = get_groundtruth_box(image_path.replace('.png', '.xml'))  # Assuming XML files have the same name as images with .xml extension

        if gt_box is None:
            continue

        # Evaluate OpenCV approach
        opencv_box = detect_receipt_with_opencv(image)
        if opencv_box is not None:
            opencv_iou = calculate_iou(opencv_box, gt_box)
            opencv_ious.append(opencv_iou)
            if opencv_iou >= 0.5:
                opencv_detected += 1

        # Evaluate NanoDet model approach
        """
        Call the fine-tuned detection model make prediction on test images, the result of prediction is boundingsboxes with confidence scores.
        When the model predicts mutiple boundingboxes, only take the boundingbox with highest confidence score as final result.

        return: the coordinates of boundingbox with highest confidence score
        """
        meta, pred_boxes = predictor.inference(image_path)
        best_pred_box = None
        best_pred_score = float('-inf')

        for image_id, pred_dict in pred_boxes.items():
            for _, pred_list in pred_dict.items():
                for pred_box in pred_list:
                    confidence = pred_box[4]  # Confidence score is at index 4 in the prediction box
                    if confidence > best_pred_score:
                        best_pred_score = confidence
                        best_pred_box = pred_box[:4]  # Extract only the coordinates

        if best_pred_box is not None:
            model_iou = calculate_iou(best_pred_box, gt_box)
            model_ious.append(model_iou)
            if model_iou >= 0.5:
                model_detected += 1

    # Calculate average IoU scores
    average_opencv_iou = np.mean(opencv_ious) if opencv_ious else 0
    average_model_iou = np.mean(model_ious) if model_ious else 0

    # Create DataFrame with the results
    results = {
        "Approach": ["OpenCV", "NanoDet"],
        "Average IoU": [average_opencv_iou, average_model_iou],
        "Number of Predicted Receipts": [opencv_detected, model_detected]
    }
    df = pd.DataFrame(results)

    return df 

In [29]:
image_folder = "/Users/local_admin/Desktop/thesis/data/50test_detector/img"
config_path_nanodet = '/Users/local_admin/Desktop/thesis/object_detection/nanodet/config/nanodet_custom_xml_dataset.yml'
nanodet_path = "/Users/local_admin/Desktop/thesis/object_detection/trained_detectors/nanodet/trained_nano_det_1500_combined_to_real/model_best/nanodet_model_best.pth"

df_results = evaluation(image_folder, config_path_nanodet, nanodet_path)

model size is  1.0x
init weights...
=> loading pretrained model https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth
Finish initialize NanoDet-Plus Head.


  0%|          | 0/50 [00:00<?, ?it/s]

forward time: 0.087s | decode time: 0.002s | 

  4%|▍         | 2/50 [00:00<00:12,  3.78it/s]

forward time: 0.029s | decode time: 0.002s | 

  6%|▌         | 3/50 [00:00<00:14,  3.15it/s]

forward time: 0.030s | decode time: 0.003s | 

  8%|▊         | 4/50 [00:01<00:16,  2.76it/s]

forward time: 0.032s | decode time: 0.002s | 

 10%|█         | 5/50 [00:01<00:16,  2.77it/s]

forward time: 0.034s | decode time: 0.002s | 

 12%|█▏        | 6/50 [00:02<00:16,  2.72it/s]

forward time: 0.030s | decode time: 0.002s | 

 14%|█▍        | 7/50 [00:02<00:16,  2.61it/s]

forward time: 0.033s | decode time: 0.002s | 

 16%|█▌        | 8/50 [00:02<00:15,  2.63it/s]

forward time: 0.028s | decode time: 0.003s | 

 18%|█▊        | 9/50 [00:03<00:15,  2.62it/s]

forward time: 0.031s | decode time: 0.002s | 

 20%|██        | 10/50 [00:03<00:15,  2.62it/s]

forward time: 0.026s | decode time: 0.002s | 

 22%|██▏       | 11/50 [00:05<00:26,  1.47it/s]

forward time: 0.029s | decode time: 0.002s | 

 24%|██▍       | 12/50 [00:05<00:22,  1.72it/s]

forward time: 0.030s | decode time: 0.002s | 

 26%|██▌       | 13/50 [00:06<00:30,  1.21it/s]

forward time: 0.027s | decode time: 0.002s | forward time: 0.032s | decode time: 0.002s | 

 30%|███       | 15/50 [00:07<00:19,  1.77it/s]

forward time: 0.027s | decode time: 0.002s | 

 32%|███▏      | 16/50 [00:08<00:25,  1.32it/s]

forward time: 0.028s | decode time: 0.002s | 

 34%|███▍      | 17/50 [00:09<00:21,  1.50it/s]

forward time: 0.028s | decode time: 0.002s | 

 36%|███▌      | 18/50 [00:10<00:26,  1.20it/s]

forward time: 0.029s | decode time: 0.002s | 

 38%|███▊      | 19/50 [00:10<00:21,  1.44it/s]

forward time: 0.030s | decode time: 0.002s | 

 40%|████      | 20/50 [00:11<00:18,  1.63it/s]

forward time: 0.027s | decode time: 0.002s | 

 42%|████▏     | 21/50 [00:11<00:15,  1.86it/s]

forward time: 0.032s | decode time: 0.002s | 

 44%|████▍     | 22/50 [00:11<00:14,  1.93it/s]

forward time: 0.030s | decode time: 0.002s | 

 46%|████▌     | 23/50 [00:12<00:13,  1.98it/s]

forward time: 0.032s | decode time: 0.002s | 

 48%|████▊     | 24/50 [00:12<00:13,  1.96it/s]

forward time: 0.031s | decode time: 0.002s | 

 50%|█████     | 25/50 [00:13<00:12,  2.00it/s]

forward time: 0.029s | decode time: 0.002s | 

 52%|█████▏    | 26/50 [00:13<00:11,  2.05it/s]

forward time: 0.030s | decode time: 0.002s | 

 54%|█████▍    | 27/50 [00:14<00:10,  2.12it/s]

forward time: 0.030s | decode time: 0.002s | 

 56%|█████▌    | 28/50 [00:15<00:16,  1.35it/s]

forward time: 0.028s | decode time: 0.002s | 

 58%|█████▊    | 29/50 [00:15<00:13,  1.58it/s]

forward time: 0.029s | decode time: 0.002s | 

 60%|██████    | 30/50 [00:16<00:11,  1.76it/s]

forward time: 0.028s | decode time: 0.002s | 

 62%|██████▏   | 31/50 [00:16<00:10,  1.88it/s]

forward time: 0.029s | decode time: 0.003s | 

 64%|██████▍   | 32/50 [00:17<00:09,  1.93it/s]

forward time: 0.030s | decode time: 0.002s | 

 66%|██████▌   | 33/50 [00:17<00:08,  2.00it/s]

forward time: 0.031s | decode time: 0.002s | 

 68%|██████▊   | 34/50 [00:18<00:07,  2.02it/s]

forward time: 0.033s | decode time: 0.003s | 

 70%|███████   | 35/50 [00:18<00:07,  2.08it/s]

forward time: 0.031s | decode time: 0.002s | forward time: 0.029s | decode time: 0.001s | 

 74%|███████▍  | 37/50 [00:19<00:04,  2.63it/s]

forward time: 0.029s | decode time: 0.002s | 

 76%|███████▌  | 38/50 [00:19<00:04,  2.55it/s]

forward time: 0.028s | decode time: 0.003s | 

 78%|███████▊  | 39/50 [00:20<00:04,  2.35it/s]

forward time: 0.030s | decode time: 0.002s | 

 80%|████████  | 40/50 [00:20<00:04,  2.29it/s]

forward time: 0.030s | decode time: 0.002s | forward time: 0.033s | decode time: 0.002s | 

 84%|████████▍ | 42/50 [00:21<00:02,  2.84it/s]

forward time: 0.033s | decode time: 0.002s | 

 86%|████████▌ | 43/50 [00:21<00:02,  2.56it/s]

forward time: 0.032s | decode time: 0.002s | 

 88%|████████▊ | 44/50 [00:22<00:02,  2.51it/s]

forward time: 0.037s | decode time: 0.002s | 

 90%|█████████ | 45/50 [00:22<00:01,  2.55it/s]

forward time: 0.031s | decode time: 0.002s | 

 94%|█████████▍| 47/50 [00:24<00:01,  1.77it/s]

forward time: 0.031s | decode time: 0.002s | forward time: 0.033s | decode time: 0.002s | 

 96%|█████████▌| 48/50 [00:24<00:01,  1.85it/s]

forward time: 0.028s | decode time: 0.002s | forward time: 0.032s | decode time: 0.002s | 

100%|██████████| 50/50 [00:25<00:00,  1.98it/s]

forward time: 0.030s | decode time: 0.002s | 




In [31]:
print(df_results.to_string(index=False))

Approach  Average IoU  Number of Predicted Receipts
  OpenCV     0.483314                            21
 NanoDet     0.956527                            50
