# Setup

## Setup YoloV8

In [None]:
# Dependencies

!pip install ultralytics

In [None]:
# Imports

from ultralytics import YOLO
from matplotlib.patches import Rectangle

In [None]:
# Models

yolo_model = YOLO('/CarDD/yolov8_weights.pt')

## Setup SAM

In [None]:
# Dependencies

!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q datasets
!pip install -q monai

In [None]:
# Imports

import monai
from statistics import mean
from transformers import SamProcessor, SamModel

import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as Dataset_torch
from torch.nn.functional import threshold, normalize

In [None]:
# Models

sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
sam_model = SamModel.from_pretrained("yjmsvma/car_sam")
sam_model.to('cuda')

## Setup GroundingDINO

In [None]:
# Dependencies

!nvidia-smi

import os
HOME = os.getcwd()
print(HOME)

%cd {HOME}
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd {HOME}/GroundingDINO
!pip install -q -e .
!pip install -q roboflow

CONFIG_PATH = os.path.join(HOME, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))

In [None]:
# Dependencies

%cd {HOME}
!mkdir {HOME}/weights
%cd {HOME}/weights

!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

In [None]:
# Imports

%cd {HOME}/GroundingDINO

from groundingdino.util.inference import load_model, load_image, predict, annotate

In [None]:
# Models

groundingdino_model = load_model(CONFIG_PATH, WEIGHTS_PATH)

## Set up OWL-VIT

In [None]:
# Dependencies

!pip install Pillow
!pip install opencv-python

In [None]:
 # Imports

import cv2
import skimage
import numpy as np
from matplotlib import rcParams
from transformers.image_utils import ImageFeatureExtractionMixin
from transformers import OwlViTProcessor, OwlViTForObjectDetection

In [None]:
# Models

# Use GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")

else:
    device = torch.device("cpu")

owlvit_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14")
owlvit_processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")

# Set model in evaluation mode
owlvit_model = owlvit_model.to(device)
owlvit_model.eval()

## General

In [None]:
# Imports

import os
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from PIL import Image, ImageOps, ImageDraw
import matplotlib.pyplot as plt
from collections import Counter
from datasets import Dataset, load_dataset, load_from_disk

import torch
from PIL import Image
import supervision as sv
import torchvision.transforms as transforms

from pycocotools.coco import COCO
from datasets import Dataset, load_dataset, load_from_disk

from cv2 import imread, imwrite

label_prompt_conversion = {'dent':'dented damage',
                           'scratch':'scratched damage',
                           'crack':'cracked damage',
                           'glass shatter':'shattered glass',
                           'lamp broken':'broken lamp',
                           'tire flat':'flat tire'}

## Functions

In [None]:
def pad_to_square(binary_mask):

    """
    Pad a binary mask numpy array to make it square.

    Parameters:
    binary_mask (numpy.ndarray): A 2D binary mask array.

    Returns:
    numpy.ndarray: A padded square binary mask.
    """

    height, width = binary_mask.shape
    if height == width:
        return binary_mask

    # Determine the size to pad to (the larger dimension of the mask)
    square_size = max(height, width)

    # Calculate padding sizes
    pad_height = (square_size - height) // 2
    pad_width = (square_size - width) // 2

    # Pad the array
    padded_mask = np.pad(binary_mask,
                         pad_width=((pad_height, square_size - height - pad_height),
                                    (pad_width, square_size - width - pad_width)),
                         mode='constant', constant_values=0)

    return padded_mask

def resize_mask(mask, new_size):

    """
    Resize a square mask to a new size.

    Parameters:
    mask (numpy.ndarray): A 2D square mask array.
    new_size (int): The size of the new square mask.

    Returns:
    numpy.ndarray: A resized square mask.
    """

    resized_mask = cv2.resize(mask, (new_size, new_size), interpolation=cv2.INTER_AREA)

    return resized_mask

def calculate_iou_mask(mask1, mask2):

    """Calculate the Intersection over Union (IoU) of two masks."""

    intersection = np.logical_and(mask1, mask2)
    union = np.logical_or(mask1, mask2)
    iou = np.sum(intersection) / np.sum(union)

    return iou

def map_values(array, values_list):

    """
    Map values in the array to 1 if they are in values_list, otherwise to 0.

    Parameters:
    - array: A NumPy array.
    - values_list: A list of values to be mapped to 1.

    Returns:
    - A NumPy array where all elements in values_list are 1, and others are 0.
    """

    return np.isin(array, values_list).astype(int)

def pad_image_to_square(image, desired_size = 1000):

    # Determine the number of channels (3 for RGB, 1 for L)
    if image.mode == 'RGB':
        fill_color = (0, 0, 0)  # Black for RGB

    #
    elif image.mode == 'L':
        fill_color = 0  # Black for grayscale

    # Calculate padding size
    old_size = image.size
    delta_w = desired_size - old_size[0]
    delta_h = desired_size - old_size[1]
    padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))

    # Pad and return
    new_im = ImageOps.expand(image, padding, fill = fill_color)

    return new_im

def resize_image(image, new_size):

    # Resize the image
    resized_im = image.resize(new_size, Image.ANTIALIAS)

    return resized_im

def visualize_yolo_predictions(image, predictions, confidence_threshold):

    """
    Visualize YOLO predictions on the image using Ultralytics library.

    Parameters:
    image (PIL.Image or numpy.ndarray): The image on which to draw the predictions.
    predictions (torch.Tensor): The predictions output by the model.
    conf_threshold (float): Confidence threshold to filter out lower-confidence predictions.

    """

    class_names = {0: 'dent',
                   1: 'scratch',
                   2: 'crack',
                   3: 'glass shatter',
                   4: 'lamp broken',
                   5: 'tire flat'}

    # Convert image to numpy array if it's a PIL Image
    if not isinstance(image, np.ndarray):
        image = np.array(image)

    plt.figure(figsize = (10, 10))
    plt.imshow(image)

    ax = plt.gca()

    # Each prediction consists of [x1, y1, x2, y2, confidence, class]
    for pred in predictions:

        boxes = pred.boxes.data.tolist()  # Boxes object for bbox outputs

        for box in boxes:

            x1, y1, x2, y2, conf, class_id = box

            class_name = class_names[class_id]

            if conf >= confidence_threshold:

                width, height = x2 - x1, y2 - y1
                rect = Rectangle((x1, y1), width, height, linewidth = 2, edgecolor = 'r', facecolor = 'none')
                ax.add_patch(rect)

                plt.text(x1, y1,
                         f'Class: {class_name}, Conf: {conf:.2f}', color='white',
                         bbox = dict(facecolor = 'red', alpha = 0.5))

    plt.show()

def get_yolo_pred_box(predictions, confidence_threshold):

    class_names = {0: 'dent',
                   1: 'scratch',
                   2: 'crack',
                   3: 'glass shatter',
                   4: 'lamp broken',
                   5: 'tire flat'}

    bbox_dict = {}
    scores_dict = {}
    for pred in predictions:

        boxes = pred.boxes.data.tolist()  # Boxes object for bbox outputs

        for box in boxes:

            x1, y1, x2, y2, conf, class_id = box

            class_name = class_names[class_id]

            if class_name not in bbox_dict:
                bbox_dict[class_name] = []

            if class_name not in scores_dict:
                scores_dict[class_name] = []

            if conf >= confidence_threshold:

                bbox_dict[class_name].append([x1, y1, x2, y2])
                scores_dict[class_name].append(conf)

    # nms_bbox_dict = {}
    # for class_name, bboxes in bbox_dict.items():

    #     nms_bboxes = non_max_suppression(bboxes, scores_dict[class_name], 0.5)
    #     nms_bbox_dict[class_name] = nms_bboxes

    return bbox_dict

def transform_bounding_box(bbox, orig_width, orig_height, target_size):

    """
    Transform the bounding box coordinates based on image transformations.

    Parameters:
    bbox (tuple): A tuple (x_min, y_min, x_max, y_max) representing the original bounding box.
    orig_width (int): Original width of the image.
    orig_height (int): Original height of the image.
    target_size (int): The size of the transformed (square) image. Default is 640.

    Returns:
    tuple: Transformed bounding box coordinates.
    """

    # Calculate padding to make the image square
    pad_x = (max(orig_width, orig_height) - orig_width) / 2
    pad_y = (max(orig_width, orig_height) - orig_height) / 2

    # Scale factor (since the image is scaled down to target_size x target_size)
    scale = target_size / max(orig_width, orig_height)

    # Transform the bounding box coordinates
    x_min, y_min, x_max, y_max = bbox
    x_min_padded = (x_min + pad_x) * scale
    y_min_padded = (y_min + pad_y) * scale
    x_max_padded = (x_max + pad_x) * scale
    y_max_padded = (y_max + pad_y) * scale

    return [int(x_min_padded), int(y_min_padded), int(x_max_padded), int(y_max_padded)]

def show_mask(mask, ax, random_color = False):

    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30/255, 144/255, 255/255, 0.6])

    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

def convert_to_binary_mask_and_back(mask_pil, threshold = 127):

    # Convert PIL Image to NumPy array
    mask_array = np.array(mask_pil)

    # Apply threshold to convert to binary mask
    binary_mask = (mask_array > threshold).astype(np.uint8)

    return binary_mask

def convert_to_pixels(box, image_width, image_height):

    """
    Convert normalized bounding box coordinates to pixel coordinates.

    Parameters:
    box (tuple): A tuple (x_center, y_center, width, height) of the bounding box,
                 with values between 0.0 and 1.0.
    image_width (int): Width of the image.
    image_height (int): Height of the image.

    Returns:
    tuple: A tuple (x_min, y_min, x_max, y_max) of the bounding box in pixel coordinates.
    """

    x_center, y_center, width, height = box
    x_min = int((x_center - width / 2) * image_width)
    y_min = int((y_center - height / 2) * image_height)
    x_max = int((x_center + width / 2) * image_width)
    y_max = int((y_center + height / 2) * image_height)

    return x_min, y_min, x_max, y_max

def bounding_box_to_mask(bbox, image_shape):
    """
    Convert a bounding box to a mask.

    Parameters:
    - bbox: A tuple (x_min, y_min, x_max, y_max) representing the bounding box.
    - image_shape: The shape of the image (height, width).

    Returns:
    - mask: A binary mask where pixels inside the bounding box are 1, others are 0.
    """
    x_min, y_min, x_max, y_max = bbox
    mask = np.zeros(image_shape, dtype=np.uint8)
    mask[int(y_min):int(y_max), int(x_min):int(x_max)] = 1

    return mask

def elementwise_addition(arrays):

    """
    Perform element-wise addition of a list of NumPy arrays.

    Parameters:
    - arrays: A list of NumPy arrays of the same shape.

    Returns:
    - A NumPy array containing the element-wise sum of the input arrays.
    """
    # Check if the list is empty
    if not arrays:
        raise ValueError("The list of arrays is empty.")

    # Check if all arrays have the same shape
    shape = arrays[0].shape
    if not all(arr.shape == shape for arr in arrays):
        raise ValueError("All arrays must have the same shape.")

    # Perform element-wise addition
    result = np.sum(arrays, axis=0)

    return result

def map_values_to_one(array, benchmark):

    """
    Map values in the array to 1 if they are greater than or equal to 2.

    Parameters:
    - array: A NumPy array.

    Returns:
    - A NumPy array where all values >= 2 are replaced with 1.
    """

    array[array < benchmark] = 0

    return array

def mask_to_bounding_box(mask):
    """
    Convert a mask to a bounding box.

    Parameters:
    - mask: A binary mask (numpy array) where the object is represented by non-zero values.

    Returns:
    - bbox: A tuple (x_min, y_min, x_max, y_max) representing the bounding box.
    """

    # Find the coordinates where the mask is not zero
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)
    y_min, y_max = np.where(rows)[0][[0, -1]]
    x_min, x_max = np.where(cols)[0][[0, -1]]

    # Return the bounding box coordinates
    return x_min, y_min, x_max, y_max

def non_max_suppression(boxes, probabilities, threshold):

    """Apply non-maximum suppression to avoid detecting the same object multiple times."""

    # Sort the boxes by their probabilities (scores)
    sorted_indices = np.argsort(probabilities)[::-1]
    keep = []

    while len(sorted_indices) > 0:

        # Take the box with the highest score
        current = sorted_indices[0]
        keep.append(current)

        if len(sorted_indices) == 1:
            break

        # Compute IoU of the current box with the rest
        ious = np.array([calculate_iou_mask(bounding_box_to_mask(boxes[current], (640, 640)), bounding_box_to_mask(boxes[current], (640, 640))) for next_box in sorted_indices[1:]])

        # Keep boxes with IoU less than the threshold
        sorted_indices = sorted_indices[1:][ious < threshold]

    return [boxes[i] for i in keep]

def coco_poly_to_mask(polygon, image_shape):

    """
    Convert COCO polygon to a binary mask.

    Parameters:
    - polygon: List of lists of coordinates for the polygons.
    - image_shape: Tuple of (height, width) for the mask.

    Returns:
    - A NumPy array representing the binary mask.
    """

    mask_img = Image.new('L', (image_shape[1], image_shape[0]), 0)
    for poly in polygon:

        # The polygon is expected to be a list of [x1, y1, x2, y2, ..., xn, yn]
        if len(poly) % 2 != 0:
            raise ValueError("Polygon length must be even.")

        # Draw the polygon
        ImageDraw.Draw(mask_img).polygon(poly, outline=1, fill=1)

    return np.array(mask_img)


In [None]:
# Get paths to unseen images
image_base_path = '/CarDD/CarDD_SOD/CarDD-TE/CarDD-TE-Image/'
images = os.listdir(image_base_path)

image_name = images[images.index('003649.jpg')]
image_source, image = load_image(image_base_path + image_name)
image_source_pil = Image.fromarray(image_source)
image_source_pil

# Inference Pipeline

In [None]:
image_base_path = '/CarDD/CarDD_COCO/test/'
images = os.listdir(image_base_path)

categories = {1:'dent', 2:'scratch', 3:'crack', 4:'glass shatter', 5:'lamp broken', 6:'tire flat'}

with open('/CarDD/CarDD_COCO/annotations/instances_test.json', 'r') as f:
    test_annotations = json.load(f)['annotations']

annFile = '/CarDD/CarDD_COCO/annotations/instances_test.json'
coco = COCO(annFile)

## Inference on single image


In [None]:
# Select image and run models
image_name = images[1]
ok = True
if ok:

    image_path = image_base_path + image_name
    print(image_path)

    image_source, image = load_image(image_path)
    image_source_pil = Image.fromarray(image_source)
    image_id = int(image_name.replace('.jpg', ''))
    image_annotations = [annotation for annotation in test_annotations if annotation['image_id'] == image_id]

    groundedtruth_segmentations = {}
    for annotation in image_annotations:

        ground_truth_label = categories[annotation['category_id']]
        ground_truth_mask = coco_poly_to_mask(annotation['segmentation'], image_source.shape[:2])

        padded_ground_truth_mask = pad_to_square(ground_truth_mask)
        resized_ground_truth_mask = resize_mask(padded_ground_truth_mask, 256)

        if ground_truth_label not in groundedtruth_segmentations:
            groundedtruth_segmentations[ground_truth_label] = np.array(resized_ground_truth_mask)

        else:
            groundedtruth_segmentations[ground_truth_label] += np.array(resized_ground_truth_mask)
    print('existing', list(groundedtruth_segmentations.keys()))

    total_ground_truth = np.zeros((256, 256))
    for label, mask in groundedtruth_segmentations.items():
        total_ground_truth += mask
    total_ground_truth = map_values(total_ground_truth, [1])

    ##### Get YOLOv8 bounding boxes #####

    # Pad image to make square
    padded_image = pad_image_to_square(image_source_pil, desired_size = 1000)

    # Resize to YOLOv8 size
    yolo_resized_image = resize_image(padded_image, new_size = (640, 640))

    # YOLOv8 inference
    yolo_prediction = yolo_model([yolo_resized_image])

    # Get predicted bounding boxes with confidence threshhold
    yolo_bboxes = get_yolo_pred_box(yolo_prediction, confidence_threshold = 0.5)

    ##### Get GroundingDINO bounding boxes #####
    groundingdino_bboxes = {}
    for predicted_label, bbox_list in yolo_bboxes.items():

        if predicted_label not in groundingdino_bboxes:
            groundingdino_bboxes[predicted_label] = []

        TEXT_PROMPT = label_prompt_conversion[predicted_label]
        BOX_TRESHOLD = 0.35
        TEXT_TRESHOLD = 0.25

        boxes, logits, phrases = predict(model = groundingdino_model, image = image, caption = TEXT_PROMPT, box_threshold = BOX_TRESHOLD, text_threshold = TEXT_TRESHOLD)
        scores = torch.sigmoid(logits).cpu().detach().numpy()

        for box in boxes:

            pixel_box = convert_to_pixels(box.tolist(), image_source.shape[1], image_source.shape[0])
            resized_box = transform_bounding_box(pixel_box, image_source.shape[1], image_source.shape[0], target_size = 640)
            groundingdino_bboxes[predicted_label].append(resized_box)

    #### Get OWL_ViT bounding boxes #####
    owlvittext_bboxes = {}
    for predicted_label, info in yolo_bboxes.items():

        text_queries = [label_prompt_conversion[predicted_label]]

        # Process image and text inputs
        owlvit_inputs = owlvit_processor(text=text_queries, images = image_source_pil.convert("RGB"), return_tensors = "pt").to(device)

        # Get predictions
        with torch.no_grad():
            owlvit_outputs = owlvit_model(**owlvit_inputs)

        # Threshold to eliminate low probability predictions
        score_threshold = 0.1

        # Get prediction logits
        logits = torch.max(owlvit_outputs["logits"][0], dim=-1)
        scores = torch.sigmoid(logits.values).cpu().detach().numpy()

        # Get prediction labels and boundary boxes
        owlvit_boxes = owlvit_outputs["pred_boxes"][0].cpu().detach().numpy()

        owlvit_results = [scores, owlvit_boxes]
        df = pd.DataFrame(owlvit_results).T
        sorted_df = df.sort_values(0, ascending = False)
        filtered_df = sorted_df[sorted_df[0] > score_threshold]

        owlvit_boxes = list(filtered_df[1])
        owlvit_scores = list(filtered_df[0])

        pixel_format_boxes = [convert_to_pixels(box, image_source.shape[1], image_source.shape[0]) for box in owlvit_boxes]
        resized_bboxes = [transform_bounding_box(box, image_source.shape[1], image_source.shape[0], target_size = 640) for box in pixel_format_boxes]

        owlvit_nms_boxes = non_max_suppression(resized_bboxes, owlvit_scores, 0.5)

        owlvittext_bboxes[predicted_label] = resized_bboxes

    ##### Get masks #####

    # Resize to SAM size
    sam_resized_image = resize_image(padded_image, new_size = (256, 256))

    ##### Get YoloV8 masks #####
    yolo_sam_segmentations = {}
    for class_name, bboxes in yolo_bboxes.items():

        for bbox in bboxes:

            # Resize boundary
            sam_resized_bbox = list(transform_bounding_box(bbox, 640, 640, target_size = 256))

            # SAM inference
            sam_inputs = sam_processor(sam_resized_image, input_boxes = [[sam_resized_bbox]], return_tensors = "pt").to('cuda')
            sam_model.eval()
            with torch.no_grad():

                sam_outputs = sam_model(**sam_inputs, multimask_output = False)

            # Filter predicted map by 0.5
            sam_seg_prob = torch.sigmoid(sam_outputs.pred_masks.squeeze(1))
            sam_seg_prob = sam_seg_prob.cpu().numpy().squeeze()
            sam_seg = (sam_seg_prob > 0.5).astype(np.uint8)

            if class_name not in yolo_sam_segmentations:
                yolo_sam_segmentations[class_name] = sam_seg

            else:
                yolo_sam_segmentations[class_name] += sam_seg

    #### Get GroundingDINO masks #####
    groundingdino_sam_segmentations = {}
    for class_name, bboxes in groundingdino_bboxes.items():

        for bbox in bboxes:

            # Resize boundary
            sam_resized_bbox = list(transform_bounding_box(bbox, 640, 640, target_size = 256))

            # SAM inference
            sam_inputs = sam_processor(sam_resized_image, input_boxes = [[sam_resized_bbox]], return_tensors = "pt").to('cuda')
            sam_model.eval()
            with torch.no_grad():

                sam_outputs = sam_model(**sam_inputs, multimask_output = False)

            # Filter predicted map by 0.5
            sam_seg_prob = torch.sigmoid(sam_outputs.pred_masks.squeeze(1))
            sam_seg_prob = sam_seg_prob.cpu().numpy().squeeze()
            sam_seg = (sam_seg_prob > 0.5).astype(np.uint8)

            if class_name not in groundingdino_sam_segmentations:
                groundingdino_sam_segmentations[class_name] = sam_seg

            else:
                groundingdino_sam_segmentations[class_name] += sam_seg

    #### Get OWL-ViT masks #####
    owlvit_sam_segmentations = {}
    for class_name, bboxes in owlvittext_bboxes.items():

        for bbox in bboxes:

            # Resize boundary
            sam_resized_bbox = list(transform_bounding_box(bbox, 640, 640, target_size = 256))

            # SAM inference
            sam_inputs = sam_processor(sam_resized_image, input_boxes = [[sam_resized_bbox]], return_tensors = "pt").to('cuda')
            sam_model.eval()
            with torch.no_grad():

                sam_outputs = sam_model(**sam_inputs, multimask_output = False)

            # Filter predicted map by 0.5
            sam_seg_prob = torch.sigmoid(sam_outputs.pred_masks.squeeze(1))
            sam_seg_prob = sam_seg_prob.cpu().numpy().squeeze()
            sam_seg = (sam_seg_prob > 0.5).astype(np.uint8)


            if class_name not in owlvit_sam_segmentations:
                owlvit_sam_segmentations[class_name] = sam_seg

            else:
                owlvit_sam_segmentations[class_name] += sam_seg

    # Combine different damage types
    combined_damage_mask = {k:[] for k in ['dent', 'scratch', 'cracked damage', 'glass shatter', 'lamp broken', 'tire flat']}
    for damage_type in ['dent', 'scratch', 'cracked damage', 'glass shatter', 'lamp broken', 'tire flat']:

        total_mask = np.zeros((256, 256))

        if damage_type in yolo_sam_segmentations:
            total_mask += yolo_sam_segmentations[damage_type]

        if damage_type in groundingdino_sam_segmentations:
            total_mask += groundingdino_sam_segmentations[damage_type]

        if damage_type in owlvit_sam_segmentations:
            total_mask += owlvit_sam_segmentations[damage_type]

        unique_mask_values = np.unique(total_mask.flatten())
        if max(unique_mask_values) > 0:
            normalized_mask = map_values(total_mask, [val for val in unique_mask_values if val >= int(max(unique_mask_values)/2)])
        else:
            normalized_mask = np.zeros((256, 256))

        combined_damage_mask[damage_type] = normalized_mask

    total_pred = np.zeros((256, 256))
    for label, mask in combined_damage_mask.items():
        total_pred += mask
    total_pred = map_values(total_pred, [1])

    total_iou = calculate_iou_mask(total_pred, total_ground_truth)
    print(total_iou)

    for label, mask in groundedtruth_segmentations.items():

        if label in yolo_sam_segmentations:
            iou = calculate_iou_mask(combined_damage_mask[label], mask)
            print(label, iou)

        else:
            iou = None

        # testing_result = {'image_name':image_name, 'label':label, 'iou':iou}

        # testing_results.append(testing_result)

In [None]:
# Visualize damage type

damage_type = 'dent'
ok = True
if ok:

    fig, axes = plt.subplots()
    axes.imshow(np.array(sam_resized_image))
    show_mask(groundedtruth_segmentations[damage_type], axes)
    axes.title.set_text(f"Mask")
    axes.axis("off")

    try:
        fig, axes = plt.subplots()
        axes.imshow(np.array(sam_resized_image))
        show_mask(yolo_sam_segmentations[damage_type], axes)
        axes.title.set_text(f"yolo segmentations")
        axes.axis("off")
    except:
        pass

    try:
        fig, axes = plt.subplots()
        axes.imshow(np.array(sam_resized_image))
        show_mask(groundingdino_sam_segmentations[damage_type], axes)
        axes.title.set_text(f"groundingdino segmentations")
        axes.axis("off")
    except:
        pass

    try:
        fig, axes = plt.subplots()
        axes.imshow(np.array(sam_resized_image))
        show_mask(owlvit_sam_segmentations[damage_type], axes)
        axes.title.set_text(f"owlvit segmentations")
        axes.axis("off")
    except:
        pass

    try:
        fig, axes = plt.subplots()
        axes.imshow(np.array(sam_resized_image))
        show_mask(combined_damage_mask[damage_type], axes)
        axes.title.set_text(f"combined segmentations")
        axes.axis("off")
    except:
        pass

## Test on full test dataset

In [None]:
testing_results = []
for image_name in tqdm(images):

    image_path = image_base_path + image_name
    image_source, image = load_image(image_path)
    image_source_pil = Image.fromarray(image_source)
    image_id = int(image_name.replace('.jpg', ''))
    image_annotations = [annotation for annotation in test_annotations if annotation['image_id'] == image_id]

    groundedtruth_segmentations = {}
    for annotation in image_annotations:

        ground_truth_label = categories[annotation['category_id']]
        ground_truth_mask = coco_poly_to_mask(annotation['segmentation'], image_source.shape[:2])

        padded_ground_truth_mask = pad_to_square(ground_truth_mask)
        resized_ground_truth_mask = resize_mask(padded_ground_truth_mask, 256)

        if ground_truth_label not in groundedtruth_segmentations:
            groundedtruth_segmentations[ground_truth_label] = np.array(resized_ground_truth_mask)

        else:
            groundedtruth_segmentations[ground_truth_label] += np.array(resized_ground_truth_mask)

    ##### Get YOLOv8 bounding boxes #####

    # Pad image to make square
    padded_image = pad_image_to_square(image_source_pil, desired_size = 1000)

    # Resize to YOLOv8 size
    yolo_resized_image = resize_image(padded_image, new_size = (640, 640))

    # YOLOv8 inference
    yolo_prediction = yolo_model([yolo_resized_image])

    # Get predicted bounding boxes with confidence threshhold
    yolo_bboxes = get_yolo_pred_box(yolo_prediction, confidence_threshold = 0.5)

    ##### Get GroundingDINO bounding boxes #####
    groundingdino_bboxes = {}
    for predicted_label, bbox_list in yolo_bboxes.items():

        if predicted_label not in groundingdino_bboxes:
            groundingdino_bboxes[predicted_label] = []

        TEXT_PROMPT = label_prompt_conversion[predicted_label]
        BOX_TRESHOLD = 0.35
        TEXT_TRESHOLD = 0.25

        boxes, logits, phrases = predict(model = groundingdino_model, image = image, caption = TEXT_PROMPT, box_threshold = BOX_TRESHOLD, text_threshold = TEXT_TRESHOLD)
        scores = torch.sigmoid(logits).cpu().detach().numpy()

        for box in boxes:

            pixel_box = convert_to_pixels(box.tolist(), image_source.shape[1], image_source.shape[0])
            resized_box = transform_bounding_box(pixel_box, image_source.shape[1], image_source.shape[0], target_size = 640)
            groundingdino_bboxes[predicted_label].append(resized_box)

    #### Get OWL_ViT bounding boxes #####
    owlvittext_bboxes = {}
    for predicted_label, info in yolo_bboxes.items():

        text_queries = [label_prompt_conversion[predicted_label]]

        # Process image and text inputs
        owlvit_inputs = owlvit_processor(text=text_queries, images = image_source_pil.convert("RGB"), return_tensors = "pt").to(device)

        # Get predictions
        with torch.no_grad():
            owlvit_outputs = owlvit_model(**owlvit_inputs)

        # Threshold to eliminate low probability predictions
        score_threshold = 0.1

        # Get prediction logits
        logits = torch.max(owlvit_outputs["logits"][0], dim=-1)
        scores = torch.sigmoid(logits.values).cpu().detach().numpy()

        # Get prediction labels and boundary boxes
        owlvit_boxes = owlvit_outputs["pred_boxes"][0].cpu().detach().numpy()

        owlvit_results = [scores, owlvit_boxes]
        df = pd.DataFrame(owlvit_results).T
        sorted_df = df.sort_values(0, ascending = False)
        filtered_df = sorted_df[sorted_df[0] > score_threshold]

        owlvit_boxes = list(filtered_df[1])
        owlvit_scores = list(filtered_df[0])

        pixel_format_boxes = [convert_to_pixels(box, image_source.shape[1], image_source.shape[0]) for box in owlvit_boxes]
        resized_bboxes = [transform_bounding_box(box, image_source.shape[1], image_source.shape[0], target_size = 640) for box in pixel_format_boxes]

        owlvit_nms_boxes = non_max_suppression(resized_bboxes, owlvit_scores, 0.5)

        owlvittext_bboxes[predicted_label] = resized_bboxes

    ##### Get masks #####

    # Resize to SAM size
    sam_resized_image = resize_image(padded_image, new_size = (256, 256))

    ##### Get YoloV8 masks #####
    yolo_sam_segmentations = {}
    for class_name, bboxes in yolo_bboxes.items():

        for bbox in bboxes:

            # Resize boundary
            sam_resized_bbox = list(transform_bounding_box(bbox, 640, 640, target_size = 256))

            # SAM inference
            sam_inputs = sam_processor(sam_resized_image, input_boxes = [[sam_resized_bbox]], return_tensors = "pt").to('cuda')
            sam_model.eval()
            with torch.no_grad():

                sam_outputs = sam_model(**sam_inputs, multimask_output = False)

            # Filter predicted map by 0.5
            sam_seg_prob = torch.sigmoid(sam_outputs.pred_masks.squeeze(1))
            sam_seg_prob = sam_seg_prob.cpu().numpy().squeeze()
            sam_seg = (sam_seg_prob > 0.5).astype(np.uint8)

            if class_name not in yolo_sam_segmentations:
                yolo_sam_segmentations[class_name] = sam_seg

            else:
                yolo_sam_segmentations[class_name] += sam_seg

    #### Get GroundingDINO masks #####
    groundingdino_sam_segmentations = {}
    for class_name, bboxes in groundingdino_bboxes.items():

        for bbox in bboxes:

            # Resize boundary
            sam_resized_bbox = list(transform_bounding_box(bbox, 640, 640, target_size = 256))

            # SAM inference
            sam_inputs = sam_processor(sam_resized_image, input_boxes = [[sam_resized_bbox]], return_tensors = "pt").to('cuda')
            sam_model.eval()
            with torch.no_grad():

                sam_outputs = sam_model(**sam_inputs, multimask_output = False)

            # Filter predicted map by 0.5
            sam_seg_prob = torch.sigmoid(sam_outputs.pred_masks.squeeze(1))
            sam_seg_prob = sam_seg_prob.cpu().numpy().squeeze()
            sam_seg = (sam_seg_prob > 0.5).astype(np.uint8)

            if class_name not in groundingdino_sam_segmentations:
                groundingdino_sam_segmentations[class_name] = sam_seg

            else:
                groundingdino_sam_segmentations[class_name] += sam_seg

    #### Get OWL-ViT masks #####
    owlvit_sam_segmentations = {}
    for class_name, bboxes in owlvittext_bboxes.items():

        for bbox in bboxes:

            # Resize boundary
            sam_resized_bbox = list(transform_bounding_box(bbox, 640, 640, target_size = 256))

            # SAM inference
            sam_inputs = sam_processor(sam_resized_image, input_boxes = [[sam_resized_bbox]], return_tensors = "pt").to('cuda')
            sam_model.eval()
            with torch.no_grad():

                sam_outputs = sam_model(**sam_inputs, multimask_output = False)

            # Filter predicted map by 0.5
            sam_seg_prob = torch.sigmoid(sam_outputs.pred_masks.squeeze(1))
            sam_seg_prob = sam_seg_prob.cpu().numpy().squeeze()
            sam_seg = (sam_seg_prob > 0.5).astype(np.uint8)


            if class_name not in owlvit_sam_segmentations:
                owlvit_sam_segmentations[class_name] = sam_seg

            else:
                owlvit_sam_segmentations[class_name] += sam_seg

    # Combine different damage types
    combined_damage_mask = {k:[] for k in ['dent', 'scratch', 'cracked damage', 'glass shatter', 'lamp broken', 'tire flat']}
    for damage_type in ['dent', 'scratch', 'cracked damage', 'glass shatter', 'lamp broken', 'tire flat']:

        total_mask = np.zeros((256, 256))

        if damage_type in yolo_sam_segmentations:
            total_mask += yolo_sam_segmentations[damage_type]

        if damage_type in groundingdino_sam_segmentations:
            total_mask += groundingdino_sam_segmentations[damage_type]

        if damage_type in owlvit_sam_segmentations:
            total_mask += owlvit_sam_segmentations[damage_type]

        unique_mask_values = np.unique(total_mask.flatten())
        if max(unique_mask_values) > 0:
            normalized_mask = map_values(total_mask, [val for val in unique_mask_values if val >= int(max(unique_mask_values)/2)])
        else:
            normalized_mask = np.zeros((256, 256))

        combined_damage_mask[damage_type] = normalized_mask

    for label, mask in groundedtruth_segmentations.items():

        if label in yolo_sam_segmentations:
            iou = calculate_iou_mask(combined_damage_mask[label], mask)

        else:
            iou = None

        testing_result = {'image_name':image_name, 'iou':iou}
        testing_results.append(testing_result)
