In [1]:
import cv2
import torch
import torchvision.transforms as transforms
from torchvision.models import detection
import numpy as np
import pickle

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
CLASSES = [
     'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
                    'train', 'truck', 'boat', 'traffic light', 'fire', 'hydrant',
                    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
                    'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
                    'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
                    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
                    'kite', 'baseball bat', 'baseball glove', 'skateboard',
                    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
                    'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
                    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
                    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
                    'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
                    'keyboard', 'cell phone', 'microwave oven', 'toaster', 'sink',
                    'refrigerator', 'book', 'clock', 'vase', 'scissors',
                    'teddy bear', 'hair drier', 'toothbrush'
    ]

In [4]:
torch.hub.set_dir('/home/anwar/MedicalImaging/prisma/torch/hub')

In [5]:
def get_ssd_classes(image, results, classes_to_labels):
    identified_classes = []
    for image_idx in range(len(results)):
        
        bboxes, classes, confidences = results[image_idx]
        for idx in range(len(bboxes)):
            identified_classes.append(classes_to_labels[classes[idx]-1])
    return identified_classes

def get_yolo_classes(yolo_dict):
    return list(set(yolo_dict['name']))

In [6]:
#SSD MODEL
def detect_obj_ssd(image):
    # define the computation device
    device = torch.device('cpu')#torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # define the image transforms
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((300, 300)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    ssd_model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd')
    ssd_model.to(device)
    ssd_model.eval()
    utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd_processing_utils')
    transformed_image = transform(image)
    # convert to torch tensor
    tensor = torch.tensor(transformed_image, dtype=torch.float32)
    # add a batch dimension
    tensor = tensor.unsqueeze(0).to(device)
    # get the detection results
    with torch.no_grad():
        detections = ssd_model(tensor)
    results_per_input = utils.decode_results(detections)
    best_results_per_input = [utils.pick_best(results, 0.7) for results in results_per_input]
    classes_to_labels = utils.get_coco_object_dictionary()
    image_result = get_ssd_classes(image, best_results_per_input, classes_to_labels)
    return image_result

In [7]:
def detect_obj_yolo(image):
    yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
    yolo_result = yolo_model(image, size = 300)
    yolo_dict = yolo_result.pandas().xyxy[0]
    classes = get_yolo_classes(yolo_dict)
    return classes

In [8]:
def detect_object(model_name, image):
    

    DEVICE = torch.device("cpu")
    # load the list of categories in the COCO dataset and then generate a
    # set of bounding box colors for each class
    # CLASSES = pickle.loads(open(args["labels"], "rb").read())
    COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))


    # initialize a dictionary containing model name and its corresponding 
    # torchvision function call
    MODELS = {
        "frcnn-resnet": detection.fasterrcnn_resnet50_fpn,
        "frcnn-mobilenet": detection.fasterrcnn_mobilenet_v3_large_320_fpn,
        "retinanet": detection.retinanet_resnet50_fpn
    }
    # load the model and set it to evaluation mode
    model = MODELS[model_name](pretrained=True, progress=True).to(DEVICE)
    model.eval()


    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = image.transpose((2, 0, 1))
    # add the batch dimension, scale the raw pixel intensities to the
    # range [0, 1], and convert the image to a floating point tensor
    image = np.expand_dims(image, axis=0)
    image = image / 255.0
    image = torch.FloatTensor(image)
    # send the input to the device and pass the it through the network to
    # get the detections and predictions
    image = image.to(DEVICE)
    detections = model(image)[0]
    return detections

In [9]:
def get_labels(detections):
    labels_to_out = []
    for i in range(0, len(detections["boxes"])):
        confidence = detections["scores"][i]
        if confidence > 0.8:
            idx = int(detections["labels"][i])
            label = "{}".format(CLASSES[idx])
            labels_to_out.append(label)
    return labels_to_out

In [13]:
def get_image_objects(image_path):
    #IMAGE READ
    image = cv2.imread(image_path)
    # keep the original height and width for resizing of bounding boxes
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    model_choice = ['frcnn-resnet', "frcnn-mobilenet", "retinanet"]
    detects = []
    for x in model_choice:
        detections = detect_object(x, image)
        detects+=get_labels(detections)
    detects +=detect_obj_ssd(image)
    detects +=detect_obj_yolo(image)  
    detects = list(set(detects))
    return {image_path: detects}

In [14]:
get_image_objects('sample2.jpg')

Using cache found in /home/anwar/MedicalImaging/prisma/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /home/anwar/MedicalImaging/prisma/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /home/anwar/MedicalImaging/prisma/torch/hub/ultralytics_yolov5_master
fatal: not a git repository (or any of the parent directories): .git
YOLOv5 🚀 2022-3-14 torch 1.10.1+cu102 CUDA:0 (Quadro P4000, 8120MiB)

Fusing layers... 
Model Summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


{'sample2.jpg': ['backpack', 'giraffe', 'zebra', 'elephant']}