In [2]:
import torchvision.transforms as transforms
import cv2
import numpy as np
import torch
import torchvision

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
coco_names = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

np.random.seed(0)
COLORS = np.random.uniform(low=0, high=1, size=(len(coco_names), 3))

transform = transforms.Compose([
    transforms.ToTensor(),
#     transforms.Resize((224, 224)),
])

In [5]:
tuple(COLORS[0])

(0.5488135039273248, 0.7151893663724195, 0.6027633760716439)

In [6]:
def predict(image, model, detection_threshold):
    # transform the image to tensor
    image = transform(image).to(device).float()
    image = image.unsqueeze(0) # add a batch dimension
    with torch.no_grad():
        outputs = model(image) # get the predictions on the image
    # get all the scores
    scores = list(outputs[0]['scores'].detach().cpu().numpy())
    # index of those scores which are above a certain threshold
    thresholded_preds_inidices = [scores.index(i) for i in scores if i > detection_threshold]
    # get all the predicted bounding boxes
    bboxes = outputs[0]['boxes'].detach().cpu().numpy()
    # get boxes above the threshold score
    boxes = bboxes[np.array(scores) >= detection_threshold].astype(np.int32)
    # get all the predicited class names
    labels = outputs[0]['labels'].cpu().numpy()
    pred_classes = [coco_names[labels[i]] for i in thresholded_preds_inidices]
    return boxes, pred_classes

In [7]:
def draw_boxes(boxes, classes, image):
    for i, box in enumerate(boxes):
#         color = COLORS[coco_names.index(classes[i])]
        # color = (255, 255, 255)
        color = tuple(COLORS[i])
        # print(color)
        cv2.rectangle(
            image,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, 2
        )
        cv2.putText(image, classes[i], (int(box[0]), int(box[1]-5)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2, 
                    lineType=cv2.LINE_AA)
    return image

In [8]:
model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True, 
                                                            min_size=800)

model.eval().to(device);



In [9]:
def get_result(image):
    
    image = np.array(image)[:, ::-1, :] / 255
    boxes, classes = predict(image, model, 0.55)
    result = draw_boxes(boxes, classes, image)

    return result

In [10]:
camera = cv2.VideoCapture(0)

while True:
    
    return_value, image = camera.read()
    print(image.shape)
    result = get_result(image)
    result = cv2.resize(result, (1200, 860))  

    cv2.imshow('detection', result)
      
    if cv2.waitKey(10) == 27 and return_value:
        break

camera.release()
cv2.destroyAllWindows()

(480, 640, 3)


KeyboardInterrupt: 