# Download the input file

In [1]:
import requests
url = "https://drive.google.com/uc?id=140fl292Ofs6tmEJzIFUPGo97AdIIO3vi&export=download"
r = requests.get(url)
with open('input.zip','wb') as f:
    f.write(r.content)

In [2]:
!unzip input.zip

Archive:  input.zip
   creating: input/
  inflating: input/image_2.jpg       
  inflating: input/image_1.jpg       
  inflating: input/video_1.mp4       
  inflating: input/video_2.mp4       


# setup

In [3]:
!mkdir outputs

In [4]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize']=21,18


In [5]:
%%writefile coco_names.py
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

Writing coco_names.py


In [6]:
#Utility Functions for Drawing Bounding Boxes and Predicting Class Labels
%%writefile detect_utils.py
import torchvision.transforms as transforms
import cv2
import numpy as np

from coco_names import COCO_INSTANCE_CATEGORY_NAMES as coco_names

# this will help us create a different color for each class
COLORS = np.random.uniform(0, 255, size=(len(coco_names), 3))

# define the torchvision image transforms
transform = transforms.Compose([
    transforms.ToTensor(),
])
# Function to predict the classes
def predict(image, model, device, detection_threshold):
    # transform the image to tensor
    image = transform(image).to(device)
    image = image.unsqueeze(0) # add a batch dimension
    outputs = model(image) # get the predictions on the image

    # get all the predicited class names
    pred_classes = [coco_names[i] for i in outputs[0]['labels'].cpu().numpy()]

    # get score for all the predicted objects
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()

    # get all the predicted bounding boxes
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()
    # get boxes above the threshold score
    boxes = pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)

    return boxes, pred_classes, outputs[0]['labels']

#Function to Draw the Bounding Box and Class Label on the Object
def draw_boxes(boxes, classes, labels, image):
    image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
    for i, box in enumerate(boxes):
        color = COLORS[labels[i]]
        cv2.rectangle(
            image,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, 2
        )
        cv2.putText(image, classes[i], (int(box[0]), int(box[1]-5)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2, 
                    lineType=cv2.LINE_AA)
    return image

Writing detect_utils.py


# Object detection in Images

In [7]:
%%writefile detect_img.py
import torchvision
import torch
import argparse
import cv2
import detect_utils

from PIL import Image

# construct the argument parser
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help='path to input image/video')
args = vars(parser.parse_args())

# define the computation device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load the model
model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
# load the model on to the computation device
model.eval().to(device)

# read the image and run the inference for detections
image = Image.open(args['input'])
boxes, classes, labels = detect_utils.predict(image, model, device, 0.7)
image = detect_utils.draw_boxes(boxes, classes, labels, image)
# cv2.imshow('Image', image)
save_name = f"{args['input'].split('/')[-1].split('.')[0]}"
cv2.imwrite(f"outputs/{save_name}.jpg", image)
# cv2.waitKey(0)

Writing detect_img.py


In [8]:
!python detect_img.py --input input/image_1.jpg
!python detect_img.py --input input/image_2.jpg

Downloading: "https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_fpn-fb6a3cc7.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_mobilenet_v3_large_fpn-fb6a3cc7.pth
100% 74.2M/74.2M [00:00<00:00, 117MB/s] 


In [9]:
image_1 = plt.imread('outputs/image_1.jpg')
image_2 = plt.imread('outputs/image_2.jpg')

plt.imshow(image_1)
plt.axis('off')
plt.show()

plt.imshow(image_2)
plt.axis('off')
plt.show()

Output hidden; open in https://colab.research.google.com to view.

# Object detection in videos

In [10]:
%%writefile detect_vid.py
import torchvision
import cv2
import torch
import argparse
import time
import detect_utils

# construct the argument parser
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help='path to input video')
args = vars(parser.parse_args())

# define the computation device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load the model 
model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
# load the model onto the computation device
model = model.eval().to(device)

cap = cv2.VideoCapture(args['input'])

if (cap.isOpened() == False):
    print('Error while trying to read video. Please check path again')

# get the frame width and height
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

save_name = f"{args['input'].split('/')[-1].split('.')[0]}"
# define codec and create VideoWriter object 
out = cv2.VideoWriter(f"outputs/{save_name}.mp4", 
                      cv2.VideoWriter_fourcc(*'mp4v'), 30, 
                      (frame_width, frame_height))

frame_count = 0 # to count total frames
total_fps = 0 # to get the final frames per second

# read until end of video
while(cap.isOpened()):
    # capture each frame of the video
    ret, frame = cap.read()
    if ret == True:
        # get the start time
        start_time = time.time()
        with torch.no_grad():
            # get predictions for the current frame
            boxes, classes, labels = detect_utils.predict(frame, model, device, 0.7)
        
        # draw boxes and show current frame on screen
        image = detect_utils.draw_boxes(boxes, classes, labels, frame)

        # get the end time
        end_time = time.time()
        # get the fps
        fps = 1 / (end_time - start_time)
        # add fps to total fps
        total_fps += fps
        # increment frame count
        frame_count += 1
        print(f"Frame counter: {frame_count}, FPS: {fps}")
        # write the FPS on the current frame
        cv2.putText(image, f"{fps:.3f} FPS", (15, 30), cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 0), 2)
        # press `q` to exit
        wait_time = max(1, int(fps/4))
        # convert from BGR to RGB color format
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # cv2.imshow('image', image)
        out.write(image)
        if cv2.waitKey(wait_time) & 0xFF == ord('q'):
            break

    else:
        break

# release VideoCapture()
cap.release()
# close all frames and video windows
cv2.destroyAllWindows()

# calculate and print the average FPS
avg_fps = total_fps / frame_count
print(f"Average FPS: {avg_fps:.3f}")


Writing detect_vid.py


In [11]:
!python detect_vid.py --input input/video_1.mp4

Frame counter: 1, FPS: 16.843917914943177
Frame counter: 2, FPS: 18.40034744918775
Frame counter: 3, FPS: 18.50956302239168
Frame counter: 4, FPS: 18.614711390809596
Frame counter: 5, FPS: 23.806654482296715
Frame counter: 6, FPS: 24.809265239971136
Frame counter: 7, FPS: 25.132598704512635
Frame counter: 8, FPS: 25.375117973041647
Frame counter: 9, FPS: 25.374657430654285
Frame counter: 10, FPS: 27.811658300787077
Frame counter: 11, FPS: 27.683164918718774
Frame counter: 12, FPS: 27.124249028344533
Frame counter: 13, FPS: 26.816943192353186
Frame counter: 14, FPS: 27.106719316500037
Frame counter: 15, FPS: 27.292451847995835
Frame counter: 16, FPS: 26.08462897086992
Frame counter: 17, FPS: 27.414289169068674
Frame counter: 18, FPS: 27.572337628188272
Frame counter: 19, FPS: 27.961840254398304
Frame counter: 20, FPS: 28.269599913728026
Frame counter: 21, FPS: 28.03098288456269
Frame counter: 22, FPS: 27.67878047975715
Frame counter: 23, FPS: 28.129490902506255
Frame counter: 24, FPS: 2

In [12]:
!python detect_vid.py --input input/video_2.mp4

Frame counter: 1, FPS: 15.242702639841841
Frame counter: 2, FPS: 16.268531555328003
Frame counter: 3, FPS: 16.09763810956654
Frame counter: 4, FPS: 18.92352184800018
Frame counter: 5, FPS: 19.656960749853543
Frame counter: 6, FPS: 19.375734506079308
Frame counter: 7, FPS: 20.038813428949542
Frame counter: 8, FPS: 21.474341711166975
Frame counter: 9, FPS: 23.036386504314205
Frame counter: 10, FPS: 22.955695544378344
Frame counter: 11, FPS: 22.79463489924132
Frame counter: 12, FPS: 23.231992910158414
Frame counter: 13, FPS: 22.3638961759123
Frame counter: 14, FPS: 23.136264597020194
Frame counter: 15, FPS: 23.025763489736875
Frame counter: 16, FPS: 22.475827108362715
Frame counter: 17, FPS: 23.548027420178872
Frame counter: 18, FPS: 23.499476146454885
Frame counter: 19, FPS: 22.55026398133313
Frame counter: 20, FPS: 22.32568438646078
Frame counter: 21, FPS: 23.10122162126436
Frame counter: 22, FPS: 23.00568244147524
Frame counter: 23, FPS: 22.84292677613485
Frame counter: 24, FPS: 22.905

In [13]:
 from google.colab import files

!zip -r outputs outputs

  adding: outputs/ (stored 0%)
  adding: outputs/video_2.mp4 (deflated 1%)
  adding: outputs/image_1.jpg (deflated 2%)
  adding: outputs/video_1.mp4 (deflated 1%)
  adding: outputs/image_2.jpg (deflated 1%)


In [14]:
files.download('outputs.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>