### Step1: Annotation

#### Test1 - Full image classification

In [1]:
# import torch
# from detectron2.engine import DefaultPredictor
# from detectron2.config import get_cfg
# from research.detectron2.detectron2.model_zoo import model_zoo
# from detectron2.utils.visualizer import Visualizer
# from detectron2.data import MetadataCatalog
# import cv2
# import matplotlib.pyplot as plt
# import os
# import json
# 
# # Configure the Detectron2 model
# cfg = get_cfg()
# cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
# cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
# cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
# cfg.MODEL.DEVICE = "cpu"  # Ensure the model uses the CPU
# 
# # Create a predictor
# predictor = DefaultPredictor(cfg)
# 
# # Provide the path to your input images
# image_dir = "./frames/"  # Change this to the directory containing your images
# output_json = "annotations.json"
# 
# # Initialize annotation data
# annotations = {
#     "images": [],
#     "annotations": [],
#     "categories": [
#         {"id": 1, "name": "book"},
#         {"id": 2, "name": "hand"}
#     ]
# }
# annotation_id = 1
# 
# # Iterate over images in the directory
# for image_file in os.listdir(image_dir):
#     image_path = os.path.join(image_dir, image_file)
# 
#     # Check if the file exists and is an image
#     if not os.path.exists(image_path) or not image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
#         continue
# 
#     # Read the image using OpenCV
#     im = cv2.imread(image_path)
# 
#     # Check if the image was successfully loaded
#     if im is None:
#         print(f"Could not read the image {image_path}. Skipping.")
#         continue
# 
#     # Perform inference
#     outputs = predictor(im)
# 
#     # Get bounding boxes and classes
#     instances = outputs["instances"].to("cpu")
#     boxes = instances.pred_boxes.tensor.numpy()
#     classes = instances.pred_classes.numpy()
# 
#     # Add image info to annotations
#     image_info = {
#         "file_name": image_file,
#         "height": im.shape[0],
#         "width": im.shape[1],
#         "id": len(annotations["images"]) + 1
#     }
#     annotations["images"].append(image_info)
# 
#     # Add annotations for this image
#     for i, box in enumerate(boxes):
#         category_id = classes[i] + 1  # Adjusting class id to start from 1 for COCO format
#         annotation = {
#             "id": annotation_id,
#             "image_id": image_info["id"],
#             "category_id": category_id,
#             "bbox": [float(box[0]), float(box[1]), float(box[2] - box[0]), float(box[3] - box[1])],
#             "area": float((box[2] - box[0]) * (box[3] - box[1])),
#             "iscrowd": 0
#         }
#         annotations["annotations"].append(annotation_id)
#         annotation_id += 1
# 
# # Save annotations to a JSON file
# with open(output_json, "w") as f:
#     json.dump(annotations, f)
# 
# print(f"Annotations saved to {output_json}")


#### Test2 - bbox based

In [5]:
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from research.detectron2.detectron2 import model_zoo
from detectron2.data import MetadataCatalog
import cv2
import os
import json

# Configure the Detectron2 model
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.MODEL.DEVICE = "cpu"  # Ensure the model uses the CPU

# Create a predictor
predictor = DefaultPredictor(cfg)

# COCO metadata
coco_metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
coco_classes = coco_metadata.thing_classes

# Provide the path to your input images
image_dir = "./frames"  # Change this to the directory containing your images
output_json = "annotations.json"

# Initialize annotation data
annotations = {
    "images": [],
    "annotations": [],
    "categories": [
        {"id": 1, "name": "person"},
        {"id": 2, "name": "book"}
    ]
}
annotation_id = 1

# Iterate over images in the directory
for image_file in os.listdir(image_dir):
    image_path = os.path.join(image_dir, image_file)

    # Check if the file exists and is an image
    if not os.path.exists(image_path) or not image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    # Read the image using OpenCV
    im = cv2.imread(image_path)

    # Check if the image was successfully loaded
    if im is None:
        print(f"Could not read the image {image_path}. Skipping.")
        continue

    # Perform inference
    outputs = predictor(im)

    # Get bounding boxes and classes
    instances = outputs["instances"].to("cpu")
    boxes = instances.pred_boxes.tensor.numpy()
    classes = instances.pred_classes.numpy()

    # Add image info to annotations
    image_info = {
        "file_name": image_file,
        "height": im.shape[0],
        "width": im.shape[1],
        "id": len(annotations["images"]) + 1
    }
    annotations["images"].append(image_info)

    # Add annotations for this image
    for i, box in enumerate(boxes):
        class_name = coco_classes[classes[i]]
        if class_name == "person":
            category_id = 0
        elif class_name == "book":
            category_id = 1
        else:
            continue  # Skip other classes

        annotation = {
            "id": annotation_id,
            "image_id": image_info["id"],
            "category_id": category_id,
            "bbox": [float(box[0]), float(box[1]), float(box[2] - box[0]), float(box[3] - box[1])],
            "area": float((box[2] - box[0]) * (box[3] - box[1])),
            "iscrowd": 0
        }
        annotations["annotations"].append(annotation)
        annotation_id += 1

# Save annotations to a JSON file
with open(output_json, "w") as f:
    json.dump(annotations, f)

print(f"Annotations saved to {output_json}")


[32m[07/17 22:39:19 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl ...
Annotations saved to annotations.json


### Step2: Training the model

#### Dataset preperation and training

In [9]:
from detectron2.data import DatasetCatalog, MetadataCatalog

# Unregister the datasets if they are already registered
for d in ["train", "val"]:
    dataset_name = "custom_" + d
    if dataset_name in DatasetCatalog.list():
        DatasetCatalog.remove(dataset_name)
    if dataset_name in MetadataCatalog.list():
        MetadataCatalog.remove(dataset_name)


In [10]:
import os
import json
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from research.detectron2.detectron2 import model_zoo
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader
import detectron2.utils.comm as comm
from detectron2.utils.logger import setup_logger

setup_logger()

annotations_file = 'annotations.json'
image_root = './frames'

def get_custom_dicts():
    with open(annotations_file) as f:
        dataset_dicts = json.load(f)

    dataset = []
    for image_info in dataset_dicts['images']:
        record = {}
        record["file_name"] = os.path.join(image_root, image_info["file_name"])
        record["image_id"] = image_info["id"]
        record["height"] = image_info["height"]
        record["width"] = image_info["width"]

        record["annotations"] = []
        for anno in dataset_dicts['annotations']:
            if anno["image_id"] == image_info["id"]:
                obj = {
                    "bbox": anno["bbox"],
                    "bbox_mode": BoxMode.XYWH_ABS,
                    "category_id": anno["category_id"],
                    "iscrowd": anno["iscrowd"]
                }
                record["annotations"].append(obj)
        dataset.append(record)

    return dataset

# Register the datasets
for d in ["train", "val"]:
    DatasetCatalog.register("custom_" + d, get_custom_dicts)
    MetadataCatalog.get("custom_" + d).set(thing_classes=["person", "book"])
custom_metadata = MetadataCatalog.get("custom_train")

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("custom_train",)
cfg.DATASETS.TEST = ("custom_val",)
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 1000
cfg.SOLVER.STEPS = (700, 900)
cfg.SOLVER.GAMMA = 0.1
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2  # person and book
cfg.MODEL.DEVICE = "cpu"  # Use CPU for training
cfg.TEST.EVAL_PERIOD = 50  # Evaluate every 50 iterations

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

# Implementing a custom trainer to include evaluation
class TrainerWithVal(DefaultTrainer):
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
        return COCOEvaluator(dataset_name, cfg, False, output_folder)

    def __init__(self, cfg):
        super().__init__(cfg)
        self._last_eval_results = None

    def run_step(self):
        self._trainer.iter = self.iter
        self._trainer.run_step()
        if (self.iter + 1) % cfg.TEST.EVAL_PERIOD == 0:
            self._last_eval_results = self.test(self.cfg, self.model)
            comm.synchronize()

    def test(self, cfg, model, evaluators=None):
        if evaluators is None:
            evaluators = [self.build_evaluator(cfg, name) for name in cfg.DATASETS.TEST]
        res = inference_on_dataset(model, build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]), evaluators[0])
        return res

trainer = TrainerWithVal(cfg)
trainer.resume_or_load(resume=False)
trainer.train()


[32m[07/17 23:46:17 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (3, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (3,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (8, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (8,) in the model! You might want to double check if this is expected.
Some model parameters or buffers are not found in the checkpoint:
[34mroi_heads.box_predictor.bbox_pred.{bias, weight}[0m
[34mroi_heads.box_predictor.cls_s

[32m[07/17 23:46:17 d2.engine.train_loop]: [0mStarting training from iteration 0
[32m[07/17 23:47:42 d2.utils.events]: [0m eta: 1:08:08  iter: 19  total_loss: 2.153  loss_cls: 1.016  loss_box_reg: 0.9059  loss_rpn_cls: 0.01852  loss_rpn_loc: 0.1921    time: 4.1128  last_time: 4.1161  data_time: 0.1002  last_data_time: 0.0011   lr: 2.9275e-07  
[32m[07/17 23:49:09 d2.utils.events]: [0m eta: 1:08:20  iter: 39  total_loss: 2.133  loss_cls: 1.013  loss_box_reg: 0.8915  loss_rpn_cls: 0.02005  loss_rpn_loc: 0.1859    time: 4.2371  last_time: 3.6182  data_time: 0.0010  last_data_time: 0.0009   lr: 3.3775e-07  
[32m[07/17 23:49:52 d2.evaluation.coco_evaluation]: [0mTrying to convert 'custom_val' to COCO format ...
[32m[07/17 23:49:52 d2.data.datasets.coco]: [0mConverting annotations of dataset 'custom_val' to COCO format ...)
[32m[07/17 23:49:52 d2.data.datasets.coco]: [0mConverting dataset dicts into COCO format
[32m[07/17 23:49:52 d2.data.datasets.coco]: [0mConversion finished,

#### Inference

In [14]:
import os
import cv2
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from research.detectron2.detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.data import MetadataCatalog

# Load the configuration and set the trained weights
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TEST = ("custom_val",)
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # Path to the trained model
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2  # Ensure this matches the number of classes (person and book)
cfg.MODEL.DEVICE = "cpu"  # Use CPU for inference

# Setup the predictor
predictor = DefaultPredictor(cfg)

# Provide the path to your input video
video_path = './vids/20240617_210932.mp4'
cap = cv2.VideoCapture(video_path)
output_path = './output_video.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))

# Metadata for visualizer
custom_metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    outputs = predictor(frame)
    v = Visualizer(frame[:, :, ::-1], metadata=custom_metadata, instance_mode=ColorMode.IMAGE)  # Use ColorMode.IMAGE for color output
    out_frame = v.draw_instance_predictions(outputs["instances"].to("cpu"))

    out.write(out_frame.get_image()[:, :, ::-1])

cap.release()
out.release()
cv2.destroyAllWindows()

print(f"Output video saved to {output_path}")


[32m[07/18 12:54:13 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from ./output/model_final.pth ...
Output video saved to ./output_video.mp4
