# General Approach and Requirements

The highest scoring model with the smallest required hardware to train and run was the FPN2 model [A11], MMN [A9] or MFaster-RCNN[A.14]. These all required a GTX 1080 TI, which can actually be run on a few froup member's hardware. I will be experimenting with the MFaster-RCNN detection architecture with an FPN integrated to handle the different scales of objects. 

# MFaster-RCNN with FPN2
Following are the packages that we need for the MFaster architecture with an FPN2

In [None]:
# export CUDA_LAUNCH_BLOCKING=1
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision.transforms as T
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim import lr_scheduler

import sys
import utils
import json
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import traceback
import numpy as np
import cv2
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import os
from tqdm import tqdm
from PIL import Image

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval



# Capture the Dataset for Preprocessing
Read in all of the image files into a list, as well as their corresponding annotations. The dataset also has a getter for the lenght of the dataset for iterative purposes later along with a getitem function for 

In [None]:
class VisDroneDataset(Dataset):
    def __init__(self, root_dir, annotation_dir, transforms=None):
        self.root_dir = root_dir
        self.annotation_dir = annotation_dir
        self.image_files = sorted(os.listdir(root_dir))
        self.transforms = transforms
        self.annotations = self._load_annotations()
        self.classes = ['background', 'predestrian', 'people', 'car', 'van', 'bus', 'truck', 'tricycle', 'awning-tricycle', 'bicycle', 'motorcycle']

    def _load_annotations(self):
        annotations = {}
        for imgName in self.image_files:
            annotation_name = imgName.replace('.jpg', '.txt')
            annotation_path = os.path.join(self.annotation_dir, annotation_name)
            boxes = []
            labels = []
            if os.path.exists(annotation_path):
                with open(annotation_path, 'r') as f:
                    for line in f:
                        try:
                            x, y, w, h, score, categoryID, truncation, occlusion = map(int, line.strip().split(',')[:8])

                            if 1 <= categoryID <= 10:
                                boxes.append([x, y, x+w, y+h])
                                labels.append(categoryID)
                            
                        except ValueError as e:
                            print(f"Error parsing line in {annotation_path}: {line.strip()} - {e}")
            annotations[imgName] = {'boxes': boxes, 'labels': labels}
            
        return annotations

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        imgName = self.image_files[idx]
        imgPath = os.path.join(self.root_dir, imgName)
        annotationData = self.annotations[imgName]

        img = Image.open(imgPath).convert("RGB")
        boxes = torch.as_tensor(annotationData['boxes'], dtype=torch.float32) 
        labels = torch.as_tensor(annotationData['labels'], dtype=torch.int64) 
        
        # with open(annotation_path, 'r') as f:
        #     for line in f:
        #         x, y, w, h, score, categoryID, truncation, occlusion = map(int, line.strip().split(',')[:8]) # Assuming standard VisDrone annotation format
        #         if not (1 < categoryID <= 10):
        #             continue
        #         boxes.append([x, y, x+w, y+h])
        #         labels.append(categoryID + 1) # Assuming the 6th value is the class label


        target = {
            "boxes": boxes,
            "labels": labels,
            "iamge_id": torch.tensor([idx])
        }
        
        if self.transforms:
            img, target = self.transforms(img, target)

        return img, target

# Data Augmentation
Resize and normalize the dataset with flips and scaling.

In [None]:
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

    

# Load the dataset with dataloaders
Create custom dataset with the augmented data appended to the original dataset using the data loaders from before.

In [None]:
def splitData(batch):
    return tuple(zip(*batch))




In [None]:
def train_one_epoch(model, optimizer, dataLoader, device, epoch, print_freq):
    model.train()
    metricLogger = utils.MetricLogger(delimiter=" ")
    metricLogger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = f'Epoch: [{epoch}]'
    
    lr_scheduler = None
    if epoch == 0:
        warmupFactor = 1./100
        warmupIter = min(len(dataLoader) - 1, 100)
        
        def lr_lambda(step):
            if step < warmupIter:
                alpha = float(step) / warmupIter
                return warmupFactor * (1 - alpha) + alpha
            return 1
        
        lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
        
    for i, (images, targets) in enumerate(metricLogger.log_every(dataLoader, print_freq, header)):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    
        lossDict = model(images, targets)
        losses = sum(loss for loss in lossDict.values())
        
        lossDictReduced = utils.reduce_dict(lossDict)
        losses_reduced = sum(loss for loss in lossDictReduced.values())
        
        lossValue = losses_reduced.item()
        
        if not torch.isfinite(lossValue):
            print(f"Loss is {lossValue}, stopping training")
            print(lossDictReduced)
            sys.exit(1)
            
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        if lr_scheduler is not None:
            lr_scheduler.step()
            
        metricLogger.update(loss=losses_reduced, **lossDictReduced)
        metricLogger.update(lr=optimizer.param_groups[0]["lr"])
        
    return metricLogger


        
        
        

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    metricLogger = utils.MetricLogger(delimiter="  ")
    header = 'Test:'
    
    coco = convertToCOCO(data_loader.dataset)
    iou_types = ["bbox"]
    cocoEvaluator = cocoEvaluator(coco, iou_types)
    
    for images, targets in metric_logger.log_every(data_loader, 100, header):
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        torch.cuda.synchronize()
        outputs = model(images)

        outputs = [{k: v.to(torch.device('cpu')) for k, v in t.items()} for t in outputs]
        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
        cocoEvaluator.update(res)
        
    cocoEvaluator.synchronize_between_processes()
    
    cocoEvaluator.accumulate()
    cocoEvaluator.summarize()
    
    return cocoEvaluator

class CocoEvaluator:
    def __init__(self, cocoGt, iouTypes):
        if not isinstance(iouTypes, (list, tuple)):
            raise TypeError(f"iou_types must be a list or tuple of strings, got {iouTypes}")
        allowed_iou_types = ("bbox", "segm")
        for iou_type in iouTypes:
            if iou_type not in allowed_iou_types:
                raise ValueError(f"iou_type: {iou_type} not in {allowed_iou_types}")
        self.cocoGt = cocoGt
        self.iouTypes = iouTypes
        self.cocoEval = {}
        
    def update(self, predictions):
        imgIds = list(np.unique(list(predictions.keys())))
        self.imgIds.extend(imgIds)
        
        for iou_type in self.iouTypes:
            if len(self.cocoEval) == 0:
                self.cocoEval[iou_type] = COCOeval(self.cocoGt, _create_coco_results(self.coco_gt, predictions, iou_type), iou_type)
            else:
                cocoDt = _create_coco_results(self.cocoGt, predictions, iou_type)
                self.cocoEval[iou_type].cocoDt = self.cocoEval[iou_type].cocoGt.loadRes(cocoDt)
            
    def synchronize_between_processes(self):
        pass
    
    def accumulate(self):
        for coco_eval in self.cocoEval.values():
            coco_eval.accumulate()
            
    def summarize(self):
        for iouType, cocoEval in self.coco_eval.items():
            print(f"IoU metric: {iouType}")
            cocoEval.summarize()
            
    @property
    def results(self):
        return {iouType: cocoEval.stats.tolist() for iouType, cocoEval in self.cocoEval.items()}
            
     
            
            

# Make Results in COCO format


In [None]:
def _create_coco_results(cocoGt, predictions, iouType): 
    results = []
    for image_id, prediction in predictions.items():
        if len(prediction) == 0:
            continue

        boxes = prediction["boxes"].tolist()
        scores = prediction["scores"].tolist()
        labels = prediction["labels"].tolist()

        coco_predictions = []
        for box, score, label in zip(boxes, scores, labels):
            coco_predictions.append(
                {
                    "image_id": image_id,
                    "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]], # Convert to [x, y, w, h]
                    "score": score,
                    "category_id": int(label),
                }
            )
        results.extend(coco_predictions)
    return results

def convertToCOCO(dataset):
    coco = COCO()
    coco.dataset = {
        "info": {"description": "VisDrone 2019 Dataset"},
        "licenses": [{"id": 1, "name": "Unknown", "url": ""}],
        "categories": [{"id": i + 1, "name": name} for i, name in enumerate(dataset.classes[1:])], # Exclude 'ignored region'
        "images": [],
        "annotations": []
    }

    annotation_id = 1
    for i in range(len(dataset)):
        img_name = dataset.image_files[i]
        _, target = dataset[i]
        image_info = {"id": i, "file_name": img_name, "width": 0, "height": 0} # Actual width and height are not used in evaluation
        coco.dataset["images"].append(image_info)

        for j in range(len(target["boxes"])):
            bbox = target["boxes"][j].tolist()
            label = target["labels"][j].item()
            annotation = {
                "id": annotation_id,
                "image_id": i,
                "category_id": label,
                "bbox": [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]], # Convert to [x, y, w, h]
                "area": (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]),
                "iscrowd": 0
            }
            coco.dataset["annotations"].append(annotation)
            annotation_id += 1

    coco.createIndex()
    return coco



# Main Function

In [None]:
def main():
    # Define paths
    train_img_dir = "VisDrone2019-DET-train/images"
    train_anno_dir = "VisDrone2019-DET-train/annotations"
    val_img_dir = "VisDrone2019-DET-val/images"
    val_anno_dir = "VisDrone2019-DET-val/annotations"

    # Check if directories exist
    if not os.path.exists(train_img_dir) or not os.path.exists(train_anno_dir) or not os.path.exists(val_img_dir) or not os.path.exists(val_anno_dir):
        print("Error: One or more image or annotation directories not found. Please adjust the paths.")
        return

    # Device configuration
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print(f"Using device: {device}")

    # Define transforms
    train_transforms = get_transform(train=True)
    val_transforms = get_transform(train=False)

    # Load datasets
    train_dataset = VisDroneDataset(train_img_dir, train_anno_dir, transforms=train_transforms)
    val_dataset = VisDroneDataset(val_img_dir, val_anno_dir, transforms=val_transforms)

    # Define data loaders
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=2, collate_fn=collate_fn)

    # Load pre-trained Faster R-CNN model
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
    num_classes = len(train_dataset.classes) # 11 classes (including background)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    model.to(device)

    # Optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.AdamW(params, lr=0.0001, weight_decay=0.0005)

    # Learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        # Train for one epoch
        metric_logger = train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=100)
        lr_scheduler.step()

        # Evaluate on the validation set
        coco_evaluator = evaluate(model, val_loader, device)
        print(f"Epoch {epoch} Validation AP: {coco_evaluator.coco_eval['bbox'].stats[0]:.3f}")

        # Save checkpoint (optional)
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'lr_scheduler_state_dict': lr_scheduler.state_dict(),
        }, f'fasterrcnn_visdrone_epoch_{epoch}.pth')

    print("Training finished!")

if __name__ == "__main__":
    # Helper functions for distributed training (even if not using)
    import sys
    import torch.utils.data
    import utils

    def init_distributed():
        pass

    def cleanup_distributed():
        pass

    def get_world_size():
        return 1

    def is_main_process():
        return True

    def get_rank():
        return 0

    utils.init_distributed = init_distributed
    utils.cleanup_distributed = cleanup_distributed
    utils.get_world_size = get_world_size
    utils.is_main_process = is_main_process
    utils.get_rank = get_rank

    main()

Bad labels at index 0: tensor([ 5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  3,
         5,  5,  5,  5,  5,  5,  5,  6,  5,  6,  5,  6,  6,  7,  5,  5,  5,  5,
        11,  4,  5,  5,  5,  8,  5,  9,  5,  5,  6,  5,  5,  5,  3,  3,  4,  4,
         3, 11,  3, 11,  3, 11, 11,  3,  3, 11,  9,  3,  6, 11, 11,  3,  3])
Batch size: 4
Image shape: [torch.Size([3, 1050, 1400]), torch.Size([3, 1078, 1916]), torch.Size([3, 1500, 2000]), torch.Size([3, 1078, 1916])]
Targets: ({'boxes': tensor([[1358.,  279., 1379.,  303.],
        [1319.,  268., 1351.,  292.],
        [1332.,  245., 1344.,  260.],
        [1315.,  247., 1326.,  257.],
        [1311.,  246., 1319.,  257.],
        [1300.,  239., 1306.,  245.],
        [1292.,  252., 1299.,  261.],
        [1154.,  410., 1167.,  424.],
        [1169.,  422., 1190.,  439.],
        [1181.,  423., 1201.,  445.],
        [1155.,  475., 1180.,  498.],
        [1135.,  469., 1156.,  488.],
        [1112.,  466., 1128.,  484.],
  

Traceback (most recent call last):
  File "/tmp/ipykernel_1615801/4183841071.py", line 28, in <module>
    loss_dict = model(imgs, targets)
                ^^^^^^^^^^^^^^^^^^^^
  File "/home/tow73/COMP9444/VisDroneInterpretaion/myenv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tow73/COMP9444/VisDroneInterpretaion/myenv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tow73/COMP9444/VisDroneInterpretaion/myenv/lib/python3.12/site-packages/torchvision/models/detection/generalized_rcnn.py", line 67, in forward
    torch._assert(
  File "/home/tow73/COMP9444/VisDroneInterpretaion/myenv/lib/python3.12/site-packages/torch/__init__.py", line 2132, in _assert
    assert condition, message
           ^^^^^^^^^
AssertionE

NameError: name 'losses' is not defined

# Running Inference on a Sample of Images
Here we take a portion of the images to posttrain the model that has already been pretrained on the COCO dataset. 

In [None]:
# Run the inference of the sample image
def generate_predicitons(model, dataset, device):
    model.eval()
    results = []
    
    for idx in range(len(dataset)):
        img, target, filename = dataset[i]
        img = img.to(device)
        imgID = int(target["image_id"].item())
        
        with torch.no_grad():
            prediction = model([img])[0]
            
        for box, label, score in zip(prediction["boxes"], prediction["labels"], prediction["scores"]):
            x1, y1, x2, y2 = box.tolist()
            results.append({
                "image_id": imgID,
                "category_id": int(label),
                "bbox": [float(x1), float(y2), float(x2-x1), float(y2-y1)],
                "score": float(score)
            })
        
        
    return results


# Evaluate with COCO Metrics
The metrics used to evaluate the results of the papers submitted in 2019 were different thresholds of Average Precision and Average Recall. First we have to convert the predictions made and the target annotations into COCO format instead of the format used in the VisDrone set.

### Convert to COCO

In [None]:
def convertToCOCO(dataset):
    coco = {
        "images": [],
        "annotations": [],
        "categories": [{"id": i+1, "name": str(i+1)} for i in range(10)]
    }
    
    annotationID = 1
    for i in range(len(dataset)):
        _, target = dataset[i]
        imgID = int(target["image_id"].item())
        
        coco["images"].append({
            "id": imgID,
            "file_name": dataset.image_files[imgID]
        })
        
        
        for j in range(len(target["boxes"])):
            x1, y1, x2, y2 = target["boxes"][i].tolist()
            coco["annotations"].append({
                "id": annotationID,
                "image_id": imgID,
                "category_id": int(target["labels"][i]),
                "bbox": [float(x1), float(y1), float(x2-x1), float(y2-y1)],
                "area": float((x2-x1) * (y2-y1)),
                "iscrowd": 0
            })
            
            annotationID += 1
    
    return coco


### Evaluate Prediction Results

In [None]:
# save the predictions
predictions = generate_predicitons(model, valDataset, device)
print(f"Saving {len(predictions)} predictions")
with open("converted_model_predictions.json", "w") as f:
    json.dump(predictions, f)

# Generate the convertion to COCO form
converted = convertToCOCO(valDataset)
print(f"Saving {len(converted)} predictions")
with open("converted_visdrone_annotations.json", "w") as f:
    json.dump(converted, f)
    
# dump json objects into file
annotationJSON = "converted_visdrone_annotations.json"
predictJSON = "converted_model_predictions.json"

cocoGT = COCO(annotationJSON)
cocoDT = cocoGT.loadRes(predictJSON)

cocoEval = COCOeval(cocoGT, cocoDT, iouType="bbox")
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()

# Test with test set
Asses the performance of the model on a test set. Using mean average precision for evlaution metric.

In [None]:
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for images, targets in tqdm(test_dataloader, desc='Evaluating'):
        images = list(image.to(device) for image in images)
        outputs = model(images)

        # Process the outputs and targets to calculate evaluation metrics
        # This part will depend on the specific evaluation metrics you want to use
        # and might require converting the output format.
        # Example (simplified - you'll need more detailed processing):
        for i, output in enumerate(outputs):
            boxes = output['boxes'].cpu().numpy()
            scores = output['scores'].cpu().numpy()
            labels = output['labels'].cpu().numpy()
            # Store these predictions and the corresponding ground truth targets

            # Example of storing (you'll need to adapt this):
            all_preds.append({'boxes': boxes, 'scores': scores, 'labels': labels})
            all_targets.append({'boxes': targets[i]['boxes'].cpu().numpy(), 'labels': targets[i]['labels'].cpu().numpy()})

# Calculate evaluation metrics (e.g., mAP) using all_preds and all_targets
# This often involves using libraries like pycocotools if your data format aligns with COCO.

print("Evaluation finished!")