

```
# 1. Preparation of the enviaromental.

```



In [None]:
# Install Dependences
!pip install torch torchvision
!pip install opencv-python-headless
!pip install pascal-voc-writer # Install Right Dataset

import os
import xml.etree.ElementTree as ET
from PIL import Image
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt

Collecting pascal-voc-writer
  Downloading pascal_voc_writer-0.1.4-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading pascal_voc_writer-0.1.4-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: pascal-voc-writer
Successfully installed pascal-voc-writer-0.1.4


2. Download and processing DataSet



In [None]:
# Downloading  Dataset PASCAL VOC 2007
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
!tar -xvf VOCtrainval_06-Nov-2007.tar

# Directory of Information
data_dir = "VOCdevkit/VOC2007/"
image_dir = os.path.join(data_dir, "JPEGImages")
annotation_dir = os.path.join(data_dir, "Annotations")

# Make Function for loading PASCAL VOC dataset
def load_voc_dataset(image_dir, annotation_dir):
    image_paths = []
    annotations = []
    # Create a dictionary to map class names to numerical indices
    class_to_idx = {}
    next_idx = 1  # Start from 1, reserving 0 for background

    for annotation_file in os.listdir(annotation_dir):
        if not annotation_file.endswith(".xml"):
            continue

        # Load XML file
        tree = ET.parse(os.path.join(annotation_dir, annotation_file))
        root = tree.getroot()

        # Get image path
        image_filename = root.find("filename").text
        image_path = os.path.join(image_dir, image_filename)
        image_paths.append(image_path)

        # Read annotations (boxes and labels)
        boxes = []
        labels = []
        for obj in root.findall("object"):
            label = obj.find("name").text

            # Add the label to the dictionary if it's not present
            if label not in class_to_idx:
                class_to_idx[label] = next_idx
                next_idx += 1

            bndbox = obj.find("bndbox")
            xmin = int(bndbox.find("xmin").text)
            ymin = int(bndbox.find("ymin").text)
            xmax = int(bndbox.find("xmax").text)
            ymax = int(bndbox.find("ymax").text)

            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_to_idx[label])  # Store numerical index of the label

        annotations.append({"boxes": boxes, "labels": labels})

    return image_paths, annotations, class_to_idx

# Load training and test data
trainval_image_paths, trainval_annotations, class_to_idx = load_voc_dataset(image_dir, annotation_dir)

# Split into training and validation
train_images, val_images, train_annotations, val_annotations = train_test_split(
    trainval_image_paths, trainval_annotations, test_size=0.2, random_state=42
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
VOCdevkit/VOC2007/JPEGImages/001732.jpg
VOCdevkit/VOC2007/JPEGImages/001733.jpg
VOCdevkit/VOC2007/JPEGImages/001734.jpg
VOCdevkit/VOC2007/JPEGImages/001738.jpg
VOCdevkit/VOC2007/JPEGImages/001739.jpg
VOCdevkit/VOC2007/JPEGImages/001741.jpg
VOCdevkit/VOC2007/JPEGImages/001746.jpg
VOCdevkit/VOC2007/JPEGImages/001747.jpg
VOCdevkit/VOC2007/JPEGImages/001749.jpg
VOCdevkit/VOC2007/JPEGImages/001750.jpg
VOCdevkit/VOC2007/JPEGImages/001752.jpg
VOCdevkit/VOC2007/JPEGImages/001754.jpg
VOCdevkit/VOC2007/JPEGImages/001755.jpg
VOCdevkit/VOC2007/JPEGImages/001756.jpg
VOCdevkit/VOC2007/JPEGImages/001758.jpg
VOCdevkit/VOC2007/JPEGImages/001759.jpg
VOCdevkit/VOC2007/JPEGImages/001761.jpg
VOCdevkit/VOC2007/JPEGImages/001765.jpg
VOCdevkit/VOC2007/JPEGImages/001766.jpg
VOCdevkit/VOC2007/JPEGImages/001768.jpg
VOCdevkit/VOC2007/JPEGImages/001771.jpg
VOCdevkit/VOC2007/JPEGImages/001772.jpg
VOCdevkit/VOC2007/JPEGImages/001775.jpg
VOCdevkit/VOC20

In [None]:
class VOCDataset(Dataset):
    def __init__(self, images, annotations, transform=None):
        self.images = images
        self.annotations = annotations
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        boxes = self.annotations[idx]["boxes"]
        labels = self.annotations[idx]["labels"]
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
        }

        return image, target

# Define Tranformations.
def get_transform():
    return torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5),
    ])

# Crear loaders
train_dataset = VOCDataset(train_images, train_annotations, transform=get_transform())
val_dataset = VOCDataset(val_images, val_annotations, transform=get_transform())

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))


In [None]:
# Loading Model pretraining with Faster R-CNN
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.train()

# Fit of the dataset
#num_classes = len(train_annotations[0]["classes"]) + 1
all_labels = [label for annotation in train_annotations for label in annotation["labels"]]
num_classes = len(set(all_labels)) + 1

model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
    model.roi_heads.box_predictor.cls_score.in_features, num_classes
)

# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

# Use GPU, if they are avalible
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 115MB/s]


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:

# Add second model for ensemble: SSD MobileNet V1
from torchvision.models.detection import ssdlite320_mobilenet_v3_large

# Initialize the second model
model2 = ssdlite320_mobilenet_v3_large(pretrained=True)
model2.eval()  # Set the second model to evaluation mode
model2.to(device)  # Transfer to the same device (GPU/CPU) as the primary model


In [None]:

# Ensemble Method: Combine predictions from both models
def ensemble_boxes(pred1, pred2, iou_threshold=0.5):
    from torchvision.ops import box_iou
    combined_predictions = []
    for p1, p2 in zip(pred1, pred2):  # Process each image in batch
        iou_matrix = box_iou(p1['boxes'], p2['boxes'])
        keep_idx1 = iou_matrix.max(dim=1).values > iou_threshold
        keep_idx2 = iou_matrix.max(dim=0).values > iou_threshold

        combined_scores = (p1['scores'][keep_idx1] + p2['scores'][keep_idx2]) / 2
        combined_predictions.append({
            "boxes": torch.cat([p1['boxes'][keep_idx1], p2['boxes'][keep_idx2]]),
            "scores": combined_scores,
            "labels": torch.cat([p1['labels'][keep_idx1], p2['labels'][keep_idx2]])
        })
    return combined_predictions

# Evaluate both models and combine predictions
model1_predictions = evaluate(model, val_loader)  # Original model's predictions
model2_predictions = evaluate(model2, val_loader)  # Second model's predictions

# Combine predictions from both models
ensemble_predictions = ensemble_boxes(model1_predictions, model2_predictions)

# Apply Non-Maximum Suppression (NMS) to refine predictions
def apply_nms(predictions, iou_threshold=0.5):
    from torchvision.ops import nms
    filtered_predictions = []
    for pred in predictions:
        keep_indices = nms(pred['boxes'], pred['scores'], iou_threshold)
        filtered_predictions.append({
            "boxes": pred['boxes'][keep_indices],
            "scores": pred['scores'][keep_indices],
            "labels": pred['labels'][keep_indices]
        })
    return filtered_predictions

# Apply NMS on ensemble predictions
final_predictions = apply_nms(ensemble_predictions)


In [None]:
# 6. Evaluation of model
from torchvision.ops import nms

def evaluate(model, loader):
    model.eval()
    results = []

    with torch.no_grad():
        for images, _ in loader:
            images = [img.to(device) for img in images]
            outputs = model(images)

            for i, output in enumerate(outputs):
                boxes = output["boxes"].cpu().numpy()
                scores = output["scores"].cpu().numpy()
                labels = output["labels"].cpu().numpy()

                keep = nms(torch.tensor(boxes), torch.tensor(scores), 0.5)
                results.append({"boxes": boxes[keep], "scores": scores[keep], "labels": labels[keep]})

    return results

# Evaluate data set.
validation_results = evaluate(model, val_loader)
print("Evaluation Completed!!!.",validation_results)


In [None]:
import os
import xml.etree.ElementTree as ET

# Directory where are located the annotations
annotation_dir = "VOCdevkit/VOC2007/Annotations"

def get_classes_from_annotations(annotation_dir):
    classes = set()  # use data for avoid duplicated files.

    for annotation_file in os.listdir(annotation_dir):
        if annotation_file.endswith(".xml"):
            # Read files XML
            tree = ET.parse(os.path.join(annotation_dir, annotation_file))
            root = tree.getroot()

            # Find all labels of class <object>
            for obj in root.findall("object"):
                label = obj.find("name").text
                classes.add(label)

    return sorted(classes)

# getting and show uniques class
classes = get_classes_from_annotations(annotation_dir)
print(f"Classes in the dataset: {classes}")


In [None]:
import torch
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt

# List  images and Paht
image_names = ["Dog.jpeg", "Cat.jpeg", "Motorbike.jpeg"]  # Files
image_paths = [os.path.join("/content/drive/MyDrive/MLFotos/", name) for name in image_names]  # Path /content/Motorbike.jpeg /content/Cat.jpeg

# Function for preprocessing Images
def preprocess_images(image_paths, device):
    images = []
    original_images = []
    for path in image_paths:
        img = Image.open(path).convert("RGB")
        original_images.append(img)  # Save Original Image for showing
        img_tensor = F.to_tensor(img).to(device)
        images.append(img_tensor)
    return images, original_images

# Function for drawing the predictions over the image
def draw_predictions(image, boxes, labels, scores, class_names):
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()

    for box, label, score in zip(boxes, labels, scores):
        x1, y1, x2, y2 = box
        label_name = class_names[label]
        draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
        draw.text((x1, y1), f"{label_name} {score:.2f}", fill="yellow", font=font)

    return image

# Function for predicting and showing results.
def predict_and_visualize(model, image_paths, class_names, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.to(device)
    model.eval()

    # Preprocesing Images
    images, original_images = preprocess_images(image_paths, device)

    # Getting predictions
    with torch.no_grad():
        outputs = model(images)

    # Drawing predictions and showing results.
    for i, (img, output) in enumerate(zip(original_images, outputs)):
        boxes = output["boxes"].cpu().numpy()
        scores = output["scores"].cpu().numpy()
        labels = output["labels"].cpu().numpy()

        # Objective is Filter predictions with score > 0.5
        keep = scores > 0.5
        boxes = boxes[keep]
        scores = scores[keep]
        labels = labels[keep]

        # Drawing Predictions
        img_with_predictions = draw_predictions(img, boxes, labels, scores, class_names)

        # Showing prediccions in Text
        print(f"\nPrediction over the imagen '{image_names[i]}':")
        for box, label, score in zip(boxes, labels, scores):
            label_name = class_names[label]
            print(f" - Class: {label_name}, Score: {score:.2f}, Box: {box}")

        # Show Image
        plt.figure(figsize=(8, 8))
        plt.imshow(img_with_predictions)
        plt.axis("off")
        plt.show()

# List of class dataset PASCAL VOC 2007
class_names = [
    "__background__", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car",
    "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]

# The model should be traning or pretraning
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Predict and showing
predict_and_visualize(model, image_paths, class_names)


## Risk Management

### Identified Risks
1. **Data Issues**: Potential class imbalance in the dataset affecting performance.
   - **Mitigation**: Applied data augmentation to balance the dataset.
2. **Computational Limits**: Limited resources available for training.
   - **Mitigation**: Used pre-trained models and applied model quantization.

### Challenges and Mitigations
1. **Training Time**: Training multiple models increases time requirements.
   - Mitigation: Reduced dataset size for faster iterations.
2. **Accuracy vs. Speed Trade-off**: Quantization may slightly reduce accuracy.
   - Mitigation: Balanced by applying ensemble methods to retain accuracy.
