For image:

In [None]:
import torch
import cv2
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

def load_model(model_path, num_classes, device):
    """
    Load the trained RCNN model
    """
    model = fasterrcnn_resnet50_fpn_v2(weights=None)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes + 1)
    
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

def prepare_image(image_path):
    """
    Prepare the image for inference
    """
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Convert to tensor and normalize
    image_tensor = F.to_tensor(image)
    image_tensor = F.normalize(image_tensor, 
                             mean=[0.485, 0.456, 0.406], 
                             std=[0.229, 0.224, 0.225])
    
    return image, image_tensor

def draw_predictions(image, boxes, labels, scores, idx_to_label, threshold=0.5, ignore_labels={"DontCare", "Unknown"}):
    """
    Draw bounding boxes and labels on the image
    """
    colors = {
        'Car': (255, 0, 0),      # Red
        'Pedestrian': (0, 255, 0),  # Green
        'Van': (0, 0, 255),      # Blue
        'Cyclist': (255, 255, 0),  # Yellow
        'Truck': (255, 0, 255),   # Magenta
        'Misc': (0, 255, 255),    # Cyan
        'Tram': (128, 0, 0),     # Dark Red
        'Person_Sitting': (0, 128, 0),  # Dark Green
        'DontCare': (128, 128, 128),  # Gray
        'Unknown': (64, 64, 64)   # Dark Gray
    }
    
    image_with_boxes = image.copy()
    
    for box, label, score in zip(boxes, labels, scores):
        if score > threshold:
            class_name = idx_to_label[label]
            
            # Skip ignored labels
            if class_name in ignore_labels:
                continue
            
            # Convert box coordinates to integers
            box = box.astype(int)
            color = colors.get(class_name, (255, 255, 255))
            
            # Draw bounding box
            cv2.rectangle(image_with_boxes, 
                         (box[0], box[1]), 
                         (box[2], box[3]), 
                         color, 
                         2)
            
            # Add label and score
            label_text = f'{class_name}: {score:.2f}'
            cv2.putText(image_with_boxes, 
                       label_text, 
                       (box[0], box[1] - 10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 
                       0.5, 
                       color, 
                       2)
    
    return image_with_boxes

def test_single_image(model_path, image_path, label_to_idx, confidence_threshold=0.5):
    """
    Test the model on a single image
    """
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Create idx to label mapping
    idx_to_label = {v: k for k, v in label_to_idx.items()}
    
    # Load model
    model = load_model(model_path, len(label_to_idx), device)
    
    # Prepare image
    original_image, image_tensor = prepare_image(image_path)
    
    # Perform inference
    with torch.no_grad():
        prediction = model([image_tensor.to(device)])
    
    # Get predictions
    boxes = prediction[0]['boxes'].cpu().numpy()
    labels = prediction[0]['labels'].cpu().numpy()
    scores = prediction[0]['scores'].cpu().numpy()
    
    # Draw predictions on image, ignoring specified labels
    result_image = draw_predictions(original_image, 
                                  boxes, 
                                  labels, 
                                  scores, 
                                  idx_to_label, 
                                  confidence_threshold)
    
    # Convert back to BGR for OpenCV display/save
    result_image = cv2.cvtColor(result_image, cv2.COLOR_RGB2BGR)
    
    # Save the result
    output_path = 'prediction_result.jpg'
    cv2.imwrite(output_path, result_image)
    print(f"Result saved to: {output_path}")
    
    return boxes, labels, scores

# Example usage:
if __name__ == "__main__":
    model_path = '/Users/ujjwalbhatta/Downloads/ComputerVision-RCNN/final_rcnn_model.pth' 
    image_path = '/Users/ujjwalbhatta/Downloads/ComputerVision-RCNN/RCNN_Dataset/test/007301.png'
    
    label_to_idx = {
        "Car": 1,
        "Pedestrian": 2,
        "Van": 3,
        "Cyclist": 4,
        "Truck": 5,
        "Misc": 6,
        "Tram": 7,
        "Person_Sitting": 8,
        "DontCare": 9,
        "Unknown": 10
    }
    
    boxes, labels, scores = test_single_image(model_path, image_path, label_to_idx)

For video:

In [None]:
import torch
import cv2
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

def load_model(model_path, num_classes, device):
    """
    Load the trained RCNN model
    """
    model = fasterrcnn_resnet50_fpn_v2(weights=None)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes + 1)
    
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

def prepare_frame(frame):
    """
    Prepare the frame for inference
    """
    # Convert from BGR to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Convert to tensor and normalize
    frame_tensor = F.to_tensor(frame)
    frame_tensor = F.normalize(frame_tensor, 
                             mean=[0.485, 0.456, 0.406], 
                             std=[0.229, 0.224, 0.225])
    
    return frame, frame_tensor

def process_video(model_path, video_path, label_to_idx, confidence_threshold=0.5, output_path='prediction_result.mp4'):
    """
    Process a video file and save the result with bounding boxes and labels
    """
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Create idx to label mapping
    idx_to_label = {v: k for k, v in label_to_idx.items()}
    
    # Load model
    model = load_model(model_path, len(label_to_idx), device)
    
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return
    
    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    print(f"Video properties - FPS: {fps}, Width: {width}, Height: {height}")
    
    # Create video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    # Process each frame
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            print("No more frames to read or unable to read the frame.")
            break
        
        print(f"Processing frame: {frame_count}")
        original_frame, frame_tensor = prepare_frame(frame)
        
        # Perform inference
        with torch.no_grad():
            prediction = model([frame_tensor.to(device)])
        
        # Get predictions
        boxes = prediction[0]['boxes'].cpu().numpy()
        labels = prediction[0]['labels'].cpu().numpy()
        scores = prediction[0]['scores'].cpu().numpy()
        
        print(f"Predictions - Boxes: {boxes.shape}, Labels: {labels.shape}, Scores: {scores.shape}")

        # Draw predictions on the frame
        result_frame = draw_predictions(original_frame, 
                                       boxes, 
                                       labels, 
                                       scores, 
                                       idx_to_label, 
                                       confidence_threshold)
        
        # Write the frame to the output video
        out.write(result_frame)
        frame_count += 1
    
    # Release the resources
    cap.release()
    out.release()
    print(f"Result saved to: {output_path}")

def draw_predictions(image, boxes, labels, scores, idx_to_label, threshold=0.5, ignore_labels={"DontCare", "Unknown"}):
    """
    Draw bounding boxes and labels on the image
    """
    colors = {
        'Car': (255, 0, 0),      # Red
        'Pedestrian': (0, 255, 0),  # Green
        'Van': (0, 0, 255),      # Blue
        'Cyclist': (255, 255, 0),  # Yellow
        'Truck': (255, 0, 255),   # Magenta
        'Misc': (0, 255, 255),    # Cyan
        'Tram': (128, 0, 0),     # Dark Red
        'Person_Sitting': (0, 128, 0),  # Dark Green
        'DontCare': (128, 128, 128),  # Gray
        'Unknown': (64, 64, 64)   # Dark Gray
    }
    
    image_with_boxes = image.copy()
    
    for box, label, score in zip(boxes, labels, scores):
        if score > threshold:
            class_name = idx_to_label[label]
            
            # Skip ignored labels
            if class_name in ignore_labels:
                continue
            
            # Convert box coordinates to integers
            box = box.astype(int)
            color = colors.get(class_name, (255, 255, 255))
            
            # Draw bounding box
            cv2.rectangle(image_with_boxes, 
                         (box[0], box[1]), 
                         (box[2], box[3]), 
                         color, 
                         2)
            
            # Add label and score
            label_text = f'{class_name}: {score:.2f}'
            cv2.putText(image_with_boxes, 
                       label_text, 
                       (box[0], box[1] - 10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 
                       0.5, 
                       color, 
                       2)
    
    return image_with_boxes

def test_single_image(model_path, image_path, label_to_idx, confidence_threshold=0.5):
    """
    Test the model on a single image
    """
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Create idx to label mapping
    idx_to_label = {v: k for k, v in label_to_idx.items()}
    
    # Load model
    model = load_model(model_path, len(label_to_idx), device)
    
    # Prepare image
    original_image, image_tensor = prepare_frame(cv2.imread(image_path))  # Use cv2.imread here
    
    # Perform inference
    with torch.no_grad():
        prediction = model([image_tensor.to(device)])
    
    # Get predictions
    boxes = prediction[0]['boxes'].cpu().numpy()
    labels = prediction[0]['labels'].cpu().numpy()
    scores = prediction[0]['scores'].cpu().numpy()
    
    # Draw predictions on image, ignoring specified labels
    result_image = draw_predictions(original_image, 
                                  boxes, 
                                  labels, 
                                  scores, 
                                  idx_to_label, 
                                  confidence_threshold)
    
    # Convert back to BGR for OpenCV display/save
    result_image = cv2.cvtColor(result_image, cv2.COLOR_RGB2BGR)
    
    # Save the result
    output_path = 'prediction_result.jpg'
    cv2.imwrite(output_path, result_image)
    print(f"Result saved to: {output_path}")
    
    return boxes, labels, scores

# Example usage:
if __name__ == "__main__":
    model_path = '/Users/ujjwalbhatta/Downloads/ComputerVision-RCNN/final_rcnn_model.pth'  
    video_path = '/Users/ujjwalbhatta/Downloads/ComputerVision-RCNN/VID20241108114856.mp4'
    
    label_to_idx = {
        "Car": 1,
        "Pedestrian": 2,
        "Van": 3,
        "Cyclist": 4,
        "Truck": 5,
        "Misc": 6,
        "Tram": 7,
        "Person_Sitting": 8,
        "DontCare": 9,
        "Unknown": 10
    }
    
    # boxes, labels, scores = test_single_image(model_path, image_path, label_to_idx)
    process_video(model_path, video_path, label_to_idx, confidence_threshold=0.5, output_path='prediction_result2.mp4')


In [None]:
import json

def single_txt_to_coco(txt_file, output_json, label_to_idx):
    """
    Convert a single annotation TXT file to COCO format.
    """
    coco_format = {
        "images": [],
        "annotations": [],
        "categories": []
    }

    annotation_id = 0
    image_id_map = {}
    image_id = 0

    # Define categories
    for label, idx in label_to_idx.items():
        coco_format["categories"].append({
            "id": idx,
            "name": label
        })

    # Read single TXT file
    with open(txt_file, 'r') as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split(',')
        image_name = parts[0]
        x_min, y_min, x_max, y_max = map(float, parts[1:5])
        class_name = parts[5]

        if image_name not in image_id_map:
            image_id_map[image_name] = image_id
            coco_format["images"].append({
                "id": image_id,
                "file_name": image_name,
                "width": 1242,  # Replace with actual image width
                "height": 375   # Replace with actual image height
            })
            image_id += 1

        if class_name in label_to_idx:
            coco_format["annotations"].append({
                "id": annotation_id,
                "image_id": image_id_map[image_name],
                "category_id": label_to_idx[class_name],
                "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
                "area": (x_max - x_min) * (y_max - y_min),
                "iscrowd": 0
            })
            annotation_id += 1

    # Save COCO JSON
    with open(output_json, 'w') as f:
        json.dump(coco_format, f, indent=4)

    print(f"COCO annotations saved to {output_json}")


# Example usage
txt_file = "/Users/ujjwalbhatta/Downloads/ComputerVision-RCNN/test_annotation.txt"
output_json = "/Users/ujjwalbhatta/Downloads/ComputerVision-RCNN/coco_annotations.json"
label_to_idx = {
    "Car": 1,
    "Pedestrian": 2,
    "Van": 3,
    "Cyclist": 4,
    "Truck": 5,
    "Misc": 6,
    "Tram": 7,
    "Person_Sitting": 8,
    "DontCare": 9,
    "Unknown": 10
}
single_txt_to_coco(txt_file, output_json, label_to_idx)


In [None]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import cv2
import numpy as np
from torchvision.transforms import functional as F

def evaluate_coco(model_path, annotation_path):
    # Initialize model architecture first
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = fasterrcnn_resnet50_fpn_v2(weights=None)
    num_classes = 10  # Your number of classes (Car, Pedestrian, etc.)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes + 1)
    
    # Now load the weights
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    
    # Rest of your evaluation code remains the same...
    # Load COCO annotations
    gt_coco = COCO(annotation_path)
    
    predictions = []
    for img_id in gt_coco.getImgIds():
        img_info = gt_coco.loadImgs(img_id)[0]
        img_path = img_info['file_name']
        
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image_tensor = F.to_tensor(image)
        image_tensor = F.normalize(image_tensor, 
                                 mean=[0.485, 0.456, 0.406], 
                                 std=[0.229, 0.224, 0.225])
        
        with torch.no_grad():
            prediction = model([image_tensor.to(device)])
        
        boxes = prediction[0]['boxes'].cpu().numpy()
        scores = prediction[0]['scores'].cpu().numpy()
        labels = prediction[0]['labels'].cpu().numpy()
        
        for box, score, label in zip(boxes, scores, labels):
            predictions.append({
                'image_id': img_id,
                'category_id': int(label),
                'bbox': [float(box[0]), float(box[1]), 
                        float(box[2] - box[0]), float(box[3] - box[1])],
                'score': float(score)
            })
    
    pred_coco = gt_coco.loadRes(predictions)
    cocoEval = COCOeval(gt_coco, pred_coco, 'bbox')
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()
    
    return {
        'AP@0.5': cocoEval.stats[1],
        'AP@0.75': cocoEval.stats[2],
        'AP@0.5:0.95': cocoEval.stats[0],
        'AP_small': cocoEval.stats[3],
        'AP_medium': cocoEval.stats[4],
        'AP_large': cocoEval.stats[5],
        'AR_max1': cocoEval.stats[6],
        'AR_max10': cocoEval.stats[7],
        'AR_max100': cocoEval.stats[8]
    }

# Use it
metrics = evaluate_coco('best_model.pth', '/Users/ujjwalbhatta/Downloads/ComputerVision-RCNN/coco_annotations.json')

# Print results
print("\nDetailed Evaluation Results:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def create_coco_metrics_tables():
    # Create main metrics table
    main_metrics = {
        'Metric': [
            'AP@[IoU=0.50:0.95, area=all]',
            'AP@[IoU=0.50, area=all]',
            'AP@[IoU=0.75, area=all]'
        ],
        'Value': [0.582, 0.806, 0.653]
    }
    
    # Create area-specific AP table
    ap_area = {
        'Area': ['Small', 'Medium', 'Large'],
        'AP': [0.551, 0.588, 0.600]
    }
    
    # Create recall table
    recall_metrics = {
        'Condition': [
            'AR@[IoU=0.50:0.95, maxDets=1]',
            'AR@[IoU=0.50:0.95, maxDets=10]',
            'AR@[IoU=0.50:0.95, maxDets=100]'
        ],
        'Value': [0.420, 0.648, 0.652]
    }
    
    # Create area-specific AR table
    ar_area = {
        'Area': ['Small', 'Medium', 'Large'],
        'AR': [0.611, 0.654, 0.683]
    }
    
    # Create visualizations
    plt.style.use('seaborn')
    fig = plt.figure(figsize=(15, 10))
    
    # 1. Main AP metrics
    plt.subplot(2, 2, 1)
    sns.barplot(x='Metric', y='Value', data=pd.DataFrame(main_metrics))
    plt.xticks(rotation=45, ha='right')
    plt.title('Main AP Metrics')
    plt.ylim(0, 1)
    
    # 2. AP by area
    plt.subplot(2, 2, 2)
    sns.barplot(x='Area', y='AP', data=pd.DataFrame(ap_area))
    plt.title('Average Precision by Area')
    plt.ylim(0, 1)
    
    # 3. Recall metrics
    plt.subplot(2, 2, 3)
    sns.barplot(x='Condition', y='Value', data=pd.DataFrame(recall_metrics))
    plt.xticks(rotation=45, ha='right')
    plt.title('Recall Metrics')
    plt.ylim(0, 1)
    
    # 4. AR by area
    plt.subplot(2, 2, 4)
    sns.barplot(x='Area', y='AR', data=pd.DataFrame(ar_area))
    plt.title('Average Recall by Area')
    plt.ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig('coco_metrics.png', dpi=300, bbox_inches='tight')
    
    # Create LaTeX table
    latex_table = """
\\begin{table}[h]
\\centering
\\begin{tabular}{lc}
\\hline
\\textbf{Metric} & \\textbf{Value} \\\\
\\hline
AP @ IoU=0.50:0.95 (all) & 0.582 \\\\
AP @ IoU=0.50 (all) & 0.806 \\\\
AP @ IoU=0.75 (all) & 0.653 \\\\
\\hline
AP (small objects) & 0.551 \\\\
AP (medium objects) & 0.588 \\\\
AP (large objects) & 0.600 \\\\
\\hline
AR @ maxDets=1 & 0.420 \\\\
AR @ maxDets=10 & 0.648 \\\\
AR @ maxDets=100 & 0.652 \\\\
\\hline
AR (small objects) & 0.611 \\\\
AR (medium objects) & 0.654 \\\\
AR (large objects) & 0.683 \\\\
\\hline
\\end{tabular}
\\caption{Detection performance using COCO metrics.}
\\label{tab:coco_metrics}
\\end{table}
"""
    
    with open('coco_metrics_table.tex', 'w') as f:
        f.write(latex_table)

create_coco_metrics_tables()