# Object Detection with YOLOv8 and Open Images Dataset v7

## 1. Environment Setup

In [None]:
# Install required packages
!pip install ultralytics pandas opencv-python torch matplotlib ipywidgets

In [1]:
# Import necessary libraries
import os
import random
import shutil
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import torch
from ultralytics import YOLO
from IPython.display import display, Image
import ipywidgets as widgets
from ipywidgets import interact, fixed

## 2. Data Loading and Preprocessing

In [2]:
# Define the dataset paths
DATASET_PATH = r"C:\Users\Hacking\fiftyone\open-images-v7"
TRAIN_DATA_PATH = r"C:\Users\Hacking\fiftyone\open-images-v7\train\data"
ANNOTATIONS_PATH = r"C:\Users\Hacking\fiftyone\open-images-v7\train\labels\detections.csv"
CLASSES_PATH = r"C:\Users\Hacking\fiftyone\open-images-v7\train\metadata\classes.csv"
YOLO_DIR = r"C:\Users\Hacking\AI Project\yolo_dataset"

In [3]:
def load_dataset():
    # Load classes
    classes_df = pd.read_csv(CLASSES_PATH)
    class_dict = {row.LabelName: idx for idx, row in enumerate(classes_df.itertuples())}
    class_names = classes_df['DisplayName'].tolist()
    
    # Load annotations
    annotations_df = pd.read_csv(ANNOTATIONS_PATH)
    
    # Get image paths
    image_files = list(Path(TRAIN_DATA_PATH).glob("*.jpg"))
    image_ids = [img.stem for img in image_files]
    
    print(f"Loaded {len(class_names)} classes and {len(image_files)} images")
    return annotations_df, class_dict, class_names, image_files, image_ids

def convert_to_yolo_format(annotations_df, class_dict, image_files, image_ids, train_ratio=0.8):
    # Randomly split data
    random.seed(42)
    random.shuffle(image_ids)
    split_idx = int(len(image_ids) * train_ratio)
    train_ids = set(image_ids[:split_idx])
    val_ids = set(image_ids[split_idx:])

    for img_path in image_files:
        img_id = img_path.stem
        img_annotations = annotations_df[annotations_df['ImageID'] == img_id]

        if img_annotations.empty:
            continue

        subset = "train" if img_id in train_ids else "val"

        # Copy image
        #shutil.copy(img_path, os.path.join(YOLO_DIR, "images", subset, img_path.name))
        dst_path = os.path.join(YOLO_DIR, "images", subset, img_path.name)
        if not os.path.exists(dst_path):
            shutil.copy(img_path, dst_path)
        # Create annotation file
        label_path = os.path.join(YOLO_DIR, "labels", subset, f"{img_id}.txt")
        with open(label_path, "w") as f:
            for _, row in img_annotations.iterrows():
                if row['LabelName'] not in class_dict:
                    continue

                class_id = class_dict[row['LabelName']]
                x_min, x_max = float(row['XMin']), float(row['XMax'])
                y_min, y_max = float(row['YMin']), float(row['YMax'])

                # Normalized values — do NOT divide again
                x_center = (x_min + x_max) / 2
                y_center = (y_min + y_max) / 2
                width = x_max - x_min
                height = y_max - y_min

                # Optional: Skip tiny boxes (common with noise in Open Images)
                if width < 0.01 or height < 0.01:
                    continue

                f.write(f"{class_id} {x_center} {y_center} {width} {height}\n")
                print(f"Writing label file: {label_path}")


    print(f"Converted {len(train_ids)} training images and {len(val_ids)} validation images")
    return train_ids, val_ids


def create_yaml_config(class_names):
    """Create YAML configuration file for YOLOv8."""
    yaml_content = {
        'train': os.path.join(YOLO_DIR, 'images', 'train'),
        'val': os.path.join(YOLO_DIR, 'images', 'val'),
        'nc': len(class_names),
        'names': class_names
    }
    
    yaml_path = os.path.join(YOLO_DIR, 'dataset.yaml')
    with open(yaml_path, 'w') as f:
        yaml_str = f"""train: {yaml_content['train']}
        val: {yaml_content['val']}
        nc: {yaml_content['nc']}
        names: {yaml_content['names']}"""
        f.write(yaml_str)
    
    print(f"Created YAML configuration at {yaml_path}")
    return yaml_path

In [4]:
FORCE_PREPROCESS = False

if FORCE_PREPROCESS or not os.path.exists(os.path.join(YOLO_DIR, 'dataset.yaml')):
    annotations_df, class_dict, class_names, image_files, image_ids = load_dataset()
    train_ids, val_ids = convert_to_yolo_format(annotations_df, class_dict, image_files, image_ids)
    yaml_path = create_yaml_config(class_names)
else:
    yaml_path = os.path.join(YOLO_DIR, 'dataset.yaml')
    print(f"YOLO dataset already processed. Using existing config at {yaml_path}")


YOLO dataset already processed. Using existing config at C:\Users\Hacking\AI Project\yolo_dataset\dataset.yaml


## 3. YOLOv8 Model Training

In [5]:
def train_yolo_model(yaml_path):  # added workers as parameter

    # Updated model path
    model_path = r'C:\Users\Hacking\AI Project\runs\detect\train6\weights\last.pt'

    # Load weights from the last model checkpoint for fine-tuning
    model = YOLO(model_path)

    results = model.train(
        data=yaml_path,
        epochs=25,
        imgsz=640,
        batch=16,
        save=True,
        resume=True,  # Resume if a checkpoint exists in the default run folder
        device=0 if torch.cuda.is_available() else 'cpu',
    )

    print(f"Training completed. Results saved to {model.trainer.save_dir}")
    return model


In [None]:
model = train_yolo_model(yaml_path)


## 4. Evaluation and Visualization

In [6]:
model = YOLO(r"C:\Users\Hacking\AI Project\runs\detect\train6\weights\best.pt")
class_names = model.names

  ckpt = torch.load(file, map_location="cpu")


In [9]:
def evaluate_model(model, yaml_path):
    # Run validation
    val_results = model.val(data=yaml_path)
    
    # Print results
    print(f"Validation Results:")
    print(f"mAP50: {val_results.box.map50:.4f}")
    print(f"mAP50-95: {val_results.box.map:.4f}")
    
    return val_results

def visualize_predictions(model, num_images=3, conf=0.25):
    # Get random validation images
    val_dir = os.path.join(YOLO_DIR, "images", "val")
    val_images = list(Path(val_dir).glob("*.jpg"))
    
    if len(val_images) == 0:
        print("No validation images found!")
        return
        
    # Select random images
    random.shuffle(val_images)
    selected_images = val_images[:min(num_images, len(val_images))]
    
    # Display predictions
    plt.figure(figsize=(15, 5 * len(selected_images)))
    
    for i, img_path in enumerate(selected_images):
        # Run prediction
        results = model.predict(img_path, conf=conf)[0]
        
        # Get annotated image
        annotated_img = results.plot()
        annotated_img = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)
        
        # Display image
        plt.subplot(len(selected_images), 1, i+1)
        plt.imshow(annotated_img)
        plt.title(f"Predictions on {img_path.name}")
        plt.axis('off')
        
        # Print detected objects
        boxes = results.boxes
        print(f"\nDetections in {img_path.name}:")
        for box in boxes:
            cls_id = int(box.cls.item())
            conf = box.conf.item()
            class_name = class_names[cls_id]
            print(f"  - {class_name}: {conf:.2f}")
    
    plt.tight_layout()
    plt.show()

In [10]:
# Evaluate the model
val_results = evaluate_model(model, yaml_path)

# Visualize predictions on sample images
visualize_predictions(model, num_images=3)

Ultralytics YOLOv8.1.16 🚀 Python-3.10.0 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 4060 Laptop GPU, 8188MiB)


Model summary (fused): 168 layers, 11358171 parameters, 0 gradients, 29.7 GFLOPs


[34m[1mval: [0mScanning C:\Users\Hacking\AI Project\yolo_dataset\labels\val.cache... 24306 images, 7 backgrounds, 0 corrupt: 100%|██████████| 24306/24306 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1520/1520 [08:46<00:00,  2.89it/s]


                   all      24306     163301      0.417      0.314      0.272      0.188
              Tortoise      24306         43      0.307     0.0522      0.138      0.113
                Magpie      24306          5      0.922        0.4      0.397      0.338
            Sea turtle      24306         45      0.512      0.156      0.186       0.15
              Football      24306        116      0.453      0.647      0.643      0.473
             Ambulance      24306         40      0.298        0.5      0.316      0.257
                Ladder      24306         35      0.251        0.4      0.318      0.244
            Toothbrush      24306         16          1          0     0.0109    0.00904
               Syringe      24306          1          1          0     0.0585     0.0527
                  Sink      24306         92      0.293      0.391      0.286      0.178
                   Toy      24306        612      0.251      0.205      0.119     0.0726
Organ (Musical Instru

<Figure size 1500x1500 with 3 Axes>

## 5. Interactive Inference

In [7]:
from collections import Counter

def generate_caption_from_boxes(boxes, class_names, conf_threshold=0.2):
    labels = [class_names[int(box.cls.item())] for box in boxes if box.conf.item() >= conf_threshold]
    counts = Counter(labels)
    if not counts:
        return "No significant objects detected."

    parts = [f"{v} {k.lower() + ('s' if v > 1 else '')}" for k, v in counts.items()]
    caption = "This image shows " + ', '.join(parts[:-1]) + (' and ' + parts[-1] if len(parts) > 1 else parts[0]) + "."
    return caption


In [8]:
class_names = model.names
def predict_image(image_path, model, conf_threshold=0.25):
    # Run prediction
    results = model.predict(image_path, conf=conf_threshold)[0]
    
    # Get annotated image
    annotated_img = results.plot()
    annotated_img = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)
    
    # Display image
    plt.figure(figsize=(10, 8))
    plt.imshow(annotated_img)
    plt.title("Object Detection Results")
    plt.axis('off')
    plt.show()
    
    # Print detected objects
    boxes = results.boxes
    print("Detected Objects:")
    detections = []
    for box in boxes:
        cls_id = int(box.cls.item())
        conf = box.conf.item()
        class_name = class_names[cls_id]
        print(f"  - {class_name}: {conf:.2f}")
        detections.append(f"{class_name}: {conf:.2f}")
    
    # Generate and print caption
    caption = generate_caption_from_boxes(boxes, class_names, conf_threshold)
    print("\nGenerated Caption:")
    print(caption)

def inference_from_upload():
    # Create widgets
    upload = widgets.FileUpload(accept='.jpg,.jpeg,.png', multiple=False, description='Upload Image')
    conf_slider = widgets.FloatSlider(value=0.25, min=0.1, max=0.9, step=0.05, description='Confidence:')
    button = widgets.Button(description='Detect Objects')
    output = widgets.Output()
    
    # Display widgets
    display(upload, conf_slider, button, output)
    
    # Define callback
    def on_button_click(b):
        with output:
            output.clear_output()
            if not upload.value:
                print("Please upload an image first!")
                return
                
            # Save uploaded file correctly
            file_info = upload.value[0]
            file_name = file_info['name']
            file_data = file_info['content']
            temp_path = f"temp_{file_name}"
            with open(temp_path, 'wb') as f:
                f.write(file_data)
                
            # Run prediction
            predict_image(temp_path, model, conf_slider.value)
            
            # Clean up
            os.remove(temp_path)
    
    # Register callback
    button.on_click(on_button_click)

def start_webcam_detection(model, conf_threshold=0.25):
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return
    print("Webcam detection started. Press 'q' to exit.")
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Could not read frame.")
            break
        results = model.predict(frame, conf=conf_threshold)[0]
        annotated_frame = results.plot()

        # OPTIONAL: Add caption to frame
        caption = generate_caption_from_boxes(results.boxes, class_names, conf_threshold)
        cv2.putText(annotated_frame, caption, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

        cv2.imshow('YOLOv8 Object Detection', annotated_frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()


In [9]:
# Create interactive file upload widget
print("Upload an image to detect objects:")
inference_from_upload()

Upload an image to detect objects:


FileUpload(value=(), accept='.jpg,.jpeg,.png', description='Upload Image')

FloatSlider(value=0.25, description='Confidence:', max=0.9, min=0.1, step=0.05)

Button(description='Detect Objects', style=ButtonStyle())

Output()

In [10]:
# Webcam detection code
# Then run:
start_webcam_detection(model, conf_threshold=0.25)

Webcam detection started. Press 'q' to exit.

0: 480x640 1 Human beard, 1 Shirt, 1 Man, 1 Human face, 76.1ms
Speed: 0.0ms preprocess, 76.1ms inference, 8.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Shirt, 1 Human ear, 1 Man, 1 Human face, 9.7ms
Speed: 2.0ms preprocess, 9.7ms inference, 8.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Shirt, 1 Man, 1 Human face, 12.7ms
Speed: 3.2ms preprocess, 12.7ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Shirt, 1 Man, 1 Human face, 8.8ms
Speed: 4.0ms preprocess, 8.8ms inference, 5.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Shirt, 1 Man, 1 Human face, 16.8ms
Speed: 0.0ms preprocess, 16.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Shirt, 1 Man, 1 Human face, 14.7ms
Speed: 3.1ms preprocess, 14.7ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Shirt, 1 Man, 1 Human face, 13.8ms
Speed: