In [4]:
%matplotlib inline
from ultralytics import YOLO 
import os
import random
import shutil

weights_path = "../../models/" 
results_path = "../../results"
train_dir_path = "/opt/homebrew/datasets/VisDrone/VisDrone2019-DET-train"

## Finetuning of YOLOv8 on VisDrone

In [None]:
model = YOLO(weights_path + "yolov8n.pt")
results = model.train(data="VisDrone.yaml", epochs=19, imgsz=640, device = "mps", save=True, amp=False, batch=4)

In [None]:
model.save(weights_path + "yolov8n_finetuned_visdrone_19_epochs.pt")

## Quantitative evaluation of models on VisDrone val set 

In this part, we will evaluate the performance of the models on the VisDrone validation set, to assess the impact of the finetuning on the performance of YOLOv8n on the object recognition task on UAV images.

In [None]:
finetuned_visdrone_yolov8n = YOLO(weights_path + "yolov8n_finetuned_visdrone_19_epochs.pt")
regular_visdrone_yolov8n = YOLO(weights_path + "yolov8n.pt")

In [None]:
metrics_finetuned_yolov8n = finetuned_visdrone_yolov8n.val(data="VisDrone.yaml", imgsz=640, device="cpu", save_json=True, batch=8) 
metrics_regular_yolov8n = regular_visdrone_yolov8n.val(data="VisDrone.yaml", imgsz=640, device="cpu", save_json=True, batch=8)

## Visualizing the results of the models on VisDrone test set

This section will be dedicated to the visualization of the results of the models on the test set. The results will be visualized in the form of bounding boxes on the images. We will evaluate different models including : the yolov8n model pretrained on the COCO dataset, the yolov8n model pretrained on COCO and finetuned on the VisDrone dataset. We hope to see that the latter will perform better than the former.

In [None]:
import numpy as np
import os
from ultralytics import YOLO
import torch 

data_path_visdrone = "../../data/still_frames/VisDrone2019DETtest/"

finetuned_visdrone_yolov8n = YOLO(weights_path + "yolov8n_finetuned_visdrone_19_epochs.pt")
regular_visdrone_yolov8n = YOLO(weights_path + "yolov8n.pt")

np.random.seed(42)  # setting seed for reproducibility

test_images = os.listdir(data_path_visdrone + "/images")
test_images = [os.path.join(data_path_visdrone, "images", img) for img in test_images]
test_images = np.random.choice(test_images, 80, replace=False)

def run_inference(model, test_images, device="cpu", save_results=None):
    results = []

    if device == "mps" and not torch.backends.mps.is_available():
        print("MPS device is not available. Falling back to CPU.")
        device = "cpu"
    elif device == "cuda" and not torch.cuda.is_available():
        print("CUDA device is not available. Falling back to CPU.")
        device = "cpu"

    print(f"Running inference on device: {device}")


    for img_path in test_images:
        try:
            result = model(img_path, device=device, verbose=False, imgsz=640)
            if save_results is not None:
                if not os.path.exists(save_results):
                    os.makedirs(save_results)
                result[0].save(filename=os.path.join(save_results, f"{os.path.basename(img_path)}"))
            results.append(result)
        except Exception as e:
            print(f"Error processing image {img_path}: {str(e)}")

    print(f"Successfully processed {len(results)} images")
    return results


results_finetuned_yolov8n = run_inference(finetuned_visdrone_yolov8n, test_images, device="cpu", save_results=os.path.join(results_path, "finetuned_yolov8n"))
results_regular_yolov8n = run_inference(regular_visdrone_yolov8n, test_images, device="cpu", save_results=os.path.join(results_path, "regular_yolov8n"))

In [None]:
annotated_frames_finetuned_yolo8n = [r[0].plot() for r in results_finetuned_yolov8n]
annotated_frames_regular_yolo8n = [r[0].plot() for r in results_regular_yolov8n]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_yolo_comparison(annotated_frames_finetuned, annotated_frames_regular, num_samples=10):
    fig, axs = plt.subplots(num_samples, 2, figsize=(20, num_samples * 5))
    fig.suptitle("Finetuned YOLOv8n vs Regular YOLOv8n on VisDrone dataset", fontsize=16)

    # Randomly selecting a subset of images to display
    available_indices = list(range(len(annotated_frames_finetuned)))
    selected_indices = np.random.choice(available_indices, num_samples, replace=False)

    for i, idx in enumerate(selected_indices):
        axs[i, 0].imshow(annotated_frames_regular[idx])
        axs[i, 0].set_title(f"Regular YOLOv8n - Image {idx}")
        axs[i, 0].axis('off')

        axs[i, 1].imshow(annotated_frames_finetuned[idx])
        axs[i, 1].set_title(f"Fine-tuned YOLOv8n - Image {idx}")
        axs[i, 1].axis('off')

    plt.tight_layout()
    plt.subplots_adjust(top=0.969)  
    plt.show()

plot_yolo_comparison(annotated_frames_finetuned_yolo8n, annotated_frames_regular_yolo8n)

## Efficiency metrics

In this section, we will evaluate the finetuned and regular yolov8n models on the visdrone test set. We will use the following metrics to evaluate how frugal the models are:
- Inference time per frame, in milliseconds
- Speed when running on CPU with onnx conversion 

In [None]:
finetuned_visdrone_yolov8n.export(format="onnx", simplify=True, dynamic=True, opset=19)

In [None]:
finetuned_visdrone_yolov8n = YOLO(weights_path + "/yolov8n_finetuned_visdrone_19_epochs.pt")
finetuned_visdrone_onnx_yolov8n = YOLO(weights_path + "/yolov8n_finetuned_visdrone_19_epochs.onnx")

results_finetuned_yolov8n = run_inference(finetuned_visdrone_yolov8n, test_images, device="cpu", save_results=os.path.join(results_path, "finetuned_yolov8n_onnx"))
results_finetuned_onnx_yolov8n = run_inference(finetuned_visdrone_onnx_yolov8n, test_images, device="cpu", save_results=os.path.join(results_path, "finetuned_onnx_yolov8n"))

In [None]:
import pandas as pd

speed_data_finetuned_yolov8n = [r[0].speed for r in results_finetuned_yolov8n]
speed_data_finetuned_yolov8n_onnx = [r[0].speed for r in results_finetuned_onnx_yolov8n]

mean_processing_time_finetuned_yolov8n = np.mean([r["preprocess"] for r in speed_data_finetuned_yolov8n])
mean_inference_time_finetuned_yolov8n = np.mean([r["inference"] for r in speed_data_finetuned_yolov8n])
mean_postprocessing_time_finetuned_yolov8n = np.mean([r["postprocess"] for r in speed_data_finetuned_yolov8n])

mean_processing_time_finetuned_yolov8n_onnx = np.mean([r["preprocess"] for r in speed_data_finetuned_yolov8n_onnx])
mean_inference_time_finetuned_yolov8n_onnx = np.mean([r["inference"] for r in speed_data_finetuned_yolov8n_onnx])
mean_postprocessing_time_finetuned_yolov8n_onnx = np.mean([r["postprocess"] for r in speed_data_finetuned_yolov8n_onnx])

speed_data_benchmark = pd.DataFrame({
    "Model": ["YOLOv8n", "YOLOv8n ONNX"], #"YOLOv8s", "YOLOv8s ONNX"],
    "Mean Preprocessing Time (ms)": [mean_processing_time_finetuned_yolov8n, mean_processing_time_finetuned_yolov8n_onnx], #mean_processing_time_finetuned_yolov8s, mean_processing_time_finetuned_yolov8s_onnx],
    "Mean Inference Time (ms)": [mean_inference_time_finetuned_yolov8n, mean_inference_time_finetuned_yolov8n_onnx], #mean_inference_time_finetuned_yolov8s, mean_inference_time_finetuned_yolov8s_onnx],
    "Mean Postprocessing Time (ms)": [mean_postprocessing_time_finetuned_yolov8n, mean_postprocessing_time_finetuned_yolov8n_onnx], # mean_postprocessing_time_finetuned_yolov8s, mean_postprocessing_time_finetuned_yolov8s_onnx]
})

In [None]:
print(speed_data_benchmark)

## Real time object detection on a UAV video

In [None]:
import cv2
import os
from tqdm import tqdm

# we create a video from frames since opencv videocapture method does not work with individual frames
def create_video_from_frames(frame_folder, output_video_path, fps=30):
    images = sorted([img for img in os.listdir(frame_folder) if img.endswith(".jpg")])
    
    if not images:
        raise ValueError(f"No JPG images found in {frame_folder}")

    first_frame = cv2.imread(os.path.join(frame_folder, images[0]))
    if first_frame is None:
        raise ValueError(f"Unable to read the first frame: {images[0]}")
    
    height, width, layers = first_frame.shape

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    for image in tqdm(images, desc="Creating video"):
        frame = cv2.imread(os.path.join(frame_folder, image))
        if frame is not None:
            video.write(frame)
        else:
            print(f"Warning: Unable to read frame {image}")

    video.release()
    print(f"Video saved to {output_video_path}")


In [None]:
create_video_from_frames("../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000297_02761_v", "../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000297_02761_v/full_sequence_uav0000297_02761_v.mp4")

In [None]:
create_video_from_frames("../../data/videos/UAV-benchmark-S/S0102", "../../data/videos/UAV-benchmark-S/S0102/full_sequence_S0102.mp4")

In [None]:
create_video_from_frames("../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000201_00000_v", "../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000201_00000_v/full_sequence_uav0000201_00000_v.mp4")

In [None]:
create_video_from_frames("../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000306_00230_v", "../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000306_00230_v/full_sequence_uav0000306_00230_v.mp4")

In [5]:
import cv2
from ultralytics import YOLO
import numpy as np
import time


def real_time_demo(video_path, model, window_name="Comparison", model_name=None, save=None):
    cap = cv2.VideoCapture(video_path)
    
    running_mean_processing_time = 0
    
    # Getting video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    if save:
        output_path = f"{save}/{model_name}_{video_path.split('/')[-1]}"
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, video_fps, (width, height))
    
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1
        
        # Process frame
        process_start = time.time()
        results = model(frame, device='cpu', verbose=False, imgsz=640)
        annotated_frame = results[0].plot()
        process_end = time.time()
        
        process_time = process_end - process_start
        
        running_mean_processing_time = (running_mean_processing_time * (frame_count - 1) + process_time) / frame_count
        
        # Adding info to frame
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.7
        color = (128, 0, 128) 
        thickness = 2
        line_spacing = 30

        cv2.putText(annotated_frame, f"Model: {model_name}", (10, 30), font, font_scale, color, thickness)
        cv2.putText(annotated_frame, f"Running Mean Processing Time: {running_mean_processing_time*1000:.2f} ms", (10, 30 + line_spacing), font, font_scale, color, thickness)
        cv2.putText(annotated_frame, f"Video FPS: {video_fps:.2f}", (10, 30 + 2*line_spacing), font, font_scale, color, thickness)
        cv2.putText(annotated_frame, f"Frame: {frame_count}/{total_frames}", (10, 30 + 3*line_spacing), font, font_scale, color, thickness)
        
        cv2.imshow(window_name, annotated_frame)
        
        if save:
            out.write(annotated_frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    if save:
        out.release()
    cv2.destroyAllWindows()
    
    return running_mean_processing_time*1000


regular_yolov8n = YOLO(weights_path + "/yolov8n.pt")
finetuned_yolov8n = YOLO(weights_path + "/yolov8n_finetuned_visdrone_19_epochs.pt")
finetuned_onnx_yolov8n = YOLO(weights_path + "/yolov8n_finetuned_visdrone_19_epochs.onnx")

# finetuned_yolov8s = YOLO(weights_path + "/yolov8s_finetuned_visdrone_20_epochs.pt")
# finetuned_onnx_yolov8s = YOLO(weights_path + "/yolov8s_finetuned_visdrone_20_epochs.onnx")

video_paths = [
    "../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000297_02761_v/full_sequence_uav0000297_02761_v.mp4",
    "../../data/videos/UAV-benchmark-S/S0102/full_sequence_S0102.mp4",
    "../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000201_00000_v/full_sequence_uav0000201_00000_v.mp4",
    "../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000306_00230_v/full_sequence_uav0000306_00230_v.mp4"
]





In [8]:
# Running comparisons between the models
import pandas as pd

results_path_real_time = "../../results/real_time_demo"
if not os.path.exists(results_path_real_time):
    os.makedirs(results_path_real_time)
    
speed_df = pd.DataFrame(columns=["Model", "Processing Time (ms)"])
models = {"Regular YOLOv8n": regular_yolov8n, "Finetuned YOLOv8n": finetuned_yolov8n, "Finetuned YOLOv8n ONNX": finetuned_onnx_yolov8n} #"Finetuned YOLOv8s": finetuned_yolov8s, "Finetuned YOLOv8s ONNX": finetuned_onnx_yolov8s}
results_paths_videos = {"Regular YOLOv8n": os.path.join(results_path_real_time, "regular_yolov8n"), "Finetuned YOLOv8n": os.path.join(results_path_real_time, "finetuned_yolov8n"), "Finetuned YOLOv8n ONNX": os.path.join(results_path_real_time, "finetuned_yolov8n_onnx")} #"Finetuned YOLOv8s": os.path.join(results_path, "finetuned_yolov8s"), "Finetuned YOLOv8s ONNX": os.path.join(results_path, "finetuned_yolov8s_onnx")}
for model_name, model in models.items():
    if not os.path.exists(results_paths_videos[model_name]):
        os.makedirs(results_paths_videos[model_name])
    
for model_name, model in models.items():
    avg_processing_time = 0
    for video_path in video_paths:
        mean_processing_time  = real_time_demo(video_path, model, window_name=f"{model_name}", model_name=model_name, save=results_paths_videos[model_name])
        avg_processing_time += mean_processing_time
    avg_processing_time /= len(video_paths)
    if model_name == "Regular YOLOv8n":
        # continuing the loop without adding the processing time of the regular model, since the comparison we are interested in is with the finetuned model
        continue
    else: 
        speed_df = pd.concat([speed_df, pd.DataFrame({"Model": [model_name], "Processing Time (ms)": [avg_processing_time]})])
    
print(speed_df)

Loading ../../models/yolov8n_finetuned_visdrone_19_epochs.onnx for ONNX Runtime inference...


  speed_df = pd.concat([speed_df, pd.DataFrame({"Model": [model_name], "Processing Time (ms)": [avg_processing_time]})])


                    Model  Processing Time (ms)
0       Finetuned YOLOv8n             44.314221
0  Finetuned YOLOv8n ONNX             48.046318


In [None]:
# tracking
# using the tracking feature of yolov8 in ultralytics, that allows to persist tracks between frames

from ultralytics import YOLO
import cv2 

fine_tuned_yolov8n = YOLO(weights_path + "yolov8n_finetuned_visdrone_19_epochs.pt")

# Open the video file
video_path = "../../data/videos/VisDrone2019-VID-test-dev/sequences/uav0000201_00000_v/full_sequence_uav0000201_00000_v.mp4"
cap = cv2.VideoCapture(video_path)

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Run YOLOv8 tracking on the frame, persisting tracks between frames
        results = fine_tuned_yolov8n.track(frame, persist=True, device="cpu", verbose=False, imgsz=640)

        # Visualize the results on the frame
        annotated_frame = results[0].plot()

        # Display the annotated frame
        cv2.imshow("YOLOv8 Tracking", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
cv2.destroyAllWindows()