Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incorporating YOLOv8 Pose ONNX model #13

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Podfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Uncomment the next line to define a global platform for your project
platform :ios, '15.0'

target 'YOLO' do
# Comment the next line if you don't want to use dynamic frameworks
use_frameworks!

# Pods for onnx
pod 'onnxruntime-objc', '~> 1.17.0-dev+20231211010.8f2b5a6'
pod 'onnxruntime-extensions-c'

end
9 changes: 9 additions & 0 deletions YOLO-Bridging-Header.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//
// YOLO-Bridging-Header.h
//
//
// Created by Pradeep Banavara on 22/03/24.
//

#import <onnxruntime.h>
#include <onnxruntime_extensions.h>
2 changes: 1 addition & 1 deletion YOLO/Info.plist
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<key>CFBundleShortVersionString</key>
<string>$(MARKETING_VERSION)</string>
<key>CFBundleVersion</key>
<string>24</string>
<string>182</string>
<key>ITSAppUsesNonExemptEncryption</key>
<false/>
<key>LSRequiresIPhoneOS</key>
Expand Down
106 changes: 2 additions & 104 deletions YOLO/Main.storyboard

Large diffs are not rendered by default.

70 changes: 45 additions & 25 deletions YOLO/Utilities/BoundingBoxView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import Foundation
import UIKit
import SwiftUI

/// Manages the visualization of bounding boxes and associated labels for object detection results.
class BoundingBoxView {
Expand All @@ -19,12 +20,17 @@ class BoundingBoxView {

/// The layer that displays the label and confidence score for the detected object.
let textLayer: CATextLayer

/// The layer that displays the pose
let lineLayer: CAShapeLayer

/// The parent layer
var parentLayer: CALayer?
/// Initializes a new BoundingBoxView with configured shape and text layers.
init() {
shapeLayer = CAShapeLayer()
shapeLayer.fillColor = UIColor.clear.cgColor // No fill to only show the bounding outline
shapeLayer.lineWidth = 4 // Set the stroke line width
shapeLayer.lineWidth = 2 // Set the stroke line width
shapeLayer.isHidden = true // Initially hidden; shown when a detection occurs

textLayer = CATextLayer()
Expand All @@ -33,47 +39,61 @@ class BoundingBoxView {
textLayer.fontSize = 14 // Set font size for the label text
textLayer.font = UIFont(name: "Avenir", size: textLayer.fontSize) // Use Avenir font for labels
textLayer.alignmentMode = .center // Center-align the text within the layer

lineLayer = CAShapeLayer()
lineLayer.fillColor = UIColor.clear.cgColor
lineLayer.lineWidth = 2
lineLayer.isHidden = true
}

/// Adds the bounding box and text layers to a specified parent layer.
/// - Parameter parent: The CALayer to which the bounding box and text layers will be added.
func addToLayer(_ parent: CALayer) {
parent.addSublayer(shapeLayer)
parent.addSublayer(textLayer)
parentLayer = parent
parentLayer!.addSublayer(shapeLayer)
parentLayer!.addSublayer(textLayer)
parentLayer!.addSublayer(lineLayer)
}

/// Updates the bounding box and label to be visible with specified properties.
/// - Parameters:
/// - frame: The CGRect frame defining the bounding box's size and position.
/// - label: The text label to display (e.g., object class and confidence).
/// - color: The color of the bounding box stroke and label background.
/// - alpha: The opacity level for the bounding box stroke and label background.
func show(frame: CGRect, label: String, color: UIColor, alpha: CGFloat) {
/// - keypoints: The pose keypoints
/// - widthRatio: To scale the keypoiints x co-ord
/// - heightRatio: To scale the keypoints y co-ord
///
func showOnnx(frame: CGRect, keypoints: [Float32], widthRatio: Float, heightRatio: Float ) {
CATransaction.setDisableActions(true) // Disable implicit animations

let path = UIBezierPath(roundedRect: frame, cornerRadius: 6.0) // Rounded rectangle for the bounding box
shapeLayer.path = path.cgPath
shapeLayer.strokeColor = color.withAlphaComponent(alpha).cgColor // Apply color and alpha to the stroke
shapeLayer.isHidden = false // Make the shape layer visible

textLayer.string = label // Set the label text
textLayer.backgroundColor = color.withAlphaComponent(alpha).cgColor // Apply color and alpha to the background
textLayer.isHidden = false // Make the text layer visible
textLayer.foregroundColor = UIColor.white.withAlphaComponent(alpha).cgColor // Set text color

// Calculate the text size and position based on the label content
let attributes = [NSAttributedString.Key.font: textLayer.font as Any]
let textRect = label.boundingRect(with: CGSize(width: 400, height: 100),
options: .truncatesLastVisibleLine,
attributes: attributes, context: nil)
let textSize = CGSize(width: textRect.width + 12, height: textRect.height) // Add padding to the text size
let textOrigin = CGPoint(x: frame.origin.x - 2, y: frame.origin.y - textSize.height - 2) // Position above the bounding box
textLayer.frame = CGRect(origin: textOrigin, size: textSize) // Set the text layer frame
shapeLayer.strokeColor = Color.black.cgColor // Apply color and alpha to the stroke
shapeLayer.lineWidth = 4
shapeLayer.isHidden = false // Make the shape layer visible
parentLayer?.addSublayer(shapeLayer)

// This loop has drawbacks. The layer that is drawn cannot be erased, so the previous keypoints remain.
// Also the scaling of keypoints is questionable at best. Just a placeholder method for now.
for i in stride(from: 0, through: keypoints.count-1, by: 3) {
let keyPointsLayer = CAShapeLayer()
let kp_x = keypoints[i] * widthRatio
let kp_y = keypoints[i+1] * heightRatio
let confidence = keypoints[i+2]
if (confidence < 0.5) { // Can potentially remove hardcoding and make the confidence configurable
continue
}
let rFrame = CGRect(x: Double(kp_x), y: Double(kp_y), width: 10, height: 10)
let pointPath = UIBezierPath(roundedRect: rFrame, cornerRadius: 6.0)
keyPointsLayer.path = pointPath.cgPath
keyPointsLayer.isHidden = true
parentLayer?.addSublayer(keyPointsLayer)
keyPointsLayer.isHidden = false
}
}

/// Hides the bounding box and text layers.
func hide() {
shapeLayer.isHidden = true
textLayer.isHidden = true
lineLayer.isHidden = true
}
}
157 changes: 157 additions & 0 deletions YOLO/Utils.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
//
// Utils.swift
// YOLO
//
// Created by Pradeep Banavara on 22/03/24.
// Copyright © 2024 Ultralytics. All rights reserved.
//

import Foundation
import SwiftUI
import onnxruntime_objc
import UIKit

class OnnxPoseUtils : NSObject {
/**
### This function accepts an UIImage and renders the detected pose points on the said image.
* It is key to use the correct model for the said purpose. Use the [Model generation] (https://onnxruntime.ai/docs/tutorials/mobile/pose-detection.html)
* It is also key to register the customOps function using the BridgingHeader
*/
var ortSession: ORTSession?
override init() {
do {
guard let modelPath = Bundle.main.path(forResource: "yolov8n-pose-pre", ofType: "onnx") else {
fatalError("Model file not found")
}
let ortEnv = try ORTEnv(loggingLevel: ORTLoggingLevel.info)
let ortSessionOptions = try ORTSessionOptions()
try ortSessionOptions.registerCustomOps(functionPointer: RegisterCustomOps) // Register the bridging-header in Build settings
ortSession = try ORTSession(
env: ortEnv, modelPath: modelPath, sessionOptions: ortSessionOptions)
} catch {
NSLog("Model initialization error \(error)")
fatalError(error.localizedDescription)
}

}
func plotPose(image: UIImage) -> UIImage{
do {
let inputData = image.pngData()!
let inputDataLength = inputData.count
let inputShape = [NSNumber(integerLiteral: inputDataLength)]
let inputTensor = try ORTValue(tensorData: NSMutableData(data: inputData), elementType:ORTTensorElementDataType.uInt8, shape:inputShape)
let inputNames = try ortSession!.inputNames() // The input names should match the model input names. Visualize the model in Netron
let outputNames = try ortSession!.outputNames() // Check the model outnames in Netron
let outputs = try ortSession!.run(
withInputs: [inputNames[0]: inputTensor], outputNames: Set(outputNames), runOptions: nil)

guard let outputTensor = outputs[outputNames[0]] else {
fatalError("Failed to get model keypoint output from inference.")
}
return try convertOutputTensorToImage(opTensor: outputTensor, inputImageData: inputData)

} catch {
print(error)
fatalError("Error in running the ONNX model")
}
}

/**
Helper function to convert the output tensor into an image with the bounding box and keypoint data.
*/
private func convertOutputTensorToImage(opTensor: ORTValue, inputImageData: Data) throws -> UIImage{

let output = try opTensor.tensorData()
var arr2 = Array<Float32>(repeating: 0, count: output.count/MemoryLayout<Float32>.stride) // Do not change the datatype Float32
_ = arr2.withUnsafeMutableBytes { output.copyBytes(to: $0) }

if (arr2.count > 0) {
var keypoints:[Float32] = Array()

// 57 is hardcoded based on the keypoints returned from the model. Refer to the Netron visualisation for the output shape
for i in stride(from: arr2.count-57, to: arr2.count, by: 1) {
keypoints.append(arr2[i])
}
let box = keypoints[0..<4] // The first 4 points are the bounding box co-ords.
// Refer yolov8_pose_e2e.py run_inference method under the https://onnxruntime.ai/docs/tutorials/mobile/pose-detection.html
let half_w = box[2] / 2
let half_h = box[3] / 2
let x = Double(box[0] - half_w)
let y = Double(box[1] - half_h)


let rect = CGRect(x: x, y: y, width: Double(half_w * 2), height: Double(half_h * 2))
NSLog("Rect is \(rect)")
let image:UIImage = UIImage(data: inputImageData) ?? UIImage()
let keypointsWithoutBoxes = Array(keypoints[6..<keypoints.count]) // Based on 17 keypoints and 3 entries per keypoint x,y,confidence
return drawKeyPointsOnImage(image: image, rectangle: rect, keypoints: keypointsWithoutBoxes)
} else {
return UIImage(data: inputImageData)!
}
}

/**
Helper function takes an input image and a boundding box CGRect along with the keypoints data to return a new image with the rect and keypoints drawn/
TODO: // Optimize on generating a new image instead paint the data on the same image. iOS experts to chime in.

*/
private func drawKeyPointsOnImage(image: UIImage, rectangle:CGRect, keypoints: [Float32]) -> UIImage {
var image = image
let imageSize = image.size
let scale: CGFloat = 0
UIGraphicsBeginImageContextWithOptions(imageSize, false, scale)
image.draw(at: CGPoint.zero)
UIColor.red.setFill()
UIColor.red.setStroke()
UIRectFrame(rectangle)

guard let context = UIGraphicsGetCurrentContext() else { return UIImage() }
context.setLineWidth(2.0)
context.setStrokeColor(UIColor.blue.cgColor)
context.move(to: CGPoint(x: Double(keypoints[0]), y: Double(keypoints[1])))

for i in stride(from: 0, through: keypoints.count-1, by: 3) {
let kp_x = keypoints[i]
let kp_y = keypoints[i+1]
let confidence = keypoints[i+2]
if (confidence < 0.5) { // Can potentially remove hardcoding and make the confidence configurable
continue
}
let rect = CGRect(x: Double(kp_x), y: Double(kp_y), width: 10.0, height: 10.0)
UIRectFill(rect)

}
image = UIGraphicsGetImageFromCurrentImageContext()!
UIGraphicsEndImageContext()
return image
}

/// Placeholder method to draw lines for poses.
private func drawPoseLines(image: UIImage, keypoints: [Float32]) -> UIImage {
var image = image
let imageSize = image.size
let scale: CGFloat = 0
UIGraphicsBeginImageContextWithOptions(imageSize, false, scale)
image.draw(at: CGPoint.zero)
guard let context = UIGraphicsGetCurrentContext() else { return UIImage() }
context.setLineWidth(2.0)
context.setStrokeColor(UIColor.blue.cgColor)


for i in stride(from: 3, through: keypoints.count-1, by: 3) {
context.move(to: CGPoint(x: Double(keypoints[i-3]), y: Double(keypoints[i-2])))
let kp_x = keypoints[i]
let kp_y = keypoints[i+1]
let confidence = keypoints[i+2]
if (confidence < 0.5) { // Can potentially remove hardcoding and make the confidence configurable
continue
}
context.addLine(to: CGPoint(x: Double(kp_x), y: Double(kp_y)))
context.strokePath()

}
image = UIGraphicsGetImageFromCurrentImageContext()!
UIGraphicsEndImageContext()
return image
}
}
77 changes: 77 additions & 0 deletions YOLO/VNOnnxHandler.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
//
// VNOnnxHandler.swift
// YOLO
//
// Created by Pradeep Banavara on 25/03/24.
// Copyright © 2024 Ultralytics. All rights reserved.
//

import Foundation
import Vision
import CoreGraphics
import VideoToolbox
import UIKit

class VNOnnxHandler {
var sampleBuffer: CVImageBuffer?
var ortSession: ORTSession
init(cvImageBufffer: CVImageBuffer, session: ORTSession) {
sampleBuffer = cvImageBufffer
ortSession = session

}

private func convertImageBufferToData(sampleBuffer: CVPixelBuffer) -> NSData {
let imageBuffer = sampleBuffer
CVPixelBufferLockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
let bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer)
let height = CVPixelBufferGetHeight(imageBuffer)
let src_buff = CVPixelBufferGetBaseAddress(imageBuffer)
let data = NSData(bytes: src_buff, length: bytesPerRow * height)
CVPixelBufferUnlockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
return data
}

/// This is a method for extracting the scaled output from the input image. The result is an output tensor which can be used to draw bounding boxes.
func perform() throws -> ORTValue {
var result: ORTValue?
let uiImage = UIImage(cgImage: CGImage.create(from: sampleBuffer!)!)
let inputData = uiImage.pngData()
let inputDataLength = inputData?.count
let inputShape = [NSNumber(integerLiteral: inputDataLength!)]
let inputTensor = try ORTValue(tensorData: NSMutableData(data: inputData!), elementType:ORTTensorElementDataType.uInt8, shape:inputShape)
let inputNames = try ortSession.inputNames() // The input names should match the model input names. Visualize the model in Netron
let outputNames = try ortSession.outputNames() // Check the model outnames in Netron
let outputs = try ortSession.run(
withInputs: [inputNames[0]: inputTensor], outputNames: Set(outputNames), runOptions: nil)
guard let outputTensor = outputs[outputNames[0]] else {
fatalError("Failed to get model keypoint output from inference.")
}
result = outputTensor
return result!
}

/// This is a handler method to use image layer instead of super imposing the boundingbox on the videoPreviewLayer
/// That function is done by the perform method listed above
func performImage(poseUtil: OnnxPoseUtils) throws -> UIImage{
var result: UIImage?
let uiImage = UIImage(cgImage: CGImage.create(from: sampleBuffer!)!)
result = poseUtil.plotPose(image: uiImage)
return result!
}
}

extension CGImage {
static func create(from cvPixelBuffer: CVPixelBuffer?) -> CGImage? {
guard let pixelBuffer = cvPixelBuffer else {
return nil
}
var image: CGImage?
VTCreateCGImageFromCVPixelBuffer(
pixelBuffer,
options: nil,
imageOut: &image)
return image
}
}

Loading
Loading