ultralytics · pbanavara · Mar 26, 2024 · Mar 27, 2024 · Mar 27, 2024 · Apr 9, 2024
diff --git a/Podfile b/Podfile
@@ -0,0 +1,12 @@
+# Uncomment the next line to define a global platform for your project
+platform :ios, '15.0'
+
+target 'YOLO' do
+  # Comment the next line if you don't want to use dynamic frameworks
+  use_frameworks!
+
+  # Pods for onnx
+  pod 'onnxruntime-objc', '~> 1.17.0-dev+20231211010.8f2b5a6'
+  pod 'onnxruntime-extensions-c'
+
+end
diff --git a/YOLO-Bridging-Header.h b/YOLO-Bridging-Header.h
@@ -0,0 +1,9 @@
+//
+//  YOLO-Bridging-Header.h
+//  
+//
+//  Created by Pradeep Banavara on 22/03/24.
+//
+
+#import <onnxruntime.h>
+#include <onnxruntime_extensions.h>
diff --git a/YOLO/Info.plist b/YOLO/Info.plist
@@ -21,7 +21,7 @@
 	<key>CFBundleShortVersionString</key>
 	<string>$(MARKETING_VERSION)</string>
 	<key>CFBundleVersion</key>
-	<string>24</string>
+	<string>182</string>
 	<key>ITSAppUsesNonExemptEncryption</key>
 	<false/>
 	<key>LSRequiresIPhoneOS</key>

diff --git a/YOLO/Main.storyboard b/YOLO/Main.storyboard
diff --git a/YOLO/Utilities/BoundingBoxView.swift b/YOLO/Utilities/BoundingBoxView.swift
@@ -11,6 +11,7 @@
 
 import Foundation
 import UIKit
+import SwiftUI
 
 /// Manages the visualization of bounding boxes and associated labels for object detection results.
 class BoundingBoxView {
@@ -19,12 +20,17 @@ class BoundingBoxView {
 
     /// The layer that displays the label and confidence score for the detected object.
     let textLayer: CATextLayer
+
+    /// The layer that displays the pose
+    let lineLayer: CAShapeLayer
 
+    /// The parent layer
+    var parentLayer: CALayer?
     /// Initializes a new BoundingBoxView with configured shape and text layers.
     init() {
         shapeLayer = CAShapeLayer()
         shapeLayer.fillColor = UIColor.clear.cgColor  // No fill to only show the bounding outline
-        shapeLayer.lineWidth = 4  // Set the stroke line width
+        shapeLayer.lineWidth = 2  // Set the stroke line width
         shapeLayer.isHidden = true  // Initially hidden; shown when a detection occurs
 
         textLayer = CATextLayer()
@@ -33,47 +39,61 @@ class BoundingBoxView {
         textLayer.fontSize = 14  // Set font size for the label text
         textLayer.font = UIFont(name: "Avenir", size: textLayer.fontSize)  // Use Avenir font for labels
         textLayer.alignmentMode = .center  // Center-align the text within the layer
+
+        lineLayer = CAShapeLayer()
+        lineLayer.fillColor = UIColor.clear.cgColor
+        lineLayer.lineWidth = 2
+        lineLayer.isHidden = true
     }
 
     /// Adds the bounding box and text layers to a specified parent layer.
     /// - Parameter parent: The CALayer to which the bounding box and text layers will be added.
     func addToLayer(_ parent: CALayer) {
-        parent.addSublayer(shapeLayer)
-        parent.addSublayer(textLayer)
+        parentLayer = parent
+        parentLayer!.addSublayer(shapeLayer)
+        parentLayer!.addSublayer(textLayer)
+        parentLayer!.addSublayer(lineLayer)
     }
-
+    
     /// Updates the bounding box and label to be visible with specified properties.
     /// - Parameters:
     ///   - frame: The CGRect frame defining the bounding box's size and position.
-    ///   - label: The text label to display (e.g., object class and confidence).
-    ///   - color: The color of the bounding box stroke and label background.
-    ///   - alpha: The opacity level for the bounding box stroke and label background.
-    func show(frame: CGRect, label: String, color: UIColor, alpha: CGFloat) {
+    ///   - keypoints: The pose keypoints
+    ///   - widthRatio: To scale the keypoiints x co-ord
+    ///   - heightRatio: To scale the keypoints y co-ord
+    ///
+    func showOnnx(frame: CGRect, keypoints: [Float32], widthRatio: Float, heightRatio: Float ) {
         CATransaction.setDisableActions(true)  // Disable implicit animations
 
         let path = UIBezierPath(roundedRect: frame, cornerRadius: 6.0)  // Rounded rectangle for the bounding box
         shapeLayer.path = path.cgPath
-        shapeLayer.strokeColor = color.withAlphaComponent(alpha).cgColor  // Apply color and alpha to the stroke
-        shapeLayer.isHidden = false  // Make the shape layer visible
-
-        textLayer.string = label  // Set the label text
-        textLayer.backgroundColor = color.withAlphaComponent(alpha).cgColor  // Apply color and alpha to the background
-        textLayer.isHidden = false  // Make the text layer visible
-        textLayer.foregroundColor = UIColor.white.withAlphaComponent(alpha).cgColor  // Set text color
-
-        // Calculate the text size and position based on the label content
-        let attributes = [NSAttributedString.Key.font: textLayer.font as Any]
-        let textRect = label.boundingRect(with: CGSize(width: 400, height: 100),
-                options: .truncatesLastVisibleLine,
-                attributes: attributes, context: nil)
-        let textSize = CGSize(width: textRect.width + 12, height: textRect.height)  // Add padding to the text size
-        let textOrigin = CGPoint(x: frame.origin.x - 2, y: frame.origin.y - textSize.height - 2)  // Position above the bounding box
-        textLayer.frame = CGRect(origin: textOrigin, size: textSize)  // Set the text layer frame
+        shapeLayer.strokeColor = Color.black.cgColor  // Apply color and alpha to the stroke
+        shapeLayer.lineWidth = 4
+        shapeLayer.isHidden = false // Make the shape layer visible
+        parentLayer?.addSublayer(shapeLayer)
+
+        // This loop has drawbacks. The layer that is drawn cannot be erased, so the previous keypoints remain.
+        // Also the scaling of keypoints is questionable at best. Just a placeholder method for now.
+        for i in stride(from: 0, through: keypoints.count-1, by: 3) {
+            let keyPointsLayer = CAShapeLayer()
+            let kp_x = keypoints[i] * widthRatio
+            let kp_y = keypoints[i+1] * heightRatio
+            let confidence = keypoints[i+2]
+            if (confidence < 0.5) { // Can potentially remove hardcoding and make the confidence configurable
+                continue
+            }
+            let rFrame = CGRect(x: Double(kp_x), y: Double(kp_y), width: 10, height: 10)
+            let pointPath = UIBezierPath(roundedRect: rFrame, cornerRadius: 6.0)
+            keyPointsLayer.path = pointPath.cgPath
+            keyPointsLayer.isHidden = true
+            parentLayer?.addSublayer(keyPointsLayer)
+            keyPointsLayer.isHidden = false
+        }
     }
-
     /// Hides the bounding box and text layers.
     func hide() {
         shapeLayer.isHidden = true
         textLayer.isHidden = true
+        lineLayer.isHidden = true
     }
 }
diff --git a/YOLO/Utils.swift b/YOLO/Utils.swift
@@ -0,0 +1,157 @@
+//
+//  Utils.swift
+//  YOLO
+//
+//  Created by Pradeep Banavara on 22/03/24.
+//  Copyright © 2024 Ultralytics. All rights reserved.
+//
+
+import Foundation
+import SwiftUI
+import onnxruntime_objc
+import UIKit
+
+class OnnxPoseUtils : NSObject {
+    /**
+     ### This function accepts an UIImage and renders the detected pose points on the said image.
+     *  It is key to use the correct model for the said purpose. Use the [Model generation] (https://onnxruntime.ai/docs/tutorials/mobile/pose-detection.html)
+     *  It is also key to register the customOps function using the BridgingHeader
+     */
+    var ortSession: ORTSession?
+    override init() {
+        do {
+            guard let modelPath = Bundle.main.path(forResource: "yolov8n-pose-pre", ofType: "onnx") else {
+                fatalError("Model file not found")
+            }
+                let ortEnv = try ORTEnv(loggingLevel: ORTLoggingLevel.info)
+                let ortSessionOptions = try ORTSessionOptions()
+                try ortSessionOptions.registerCustomOps(functionPointer: RegisterCustomOps) // Register the bridging-header in Build settings
+                ortSession = try ORTSession(
+                    env: ortEnv, modelPath: modelPath, sessionOptions: ortSessionOptions)
+        } catch {
+            NSLog("Model initialization error \(error)")
+            fatalError(error.localizedDescription)
+        }
+
+    }
+    func plotPose(image: UIImage) -> UIImage{
+        do {
+            let inputData = image.pngData()!
+            let inputDataLength = inputData.count
+            let inputShape = [NSNumber(integerLiteral: inputDataLength)]
+            let inputTensor = try ORTValue(tensorData: NSMutableData(data: inputData), elementType:ORTTensorElementDataType.uInt8, shape:inputShape)
+            let inputNames = try ortSession!.inputNames()    // The input names should match the model input names. Visualize the model in Netron
+            let outputNames = try ortSession!.outputNames()  // Check the model outnames in Netron
+            let outputs = try ortSession!.run(
+                withInputs: [inputNames[0]: inputTensor], outputNames: Set(outputNames), runOptions: nil)
+
+            guard let outputTensor = outputs[outputNames[0]] else {
+                fatalError("Failed to get model keypoint output from inference.")
+            }
+            return try convertOutputTensorToImage(opTensor: outputTensor, inputImageData: inputData)
+
+        } catch {
+            print(error)
+            fatalError("Error in running the ONNX model")
+        }
+    }
+
+    /**
+     Helper function to convert the output tensor into an image with the bounding box and keypoint data.
+     */
+    private func convertOutputTensorToImage(opTensor: ORTValue, inputImageData: Data) throws -> UIImage{
+
+        let output = try opTensor.tensorData()
+        var arr2 = Array<Float32>(repeating: 0, count: output.count/MemoryLayout<Float32>.stride)   // Do not change the datatype Float32
+        _ = arr2.withUnsafeMutableBytes { output.copyBytes(to: $0) }
+
+        if (arr2.count > 0) {
+            var keypoints:[Float32] = Array()
+
+            // 57 is hardcoded based on the keypoints returned from the model. Refer to the Netron visualisation for the output shape
+            for i in stride(from: arr2.count-57, to: arr2.count, by: 1) {
+                keypoints.append(arr2[i])
+            }
+            let box = keypoints[0..<4] // The first 4 points are the bounding box co-ords.
+            // Refer yolov8_pose_e2e.py run_inference method under the https://onnxruntime.ai/docs/tutorials/mobile/pose-detection.html
+            let half_w = box[2] / 2
+            let half_h = box[3] / 2
+            let x = Double(box[0] - half_w)
+            let y = Double(box[1] - half_h)
+
+
+            let rect = CGRect(x: x, y: y, width: Double(half_w * 2), height: Double(half_h * 2))
+            NSLog("Rect is \(rect)")
+            let image:UIImage = UIImage(data: inputImageData) ?? UIImage()
+            let keypointsWithoutBoxes = Array(keypoints[6..<keypoints.count]) // Based on 17 keypoints and 3 entries per keypoint x,y,confidence
+            return drawKeyPointsOnImage(image: image, rectangle: rect, keypoints: keypointsWithoutBoxes)
+        } else {
+            return UIImage(data: inputImageData)!
+        }
+    }
+
+    /**
+     Helper function takes an input image and a boundding box CGRect along with the keypoints data to return a new image with the rect and keypoints drawn/
+     TODO: // Optimize on generating a new image instead paint the data on the same image. iOS experts to chime in.
+
+     */
+    private func drawKeyPointsOnImage(image: UIImage, rectangle:CGRect, keypoints: [Float32]) -> UIImage {
+        var image = image
+        let imageSize = image.size
+        let scale: CGFloat = 0
+        UIGraphicsBeginImageContextWithOptions(imageSize, false, scale)
+        image.draw(at: CGPoint.zero)
+        UIColor.red.setFill()
+        UIColor.red.setStroke()
+        UIRectFrame(rectangle)
+
+        guard let context = UIGraphicsGetCurrentContext() else { return UIImage() }
+        context.setLineWidth(2.0)
+        context.setStrokeColor(UIColor.blue.cgColor)
+        context.move(to: CGPoint(x: Double(keypoints[0]), y: Double(keypoints[1])))
+
+        for i in stride(from: 0, through: keypoints.count-1, by: 3) {
+            let kp_x = keypoints[i]
+            let kp_y = keypoints[i+1]
+            let confidence = keypoints[i+2]
+            if (confidence < 0.5) { // Can potentially remove hardcoding and make the confidence configurable
+                continue
+            }
+            let rect = CGRect(x: Double(kp_x), y: Double(kp_y), width: 10.0, height: 10.0)
+            UIRectFill(rect)
+
+        }
+        image = UIGraphicsGetImageFromCurrentImageContext()!
+        UIGraphicsEndImageContext()
+        return image
+    }
+
+    /// Placeholder method to draw lines for poses.
+    private func drawPoseLines(image: UIImage, keypoints: [Float32]) -> UIImage {
+        var image = image
+        let imageSize = image.size
+        let scale: CGFloat = 0
+        UIGraphicsBeginImageContextWithOptions(imageSize, false, scale)
+        image.draw(at: CGPoint.zero)
+        guard let context = UIGraphicsGetCurrentContext() else { return UIImage() }
+        context.setLineWidth(2.0)
+        context.setStrokeColor(UIColor.blue.cgColor)
+
+
+        for i in stride(from: 3, through: keypoints.count-1, by: 3) {
+            context.move(to: CGPoint(x: Double(keypoints[i-3]), y: Double(keypoints[i-2])))
+            let kp_x = keypoints[i]
+            let kp_y = keypoints[i+1]
+            let confidence = keypoints[i+2]
+            if (confidence < 0.5) { // Can potentially remove hardcoding and make the confidence configurable
+                continue
+            }
+            context.addLine(to: CGPoint(x: Double(kp_x), y: Double(kp_y)))
+            context.strokePath()
+
+        }
+        image = UIGraphicsGetImageFromCurrentImageContext()!
+        UIGraphicsEndImageContext()
+        return image
+    }
+}
diff --git a/YOLO/VNOnnxHandler.swift b/YOLO/VNOnnxHandler.swift
@@ -0,0 +1,77 @@
+//
+//  VNOnnxHandler.swift
+//  YOLO
+//
+//  Created by Pradeep Banavara on 25/03/24.
+//  Copyright © 2024 Ultralytics. All rights reserved.
+//
+
+import Foundation
+import Vision
+import CoreGraphics
+import VideoToolbox
+import UIKit
+
+class VNOnnxHandler {
+    var sampleBuffer: CVImageBuffer?
+    var ortSession: ORTSession
+    init(cvImageBufffer: CVImageBuffer, session: ORTSession) {
+        sampleBuffer = cvImageBufffer
+        ortSession = session
+
+    }
+
+    private func convertImageBufferToData(sampleBuffer: CVPixelBuffer) -> NSData {
+        let imageBuffer = sampleBuffer
+        CVPixelBufferLockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
+        let bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer)
+        let height = CVPixelBufferGetHeight(imageBuffer)
+        let src_buff = CVPixelBufferGetBaseAddress(imageBuffer)
+        let data = NSData(bytes: src_buff, length: bytesPerRow * height)
+        CVPixelBufferUnlockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
+        return data
+    }
+
+    /// This is a method for extracting the scaled output from the input image. The result is an output tensor which can be used to draw bounding boxes.
+    func perform() throws -> ORTValue {
+        var result: ORTValue?
+        let uiImage = UIImage(cgImage: CGImage.create(from: sampleBuffer!)!)
+        let inputData = uiImage.pngData()
+        let inputDataLength = inputData?.count
+        let inputShape = [NSNumber(integerLiteral: inputDataLength!)]
+        let inputTensor = try ORTValue(tensorData: NSMutableData(data: inputData!), elementType:ORTTensorElementDataType.uInt8, shape:inputShape)
+        let inputNames = try ortSession.inputNames()    // The input names should match the model input names. Visualize the model in Netron
+        let outputNames = try ortSession.outputNames()  // Check the model outnames in Netron
+        let outputs = try ortSession.run(
+            withInputs: [inputNames[0]: inputTensor], outputNames: Set(outputNames), runOptions: nil)
+        guard let outputTensor = outputs[outputNames[0]] else {
+            fatalError("Failed to get model keypoint output from inference.")
+        }
+        result = outputTensor
+        return result!
+    }
+
+    /// This is a handler method to use image layer instead of super imposing the boundingbox on the videoPreviewLayer
+    /// That function is done by the perform method listed above
+    func performImage(poseUtil: OnnxPoseUtils) throws -> UIImage{
+        var result: UIImage?
+        let uiImage = UIImage(cgImage: CGImage.create(from: sampleBuffer!)!)
+        result = poseUtil.plotPose(image: uiImage)
+        return result!
+    }
+}
+
+extension CGImage {
+    static func create(from cvPixelBuffer: CVPixelBuffer?) -> CGImage? {
+        guard let pixelBuffer = cvPixelBuffer else {
+            return nil
+        }
+        var image: CGImage?
+        VTCreateCGImageFromCVPixelBuffer(
+            pixelBuffer,
+            options: nil,
+            imageOut: &image)
+        return image
+    }
+}
+