Merge pull request #14 from tucan9389/new-model/face-parsing

[PR] Support face-parsing semantic segmentation model
tucan9389 · Mar 17, 2021 · c8a41ba · c8a41ba
2 parents 54db767 + 483f93f
commit c8a41ba
Show file tree

Hide file tree

Showing 9 changed files with 306 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -68,3 +68,5 @@ fastlane/screenshots/**/*.png
 fastlane/test_output
 
 .DS_Store
+
+*.mlmodel
diff --git a/README.md b/README.md
@@ -6,9 +6,9 @@
 
 This project is Object Segmentation on iOS with Core ML.<br>If you are interested in iOS + Machine Learning, visit [here](https://github.com/motlabs/iOS-Proejcts-with-ML-Models) you can see various DEMOs.<br>
 
-| DEMO                                                         | Screenshot 1                                  | Screenshot 2                                  | Screenshot 3                                  |
-| ------------------------------------------------------------ | --------------------------------------------- | --------------------------------------------- | --------------------------------------------- |
-| <img src="https://user-images.githubusercontent.com/37643248/99242802-167ad280-2843-11eb-959a-5fe3b169d8f0.gif" width=240px> | <img src="resource/IMG_3633.PNG" width=240px> | <img src="resource/IMG_3632.PNG" width=240px> | <img src="resource/IMG_3635.PNG" width=240px> |
+| DeepLabV3-DEMO1                                              | FaseParsing-DEMO                                             | DeepLabV3-DEMO-2                              | DeepLabV3-DEMO-3                              |
+| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------------------------------------- | --------------------------------------------- |
+| <img src="https://user-images.githubusercontent.com/37643248/99242802-167ad280-2843-11eb-959a-5fe3b169d8f0.gif" width=240px> | <img src="https://user-images.githubusercontent.com/37643248/110972921-e8943d80-839f-11eb-9559-2a32d3b56de0.gif" width=240px> | <img src="resource/IMG_3633.PNG" width=240px> | <img src="resource/IMG_3635.PNG" width=240px> |
 
 ## How it works
 
@@ -22,21 +22,22 @@ This project is Object Segmentation on iOS with Core ML.<br>If you are intereste
 - iOS 12.0+
 - Swift 5
 
-## Model
+## Models
 
 ### Download
 
 Download model from [apple's model page](https://developer.apple.com/machine-learning/models/).
 
 ### Matadata
 
-|                  |             input node              |                 output node                 |  size  |
-| :--------------: | :---------------------------------: | :-----------------------------------------: | :----: |
-|    DeepLabV3     | `[1, 513, 513, 3]`<br>name: `image` | `[513, 513]`<br>name: `semanticPredictions` | 8.6 MB |
-|  DeepLabV3FP16   | `[1, 513, 513, 3]`<br>name: `image` | `[513, 513]`<br>name: `semanticPredictions` | 4.3 MB |
-| DeepLabV3Int8LUT | `[1, 513, 513, 3]`<br>name: `image` | `[513, 513]`<br>name: `semanticPredictions` | 2.3 MB |
+| Name             |           Input           |             Output             |  Size   | iOS version+ |                           Download                           |
+| :--------------- | :-----------------------: | :----------------------------: | :-----: | :----------: | :----------------------------------------------------------: |
+| DeepLabV3        | `Image (Color 513 × 513)` | `MultiArray (Int32 513 × 513)` | 8.6 MB  |  iOS 12.0+   | [link](https://developer.apple.com/machine-learning/models/) |
+| DeepLabV3FP16    | `Image (Color 513 × 513)` | `MultiArray (Int32 513 × 513)` | 4.3 MB  |  iOS 12.0+   | [link](https://developer.apple.com/machine-learning/models/) |
+| DeepLabV3Int8LUT | `Image (Color 513 × 513)` | `MultiArray (Int32 513 × 513)` | 2.3 MB  |  iOS 12.0+   | [link](https://developer.apple.com/machine-learning/models/) |
+| FaceParsing      | `Image (Color 512 × 512)` | `MultiArray (Int32)` 512 × 512 | 52.7 MB |  iOS 14.0+   | [link](https://github.com/tucan9389/SemanticSegmentation-CoreML/releases/download/support-face-parsing/FaceParsing.mlmodel) |
 
-### Inference Time
+### Inference Time − DeepLabV3
 
 | Device            | Inference Time | Total Time (GPU) | Total Time (CPU) |
 | ----------------- | :------------: | :--------------: | :--------------: |
@@ -60,9 +61,38 @@ Download model from [apple's model page](https://developer.apple.com/machine-lea
 
 ⏲: need to measure
 
+### Inference Time − FaceParsing
+
+| Device        | Inference Time | Total Time (GPU) | Total Time (CPU) |
+| ------------- | :------------: | :--------------: | :--------------: |
+| iPhone 12 Pro |       ⏲        |        ⏲         |        ⏲         |
+| iPhone 11 Pro |     37 ms      |      37 ms       |        ⏲         |
+
+### Labels − DeepLabV3
+
+```
+# total 21
+["background", "aeroplane", "bicycle", "bird", "boat", 
+"bottle", "bus", "car", "cat", "chair", 
+"cow", "diningtable", "dog", "horse", "motorbike", 
+"person", "pottedplant", "sheep", "sofa", "train", 
+"tv"]
+```
+
+### Labels − FaceParsing
+
+```
+# total 19
+["background", "skin", "l_brow", "r_brow", "l_eye", 
+"r_eye", "eye_g", "l_ear", "r_ear", "ear_r", 
+"nose", "mouth", "u_lip", "l_lip", "neck", 
+"neck_l", "cloth", "hair", "hat"]
+```
+
 ## See also
 
 - [motlabs/iOS-Proejcts-with-ML-Models](https://github.com/motlabs/iOS-Proejcts-with-ML-Models)<br>
   : The challenge using machine learning model created from tensorflow on iOS
-- [deeplab on TensorFlow](https://github.com/tensorflow/models/tree/master/research/deeplab)<br>
+- [DeepLab on TensorFlow](https://github.com/tensorflow/models/tree/master/research/deeplab)<br>
   : The repository providing DeepLabV3 model
+- [FaceParsing](https://github.com/zllrunning/face-parsing.PyTorch)<Br>: The repository providing the FaceParsing pytorch model
diff --git a/SemanticSegmentation-CoreML.xcodeproj/project.pbxproj b/SemanticSegmentation-CoreML.xcodeproj/project.pbxproj
@@ -20,6 +20,7 @@
 		71BBE06222E3400E00E74F11 /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = 71BBE06022E3400E00E74F11 /* VideoCapture.swift */; };
 		71BBE06322E3400E00E74F11 /* Measure.swift in Sources */ = {isa = PBXBuildFile; fileRef = 71BBE06122E3400E00E74F11 /* Measure.swift */; };
 		71BBE06722E3446300E74F11 /* SegmentationResultMLMultiArray.swift in Sources */ = {isa = PBXBuildFile; fileRef = 71BBE06622E3446300E74F11 /* SegmentationResultMLMultiArray.swift */; };
+		C4052DC025EFE8960040F98D /* MaskTextureGenerater.swift in Sources */ = {isa = PBXBuildFile; fileRef = C4052DBF25EFE8960040F98D /* MaskTextureGenerater.swift */; };
 		C4BB0D92256195AE00354C08 /* MetalRenderingDevice.swift in Sources */ = {isa = PBXBuildFile; fileRef = C4BB0D91256195AE00354C08 /* MetalRenderingDevice.swift */; };
 		C4BB0D96256195F800354C08 /* Maths.swift in Sources */ = {isa = PBXBuildFile; fileRef = C4BB0D95256195F800354C08 /* Maths.swift */; };
 		C4BB0D99256196A300354C08 /* CameraTextureGenerater.swift in Sources */ = {isa = PBXBuildFile; fileRef = C4BB0D98256196A300354C08 /* CameraTextureGenerater.swift */; };
@@ -29,6 +30,8 @@
 		C4BB0DA625619AA400354C08 /* MetalVideoView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C4BB0DA525619AA400354C08 /* MetalVideoView.swift */; };
 		C4BB0DA925619C0400354C08 /* LiveMetalCameraViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = C4BB0DA825619C0400354C08 /* LiveMetalCameraViewController.swift */; };
 		C4BB0DB52561A47900354C08 /* Shaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = C4BB0DB42561A47900354C08 /* Shaders.metal */; };
+		C4DDEF4925FB779D000CF6A5 /* MultitargetSegmentationTextureGenerater.swift in Sources */ = {isa = PBXBuildFile; fileRef = C4DDEF4825FB779D000CF6A5 /* MultitargetSegmentationTextureGenerater.swift */; };
+		C4DDEF7525FBCBC8000CF6A5 /* FaceParsing.mlmodel in Sources */ = {isa = PBXBuildFile; fileRef = C4DDEF7425FBCBC8000CF6A5 /* FaceParsing.mlmodel */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
@@ -47,6 +50,7 @@
 		71BBE06022E3400E00E74F11 /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = "<group>"; };
 		71BBE06122E3400E00E74F11 /* Measure.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Measure.swift; sourceTree = "<group>"; };
 		71BBE06622E3446300E74F11 /* SegmentationResultMLMultiArray.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SegmentationResultMLMultiArray.swift; sourceTree = "<group>"; };
+		C4052DBF25EFE8960040F98D /* MaskTextureGenerater.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MaskTextureGenerater.swift; sourceTree = "<group>"; };
 		C4BB0D91256195AE00354C08 /* MetalRenderingDevice.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalRenderingDevice.swift; sourceTree = "<group>"; };
 		C4BB0D95256195F800354C08 /* Maths.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Maths.swift; sourceTree = "<group>"; };
 		C4BB0D98256196A300354C08 /* CameraTextureGenerater.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CameraTextureGenerater.swift; sourceTree = "<group>"; };
@@ -56,6 +60,8 @@
 		C4BB0DA525619AA400354C08 /* MetalVideoView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalVideoView.swift; sourceTree = "<group>"; };
 		C4BB0DA825619C0400354C08 /* LiveMetalCameraViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LiveMetalCameraViewController.swift; sourceTree = "<group>"; };
 		C4BB0DB42561A47900354C08 /* Shaders.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Shaders.metal; sourceTree = "<group>"; };
+		C4DDEF4825FB779D000CF6A5 /* MultitargetSegmentationTextureGenerater.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultitargetSegmentationTextureGenerater.swift; sourceTree = "<group>"; };
+		C4DDEF7425FBCBC8000CF6A5 /* FaceParsing.mlmodel */ = {isa = PBXFileReference; lastKnownFileType = file.mlmodel; path = FaceParsing.mlmodel; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -112,6 +118,7 @@
 				71BBE05522E33BEB00E74F11 /* DeepLabV3.mlmodel */,
 				71BBE05722E33BEF00E74F11 /* DeepLabV3FP16.mlmodel */,
 				71BBE05922E33BF300E74F11 /* DeepLabV3Int8LUT.mlmodel */,
+				C4DDEF7425FBCBC8000CF6A5 /* FaceParsing.mlmodel */,
 			);
 			path = mlmodel;
 			sourceTree = "<group>";
@@ -123,7 +130,9 @@
 				C4BB0DA525619AA400354C08 /* MetalVideoView.swift */,
 				C4BB0D98256196A300354C08 /* CameraTextureGenerater.swift */,
 				C4BB0D9F2561983C00354C08 /* SegmentationTextureGenerater.swift */,
+				C4DDEF4825FB779D000CF6A5 /* MultitargetSegmentationTextureGenerater.swift */,
 				C4BB0DA2256199B200354C08 /* OverlayingTexturesGenerater.swift */,
+				C4052DBF25EFE8960040F98D /* MaskTextureGenerater.swift */,
 				C4BB0D9B256196ED00354C08 /* Texture.swift */,
 				C4BB0DB32561A46B00354C08 /* Shaders */,
 				C4BB0D94256195E800354C08 /* Utils */,
@@ -225,11 +234,14 @@
 				71BBE06322E3400E00E74F11 /* Measure.swift in Sources */,
 				C4BB0DA625619AA400354C08 /* MetalVideoView.swift in Sources */,
 				71BBE05822E33BEF00E74F11 /* DeepLabV3FP16.mlmodel in Sources */,
+				C4DDEF4925FB779D000CF6A5 /* MultitargetSegmentationTextureGenerater.swift in Sources */,
 				71BBE06722E3446300E74F11 /* SegmentationResultMLMultiArray.swift in Sources */,
 				C4BB0DA925619C0400354C08 /* LiveMetalCameraViewController.swift in Sources */,
 				C4BB0D9C256196ED00354C08 /* Texture.swift in Sources */,
 				71BBE05622E33BEB00E74F11 /* DeepLabV3.mlmodel in Sources */,
+				C4052DC025EFE8960040F98D /* MaskTextureGenerater.swift in Sources */,
 				71BBE05C22E33C6C00E74F11 /* StillImageViewController.swift in Sources */,
+				C4DDEF7525FBCBC8000CF6A5 /* FaceParsing.mlmodel in Sources */,
 				C4BB0D96256195F800354C08 /* Maths.swift in Sources */,
 				C4BB0D92256195AE00354C08 /* MetalRenderingDevice.swift in Sources */,
 				71BBE04622E33B2500E74F11 /* LiveImageViewController.swift in Sources */,
@@ -384,14 +396,16 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = 5WXJ4Z4H69;
+				GCC_OPTIMIZATION_LEVEL = s;
 				INFOPLIST_FILE = "SemanticSegmentation-CoreML/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 12.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = "com.tucan9389.SemanticSegmentation-CoreML";
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -404,7 +418,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = 5WXJ4Z4H69;
 				INFOPLIST_FILE = "SemanticSegmentation-CoreML/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 12.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",

diff --git a/SemanticSegmentation-CoreML/LiveImageViewController.swift b/SemanticSegmentation-CoreML/LiveImageViewController.swift
@@ -24,7 +24,10 @@ class LiveImageViewController: UIViewController {
 
     // MARK - Core ML model
     // DeepLabV3(iOS12+), DeepLabV3FP16(iOS12+), DeepLabV3Int8LUT(iOS12+)
-    let segmentationModel = DeepLabV3Int8LUT()
+    // FaceParsing(iOS14+)
+    lazy var segmentationModel = {
+        return try! DeepLabV3()
+    }()
 
 //    11 Pro
 //    DeepLabV3        : 37 465 1

diff --git a/SemanticSegmentation-CoreML/LiveMetalCameraViewController.swift b/SemanticSegmentation-CoreML/LiveMetalCameraViewController.swift
@@ -20,7 +20,7 @@ class LiveMetalCameraViewController: UIViewController {
     @IBOutlet weak var fpsLabel: UILabel!
 
     var cameraTextureGenerater = CameraTextureGenerater()
-    var segmentationTextureGenerater = SegmentationTextureGenerater()
+    var multitargetSegmentationTextureGenerater = MultitargetSegmentationTextureGenerater()
     var overlayingTexturesGenerater = OverlayingTexturesGenerater()
 
     var cameraTexture: Texture?
@@ -30,8 +30,14 @@ class LiveMetalCameraViewController: UIViewController {
     var videoCapture: VideoCapture!
 
     // MARK - Core ML model
-    // DeepLabV3(iOS12+), DeepLabV3FP16(iOS12+), DeepLabV3Int8LUT(iOS12+)
-    let segmentationModel = DeepLabV3()
+    /// DeepLabV3(iOS12+), DeepLabV3FP16(iOS12+), DeepLabV3Int8LUT(iOS12+)
+    /// - labels: ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tv"]
+    /// - number of labels: 21
+    /// FaceParsing(iOS14+)
+    /// - labels:  ["background", "skin", "l_brow", "r_brow", "l_eye", "r_eye", "eye_g", "l_ear", "r_ear", "ear_r", "nose", "mouth", "u_lip", "l_lip", "neck", "neck_l", "cloth", "hair", "hat"]
+    /// - number of labels: 19
+    lazy var segmentationModel = { return try! DeepLabV3() }()
+    let numberOfLabels = 21 // <#if you changed the segmentationModel, you have to change the numberOfLabels#>
 
     // MARK: - Vision Properties
     var request: VNCoreMLRequest?
@@ -138,16 +144,13 @@ extension LiveMetalCameraViewController {
 
         if let observations = request.results as? [VNCoreMLFeatureValueObservation],
             let segmentationmap = observations.first?.featureValue.multiArrayValue {
-
             guard let row = segmentationmap.shape[0] as? Int,
                 let col = segmentationmap.shape[1] as? Int else {
                     return
             }
 
-            let targetClass = 15 // index of human category
-
             guard let cameraTexture = cameraTexture,
-                  let segmentationTexture = segmentationTextureGenerater.texture(segmentationmap, row, col, targetClass) else {
+                  let segmentationTexture = multitargetSegmentationTextureGenerater.texture(segmentationmap, row, col, numberOfLabels) else {
                 return
             }
 

diff --git a/SemanticSegmentation-CoreML/MetalCamera/CameraTextureGenerater.swift b/SemanticSegmentation-CoreML/MetalCamera/CameraTextureGenerater.swift
@@ -20,8 +20,7 @@ class CameraTextureGenerater: NSObject {
         CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, sharedMetalRenderingDevice.device, nil, &videoTextureCache)
     }
 
-    func texture(from sampleBuffer: CMSampleBuffer) -> Texture? {
-        guard let cameraFrame = CMSampleBufferGetImageBuffer(sampleBuffer) else { return nil }
+    func texture(from cameraFrame: CVPixelBuffer) -> Texture? {
         guard let videoTextureCache = videoTextureCache else { return nil }
 
         let bufferWidth = CVPixelBufferGetWidth(cameraFrame)
@@ -44,4 +43,9 @@ class CameraTextureGenerater: NSObject {
             return nil
         }
     }
+
+    func texture(from sampleBuffer: CMSampleBuffer) -> Texture? {
+        guard let cameraFrame = CMSampleBufferGetImageBuffer(sampleBuffer) else { return nil }
+        return texture(from: cameraFrame)
+    }
 }