tensorflow · copybara-service · Jul 21, 2020 · Jul 21, 2020
diff --git a/tensorflow_lite_support/cc/task/vision/core/base_vision_task_api.h b/tensorflow_lite_support/cc/task/vision/core/base_vision_task_api.h
@@ -94,7 +94,24 @@ class BaseVisionTaskApi
 
   // Performs image preprocessing on the input frame buffer over the region of
   // interest so that it fits model requirements (e.g. upright 224x224 RGB) and
-  // populate the corresponding input tensor.
+  // populate the corresponding input tensor. This is performed by (in this
+  // order):
+  // - cropping the frame buffer to the region of interest (which, in most
+  //   cases, just covers the entire input image),
+  // - resizing it (with bilinear interpolation, aspect-ratio *not* preserved)
+  //   to the dimensions of the model input tensor,
+  // - converting it to the colorspace of the input tensor (i.e. RGB, which is
+  //   the only supported colorspace for now),
+  // - rotating it according to its `Orientation` so that inference is performed
+  //   on an "upright" image.
+  //
+  // IMPORTANT: as a consequence of cropping occurring first, the provided
+  // region of interest is expressed in the unrotated frame of reference
+  // coordinates system, i.e. in `[0, frame_buffer.width) x [0,
+  // frame_buffer.height)`, which are the dimensions of the underlying
+  // `frame_buffer` data before any `Orientation` flag gets applied. Also, the
+  // region of interest is not clamped, so this method will return a non-ok
+  // status if the region is out of these bounds.
   absl::Status Preprocess(const std::vector<TfLiteTensor*>& input_tensors,
                           const FrameBuffer& frame_buffer,
                           const BoundingBox& roi) override {

diff --git a/tensorflow_lite_support/cc/task/vision/image_classifier.h b/tensorflow_lite_support/cc/task/vision/image_classifier.h
@@ -51,13 +51,24 @@ namespace vision {
 //    - only RGB inputs are supported (`channels` is required to be 3).
 //    - if type is kTfLiteFloat32, NormalizationOptions are required to be
 //      attached to the metadata for input normalization.
-// At least one output tensor with `N `classes and either 2 or 4 dimensions:
+// At least one output tensor with:
 //   (kTfLiteUInt8/kTfLiteFloat32)
-//    - `[1 x N]`
-//    - `[1 x 1 x 1 x N]`
+//    -  `N `classes and either 2 or 4 dimensions, i.e. `[1 x N]` or
+//       `[1 x 1 x 1 x N]`
+//    - optional (but recommended) label map(s) as AssociatedFile-s with type
+//      TENSOR_AXIS_LABELS, containing one label per line. The first such
+//      AssociatedFile (if any) is used to fill the `class_name` field of the
+//      results. The `display_name` field is filled from the AssociatedFile (if
+//      any) whose locale matches the `display_names_locale` field of the
+//      `ImageClassifierOptions` used at creation time ("en" by default, i.e.
+//      English). If none of these are available, only the `index` field of the
+//      results will be filled.
 //
 // An example of such model can be found at:
 // https://tfhub.dev/bohemian-visual-recognition-alliance/lite-model/models/mushroom-identification_v1/1
+//
+// A CLI demo is available at `examples/vision/desktop/image_classifier_demo.cc`
+// and provides example usage.
 class ImageClassifier : public BaseVisionTaskApi<ClassificationResult> {
  public:
   using BaseVisionTaskApi::BaseVisionTaskApi;
@@ -71,11 +82,29 @@ class ImageClassifier : public BaseVisionTaskApi<ClassificationResult> {
           absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>());
 
   // Performs actual classification on the provided FrameBuffer.
+  //
+  // The FrameBuffer can be of any size and any of the supported formats, i.e.
+  // RGBA, RGB, NV12, NV21, YV12, YV21. It is automatically pre-processed before
+  // inference in order to (and in this order):
+  // - resize it (with bilinear interpolation, aspect-ratio *not* preserved) to
+  //   the dimensions of the model input tensor,
+  // - convert it to the colorspace of the input tensor (i.e. RGB, which is the
+  //   only supported colorspace for now),
+  // - rotate it according to its `Orientation` so that inference is performed
+  //   on an "upright" image.
   StatusOr<ClassificationResult> Classify(const FrameBuffer& frame_buffer);
 
   // Same as above, except that the classification is performed based on the
-  // input region of interest. Note: the region of interest is not clamped, so
-  // this method will fail if the region is out of bounds.
+  // input region of interest. Cropping according to this region of interest is
+  // prepended to the pre-processing operations.
+  //
+  // IMPORTANT: as a consequence of cropping occurring first, the provided
+  // region of interest is expressed in the unrotated frame of reference
+  // coordinates system, i.e. in `[0, frame_buffer.width) x [0,
+  // frame_buffer.height)`, which are the dimensions of the underlying
+  // `frame_buffer` data before any `Orientation` flag gets applied. Also, the
+  // region of interest is not clamped, so this method will return a non-ok
+  // status if the region is out of these bounds.
   StatusOr<ClassificationResult> Classify(const FrameBuffer& frame_buffer,
                                           const BoundingBox& roi);
 

diff --git a/tensorflow_lite_support/cc/task/vision/image_segmenter.h b/tensorflow_lite_support/cc/task/vision/image_segmenter.h
@@ -53,14 +53,20 @@ namespace vision {
 //      `batch` is required to be 1, `mask_width` and `mask_height` are the
 //      dimensions of the segmentation masks produced by the model, and
 //      `num_classes` is the number of classes supported by the model.
-//    - if label maps are attached to the metadata as TENSOR_AXIS_LABELS
-//      associated files, they are used to populate the `class_name` and
-//      (optional) `display_name` fields of the segmentation results
-//      `colored_labels` field; otherwise these are left empty and only rgb
-//      components are set.
+//    - optional (but recommended) label map(s) can be attached as
+//      AssociatedFile-s with type TENSOR_AXIS_LABELS, containing one label per
+//      line. The first such AssociatedFile (if any) is used to fill the
+//      `class_name` field of the results. The `display_name` field is filled
+//      from the AssociatedFile (if any) whose locale matches the
+//      `display_names_locale` field of the `ImageSegmenterOptions` used at
+//      creation time ("en" by default, i.e. English). If none of these are
+//      available, only the `index` field of the results will be filled.
 //
 // An example of such model can be found at:
 // https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/1
+//
+// A CLI demo is available at `examples/vision/desktop/image_segmenter_demo.cc`
+// and provides example usage.
 class ImageSegmenter : public BaseVisionTaskApi<SegmentationResult> {
  public:
   using BaseVisionTaskApi::BaseVisionTaskApi;
@@ -75,6 +81,16 @@ class ImageSegmenter : public BaseVisionTaskApi<SegmentationResult> {
 
   // Performs actual segmentation on the provided FrameBuffer.
   //
+  // The FrameBuffer can be of any size and any of the supported formats, i.e.
+  // RGBA, RGB, NV12, NV21, YV12, YV21. It is automatically pre-processed before
+  // inference in order to (and in this order):
+  // - resize it (with bilinear interpolation, aspect-ratio *not* preserved) to
+  //   the dimensions of the model input tensor,
+  // - convert it to the colorspace of the input tensor (i.e. RGB, which is the
+  //   only supported colorspace for now),
+  // - rotate it according to its `Orientation` so that inference is performed
+  //   on an "upright" image.
+  //
   // IMPORTANT: the returned segmentation masks are not direcly suited for
   // display, in particular:
   // * they are relative to the unrotated input frame, i.e. *not* taking into

diff --git a/tensorflow_lite_support/cc/task/vision/object_detector.h b/tensorflow_lite_support/cc/task/vision/object_detector.h
@@ -54,10 +54,14 @@ namespace vision {
 //  (kTfLiteFloat32)
 //   - classes tensor of size `[num_results]`, each value representing the
 //     integer index of a class.
-//   - if label maps are attached to the metadata as TENSOR_VALUE_LABELS
-//     associated files, they are used to populate the `class_name` and
-//     (optional) `display_name` fields of the detection results; otherwise
-//     these are left empty and only the `index` field is set.
+//    - optional (but recommended) label map(s) can be attached as
+//      AssociatedFile-s with type TENSOR_VALUE_LABELS, containing one label per
+//      line. The first such AssociatedFile (if any) is used to fill the
+//      `class_name` field of the results. The `display_name` field is filled
+//      from the AssociatedFile (if any) whose locale matches the
+//      `display_names_locale` field of the `ObjectDetectorOptions` used at
+//      creation time ("en" by default, i.e. English). If none of these are
+//      available, only the `index` field of the results will be filled.
 //  (kTfLiteFloat32)
 //   - scores tensor of size `[num_results]`, each value representing the score
 //     of the detected object.
@@ -66,6 +70,9 @@ namespace vision {
 //
 // An example of such model can be found at:
 // https://tfhub.dev/google/lite-model/object_detection/mobile_object_localizer_v1/1/metadata/1
+//
+// A CLI demo is available at `examples/vision/desktop/object_detector_demo.cc`
+// and provides example usage.
 class ObjectDetector : public BaseVisionTaskApi<DetectionResult> {
  public:
   using BaseVisionTaskApi::BaseVisionTaskApi;
@@ -79,6 +86,33 @@ class ObjectDetector : public BaseVisionTaskApi<DetectionResult> {
           absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>());
 
   // Performs actual detection on the provided FrameBuffer.
+  //
+  // The FrameBuffer can be of any size and any of the supported formats, i.e.
+  // RGBA, RGB, NV12, NV21, YV12, YV21. It is automatically pre-processed
+  // before inference in order to (and in this order):
+  // - resize it (with bilinear interpolation, aspect-ratio *not* preserved) to
+  //   the dimensions of the model input tensor,
+  // - convert it to the colorspace of the input tensor (i.e. RGB, which is the
+  //   only supported colorspace for now),
+  // - rotate it according to its `Orientation` so that inference is performed
+  //   on an "upright" image.
+  //
+  // IMPORTANT: the returned bounding boxes are expressed in the unrotated input
+  // frame of reference coordinates system, i.e. in `[0, frame_buffer.width) x
+  // [0, frame_buffer.height)`, which are the dimensions of the underlying
+  // `frame_buffer` data before any `Orientation` flag gets applied.
+  //
+  // In particular, this implies that the returned bounding boxes may not be
+  // directly suitable for display if the input image is displayed *with* the
+  // `Orientation` flag taken into account according to the EXIF specification
+  // (http://jpegclub.org/exif_orientation.html): it may first need to be
+  // rotated. This is typically true when consuming camera frames on Android or
+  // iOS.
+  //
+  // For example, if the input `frame_buffer` has its `Orientation` flag set to
+  // `kLeftBottom` (i.e. the image will be rotated 90° clockwise during
+  // preprocessing to make it "upright"), then the same 90° clockwise rotation
+  // needs to be applied to the bounding box for display.
   StatusOr<DetectionResult> Detect(const FrameBuffer& frame_buffer);
 
  protected:

diff --git a/tensorflow_lite_support/cc/task/vision/proto/detections.proto b/tensorflow_lite_support/cc/task/vision/proto/detections.proto
@@ -23,6 +23,23 @@ import "tensorflow_lite_support/cc/task/vision/proto/class.proto";
 // A single detected object.
 message Detection {
   // The bounding box.
+  //
+  // IMPORTANT: when using the Task APIs, the bounding box is expressed in the
+  // unrotated input frame of reference coordinates system, i.e. in `[0,
+  // frame_buffer.width) x [0, frame_buffer.height)`, which are the dimensions
+  // of the underlying `frame_buffer` data before any `Orientation` flag gets
+  // applied.
+  //
+  // In particular, this implies that the returned bounding boxes may not be
+  // directly suitable for display if the input image is displayed *with* the
+  // `Orientation` flag taken into account according to the EXIF specification
+  // (http://jpegclub.org/exif_orientation.html): it may first need to be
+  // rotated.
+  //
+  // For example, if the input `frame_buffer` has its `Orientation` flag set to
+  // `kLeftBottom` (i.e. the image will be rotated 90° clockwise during
+  // preprocessing to make it "upright"), then the same 90° clockwise rotation
+  // needs to be applied to the bounding box for display.
   optional BoundingBox bounding_box = 2;
   // The candidate classes, sorted by descending score.
   repeated Class classes = 3;