tensorflow · YknZhu · Mar 23, 2018 · Mar 23, 2018
diff --git a/research/deeplab/README.md b/research/deeplab/README.md
@@ -28,7 +28,9 @@ features:
     convolution to trade-off precision and runtime.
 
 If you find the code useful for your research, please consider citing our latest
-work:
+works:
+
+*   DeepLabv3+:
 
 ```
 @article{deeplabv3plus2018,
@@ -39,11 +41,21 @@ work:
 }
 ```
 
+*   MobileNetv2:
+
+```
+@inproceedings{mobilenetv22018,
+  title={Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation},
+  author={Mark Sandler and Andrew Howard and Menglong Zhu and Andrey Zhmoginov and Liang-Chieh Chen},
+  booktitle={CVPR},
+  year={2018}
+}
+```
+
 In the current implementation, we support adopting the following network
 backbones:
 
-1.  MobileNetv2 [8]: A fast network structure designed for mobile devices. **We
-    will provide MobileNetv2 support in the next update. Please stay tuned.**
+1.  MobileNetv2 [8]: A fast network structure designed for mobile devices.
 
 2.  Xception [9, 10]: A powerful network structure intended for server-side
     deployment.
@@ -71,7 +83,7 @@ Some segmentation results on Flickr images:
 
 Demo:
 
-*   <a href='deeplab_demo.ipynb'>Jupyter notebook for off-the-shelf inference.</a><br>
+*   <a href='https://colab.sandbox.google.com/github/tensorflow/models/blob/master/research/deeplab/deeplab_demo.ipynb'>Colab notebook for off-the-shelf inference.</a><br>
 
 Running:
 

diff --git a/research/deeplab/common.py b/research/deeplab/common.py
@@ -39,11 +39,11 @@
                      'The kernel size for the convolutional kernel that '
                      'generates logits.')
 
-# We will support `mobilenet_v2' in the coming update. When using
-# 'xception_65', we set atrous_rates = [6, 12, 18] (output stride 16) and
-# decoder_output_stride = 4.
-flags.DEFINE_enum('model_variant', 'xception_65', ['xception_65'],
-                  'DeepLab model variants.')
+# When using 'mobilent_v2', we set atrous_rates = decoder_output_stride = None.
+# When using 'xception_65', we set atrous_rates = [6, 12, 18] (output stride 16)
+# and decoder_output_stride = 4.
+flags.DEFINE_enum('model_variant', 'mobilenet_v2',
+                  ['xception_65', 'mobilenet_v2'], 'DeepLab model variant.')
 
 flags.DEFINE_multi_float('image_pyramid', None,
                          'Input scales for multi-scale feature extraction.')
@@ -60,7 +60,12 @@
 flags.DEFINE_multi_integer('multi_grid', None,
                            'Employ a hierarchy of atrous rates for ResNet.')
 
-# For `xception_65`, use decoder_output_stride = 4.
+flags.DEFINE_float('depth_multiplier', 1.0,
+                   'Multiplier for the depth (number of channels) for all '
+                   'convolution ops used in MobileNet.')
+
+# For `xception_65`, use decoder_output_stride = 4. For `mobilenet_v2`, use
+# decoder_output_stride = None.
 flags.DEFINE_integer('decoder_output_stride', None,
                      'The ratio of input to output spatial resolution when '
                      'employing decoder to refine segmentation results.')

diff --git a/research/deeplab/core/feature_extractor.py b/research/deeplab/core/feature_extractor.py
@@ -18,18 +18,62 @@
 import tensorflow as tf
 
 from deeplab.core import xception
+from nets.mobilenet import mobilenet as mobilenet_lib
+from nets.mobilenet import mobilenet_v2
 
 
 slim = tf.contrib.slim
 
+# Default end point for MobileNetv2.
+_MOBILENET_V2_FINAL_ENDPOINT = 'layer_18'
+
+
+def _mobilenet_v2(net,
+                  depth_multiplier,
+                  output_stride,
+                  reuse=None,
+                  scope=None,
+                  final_endpoint=None):
+  """Auxiliary function to add support for 'reuse' to mobilenet_v2.
+
+  Args:
+    net: Input tensor of shape [batch_size, height, width, channels].
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    output_stride: An integer that specifies the requested ratio of input to
+      output spatial resolution. If not None, then we invoke atrous convolution
+      if necessary to prevent the network from reducing the spatial resolution
+      of the activation maps. Allowed values are 8 (accurate fully convolutional
+      mode), 16 (fast fully convolutional mode), 32 (classification mode).
+    reuse: Reuse model variables.
+    scope: Optional variable scope.
+    final_endpoint: The endpoint to construct the network up to.
+
+  Returns:
+    Features extracted by MobileNetv2.
+  """
+  with tf.variable_scope(
+      scope, 'MobilenetV2', [net], reuse=reuse) as scope:
+    return mobilenet_lib.mobilenet_base(
+        net,
+        conv_defs=mobilenet_v2.V2_DEF,
+        multiplier=depth_multiplier,
+        final_endpoint=final_endpoint or _MOBILENET_V2_FINAL_ENDPOINT,
+        output_stride=output_stride,
+        scope=scope)
+
 
 # A map from network name to network function.
 networks_map = {
+    'mobilenet_v2': _mobilenet_v2,
     'xception_65': xception.xception_65,
 }
 
 # A map from network name to network arg scope.
 arg_scopes_map = {
+    'mobilenet_v2': mobilenet_v2.training_scope,
     'xception_65': xception.xception_arg_scope,
 }
 
@@ -38,6 +82,10 @@
 
 # A dictionary from network name to a map of end point features.
 networks_to_feature_maps = {
+    'mobilenet_v2': {
+        # The provided checkpoint does not include decoder module.
+        DECODER_END_POINTS: None,
+    },
     'xception_65': {
         DECODER_END_POINTS: [
             'entry_flow/block2/unit_1/xception_module/'
@@ -49,6 +97,7 @@
 # A map from feature extractor name to the network name scope used in the
 # ImageNet pretrained versions of these models.
 name_scope = {
+    'mobilenet_v2': 'MobilenetV2',
     'xception_65': 'xception_65',
 }
 
@@ -68,6 +117,7 @@ def _preprocess_zero_mean_unit_range(inputs):
 
 
 _PREPROCESS_FN = {
+    'mobilenet_v2': _preprocess_zero_mean_unit_range,
     'xception_65': _preprocess_zero_mean_unit_range,
 }
 
@@ -99,6 +149,8 @@ def mean_pixel(model_variant=None):
 def extract_features(images,
                      output_stride=8,
                      multi_grid=None,
+                     depth_multiplier=1.0,
+                     final_endpoint=None,
                      model_variant=None,
                      weight_decay=0.0001,
                      reuse=None,
@@ -114,6 +166,9 @@ def extract_features(images,
     images: A tensor of size [batch, height, width, channels].
     output_stride: The ratio of input to output spatial resolution.
     multi_grid: Employ a hierarchy of different atrous rates within network.
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops used in MobileNet.
+    final_endpoint: The MobileNet endpoint to construct the network up to.
     model_variant: Model variant for feature extraction.
     weight_decay: The weight decay for model variables.
     reuse: Reuse the model variables or not.
@@ -159,7 +214,17 @@ def extract_features(images,
             reuse=reuse,
             scope=name_scope[model_variant])
   elif 'mobilenet' in model_variant:
-    raise ValueError('MobileNetv2 support is coming soon.')
+    arg_scope = arg_scopes_map[model_variant](
+        is_training=(is_training and fine_tune_batch_norm),
+        weight_decay=weight_decay)
+    features, end_points = get_network(
+        model_variant, preprocess_images, arg_scope)(
+            inputs=images,
+            depth_multiplier=depth_multiplier,
+            output_stride=output_stride,
+            reuse=reuse,
+            scope=name_scope[model_variant],
+            final_endpoint=final_endpoint)
   else:
     raise ValueError('Unknown model variant %s.' % model_variant)
 

diff --git a/research/deeplab/datasets/convert_cityscapes.sh b/research/deeplab/datasets/convert_cityscapes.sh
@@ -14,19 +14,21 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to preprocess the Cityscapes dataset. Note (1) the users should register
-# the Cityscapes dataset website: https://www.cityscapes-dataset.com/downloads/ to
-# download the dataset, and (2) the users should run the script provided by Cityscapes
-# `preparation/createTrainIdLabelImgs.py` to generate the training groundtruth.
+# Script to preprocess the Cityscapes dataset. Note (1) the users should
+# register the Cityscapes dataset website at
+# https://www.cityscapes-dataset.com/downloads/ to download the dataset,
+# and (2) the users should download the utility scripts provided by
+# Cityscapes at https://github.com/mcordts/cityscapesScripts.
 #
 # Usage:
 #   bash ./preprocess_cityscapes.sh
 #
 # The folder structure is assumed to be:
-#  + data
+#  + datasets
 #    - build_cityscapes_data.py
+#    - convert_cityscapes.sh
 #    + cityscapes
-#      + cityscapesscripts
+#      + cityscapesscripts (downloaded scripts)
 #      + gtFine
 #      + leftImg8bit
 #
@@ -37,17 +39,18 @@ set -e
 CURRENT_DIR=$(pwd)
 WORK_DIR="."
 
-cd "${CURRENT_DIR}"
-
-# Root path for PASCAL VOC 2012 dataset.
+# Root path for Cityscapes dataset.
 CITYSCAPES_ROOT="${WORK_DIR}/cityscapes"
 
+# Create training labels.
+python "${CITYSCAPES_ROOT}/cityscapesscripts/preparation/createTrainIdLabelImgs.py"
+
 # Build TFRecords of the dataset.
 # First, create output directory for storing TFRecords.
 OUTPUT_DIR="${CITYSCAPES_ROOT}/tfrecord"
 mkdir -p "${OUTPUT_DIR}"
 
-BUILD_SCRIPT="${WORK_DIR}/build_cityscapes_data.py"
+BUILD_SCRIPT="${CURRENT_DIR}/build_cityscapes_data.py"
 
 echo "Converting Cityscapes dataset..."
 python "${BUILD_SCRIPT}" \

diff --git a/research/deeplab/datasets/download_and_convert_voc2012.sh b/research/deeplab/datasets/download_and_convert_voc2012.sh
@@ -20,15 +20,16 @@
 #   bash ./download_and_preprocess_voc2012.sh
 #
 # The folder structure is assumed to be:
-#  + data
+#  + datasets
 #     - build_data.py
 #     - build_voc2012_data.py
 #     - download_and_preprocess_voc2012.sh
 #     - remove_gt_colormap.py
-#     + VOCdevkit
-#       + VOC2012
-#         + JPEGImages
-#         + SegmentationClass
+#     + pascal_voc_seg
+#       + VOCdevkit
+#         + VOC2012
+#           + JPEGImages
+#           + SegmentationClass
 #
 
 # Exit immediately if a command exits with a non-zero status.