To not compute metrics for the slices with example count less than 'k…

…'. (where value of 'k' is configurable and set to 1 by default) PiperOrigin-RevId: 249854302
tensorflow · May 24, 2019 · 545106a · 545106a
1 parent cf5b1fb
commit 545106a
Show file tree

Hide file tree

Showing 9 changed files with 268 additions and 33 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -34,6 +34,8 @@
 *   Added support for mean absolute error post export metric.
 *   Added support for mean squared error and root mean squared error post export
     metric.
+*   Added support for not computing metrics for slices with less than a given
+    number of examples.
 
 ## Bug fixes and other changes
 *   Cast / convert labels for precision / recall at K so that they work even if

diff --git a/tensorflow_model_analysis/api/model_eval_lib.py b/tensorflow_model_analysis/api/model_eval_lib.py
@@ -274,7 +274,8 @@ def default_extractors(  # pylint: disable=invalid-name
 def default_evaluators(  # pylint: disable=invalid-name
     eval_shared_model: types.EvalSharedModel,
     desired_batch_size: Optional[int] = None,
-    num_bootstrap_samples: Optional[int] = None) -> List[evaluator.Evaluator]:
+    num_bootstrap_samples: Optional[int] = None,
+    k_anonymization_count: int = 1) -> List[evaluator.Evaluator]:
   """Returns the default evaluators for use in ExtractAndEvaluate.
 
   Args:
@@ -283,12 +284,17 @@ def default_evaluators(  # pylint: disable=invalid-name
     num_bootstrap_samples: Number of bootstrap samples to draw. If more than 1,
       confidence intervals will be computed for metrics. Suggested value is at
       least 20.
+    k_anonymization_count: If the number of examples in a specific slice is less
+      than k_anonymization_count, then an error will be returned for that slice.
+      This will be useful to ensure privacy by not displaying the aggregated
+      data for smaller number of examples.
   """
   return [
       metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
           eval_shared_model,
           desired_batch_size,
-          num_bootstrap_samples=num_bootstrap_samples)
+          num_bootstrap_samples=num_bootstrap_samples,
+          k_anonymization_count=k_anonymization_count)
   ]
 
 
@@ -479,7 +485,8 @@ def ExtractEvaluateAndWriteResults(  # pylint: disable=invalid-name
     evaluators: Optional[List[evaluator.Evaluator]] = None,
     writers: Optional[List[writer.Writer]] = None,
     write_config: Optional[bool] = True,
-    num_bootstrap_samples: Optional[int] = 1) -> beam.pvalue.PDone:
+    num_bootstrap_samples: Optional[int] = 1,
+    k_anonymization_count: int = 1) -> beam.pvalue.PDone:
   """PTransform for performing extraction, evaluation, and writing results.
 
   Users who want to construct their own Beam pipelines instead of using the
@@ -533,6 +540,10 @@ def ExtractEvaluateAndWriteResults(  # pylint: disable=invalid-name
     write_config: True to write the config along with the results.
     num_bootstrap_samples: Optional, set to at least 20 in order to calculate
       metrics with confidence intervals.
+    k_anonymization_count: If the number of examples in a specific slice is less
+      than k_anonymization_count, then an error will be returned for that slice.
+      This will be useful to ensure privacy by not displaying the aggregated
+      data for smaller number of examples.
 
   Raises:
     ValueError: If matching Extractor not found for an Evaluator.
@@ -551,7 +562,8 @@ def ExtractEvaluateAndWriteResults(  # pylint: disable=invalid-name
     evaluators = default_evaluators(
         eval_shared_model=eval_shared_model,
         desired_batch_size=desired_batch_size,
-        num_bootstrap_samples=num_bootstrap_samples)
+        num_bootstrap_samples=num_bootstrap_samples,
+        k_anonymization_count=k_anonymization_count)
 
   for v in evaluators:
     evaluator.verify_evaluator(v, extractors)
@@ -602,6 +614,7 @@ def run_model_analysis(
     write_config: Optional[bool] = True,
     pipeline_options: Optional[Any] = None,
     num_bootstrap_samples: Optional[int] = 1,
+    k_anonymization_count: int = 1,
 ) -> EvalResult:
   """Runs TensorFlow model analysis.
 
@@ -646,6 +659,10 @@ def run_model_analysis(
       whether to run directly.
     num_bootstrap_samples: Optional, set to at least 20 in order to calculate
       metrics with confidence intervals.
+    k_anonymization_count: If the number of examples in a specific slice is less
+      than k_anonymization_count, then an error will be returned for that slice.
+      This will be useful to ensure privacy by not displaying the aggregated
+      data for smaller number of examples.
 
   Returns:
     An EvalResult that can be used with the TFMA visualization functions.
@@ -682,7 +699,8 @@ def run_model_analysis(
             evaluators=evaluators,
             writers=writers,
             write_config=write_config,
-            num_bootstrap_samples=num_bootstrap_samples))
+            num_bootstrap_samples=num_bootstrap_samples,
+            k_anonymization_count=k_anonymization_count))
     # pylint: enable=no-value-for-parameter
 
   eval_result = load_eval_result(output_path=output_path)

diff --git a/tensorflow_model_analysis/api/model_eval_lib_test.py b/tensorflow_model_analysis/api/model_eval_lib_test.py
@@ -123,18 +123,28 @@ def testRunModelAnalysis(self):
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
-        self._makeExample(age=5.0, language='chinese', label=1.0)
+        self._makeExample(age=5.0, language='chinese', label=1.0),
+        self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
     eval_result = model_eval_lib.run_model_analysis(
         model_eval_lib.default_eval_shared_model(
             eval_saved_model_path=model_location, example_weight_key='age'),
         data_location,
-        slice_spec=slice_spec)
+        slice_spec=slice_spec,
+        k_anonymization_count=2)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
+        (('language', b'hindi'),): {
+            u'error': {
+                'debugMessage':
+                    u'Example count for this slice key is lower than the '
+                    u'minimum required value: 2. No data is aggregated for '
+                    u'this slice.'
+            },
+        },
         (('language', b'chinese'),): {
             'accuracy': {
                 'doubleValue': 0.5
@@ -177,7 +187,8 @@ def testRunModelAnalysisWithUncertainty(self):
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
-        self._makeExample(age=5.0, language='chinese', label=1.0)
+        self._makeExample(age=5.0, language='chinese', label=1.0),
+        self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
@@ -186,10 +197,19 @@ def testRunModelAnalysisWithUncertainty(self):
             eval_saved_model_path=model_location, example_weight_key='age'),
         data_location,
         slice_spec=slice_spec,
-        num_bootstrap_samples=20)
+        num_bootstrap_samples=20,
+        k_anonymization_count=2)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
+        (('language', b'hindi'),): {
+            u'error': {
+                'debugMessage':
+                    u'Example count for this slice key is lower than the '
+                    u'minimum required value: 2. No data is aggregated for '
+                    u'this slice.'
+            },
+        },
         (('language', b'chinese'),): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0

diff --git a/tensorflow_model_analysis/api/tfma_unit.py b/tensorflow_model_analysis/api/tfma_unit.py
@@ -283,7 +283,7 @@ def check_metrics(got):
 
     with beam.Pipeline() as pipeline:
       # pylint: disable=no-value-for-parameter
-      metrics, _ = (
+      (metrics, _), _ = (
           pipeline
           | 'CreateExamples' >> beam.Create(serialized_examples)
           | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
@@ -371,7 +371,7 @@ def check_metrics(got):
         eval_shared_model=eval_shared_model, slice_spec=slice_spec)
 
     # pylint: disable=no-value-for-parameter
-    metrics, _ = (
+    (metrics, _), _ = (
         examples_pcollection
         | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
         | 'Extract' >> Extract(extractors=extractors)