From 0270cac779f61a879f74d9495b9156f8f4127478 Mon Sep 17 00:00:00 2001 From: Qianli Scott Zhu Date: Fri, 11 May 2018 13:11:21 -0700 Subject: [PATCH] Add benchmark logger that does stream upload to bigquery. (#4210) * Move the benchmark_uploader to new location. * Update benchmark logger to streaming upload. * Fix lint and unit test error. * delint. * Update the benchmark uploader test. Skip the import of benchmark_uploader when bigquery is not installed. * Merge the 2 classes of benchmark uploader into 1. * Address review comments. * delint. * Execute bigquery upload in a separate thread. * Change to use python six.moves for importing. * Address review comments and delint. * Address review comment. Adding comment for potential performance impact for model on CPU. * Fix random failure on py3. * Fix the order of flag saver to avoid the randomness. The test is broken when the benchmark_logger_type is set first, and validated when the benchmark_log_dir is not set yet. --- official/benchmark/__init__.py | 0 .../logs => benchmark}/benchmark_uploader.py | 123 +++++++------- official/benchmark/benchmark_uploader_main.py | 62 +++++++ official/benchmark/benchmark_uploader_test.py | 107 ++++++++++++ official/resnet/resnet_run_loop.py | 5 +- official/utils/flags/_benchmark.py | 20 ++- official/utils/logs/hooks_helper.py | 5 +- official/utils/logs/hooks_helper_test.py | 3 +- official/utils/logs/logger.py | 157 +++++++++++++----- official/utils/logs/logger_test.py | 84 +++++++++- 10 files changed, 450 insertions(+), 116 deletions(-) create mode 100644 official/benchmark/__init__.py rename official/{utils/logs => benchmark}/benchmark_uploader.py (50%) create mode 100644 official/benchmark/benchmark_uploader_main.py create mode 100644 official/benchmark/benchmark_uploader_test.py diff --git a/official/benchmark/__init__.py b/official/benchmark/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/official/utils/logs/benchmark_uploader.py b/official/benchmark/benchmark_uploader.py similarity index 50% rename from official/utils/logs/benchmark_uploader.py rename to official/benchmark/benchmark_uploader.py index 1ba09edf770..c518fd07b2b 100644 --- a/official/utils/logs/benchmark_uploader.py +++ b/official/benchmark/benchmark_uploader.py @@ -25,30 +25,19 @@ from __future__ import print_function import json -import os -import sys -import uuid from google.cloud import bigquery -# pylint: disable=g-bad-import-order -from absl import app as absl_app -from absl import flags import tensorflow as tf -# pylint: enable=g-bad-import-order - -from official.utils.flags import core as flags_core -from official.utils.logs import logger class BigQueryUploader(object): - """Upload the benchmark and metric info to BigQuery.""" + """Upload the benchmark and metric info from JSON input to BigQuery. """ - def __init__(self, logging_dir, gcp_project=None, credentials=None): + def __init__(self, gcp_project=None, credentials=None): """Initialized BigQueryUploader with proper setting. Args: - logging_dir: string, logging directory that contains the benchmark log. gcp_project: string, the name of the GCP project that the log will be uploaded to. The default project name will be detected from local environment if no value is provided. @@ -58,11 +47,11 @@ def __init__(self, logging_dir, gcp_project=None, credentials=None): google.oauth2.service_account.Credentials to load credential from local file for the case that the test is run out side of GCP. """ - self._logging_dir = logging_dir self._bq_client = bigquery.Client( project=gcp_project, credentials=credentials) - def upload_benchmark_run(self, dataset_name, table_name, run_id): + def upload_benchmark_run_json( + self, dataset_name, table_name, run_id, run_json): """Upload benchmark run information to Bigquery. Args: @@ -72,19 +61,13 @@ def upload_benchmark_run(self, dataset_name, table_name, run_id): the data will be uploaded. run_id: string, a unique ID that will be attached to the data, usually this is a UUID4 format. + run_json: dict, the JSON data that contains the benchmark run info. """ - expected_file = os.path.join( - self._logging_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME) - with tf.gfile.GFile(expected_file) as f: - benchmark_json = json.load(f) - benchmark_json["model_id"] = run_id - table_ref = self._bq_client.dataset(dataset_name).table(table_name) - errors = self._bq_client.insert_rows_json(table_ref, [benchmark_json]) - if errors: - tf.logging.error( - "Failed to upload benchmark info to bigquery: {}".format(errors)) - - def upload_metric(self, dataset_name, table_name, run_id): + run_json["model_id"] = run_id + self._upload_json(dataset_name, table_name, [run_json]) + + def upload_benchmark_metric_json( + self, dataset_name, table_name, run_id, metric_json_list): """Upload metric information to Bigquery. Args: @@ -95,39 +78,57 @@ def upload_metric(self, dataset_name, table_name, run_id): benchmark_run table. run_id: string, a unique ID that will be attached to the data, usually this is a UUID4 format. This should be the same as the benchmark run_id. + metric_json_list: list, a list of JSON object that record the metric info. + """ + for m in metric_json_list: + m["run_id"] = run_id + self._upload_json(dataset_name, table_name, metric_json_list) + + def upload_benchmark_run_file( + self, dataset_name, table_name, run_id, run_json_file): + """Upload benchmark run information to Bigquery from input json file. + + Args: + dataset_name: string, the name of bigquery dataset where the data will be + uploaded. + table_name: string, the name of bigquery table under the dataset where + the data will be uploaded. + run_id: string, a unique ID that will be attached to the data, usually + this is a UUID4 format. + run_json_file: string, the file path that contains the run JSON data. + """ + with tf.gfile.GFile(run_json_file) as f: + benchmark_json = json.load(f) + self.upload_benchmark_run_json( + dataset_name, table_name, run_id, benchmark_json) + + def upload_metric_file( + self, dataset_name, table_name, run_id, metric_json_file): + """Upload metric information to Bigquery from input json file. + + Args: + dataset_name: string, the name of bigquery dataset where the data will be + uploaded. + table_name: string, the name of bigquery table under the dataset where + the metric data will be uploaded. This is different from the + benchmark_run table. + run_id: string, a unique ID that will be attached to the data, usually + this is a UUID4 format. This should be the same as the benchmark run_id. + metric_json_file: string, the file path that contains the metric JSON + data. """ - expected_file = os.path.join( - self._logging_dir, logger.METRIC_LOG_FILE_NAME) - with tf.gfile.GFile(expected_file) as f: - lines = f.readlines() + with tf.gfile.GFile(metric_json_file) as f: metrics = [] - for line in filter(lambda l: l.strip(), lines): - metric = json.loads(line) - metric["run_id"] = run_id - metrics.append(metric) - table_ref = self._bq_client.dataset(dataset_name).table(table_name) - errors = self._bq_client.insert_rows_json(table_ref, metrics) - if errors: - tf.logging.error( - "Failed to upload benchmark info to bigquery: {}".format(errors)) - - -def main(_): - if not flags.FLAGS.benchmark_log_dir: - print("Usage: benchmark_uploader.py --benchmark_log_dir=/some/dir") - sys.exit(1) - - uploader = BigQueryUploader( - flags.FLAGS.benchmark_log_dir, - gcp_project=flags.FLAGS.gcp_project) - run_id = str(uuid.uuid4()) - uploader.upload_benchmark_run( - flags.FLAGS.bigquery_data_set, flags.FLAGS.bigquery_run_table, run_id) - uploader.upload_metric( - flags.FLAGS.bigquery_data_set, flags.FLAGS.bigquery_metric_table, run_id) - - -if __name__ == "__main__": - flags_core.define_benchmark() - flags.adopt_module_key_flags(flags_core) - absl_app.run(main=main) + for line in f: + metrics.append(json.loads(line.strip())) + self.upload_benchmark_metric_json( + dataset_name, table_name, run_id, metrics) + + def _upload_json(self, dataset_name, table_name, json_list): + # Find the unique table reference based on dataset and table name, so that + # the data can be inserted to it. + table_ref = self._bq_client.dataset(dataset_name).table(table_name) + errors = self._bq_client.insert_rows_json(table_ref, json_list) + if errors: + tf.logging.error( + "Failed to upload benchmark info to bigquery: {}".format(errors)) diff --git a/official/benchmark/benchmark_uploader_main.py b/official/benchmark/benchmark_uploader_main.py new file mode 100644 index 00000000000..af21821c143 --- /dev/null +++ b/official/benchmark/benchmark_uploader_main.py @@ -0,0 +1,62 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Binary to upload benchmark generated by BenchmarkLogger to remote repo. + +This library require google cloud bigquery lib as dependency, which can be +installed with: + > pip install --upgrade google-cloud-bigquery +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import uuid + +from absl import app as absl_app +from absl import flags + +from official.benchmark import benchmark_uploader +from official.utils.flags import core as flags_core +from official.utils.logs import logger + +def main(_): + if not flags.FLAGS.benchmark_log_dir: + print("Usage: benchmark_uploader.py --benchmark_log_dir=/some/dir") + sys.exit(1) + + uploader = benchmark_uploader.BigQueryUploader( + gcp_project=flags.FLAGS.gcp_project) + run_id = str(uuid.uuid4()) + run_json_file = os.path.join( + flags.FLAGS.benchmark_log_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME) + metric_json_file = os.path.join( + flags.FLAGS.benchmark_log_dir, logger.METRIC_LOG_FILE_NAME) + + uploader.upload_benchmark_run_file( + flags.FLAGS.bigquery_data_set, flags.FLAGS.bigquery_run_table, run_id, + run_json_file) + uploader.upload_metric_file( + flags.FLAGS.bigquery_data_set, flags.FLAGS.bigquery_metric_table, run_id, + metric_json_file) + + +if __name__ == "__main__": + flags_core.define_benchmark() + flags.adopt_module_key_flags(flags_core) + absl_app.run(main=main) diff --git a/official/benchmark/benchmark_uploader_test.py b/official/benchmark/benchmark_uploader_test.py new file mode 100644 index 00000000000..80919fab119 --- /dev/null +++ b/official/benchmark/benchmark_uploader_test.py @@ -0,0 +1,107 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for benchmark_uploader.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import os +import tempfile +import unittest +from mock import MagicMock +from mock import patch + +import tensorflow as tf # pylint: disable=g-bad-import-order + +try: + from google.cloud import bigquery + from official.benchmark import benchmark_uploader +except ImportError: + bigquery = None + benchmark_uploader = None + + +@unittest.skipIf(bigquery is None, 'Bigquery dependency is not installed.') +class BigQueryUploaderTest(tf.test.TestCase): + + @patch.object(bigquery, 'Client') + def setUp(self, mock_bigquery): + self.mock_client = mock_bigquery.return_value + self.mock_dataset = MagicMock(name="dataset") + self.mock_table = MagicMock(name="table") + self.mock_client.dataset.return_value = self.mock_dataset + self.mock_dataset.table.return_value = self.mock_table + self.mock_client.insert_rows_json.return_value = [] + + self.benchmark_uploader = benchmark_uploader.BigQueryUploader() + self.benchmark_uploader._bq_client = self.mock_client + + self.log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) + with open(os.path.join(self.log_dir, 'metric.log'), 'a') as f: + json.dump({'name': 'accuracy', 'value': 1.0}, f) + f.write("\n") + json.dump({'name': 'loss', 'value': 0.5}, f) + f.write("\n") + with open(os.path.join(self.log_dir, 'run.log'), 'w') as f: + json.dump({'model_name': 'value'}, f) + + def tearDown(self): + tf.gfile.DeleteRecursively(self.get_temp_dir()) + + def test_upload_benchmark_run_json(self): + self.benchmark_uploader.upload_benchmark_run_json( + 'dataset', 'table', 'run_id', {'model_name': 'value'}) + + self.mock_client.insert_rows_json.assert_called_once_with( + self.mock_table, [{'model_name': 'value', 'model_id': 'run_id'}]) + + def test_upload_benchmark_metric_json(self): + metric_json_list = [ + {'name': 'accuracy', 'value': 1.0}, + {'name': 'loss', 'value': 0.5} + ] + expected_params = [ + {'run_id': 'run_id', 'name': 'accuracy', 'value': 1.0}, + {'run_id': 'run_id', 'name': 'loss', 'value': 0.5} + ] + self.benchmark_uploader.upload_benchmark_metric_json( + 'dataset', 'table', 'run_id', metric_json_list) + self.mock_client.insert_rows_json.assert_called_once_with( + self.mock_table, expected_params) + + def test_upload_benchmark_run_file(self): + self.benchmark_uploader.upload_benchmark_run_file( + 'dataset', 'table', 'run_id', os.path.join(self.log_dir, 'run.log')) + + self.mock_client.insert_rows_json.assert_called_once_with( + self.mock_table, [{'model_name': 'value', 'model_id': 'run_id'}]) + + def test_upload_metric_file(self): + self.benchmark_uploader.upload_metric_file( + 'dataset', 'table', 'run_id', + os.path.join(self.log_dir, 'metric.log')) + expected_params = [ + {'run_id': 'run_id', 'name': 'accuracy', 'value': 1.0}, + {'run_id': 'run_id', 'name': 'loss', 'value': 0.5} + ] + self.mock_client.insert_rows_json.assert_called_once_with( + self.mock_table, expected_params) + + +if __name__ == '__main__': + tf.test.main() diff --git a/official/resnet/resnet_run_loop.py b/official/resnet/resnet_run_loop.py index de6a2d83fc3..ea6dfaf7769 100644 --- a/official/resnet/resnet_run_loop.py +++ b/official/resnet/resnet_run_loop.py @@ -395,13 +395,12 @@ def resnet_main( 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } - benchmark_logger = logger.config_benchmark_logger(flags_obj.benchmark_log_dir) + benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info('resnet', dataset_name, run_params) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, - batch_size=flags_obj.batch_size, - benchmark_log_dir=flags_obj.benchmark_log_dir) + batch_size=flags_obj.batch_size) def input_fn_train(): return input_function( diff --git a/official/utils/flags/_benchmark.py b/official/utils/flags/_benchmark.py index 773d81d16a8..9431d9fb56f 100644 --- a/official/utils/flags/_benchmark.py +++ b/official/utils/flags/_benchmark.py @@ -36,6 +36,14 @@ def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True): key_flags = [] + flags.DEFINE_enum( + name="benchmark_logger_type", default="BaseBenchmarkLogger", + enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger", + "BenchmarkBigQueryLogger"], + help=help_wrap("The type of benchmark logger to use. Defaults to using " + "BaseBenchmarkLogger which logs to STDOUT. Different " + "loggers will require other flags to be able to work.")) + if benchmark_log_dir: flags.DEFINE_string( name="benchmark_log_dir", short_name="bld", default=None, @@ -64,4 +72,14 @@ def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True): help=help_wrap("The Bigquery table name where the benchmark metric " "information will be uploaded.")) - return key_flags + @flags.multi_flags_validator( + ["benchmark_logger_type", "benchmark_log_dir"], + message="--benchmark_logger_type=BenchmarkFileLogger will require " + "--benchmark_log_dir being set") + def _check_benchmark_log_dir(flags_dict): + benchmark_logger_type = flags_dict["benchmark_logger_type"] + if benchmark_logger_type == "BenchmarkFileLogger": + return flags_dict["benchmark_log_dir"] + return True + + return key_flags diff --git a/official/utils/logs/hooks_helper.py b/official/utils/logs/hooks_helper.py index 349b79f6d46..edb5e2e0bcd 100644 --- a/official/utils/logs/hooks_helper.py +++ b/official/utils/logs/hooks_helper.py @@ -124,14 +124,12 @@ def get_examples_per_second_hook(every_n_steps=100, warm_steps=warm_steps) -def get_logging_metric_hook(benchmark_log_dir=None, - tensors_to_log=None, +def get_logging_metric_hook(tensors_to_log=None, every_n_secs=600, **kwargs): # pylint: disable=unused-argument """Function to get LoggingMetricHook. Args: - benchmark_log_dir: `string`, directory path to save the metric log. tensors_to_log: List of tensor names or dictionary mapping labels to tensor names. If not set, log _TENSORS_TO_LOG by default. every_n_secs: `int`, the frequency for logging the metric. Default to every @@ -141,7 +139,6 @@ def get_logging_metric_hook(benchmark_log_dir=None, Returns a ProfilerHook that writes out timelines that can be loaded into profiling tools like chrome://tracing. """ - logger.config_benchmark_logger(benchmark_log_dir) if tensors_to_log is None: tensors_to_log = _TENSORS_TO_LOG return metric_hook.LoggingMetricHook( diff --git a/official/utils/logs/hooks_helper_test.py b/official/utils/logs/hooks_helper_test.py index 054661294d0..f8f59a8f96e 100644 --- a/official/utils/logs/hooks_helper_test.py +++ b/official/utils/logs/hooks_helper_test.py @@ -60,8 +60,7 @@ def test_get_train_hooks_examples_per_second_hook(self): def test_get_logging_metric_hook(self): test_hook_name = 'LoggingMetricHook' - self.validate_train_hook_name(test_hook_name, 'loggingmetrichook', - benchmark_log_dir='/tmp') + self.validate_train_hook_name(test_hook_name, 'loggingmetrichook') if __name__ == '__main__': tf.test.main() diff --git a/official/utils/logs/logger.py b/official/utils/logs/logger.py index 3ec278d7200..a7d78c2eec7 100644 --- a/official/utils/logs/logger.py +++ b/official/utils/logs/logger.py @@ -28,7 +28,10 @@ import numbers import os import threading +import uuid +from six.moves import _thread as thread +from absl import flags import tensorflow as tf from tensorflow.python.client import device_lib @@ -36,21 +39,39 @@ BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log" _DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ" +FLAGS = flags.FLAGS # Don't use it directly. Use get_benchmark_logger to access a logger. _benchmark_logger = None _logger_lock = threading.Lock() -def config_benchmark_logger(logging_dir): +def config_benchmark_logger(flag_obj=None): """Config the global benchmark logger""" _logger_lock.acquire() try: global _benchmark_logger - if logging_dir: - _benchmark_logger = BenchmarkFileLogger(logging_dir) - else: + if not flag_obj: + flag_obj = FLAGS + + if (not hasattr(flag_obj, 'benchmark_logger_type') or + flag_obj.benchmark_logger_type == 'BaseBenchmarkLogger'): _benchmark_logger = BaseBenchmarkLogger() + elif flag_obj.benchmark_logger_type == 'BenchmarkFileLogger': + _benchmark_logger = BenchmarkFileLogger(flag_obj.benchmark_log_dir) + elif flag_obj.benchmark_logger_type == 'BenchmarkBigQueryLogger': + from official.benchmark import benchmark_uploader as bu # pylint: disable=g-import-not-at-top + bq_uploader = bu.BigQueryUploader(gcp_project=flag_obj.gcp_project) + _benchmark_logger = BenchmarkBigQueryLogger( + bigquery_uploader=bq_uploader, + bigquery_data_set=flag_obj.bigquery_data_set, + bigquery_run_table=flag_obj.bigquery_run_table, + bigquery_metric_table=flag_obj.bigquery_metric_table, + run_id=str(uuid.uuid4())) + else: + raise ValueError('Unrecognized benchmark_logger_type: %s', + flag_obj.benchmark_logger_type) + finally: _logger_lock.release() return _benchmark_logger @@ -58,8 +79,7 @@ def config_benchmark_logger(logging_dir): def get_benchmark_logger(): if not _benchmark_logger: - config_benchmark_logger(None) - + config_benchmark_logger() return _benchmark_logger @@ -99,15 +119,9 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None): global_step: int, the global_step when the metric is logged. extras: map of string:string, the extra information about the metric. """ - if not isinstance(value, numbers.Number): - tf.logging.warning( - "Metric value to log should be a number. Got %s", type(value)) - return - extras = _convert_to_json_dict(extras) - - tf.logging.info("Benchmark metric: " - "Name %s, value %d, unit %s, global_step %d, extras %s", - name, value, unit, global_step, extras) + metric = _process_metric_to_json(name, value, unit, global_step, extras) + if metric: + tf.logging.info("Benchmark metric: %s", metric) def log_run_info(self, model_name, dataset_name, run_params): tf.logging.info("Benchmark run: %s", @@ -137,28 +151,16 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None): global_step: int, the global_step when the metric is logged. extras: map of string:string, the extra information about the metric. """ - if not isinstance(value, numbers.Number): - tf.logging.warning( - "Metric value to log should be a number. Got %s", type(value)) - return - extras = _convert_to_json_dict(extras) - - with tf.gfile.GFile( - os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f: - metric = { - "name": name, - "value": float(value), - "unit": unit, - "global_step": global_step, - "timestamp": datetime.datetime.utcnow().strftime( - _DATE_TIME_FORMAT_PATTERN), - "extras": extras} - try: - json.dump(metric, f) - f.write("\n") - except (TypeError, ValueError) as e: - tf.logging.warning("Failed to dump metric to log file: " - "name %s, value %s, error %s", name, value, e) + metric = _process_metric_to_json(name, value, unit, global_step, extras) + if metric: + with tf.gfile.GFile( + os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f: + try: + json.dump(metric, f) + f.write("\n") + except (TypeError, ValueError) as e: + tf.logging.warning("Failed to dump metric to log file: " + "name %s, value %s, error %s", name, value, e) def log_run_info(self, model_name, dataset_name, run_params): """Collect most of the TF runtime information for the local env. @@ -183,6 +185,68 @@ def log_run_info(self, model_name, dataset_name, run_params): e) +class BenchmarkBigQueryLogger(BaseBenchmarkLogger): + """Class to log the benchmark information to BigQuery data store.""" + + def __init__(self, + bigquery_uploader, + bigquery_data_set, + bigquery_run_table, + bigquery_metric_table, + run_id): + super(BenchmarkBigQueryLogger, self).__init__() + self._bigquery_uploader = bigquery_uploader + self._bigquery_data_set = bigquery_data_set + self._bigquery_run_table = bigquery_run_table + self._bigquery_metric_table = bigquery_metric_table + self._run_id = run_id + + def log_metric(self, name, value, unit=None, global_step=None, extras=None): + """Log the benchmark metric information to bigquery. + + Args: + name: string, the name of the metric to log. + value: number, the value of the metric. The value will not be logged if it + is not a number type. + unit: string, the unit of the metric, E.g "image per second". + global_step: int, the global_step when the metric is logged. + extras: map of string:string, the extra information about the metric. + """ + metric = _process_metric_to_json(name, value, unit, global_step, extras) + if metric: + # Starting new thread for bigquery upload in case it might take long time + # and impact the benchmark and performance measurement. Starting a new + # thread might have potential performance impact for model that run on + # CPU. + thread.start_new_thread( + self._bigquery_uploader.upload_benchmark_metric_json, + (self._bigquery_data_set, + self._bigquery_metric_table, + self._run_id, + [metric])) + + def log_run_info(self, model_name, dataset_name, run_params): + """Collect most of the TF runtime information for the local env. + + The schema of the run info follows official/benchmark/datastore/schema. + + Args: + model_name: string, the name of the model. + dataset_name: string, the name of dataset for training and evaluation. + run_params: dict, the dictionary of parameters for the run, it could + include hyperparameters or other params that are important for the run. + """ + run_info = _gather_run_info(model_name, dataset_name, run_params) + # Starting new thread for bigquery upload in case it might take long time + # and impact the benchmark and performance measurement. Starting a new + # thread might have potential performance impact for model that run on CPU. + thread.start_new_thread( + self._bigquery_uploader.upload_benchmark_run_json, + (self._bigquery_data_set, + self._bigquery_run_table, + self._run_id, + run_info)) + def _gather_run_info(model_name, dataset_name, run_params): """Collect the benchmark run information for the local environment.""" run_info = { @@ -200,6 +264,25 @@ def _gather_run_info(model_name, dataset_name, run_params): return run_info +def _process_metric_to_json( + name, value, unit=None, global_step=None, extras=None): + """Validate the metric data and generate JSON for insert.""" + if not isinstance(value, numbers.Number): + tf.logging.warning( + "Metric value to log should be a number. Got %s", type(value)) + return None + + extras = _convert_to_json_dict(extras) + return { + "name": name, + "value": float(value), + "unit": unit, + "global_step": global_step, + "timestamp": datetime.datetime.utcnow().strftime( + _DATE_TIME_FORMAT_PATTERN), + "extras": extras} + + def _collect_tensorflow_info(run_info): run_info["tensorflow_version"] = { "version": tf.VERSION, "git_hash": tf.GIT_VERSION} diff --git a/official/utils/logs/logger_test.py b/official/utils/logs/logger_test.py index 9af683a54d0..2d66cb336bb 100644 --- a/official/utils/logs/logger_test.py +++ b/official/utils/logs/logger_test.py @@ -22,28 +22,55 @@ import json import os import tempfile +import time import unittest +import mock +from absl.testing import flagsaver import tensorflow as tf # pylint: disable=g-bad-import-order +try: + from google.cloud import bigquery +except ImportError: + bigquery = None + +from official.utils.flags import core as flags_core from official.utils.logs import logger class BenchmarkLoggerTest(tf.test.TestCase): + @classmethod + def setUpClass(cls): # pylint: disable=invalid-name + super(BenchmarkLoggerTest, cls).setUpClass() + flags_core.define_benchmark() + def test_get_default_benchmark_logger(self): - self.assertIsInstance(logger.get_benchmark_logger(), - logger.BaseBenchmarkLogger) + with flagsaver.flagsaver(benchmark_logger_type='foo'): + self.assertIsInstance(logger.get_benchmark_logger(), + logger.BaseBenchmarkLogger) def test_config_base_benchmark_logger(self): - logger.config_benchmark_logger("") - self.assertIsInstance(logger.get_benchmark_logger(), - logger.BaseBenchmarkLogger) + with flagsaver.flagsaver(benchmark_logger_type='BaseBenchmarkLogger'): + logger.config_benchmark_logger() + self.assertIsInstance(logger.get_benchmark_logger(), + logger.BaseBenchmarkLogger) def test_config_benchmark_file_logger(self): - logger.config_benchmark_logger("/tmp/abc") - self.assertIsInstance(logger.get_benchmark_logger(), - logger.BenchmarkFileLogger) + # Set the benchmark_log_dir first since the benchmark_logger_type will need + # the value to be set when it does the validation. + with flagsaver.flagsaver(benchmark_log_dir='/tmp'): + with flagsaver.flagsaver(benchmark_logger_type='BenchmarkFileLogger'): + logger.config_benchmark_logger() + self.assertIsInstance(logger.get_benchmark_logger(), + logger.BenchmarkFileLogger) + + @unittest.skipIf(bigquery is None, 'Bigquery dependency is not installed.') + def test_config_benchmark_bigquery_logger(self): + with flagsaver.flagsaver(benchmark_logger_type='BenchmarkBigQueryLogger'): + logger.config_benchmark_logger() + self.assertIsInstance(logger.get_benchmark_logger(), + logger.BenchmarkBigQueryLogger) class BaseBenchmarkLoggerTest(tf.test.TestCase): @@ -233,5 +260,46 @@ def test_collect_memory_info(self): self.assertIsNotNone(run_info["machine_config"]["memory_total"]) self.assertIsNotNone(run_info["machine_config"]["memory_available"]) + +@unittest.skipIf(bigquery is None, 'Bigquery dependency is not installed.') +class BenchmarkBigQueryLoggerTest(tf.test.TestCase): + + def setUp(self): + super(BenchmarkBigQueryLoggerTest, self).setUp() + # Avoid pulling extra env vars from test environment which affects the test + # result, eg. Kokoro test has a TF_PKG env which affect the test case + # test_collect_tensorflow_environment_variables() + self.original_environ = dict(os.environ) + os.environ.clear() + + self.mock_bq_uploader = mock.MagicMock() + self.logger = logger.BenchmarkBigQueryLogger( + self.mock_bq_uploader, "dataset", "run_table", "metric_table", + "run_id") + + def tearDown(self): + super(BenchmarkBigQueryLoggerTest, self).tearDown() + tf.gfile.DeleteRecursively(self.get_temp_dir()) + os.environ.clear() + os.environ.update(self.original_environ) + + def test_log_metric(self): + self.logger.log_metric( + "accuracy", 0.999, global_step=1e4, extras={"name": "value"}) + expected_metric_json = [{ + "name": "accuracy", + "value": 0.999, + "unit": None, + "global_step": 1e4, + "timestamp": mock.ANY, + "extras": [{"name": "name", "value": "value"}] + }] + # log_metric will call upload_benchmark_metric_json in a separate thread. + # Give it some grace period for the new thread before assert. + time.sleep(1) + self.mock_bq_uploader.upload_benchmark_metric_json.assert_called_once_with( + "dataset", "metric_table", "run_id", expected_metric_json) + + if __name__ == "__main__": tf.test.main()