Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions official/benchmark/datastore/schema/benchmark_run.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"description": "The date when the test of the model is started",
"mode": "REQUIRED",
"name": "run_date",
"type": "DATETIME"
"type": "TIMESTAMP"
},
{
"description": "The tensorflow version information.",
Expand Down Expand Up @@ -58,7 +58,7 @@
"type": "RECORD"
},
{
"description": "Enviornment variables when the benchmark run is executed.",
"description": "Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
Expand All @@ -74,7 +74,27 @@
}
],
"mode": "REPEATED",
"name": "enviornment_variable",
"name": "environment_variable",
"type": "RECORD"
},
{
"description": "TF Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the variable.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "tensorflow_environment_variables",
"type": "RECORD"
},
{
Expand Down
3 changes: 2 additions & 1 deletion official/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
psutil>=5.4.3
py-cpuinfo>=3.3.0
py-cpuinfo>=3.3.0
google-cloud-bigquery>=0.31.0
9 changes: 7 additions & 2 deletions official/resnet/resnet_run_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,12 @@ def resnet_main(flags, model_function, input_function):
'version': flags.version,
})

if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
benchmark_logger.log_run_info("resnet")
else:
benchmark_logger = None

for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks(
flags.hooks,
Expand Down Expand Up @@ -380,8 +386,7 @@ def input_fn_eval():
steps=flags.max_train_steps)
print(eval_results)

if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
if benchmark_logger:
benchmark_logger.log_estimator_evaluation_result(eval_results)


Expand Down
28 changes: 27 additions & 1 deletion official/utils/arg_parsers/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,37 @@ class BenchmarkParser(argparse.ArgumentParser):
benchmark_log_dir: Create a flag to specify location for benchmark logging.
"""

def __init__(self, add_help=False, benchmark_log_dir=True):
def __init__(self, add_help=False, benchmark_log_dir=True,
bigquery_uploader=True):
super(BenchmarkParser, self).__init__(add_help=add_help)
if benchmark_log_dir:
self.add_argument(
"--benchmark_log_dir", "-bld", default=None,
help="[default: %(default)s] The location of the benchmark logging.",
metavar="<BLD>"
)
if bigquery_uploader:
self.add_argument(
"--gcp_project", "-gp", default=None,
help="[default: %(default)s] The GCP project name where the benchmark"
" will be uploaded.",
metavar="<GP>"
)
self.add_argument(
"--bigquery_data_set", "-bds", default="test_benchmark",
help="[default: %(default)s] The Bigquery dataset name where the"
" benchmark will be uploaded.",
metavar="<BDS>"
)
self.add_argument(
"--bigquery_run_table", "-brt", default="benchmark_run",
help="[default: %(default)s] The Bigquery table name where the"
" benchmark run information will be uploaded.",
metavar="<BRT>"
)
self.add_argument(
"--bigquery_metric_table", "-bmt", default="benchmark_metric",
help="[default: %(default)s] The Bigquery table name where the"
" benchmark metric information will be uploaded.",
metavar="<BMT>"
)
5 changes: 3 additions & 2 deletions official/utils/arg_parsers/parsers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self):
parsers.PerformanceParser(num_parallel_calls=True, inter_op=True,
intra_op=True, use_synthetic_data=True),
parsers.ImageModelParser(data_format=True),
parsers.BenchmarkParser(benchmark_log_dir=True)
parsers.BenchmarkParser(benchmark_log_dir=True, bigquery_uploader=True)
])


Expand Down Expand Up @@ -62,7 +62,8 @@ def test_default_setting(self):
def test_benchmark_setting(self):
defaults = dict(
hooks=["LoggingMetricHook"],
benchmark_log_dir="/tmp/12345"
benchmark_log_dir="/tmp/12345",
gcp_project="project_abc",
)

parser = TestParser()
Expand Down
129 changes: 129 additions & 0 deletions official/utils/logging/benchmark_uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Library to upload benchmark generated by BenchmarkLogger to remote repo.

This library require google cloud bigquery lib as dependency, which can be
installed with:
> pip install --upgrade google-cloud-bigquery
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also add to requirements.txt

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import json
import os
import sys
import uuid

from google.cloud import bigquery

import tensorflow as tf # pylint: disable=g-bad-import-order

from official.utils.arg_parsers import parsers
from official.utils.logging import logger


class BigQueryUploader(object):
"""Upload the benchmark and metric info to BigQuery."""

def __init__(self, logging_dir, gcp_project=None, credentials=None):
"""Initialized BigQueryUploader with proper setting.

Args:
logging_dir: string, logging directory that contains the benchmark log.
gcp_project: string, the name of the GCP project that the log will be
uploaded to. The default project name will be detected from local
environment if no value is provided.
credentials: google.auth.credentials. The credential to access the
BigQuery service. The default service account credential will be
detected from local environment if no value is provided. Please use
google.oauth2.service_account.Credentials to load credential from local
file for the case that the test is run out side of GCP.
"""
self._logging_dir = logging_dir
self._bq_client = bigquery.Client(
project=gcp_project, credentials=credentials)

def upload_benchmark_run(self, dataset_name, table_name, run_id):
"""Upload benchmark run information to Bigquery.

Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the data will be uploaded.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format.
"""
expected_file = os.path.join(
self._logging_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME)
with tf.gfile.GFile(expected_file) as f:
benchmark_json = json.load(f)
benchmark_json["model_id"] = run_id
table_ref = self._bq_client.dataset(dataset_name).table(table_name)
errors = self._bq_client.insert_rows_json(table_ref, [benchmark_json])
if errors:
tf.logging.error(
"Failed to upload benchmark info to bigquery: {}".format(errors))

def upload_metric(self, dataset_name, table_name, run_id):
"""Upload metric information to Bigquery.

Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the metric data will be uploaded. This is different from the
benchmark_run table.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format. This should be the same as the benchmark run_id.
"""
expected_file = os.path.join(
self._logging_dir, logger.METRIC_LOG_FILE_NAME)
with tf.gfile.GFile(expected_file) as f:
lines = f.readlines()
metrics = []
for line in filter(lambda l: l.strip(), lines):
metric = json.loads(line)
metric["run_id"] = run_id
metrics.append(metric)
table_ref = self._bq_client.dataset(dataset_name).table(table_name)
errors = self._bq_client.insert_rows_json(table_ref, metrics)
if errors:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have retry logic here? (Or does the BigTable API already support it.) Once we're containerized we should expect these files will disappear after the test is done, so it would be a shame to lose a run due to a DB connection hiccup.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The upload API was quite stable so far, and only spit out non-retryable error for the moment. I will not be so worries for now, until we see some flakiness.

tf.logging.error(
"Failed to upload benchmark info to bigquery: {}".format(errors))


def main(argv):
parser = parsers.BenchmarkParser()
flags = parser.parse_args(args=argv[1:])
if not flags.benchmark_log_dir:
print("Usage: benchmark_uploader.py --benchmark_log_dir=/some/dir")
sys.exit(1)

uploader = BigQueryUploader(
flags.benchmark_log_dir,
gcp_project=flags.gcp_project)
run_id = str(uuid.uuid4())
uploader.upload_benchmark_run(
flags.bigquery_data_set, flags.bigquery_run_table, run_id)
uploader.upload_metric(
flags.bigquery_data_set, flags.bigquery_metric_table, run_id)


if __name__ == "__main__":
main(argv=sys.argv)
31 changes: 19 additions & 12 deletions official/utils/logging/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
import tensorflow as tf
from tensorflow.python.client import device_lib

_METRIC_LOG_FILE_NAME = "metric.log"
_BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
METRIC_LOG_FILE_NAME = "metric.log"
BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"


Expand Down Expand Up @@ -81,9 +81,12 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None):
tf.logging.warning(
"Metric value to log should be a number. Got %s", type(value))
return

if extras:
extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
else:
extras = []
with tf.gfile.GFile(
os.path.join(self._logging_dir, _METRIC_LOG_FILE_NAME), "a") as f:
os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
metric = {
"name": name,
"value": float(value),
Expand All @@ -107,15 +110,18 @@ def log_run_info(self, model_name):
Args:
model_name: string, the name of the model.
"""
run_info = {"model_name": model_name}
run_info = {
"model_name": model_name,
"machine_config": {},
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_cpu_info(run_info)
_collect_gpu_info(run_info)
_collect_memory_info(run_info)

with tf.gfile.GFile(os.path.join(
self._logging_dir, _BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
try:
json.dump(run_info, f)
f.write("\n")
Expand All @@ -130,8 +136,9 @@ def _collect_tensorflow_info(run_info):


def _collect_tensorflow_environment_variables(run_info):
run_info["tensorflow_environment_variables"] = {
k: v for k, v in os.environ.items() if k.startswith("TF_")}
run_info["tensorflow_environment_variables"] = [
{"name": k, "value": v}
for k, v in sorted(os.environ.items()) if k.startswith("TF_")]


# The following code is mirrored from tensorflow/tools/test/system_info_lib
Expand All @@ -150,7 +157,7 @@ def _collect_cpu_info(run_info):
cpu_info["cpu_info"] = info["brand"]
cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6

run_info["cpu_info"] = cpu_info
run_info["machine_config"]["cpu_info"] = cpu_info


def _collect_gpu_info(run_info):
Expand All @@ -168,16 +175,16 @@ def _collect_gpu_info(run_info):
gpu_info["model"] = _parse_gpu_model(d.physical_device_desc)
# Assume all the GPU connected are same model
break
run_info["gpu_info"] = gpu_info
run_info["machine_config"]["gpu_info"] = gpu_info


def _collect_memory_info(run_info):
# Note: psutil is not installed in the TensorFlow OSS tree.
# It is installable via pip.
import psutil # pylint: disable=g-import-not-at-top
vmem = psutil.virtual_memory()
run_info["memory_total"] = vmem.total
run_info["memory_available"] = vmem.available
run_info["machine_config"]["memory_total"] = vmem.total
run_info["machine_config"]["memory_available"] = vmem.available


def _parse_gpu_model(physical_device_desc):
Expand Down
Loading