Skip to content

Commit

Permalink
Prometheus exporter for TF metrics.
Browse files Browse the repository at this point in the history
Metrics are exported via HTTP server on a new endpoint
(defaults to `/monitoring/prometheus/metrics`) that
Prometheus can use to scrape from.

This change adds:
o `MonitoringConfig` proto to configure monitoring.
o `--monitoring_config_file` command line flag to pass config.

PiperOrigin-RevId: 212695523
  • Loading branch information
tensorflower-gardener committed Sep 12, 2018
1 parent d859d94 commit 021efbd
Show file tree
Hide file tree
Showing 15 changed files with 539 additions and 5 deletions.
9 changes: 9 additions & 0 deletions tensorflow_serving/config/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ serving_proto_library_py(
],
)

serving_proto_library(
name = "monitoring_config_proto",
srcs = ["monitoring_config.proto"],
cc_api_version = 2,
java_api_version = 2,
deps = [
],
)

serving_proto_library(
name = "ssl_config_proto",
srcs = ["ssl_config.proto"],
Expand Down
19 changes: 19 additions & 0 deletions tensorflow_serving/config/monitoring_config.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
syntax = "proto3";

package tensorflow.serving;
option cc_enable_arenas = true;

// Configuration for Prometheus monitoring.
message PrometheusConfig {
// Whether to expose Prometheus metrics.
bool enable = 1;

// The endpoint to expose Prometheus metrics.
// If not specified, PrometheusExporter::kPrometheusPath value is used.
string path = 2;
}

// Configuration for monitoring.
message MonitoringConfig {
PrometheusConfig prometheus_config = 1;
}
5 changes: 4 additions & 1 deletion tensorflow_serving/model_servers/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,11 @@ cc_library(
deps = [
":http_rest_api_handler",
":server_core",
"//tensorflow_serving/config:monitoring_config_proto",
"//tensorflow_serving/util:prometheus_exporter",
"//tensorflow_serving/util:threadpool_executor",
"//tensorflow_serving/util/net_http/server/public:http_server",
"//tensorflow_serving/util/net_http/server/public:http_server_api",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@com_googlesource_code_re2//:re2",
"@org_tensorflow//tensorflow/core:lib",
Expand Down Expand Up @@ -318,6 +319,7 @@ cc_library(
"@com_google_absl//absl/memory",
"@org_tensorflow//tensorflow/core:protos_all_cc",
"//tensorflow_serving/config:model_server_config_proto",
"//tensorflow_serving/config:monitoring_config_proto",
"//tensorflow_serving/config:ssl_config_proto",
"//tensorflow_serving/core:availability_preserving_policy",
"//tensorflow_serving/servables/tensorflow:session_bundle_config_proto",
Expand Down Expand Up @@ -366,6 +368,7 @@ py_test(
"//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.data-00000-of-00001",
"//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.index",
"//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.meta",
"//tensorflow_serving/servables/tensorflow/testdata:monitoring_config.txt",
"//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/assets/foo.txt",
"//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/saved_model.pb",
"//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/variables/variables.data-00000-of-00001",
Expand Down
49 changes: 48 additions & 1 deletion tensorflow_serving/model_servers/http_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ limitations under the License.
==============================================================================*/

#include <cstdint>
#include <memory>

#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "re2/re2.h"
#include "tensorflow/core/platform/env.h"
Expand All @@ -26,6 +28,7 @@ limitations under the License.
#include "tensorflow_serving/util/net_http/server/public/httpserver.h"
#include "tensorflow_serving/util/net_http/server/public/response_code_enum.h"
#include "tensorflow_serving/util/net_http/server/public/server_request_interface.h"
#include "tensorflow_serving/util/prometheus_exporter.h"
#include "tensorflow_serving/util/threadpool_executor.h"

namespace tensorflow {
Expand Down Expand Up @@ -79,6 +82,35 @@ net_http::HTTPStatusCode ToHTTPStatusCode(const Status& status) {
}
}

void ProcessPrometheusRequest(PrometheusExporter* exporter,
const PrometheusConfig& prometheus_config,
net_http::ServerRequestInterface* req) {
std::vector<std::pair<string, string>> headers;
headers.push_back({"Content-Type", "text/plain"});
string output;
Status status;
// Check if url matches the path.
if (req->uri_path() != prometheus_config.path()) {
output = absl::StrFormat("Unexpected path: %s. Should be %s",
req->uri_path(), prometheus_config.path());
status = Status(error::Code::INVALID_ARGUMENT, output);
} else {
status = exporter->GeneratePage(&output);
}
const net_http::HTTPStatusCode http_status = ToHTTPStatusCode(status);
// Note: we add headers+output for non successful status too, in case the
// output contains details about the error (e.g. error messages).
for (const auto& kv : headers) {
req->OverwriteResponseHeader(kv.first, kv.second);
}
req->WriteResponseString(output);
if (http_status != net_http::HTTPStatusCode::OK) {
VLOG(1) << "Error Processing prometheus metrics request. Error: "
<< status.ToString();
}
req->ReplyWithStatus(http_status);
}

class RequestExecutor final : public net_http::EventExecutor {
public:
explicit RequestExecutor(int num_threads)
Expand Down Expand Up @@ -147,7 +179,8 @@ class RestApiRequestDispatcher {
} // namespace

std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
int port, int num_threads, int timeout_in_ms, ServerCore* core) {
int port, int num_threads, int timeout_in_ms,
const MonitoringConfig& monitoring_config, ServerCore* core) {
auto options = absl::make_unique<net_http::ServerOptions>();
options->AddPort(static_cast<uint32_t>(port));
options->SetExecutor(absl::make_unique<RequestExecutor>(num_threads));
Expand All @@ -157,6 +190,20 @@ std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
return nullptr;
}

// Register handler for prometheus metric endpoint.
if (monitoring_config.prometheus_config().enable()) {
std::shared_ptr<PrometheusExporter> exporter =
std::make_shared<PrometheusExporter>();
net_http::RequestHandlerOptions prometheus_request_options;
PrometheusConfig prometheus_config = monitoring_config.prometheus_config();
server->RegisterRequestHandler(
monitoring_config.prometheus_config().path(),
[exporter, prometheus_config](net_http::ServerRequestInterface* req) {
ProcessPrometheusRequest(exporter.get(), prometheus_config, req);
},
prometheus_request_options);
}

std::shared_ptr<RestApiRequestDispatcher> dispatcher =
std::make_shared<RestApiRequestDispatcher>(timeout_in_ms, core);
net_http::RequestHandlerOptions handler_options;
Expand Down
4 changes: 3 additions & 1 deletion tensorflow_serving/model_servers/http_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.

#include <memory>

#include "tensorflow_serving/config/monitoring_config.pb.h"
#include "tensorflow_serving/util/net_http/server/public/httpserver_interface.h"

namespace tensorflow {
Expand All @@ -30,7 +31,8 @@ class ServerCore;
//
// The returned server is in a state of accepting new requests.
std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
int port, int num_threads, int timeout_in_ms, ServerCore* core);
int port, int num_threads, int timeout_in_ms,
const MonitoringConfig& monitoring_config, ServerCore* core);

} // namespace serving
} // namespace tensorflow
Expand Down
6 changes: 5 additions & 1 deletion tensorflow_serving/model_servers/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,11 @@ int main(int argc, char** argv) {
"Enables model warmup, which triggers lazy "
"initializations (such as TF optimizations) at load "
"time, to reduce first request latency."),
tensorflow::Flag("version", &display_version, "Display version")};
tensorflow::Flag("version", &display_version, "Display version"),
tensorflow::Flag(
"monitoring_config_file", &options.monitoring_config_file,
"If non-empty, read an ascii MonitoringConfig protobuf from "
"the supplied file name")};

const auto& usage = tensorflow::Flags::Usage(argv[0], flag_list);
if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
Expand Down
9 changes: 8 additions & 1 deletion tensorflow_serving/model_servers/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ limitations under the License.
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/protobuf/config.pb.h"
#include "tensorflow_serving/config/model_server_config.pb.h"
#include "tensorflow_serving/config/monitoring_config.pb.h"
#include "tensorflow_serving/config/ssl_config.pb.h"
#include "tensorflow_serving/core/availability_preserving_policy.h"
#include "tensorflow_serving/model_servers/grpc_status_util.h"
Expand Down Expand Up @@ -285,9 +286,15 @@ Status Server::BuildAndStart(const Options& server_options) {
if (server_options.http_port != server_options.grpc_port) {
const string server_address =
"localhost:" + std::to_string(server_options.http_port);
MonitoringConfig monitoring_config;
if (!server_options.monitoring_config_file.empty()) {
monitoring_config = ReadProtoFromFile<MonitoringConfig>(
server_options.monitoring_config_file);
}
http_server_ = CreateAndStartHttpServer(
server_options.http_port, server_options.http_num_threads,
server_options.http_timeout_in_ms, server_core_.get());
server_options.http_timeout_in_ms, monitoring_config,
server_core_.get());
if (http_server_ != nullptr) {
LOG(INFO) << "Exporting HTTP/REST API at:" << server_address << " ...";
} else {
Expand Down
1 change: 1 addition & 0 deletions tensorflow_serving/model_servers/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class Server {
tensorflow::string ssl_config_file;
string model_config_file;
bool enable_model_warmup = true;
tensorflow::string monitoring_config_file;

Options();
};
Expand Down
30 changes: 30 additions & 0 deletions tensorflow_serving/model_servers/tensorflow_model_server_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def GetArgsKey(*args, **kwargs):
def RunServer(model_name,
model_path,
model_config_file=None,
monitoring_config_file=None,
batching_parameters_file=None,
grpc_channel_arguments='',
wait_for_server_ready=True,
Expand All @@ -131,6 +132,7 @@ def RunServer(model_name,
model_name: Name of model.
model_path: Path to model.
model_config_file: Path to model config file.
monitoring_config_file: Path to the monitoring config file.
batching_parameters_file: Path to batching parameters.
grpc_channel_arguments: Custom gRPC args for server.
wait_for_server_ready: Wait for gRPC port to be ready.
Expand Down Expand Up @@ -165,6 +167,9 @@ def RunServer(model_name,
else:
raise ValueError('Both model_config_file and model_path cannot be empty!')

if monitoring_config_file:
command += ' --monitoring_config_file=' + monitoring_config_file

if batching_parameters_file:
command += ' --enable_batching'
command += ' --batching_parameters_file=' + batching_parameters_file
Expand Down Expand Up @@ -287,6 +292,10 @@ def _GetBatchingParametersFile(self):
"""Returns a path to a batching configuration file."""
return os.path.join(self.testdata_dir, 'batching_config.txt')

def _GetMonitoringConfigFile(self):
"""Returns a path to a monitoring configuration file."""
return os.path.join(self.testdata_dir, 'monitoring_config.txt')

def _VerifyModelSpec(self,
actual_model_spec,
exp_model_name,
Expand Down Expand Up @@ -642,6 +651,27 @@ def testGetStatusREST(self):
}]
})

def testPrometheusEndpoint(self):
"""Test ModelStatus implementation over REST API with columnar inputs."""
model_path = self._GetSavedModelBundlePath()
host, port = TensorflowModelServerTest.RunServer(
'default',
model_path,
monitoring_config_file=self._GetMonitoringConfigFile())[2].split(':')

# Prepare request
url = 'http://{}:{}/monitoring/prometheus/metrics'.format(host, port)

# Send request
resp_data = None
try:
resp_data = CallREST(url, None)
except Exception as e: # pylint: disable=broad-except
self.fail('Request failed with error: {}'.format(e))

# Verify that there should be some metric type information.
self.assertIn('# TYPE', resp_data)


if __name__ == '__main__':
tf.test.main()
1 change: 1 addition & 0 deletions tensorflow_serving/servables/tensorflow/testdata/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,5 @@ exports_files([
"good_model_config.txt",
"bad_model_config.txt",
"batching_config.txt",
"monitoring_config.txt",
])
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
prometheus_config: {
enable: true,
path: "/monitoring/prometheus/metrics"
}
24 changes: 24 additions & 0 deletions tensorflow_serving/util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ cc_library(
],
)

cc_library(
name = "prometheus_exporter",
srcs = ["prometheus_exporter.cc"],
hdrs = ["prometheus_exporter.h"],
deps = [
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@com_googlesource_code_re2//:re2",
"@org_tensorflow//tensorflow/core:lib",
],
)

###############################################################################
# Internal targets
###############################################################################
Expand All @@ -87,6 +99,18 @@ cc_test(
],
)

cc_test(
name = "prometheus_exporter_test",
size = "small",
srcs = ["prometheus_exporter_test.cc"],
deps = [
":prometheus_exporter",
"//tensorflow_serving/core/test_util:test_main",
"@com_google_absl//absl/strings",
"@org_tensorflow//tensorflow/core:lib",
],
)

cc_test(
name = "event_bus_test",
size = "small",
Expand Down
Loading

0 comments on commit 021efbd

Please sign in to comment.