From 62aa7c142b02975709b5d8bc320bd4e02002de34 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 22 Jul 2021 15:17:48 +0600 Subject: [PATCH] Add new server_status metric This commits introduces new boolean metric - server_status with server ip and server port labels. The metric will represent an active or inactive state of a RADIUS server where eradius proxy client tries to send RADIUS requests. By default all possible endpoints are set to inactive. If eradius client sent a request and received successfully response such primary or secondary RADIUS server will be set to 'active' and all other servers from the given server pool will be marked as inactive. --- rebar.config | 4 +++- src/eradius_client.erl | 18 +++++++++++++++++ src/eradius_counter.erl | 19 +++++++++++++++++- test/eradius_metrics_SUITE.erl | 35 +++++++++++++++++++++++++++------- 4 files changed, 67 insertions(+), 9 deletions(-) diff --git a/rebar.config b/rebar.config index 7ade81f4..24fee02c 100644 --- a/rebar.config +++ b/rebar.config @@ -19,7 +19,9 @@ deprecated_functions]}. {xref_ignores, [{prometheus_histogram, declare, 1}, - {prometheus_histogram, observe, 3}]}. + {prometheus_histogram, observe, 3}, + {prometheus_boolean, declare, 1}, + {prometheus_boolean, set, 3}]}. %% == Plugins == diff --git a/src/eradius_client.erl b/src/eradius_client.erl index 0c135874..87a81460 100644 --- a/src/eradius_client.erl +++ b/src/eradius_client.erl @@ -180,6 +180,7 @@ proceed_response(Request, {ok, Response, Secret, Authenticator}, _Peer = {_Serve ?LOG(error, "~s INF: Noreply for request ~p. Could not decode the request, reason: ~s", [printable_peer(ServerIP, Port), Request, Reason]), maybe_failover(Request, noreply, {ServerIP, Port}, Options); Decoded -> + update_server_status_metric(ServerIP, Port, true, Options), update_client_response(Decoded#radius_request.cmd, MetricsInfo, Request), {ok, Response, Authenticator} end; @@ -190,6 +191,7 @@ proceed_response(Request, Response, {_ServerName, {ServerIP, Port}}, TS1, Metric maybe_failover(Request, Response, {ServerIP, Port}, Options). maybe_failover(Request, Response, {ServerIP, Port}, Options) -> + update_server_status_metric(ServerIP, Port, false, Options), case proplists:get_value(failover, Options, []) of [] -> Response; @@ -443,6 +445,7 @@ store_upstream_servers(Server) -> %% private store_radius_server_from_pool(Addr, Port, Retries) when is_tuple(Addr) and is_integer(Port) and is_integer(Retries) -> + eradius_counter:set_boolean_metric(server_status, [Addr, Port], false), ets:insert(?MODULE, {{Addr, Port}, Retries, Retries}); store_radius_server_from_pool(Addr, _, _) -> ?LOG(error, "bad IP address specified in RADIUS servers pool configuration ~p", [Addr]), @@ -562,6 +565,21 @@ inc_responses_counter_accounting(MetricsInfo, #radius_request{attrs = Attrs}) -> inc_responses_counter_accounting(_, _) -> ok. +update_server_status_metric(IP, Port, false, _Options) -> + eradius_counter:set_boolean_metric(server_status, [IP, Port], false); +update_server_status_metric(IP, Port, true, Options) -> + lists:foreach(fun (Server) -> + case Server of + {IP, Port, _Secret} -> + eradius_counter:set_boolean_metric(server_status, [IP, Port], false); + {IP, Port, _Secret, _Opts} -> + eradius_counter:set_boolean_metric(server_status, [IP, Port], false); + _ -> + ok + end + end, proplists:get_value(failover, Options, [])), + eradius_counter:set_boolean_metric(server_status, [IP, Port], true). + %% check if we can use persistent_term for config %% persistent term was added in OTP 21.2 but we can't %% check minor versions with macros so we're stuck waiting diff --git a/src/eradius_counter.erl b/src/eradius_counter.erl index e5b81733..7e95c18c 100644 --- a/src/eradius_counter.erl +++ b/src/eradius_counter.erl @@ -3,7 +3,8 @@ -module(eradius_counter). -export([init_counter/1, init_counter/2, inc_counter/2, dec_counter/2, reset_counter/1, reset_counter/2, - inc_request_counter/2, inc_reply_counter/2, observe/4, observe/5]). + inc_request_counter/2, inc_reply_counter/2, observe/4, observe/5, + set_boolean_metric/3]). -behaviour(gen_server). -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). @@ -82,6 +83,22 @@ aggregate({Servers, {ResetTS, Nass}}) -> NSum1 = [Value || {_Key, Value} <- orddict:to_list(NSums)], {Servers, {ResetTS, NSum1}}. +%% @doc Set Value for the given prometheus boolean metric by the given Name with +%% the given values +set_boolean_metric(Name, Labels, Value) -> + case code:is_loaded(prometheus) of + {file, _} -> + try + prometheus_boolean:set(Name, Labels, Value) + catch _:_ -> + prometheus_boolean:declare([{name, server_status}, {labels, [server_ip, server_port]}, + {help, "Status of an upstream RADIUS Server"}]), + prometheus_boolean:set(Name, Labels, Value) + end; + _ -> + ok + end. + %% @doc Update the given histogram metric value %% NOTE: We use prometheus_histogram collector here instead of eradius_counter ets table because %% it is much easy to use histograms in this way. As we don't need to manage buckets and do diff --git a/test/eradius_metrics_SUITE.erl b/test/eradius_metrics_SUITE.erl index c2fad59a..63fb46c2 100644 --- a/test/eradius_metrics_SUITE.erl +++ b/test/eradius_metrics_SUITE.erl @@ -24,6 +24,7 @@ -include_lib("eradius/include/eradius_lib.hrl"). -include_lib("eradius/include/eradius_dict.hrl"). -include_lib("eradius/include/dictionary.hrl"). +-include("eradius_test.hrl"). -define(SECRET, <<"secret">>). -define(ATTRS_GOOD, [{?NAS_Identifier, "good"}, {?RStatus_Type, ?RStatus_Type_Start}]). @@ -39,10 +40,15 @@ init_per_suite(Config) -> application:load(eradius), EradiusConfig = [{radius_callback, ?MODULE}, {servers, [{good, {eradius_test_handler:localhost(ip), [1812]}}, %% for 'positive' responses, e.g. access accepts - {bad, {eradius_test_handler:localhost(ip), [1813]}}, %% for 'negative' responses, e.g. coa naks - {error, {eradius_test_handler:localhost(ip), [1814]}} %% here things go wrong, e.g. duplicate requests + {bad, {eradius_test_handler:localhost(ip), [1813]}}, %% for 'negative' responses, e.g. coa naks + {error, {eradius_test_handler:localhost(ip), [1814]}} %% here things go wrong, e.g. duplicate requests ]}, {session_nodes, [node()]}, + {servers_pool, + [{test_pool, [{eradius_test_handler:localhost(tuple), 1814, ?SECRET}, + {eradius_test_handler:localhost(tuple), 1813, ?SECRET}, + {eradius_test_handler:localhost(tuple), 1812, ?SECRET}]}] + }, {good, [ { {"good", [] }, [{"127.0.0.2", ?SECRET, [{nas_id, <<"good_nas">>}]}] } ]}, @@ -59,6 +65,9 @@ init_per_suite(Config) -> ], [application:set_env(eradius, Key, Value) || {Key, Value} <- EradiusConfig], application:set_env(prometheus, collectors, [eradius_prometheus_collector]), + % prometheus is not included directly to eradius but prometheus_eradius_collector + % should include it + application:ensure_all_started(prometheus), {ok, _} = application:ensure_all_started(eradius), spawn(fun() -> eradius:modules_ready([?MODULE]), @@ -68,9 +77,9 @@ init_per_suite(Config) -> end_per_suite(_Config) -> application:stop(eradius), + application:stop(prometheus), ok. - %% tests good_requests(_Config) -> Requests = [{request, access, access_accept}, @@ -107,23 +116,32 @@ check_single_request(good, EradiusRequestType, _RequestType, _ResponseType) -> ok = check_metric(EradiusRequestType, client_accounting_requests_total, [{server_name, good}, {acct_type, update}], 0), ok = check_metric(client_accept_responses_total, [{server_name, good}], 1), ok = check_metric(accept_responses_total, [{server_name, good}], 1), - ok = check_metric(access_requests_total, [{server_name, good}], 1); + ok = check_metric(access_requests_total, [{server_name, good}], 1), + ok = check_metric(server_status, true, [eradius_test_handler:localhost(tuple), 1812]); check_single_request(bad, EradiusRequestType, _RequestType, _ResponseType) -> ok = send_request(EradiusRequestType, eradius_test_handler:localhost(tuple), 1813, ?ATTRS_BAD, [{server_name, bad}, {client_name, test}]), ok = check_metric(client_access_requests_total, [{server_name, bad}], 1), ok = check_metric(client_reject_responses_total, [{server_name, bad}], 1), ok = check_metric(access_requests_total, [{server_name, bad}], 1), - ok = check_metric(reject_responses_total, [{server_name, bad}], 1); + ok = check_metric(reject_responses_total, [{server_name, bad}], 1), + ok = check_metric(server_status, true, [eradius_test_handler:localhost(tuple), 1813]); check_single_request(error, EradiusRequestType, _RequestType, _ResponseType) -> + eradius_client:reconfigure(), ok = send_request(EradiusRequestType, eradius_test_handler:localhost(tuple), 1814, ?ATTRS_ERROR, - [{server_name, error}, {client_name, test}, {timeout, 1000}]), + [{server_name, error}, {client_name, test}, {timeout, 1000}, + {failover, [{eradius_test_handler:localhost(tuple), 1812, ?SECRET}]}]), ok = check_metric(client_access_requests_total, [{server_name, error}], 1), ok = check_metric(client_retransmissions_total, [{server_name, error}], 1), ok = check_metric(access_requests_total, [{server_name, error}], 1), ok = check_metric(accept_responses_total, [{server_name, error}], 1), ok = check_metric(duplicated_requests_total, [{server_name, error}], 1), ok = check_metric(client_requests_total, [{server_name, error}], 1), - ok = check_metric(requests_total, [{server_name, error}], 2). + ok = check_metric(requests_total, [{server_name, error}], 2), + ok = check_metric(server_status, false, [eradius_test_handler:localhost(tuple), 1812]), + ok = check_metric(server_status, false, [eradius_test_handler:localhost(tuple), 1813]), + ok = check_metric(server_status, true, [eradius_test_handler:localhost(tuple), 1814]), + ok = check_metric(server_status, undefined, [eradius_test_handler:localhost(tuple), 1815]). + check_total_requests(good, N) -> ok = check_metric(requests_total, [{server_name, good}], N), @@ -154,6 +172,9 @@ check_metric(accreq, Id, Labels, Count) -> check_metric(_, _, _, _) -> ok. +check_metric(server_status, Value, Labels) -> + ?equal(Value, prometheus_boolean:value(server_status, Labels)), + ok; check_metric(Id, Labels, Count) -> case eradius_prometheus_collector:fetch_counter(Id, Labels) of [{Count, _}] ->