Skip to content

Commit

Permalink
Add new server_status metric
Browse files Browse the repository at this point in the history
This commits introduces new boolean metric - server_status with
server ip and server port labels.

The metric will represent an active or inactive state of a RADIUS
server where eradius proxy client tries to send RADIUS requests.
By default all possible endpoints are set to inactive. If eradius
client sent a request and received successfully response such primary
or secondary RADIUS server will be set to 'active' and all other
servers from the given server pool will be marked as inactive.
  • Loading branch information
0xAX committed Jul 26, 2021
1 parent acafb5b commit 62aa7c1
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 9 deletions.
4 changes: 3 additions & 1 deletion rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
deprecated_functions]}.

{xref_ignores, [{prometheus_histogram, declare, 1},
{prometheus_histogram, observe, 3}]}.
{prometheus_histogram, observe, 3},
{prometheus_boolean, declare, 1},
{prometheus_boolean, set, 3}]}.


%% == Plugins ==
Expand Down
18 changes: 18 additions & 0 deletions src/eradius_client.erl
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ proceed_response(Request, {ok, Response, Secret, Authenticator}, _Peer = {_Serve
?LOG(error, "~s INF: Noreply for request ~p. Could not decode the request, reason: ~s", [printable_peer(ServerIP, Port), Request, Reason]),
maybe_failover(Request, noreply, {ServerIP, Port}, Options);
Decoded ->
update_server_status_metric(ServerIP, Port, true, Options),
update_client_response(Decoded#radius_request.cmd, MetricsInfo, Request),
{ok, Response, Authenticator}
end;
Expand All @@ -190,6 +191,7 @@ proceed_response(Request, Response, {_ServerName, {ServerIP, Port}}, TS1, Metric
maybe_failover(Request, Response, {ServerIP, Port}, Options).

maybe_failover(Request, Response, {ServerIP, Port}, Options) ->
update_server_status_metric(ServerIP, Port, false, Options),
case proplists:get_value(failover, Options, []) of
[] ->
Response;
Expand Down Expand Up @@ -443,6 +445,7 @@ store_upstream_servers(Server) ->

%% private
store_radius_server_from_pool(Addr, Port, Retries) when is_tuple(Addr) and is_integer(Port) and is_integer(Retries) ->
eradius_counter:set_boolean_metric(server_status, [Addr, Port], false),
ets:insert(?MODULE, {{Addr, Port}, Retries, Retries});
store_radius_server_from_pool(Addr, _, _) ->
?LOG(error, "bad IP address specified in RADIUS servers pool configuration ~p", [Addr]),
Expand Down Expand Up @@ -562,6 +565,21 @@ inc_responses_counter_accounting(MetricsInfo, #radius_request{attrs = Attrs}) ->
inc_responses_counter_accounting(_, _) ->
ok.

update_server_status_metric(IP, Port, false, _Options) ->
eradius_counter:set_boolean_metric(server_status, [IP, Port], false);
update_server_status_metric(IP, Port, true, Options) ->
lists:foreach(fun (Server) ->
case Server of
{IP, Port, _Secret} ->
eradius_counter:set_boolean_metric(server_status, [IP, Port], false);
{IP, Port, _Secret, _Opts} ->
eradius_counter:set_boolean_metric(server_status, [IP, Port], false);
_ ->
ok
end
end, proplists:get_value(failover, Options, [])),
eradius_counter:set_boolean_metric(server_status, [IP, Port], true).

%% check if we can use persistent_term for config
%% persistent term was added in OTP 21.2 but we can't
%% check minor versions with macros so we're stuck waiting
Expand Down
19 changes: 18 additions & 1 deletion src/eradius_counter.erl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

-module(eradius_counter).
-export([init_counter/1, init_counter/2, inc_counter/2, dec_counter/2, reset_counter/1, reset_counter/2,
inc_request_counter/2, inc_reply_counter/2, observe/4, observe/5]).
inc_request_counter/2, inc_reply_counter/2, observe/4, observe/5,
set_boolean_metric/3]).

-behaviour(gen_server).
-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]).
Expand Down Expand Up @@ -82,6 +83,22 @@ aggregate({Servers, {ResetTS, Nass}}) ->
NSum1 = [Value || {_Key, Value} <- orddict:to_list(NSums)],
{Servers, {ResetTS, NSum1}}.

%% @doc Set Value for the given prometheus boolean metric by the given Name with
%% the given values
set_boolean_metric(Name, Labels, Value) ->
case code:is_loaded(prometheus) of
{file, _} ->
try
prometheus_boolean:set(Name, Labels, Value)
catch _:_ ->
prometheus_boolean:declare([{name, server_status}, {labels, [server_ip, server_port]},
{help, "Status of an upstream RADIUS Server"}]),
prometheus_boolean:set(Name, Labels, Value)
end;
_ ->
ok
end.

%% @doc Update the given histogram metric value
%% NOTE: We use prometheus_histogram collector here instead of eradius_counter ets table because
%% it is much easy to use histograms in this way. As we don't need to manage buckets and do
Expand Down
35 changes: 28 additions & 7 deletions test/eradius_metrics_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
-include_lib("eradius/include/eradius_lib.hrl").
-include_lib("eradius/include/eradius_dict.hrl").
-include_lib("eradius/include/dictionary.hrl").
-include("eradius_test.hrl").

-define(SECRET, <<"secret">>).
-define(ATTRS_GOOD, [{?NAS_Identifier, "good"}, {?RStatus_Type, ?RStatus_Type_Start}]).
Expand All @@ -39,10 +40,15 @@ init_per_suite(Config) ->
application:load(eradius),
EradiusConfig = [{radius_callback, ?MODULE},
{servers, [{good, {eradius_test_handler:localhost(ip), [1812]}}, %% for 'positive' responses, e.g. access accepts
{bad, {eradius_test_handler:localhost(ip), [1813]}}, %% for 'negative' responses, e.g. coa naks
{error, {eradius_test_handler:localhost(ip), [1814]}} %% here things go wrong, e.g. duplicate requests
{bad, {eradius_test_handler:localhost(ip), [1813]}}, %% for 'negative' responses, e.g. coa naks
{error, {eradius_test_handler:localhost(ip), [1814]}} %% here things go wrong, e.g. duplicate requests
]},
{session_nodes, [node()]},
{servers_pool,
[{test_pool, [{eradius_test_handler:localhost(tuple), 1814, ?SECRET},
{eradius_test_handler:localhost(tuple), 1813, ?SECRET},
{eradius_test_handler:localhost(tuple), 1812, ?SECRET}]}]
},
{good, [
{ {"good", [] }, [{"127.0.0.2", ?SECRET, [{nas_id, <<"good_nas">>}]}] }
]},
Expand All @@ -59,6 +65,9 @@ init_per_suite(Config) ->
],
[application:set_env(eradius, Key, Value) || {Key, Value} <- EradiusConfig],
application:set_env(prometheus, collectors, [eradius_prometheus_collector]),
% prometheus is not included directly to eradius but prometheus_eradius_collector
% should include it
application:ensure_all_started(prometheus),
{ok, _} = application:ensure_all_started(eradius),
spawn(fun() ->
eradius:modules_ready([?MODULE]),
Expand All @@ -68,9 +77,9 @@ init_per_suite(Config) ->

end_per_suite(_Config) ->
application:stop(eradius),
application:stop(prometheus),
ok.


%% tests
good_requests(_Config) ->
Requests = [{request, access, access_accept},
Expand Down Expand Up @@ -107,23 +116,32 @@ check_single_request(good, EradiusRequestType, _RequestType, _ResponseType) ->
ok = check_metric(EradiusRequestType, client_accounting_requests_total, [{server_name, good}, {acct_type, update}], 0),
ok = check_metric(client_accept_responses_total, [{server_name, good}], 1),
ok = check_metric(accept_responses_total, [{server_name, good}], 1),
ok = check_metric(access_requests_total, [{server_name, good}], 1);
ok = check_metric(access_requests_total, [{server_name, good}], 1),
ok = check_metric(server_status, true, [eradius_test_handler:localhost(tuple), 1812]);
check_single_request(bad, EradiusRequestType, _RequestType, _ResponseType) ->
ok = send_request(EradiusRequestType, eradius_test_handler:localhost(tuple), 1813, ?ATTRS_BAD, [{server_name, bad}, {client_name, test}]),
ok = check_metric(client_access_requests_total, [{server_name, bad}], 1),
ok = check_metric(client_reject_responses_total, [{server_name, bad}], 1),
ok = check_metric(access_requests_total, [{server_name, bad}], 1),
ok = check_metric(reject_responses_total, [{server_name, bad}], 1);
ok = check_metric(reject_responses_total, [{server_name, bad}], 1),
ok = check_metric(server_status, true, [eradius_test_handler:localhost(tuple), 1813]);
check_single_request(error, EradiusRequestType, _RequestType, _ResponseType) ->
eradius_client:reconfigure(),
ok = send_request(EradiusRequestType, eradius_test_handler:localhost(tuple), 1814, ?ATTRS_ERROR,
[{server_name, error}, {client_name, test}, {timeout, 1000}]),
[{server_name, error}, {client_name, test}, {timeout, 1000},
{failover, [{eradius_test_handler:localhost(tuple), 1812, ?SECRET}]}]),
ok = check_metric(client_access_requests_total, [{server_name, error}], 1),
ok = check_metric(client_retransmissions_total, [{server_name, error}], 1),
ok = check_metric(access_requests_total, [{server_name, error}], 1),
ok = check_metric(accept_responses_total, [{server_name, error}], 1),
ok = check_metric(duplicated_requests_total, [{server_name, error}], 1),
ok = check_metric(client_requests_total, [{server_name, error}], 1),
ok = check_metric(requests_total, [{server_name, error}], 2).
ok = check_metric(requests_total, [{server_name, error}], 2),
ok = check_metric(server_status, false, [eradius_test_handler:localhost(tuple), 1812]),
ok = check_metric(server_status, false, [eradius_test_handler:localhost(tuple), 1813]),
ok = check_metric(server_status, true, [eradius_test_handler:localhost(tuple), 1814]),
ok = check_metric(server_status, undefined, [eradius_test_handler:localhost(tuple), 1815]).


check_total_requests(good, N) ->
ok = check_metric(requests_total, [{server_name, good}], N),
Expand Down Expand Up @@ -154,6 +172,9 @@ check_metric(accreq, Id, Labels, Count) ->
check_metric(_, _, _, _) ->
ok.

check_metric(server_status, Value, Labels) ->
?equal(Value, prometheus_boolean:value(server_status, Labels)),
ok;
check_metric(Id, Labels, Count) ->
case eradius_prometheus_collector:fetch_counter(Id, Labels) of
[{Count, _}] ->
Expand Down

0 comments on commit 62aa7c1

Please sign in to comment.