Skip to content

Commit

Permalink
Retry db connections on econnrefused. (#1455)
Browse files Browse the repository at this point in the history
Fix #465

This fixes a problem where a site crashes hard during a postgresql database restart.
On an econnrefused the db pool entry will try to reconnect for a couple of times.
The periodic queue pollers now gracefully handle timeout and econnrefused errors.

This also fixes a problem with merging config keys.
  • Loading branch information
mworrell committed Sep 28, 2016
1 parent aab627a commit a7152de
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ copy_file("config.in", FromPath, ToPath, Options) ->
% Merge config files
{ok, [Config]} = file:consult(FnConfig),
{ok, [ConfigIn]} = file:consult(ToPath),
MergedConfigs = lists:keymerge(1, lists:sort(Config), lists:sort(ConfigIn)),
MergedConfigs = lists:ukeymerge(1, lists:sort(Config), lists:sort(ConfigIn)),
io_lib:format("~p.", [normalize_options(MergedConfigs)]);
false ->
Outfile
Expand Down
60 changes: 40 additions & 20 deletions src/db/z_db_pgsql.erl
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@

-define(IDLE_TIMEOUT, 60000).

-define(CONNECT_RETRIES, 10).
-define(CONNECT_RETRY_SLEEP, 5000).
-define(CONNECT_RETRIES, 5).
-define(CONNECT_RETRY_SLEEP, 10000).

-record(state, {conn, conn_args}).

Expand All @@ -55,7 +55,7 @@
%% API
%%

start_link(Args) ->
start_link(Args) when is_list(Args) ->
gen_server:start_link(?MODULE, Args, []).

test_connection(Args) ->
Expand Down Expand Up @@ -93,6 +93,7 @@ get_raw_connection(#context{dbc=Worker}) when Worker =/= undefined ->

init(Args) ->
%% Start disconnected
process_flag(trap_exit, true),
{ok, #state{conn=undefined, conn_args=Args}, ?IDLE_TIMEOUT}.


Expand All @@ -107,6 +108,7 @@ handle_call(Cmd, _From, #state{conn=undefined, conn_args=Args}=State) ->
handle_call({squery, Sql}, _From, #state{conn=Conn}=State) ->
{reply, decode_reply(epgsql:squery(Conn, Sql)), State, ?IDLE_TIMEOUT};


handle_call({equery, Sql, Params}, _From, #state{conn=Conn}=State) ->
{reply, decode_reply(epgsql:equery(Conn, Sql, encode_values(Params))), State, ?IDLE_TIMEOUT};

Expand All @@ -123,6 +125,11 @@ handle_cast(_Msg, State) ->

handle_info(timeout, State) ->
{noreply, disconnect(State)};
handle_info({'EXIT', _Pid, econnrefused}, State) ->
% Handled in the connect retry loop
{noreply, State};
handle_info({'EXIT', _Pid, _Reason}, State) ->
{stop, normal, State};
handle_info(_Info, State) ->
{noreply, State, ?IDLE_TIMEOUT}.

Expand All @@ -142,7 +149,7 @@ code_change(_OldVsn, State, _Extra) ->
connect(Args) when is_list(Args) ->
connect(Args, 0).

connect(_Args, RetryCt) when RetryCt > ?CONNECT_RETRIES ->
connect(_Args, RetryCt) when RetryCt >= ?CONNECT_RETRIES ->
{error, econnrefused};
connect(Args, RetryCt) ->
Hostname = get_arg(dbhost, Args),
Expand All @@ -151,25 +158,38 @@ connect(Args, RetryCt) ->
Username = get_arg(dbuser, Args),
Password = get_arg(dbpassword, Args),
Schema = get_arg(dbschema, Args),
case epgsql:connect(Hostname, Username, Password,
[{database, Database}, {port, Port}]) of
{ok, Conn} ->
case epgsql:squery(Conn, "SET search_path TO " ++ Schema) of
{ok, [], []} ->
{ok, Conn};
Error ->
epgsql:close(Conn),
{error, Error}
end;
{error, econnrefused} ->
try
case epgsql:connect(Hostname, Username, Password,
[{database, Database}, {port, Port}]) of
{ok, Conn} ->
case epgsql:squery(Conn, "SET search_path TO " ++ Schema) of
{ok, [], []} ->
{ok, Conn};
Error ->
catch epgsql:close(Conn),
{error, Error}
end;
{error, econnrefused} ->
lager:warning("psql connection to ~p:~p refused, retrying in ~p sec (~p)",
[Hostname, Port, ?CONNECT_RETRY_SLEEP div 1000, self()]),
timer:sleep(?CONNECT_RETRY_SLEEP),
connect(Args, RetryCt+1);
{error, _} = E ->
lager:warning("psql connection to ~p:~p returned error ~p",
[Hostname, Port, E]),
E
end
catch
A:B ->
?DEBUG({A,B}),
timer:sleep(?CONNECT_RETRY_SLEEP),
connect(Args, RetryCt+1);
{error, _} = E ->
E
connect(Args, RetryCt+1)
end.

disconnect(State) ->
epgsql:close(State#state.conn),
disconnect(#state{conn=undefined} = State) ->
State;
disconnect(#state{conn=Conn} = State) ->
_ = epgsql:close(Conn),
State#state{conn=undefined}.

get_arg(K, Args) ->
Expand Down
2 changes: 1 addition & 1 deletion src/db/z_db_pool.erl
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ db_opts(SiteProps) ->
{dbuser, z_config:get(dbuser, "zotonic")},
{dbdatabase, z_config:get(dbdatabase, "zotonic")},
{dbschema, z_config:get(dbschema, "public")}],
lists:keymerge(1, lists:sort(Kvs), lists:sort(Defaults)).
lists:ukeymerge(1, lists:sort(Kvs), lists:sort(Defaults)).

get_connection(#context{db={Pool,_}}) ->
poolboy:checkout(Pool).
Expand Down
4 changes: 2 additions & 2 deletions src/support/z_edge_log_server.erl
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ do_check(Site) ->
Context),
Context)
catch
throw:{error, econnrefused} ->
{ok, 0}
exit:{timeout, _} -> {ok, 0};
throw:{error, econnrefused} -> false
end.


Expand Down
4 changes: 2 additions & 2 deletions src/support/z_pivot_rsc.erl
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,8 @@ do_poll(Context) ->
DidTask = do_poll_task(Context),
do_poll_queue(Context) or DidTask
catch
throw:{error, econnrefused} ->
false
exit:{timeout, _} -> false;
throw:{error, econnrefused} -> false
end.

do_poll_task(Context) ->
Expand Down

0 comments on commit a7152de

Please sign in to comment.