Permalink
Browse files

Filling out the supervision tree; avoid orphans.

  • Loading branch information...
zxq9 committed Nov 12, 2014
1 parent dce21ad commit 897637bf65a141cd5fd901f720d75cca9a8544bb
Showing with 38 additions and 13 deletions.
  1. +12 −8 erlmud-0.1/chanman.erl
  2. +2 −2 erlmud-0.1/channel.erl
  3. +18 −0 erlmud-0.1/erlmud.erl
  4. +6 −3 erlmud-0.1/telcon.erl
View
@@ -19,6 +19,7 @@ starter(Spawn, Parent, Conf) ->
end.
init(Parent, Conf) ->
+ process_flag(trap_exit, true),
note("Notional initialization with ~tp.", [Conf]),
Channels = [],
loop(Parent, Channels).
@@ -31,11 +32,14 @@ loop(Parent, Channels) ->
From ! {Ref, ChanPid},
loop(Parent, NewChannels);
{From, Ref, list} ->
- List = [Name || {Name, _, _} <- Channels],
+ List = [Name || {Name, _} <- Channels],
From ! {Ref, List},
loop(Parent, Channels);
- Message = {'DOWN', _, process, _, _} ->
- NewChannels = handle_down(Channels, Message),
+ {'EXIT', Parent, Reason} ->
+ note("Parent~tp died with ~tp~nFollowing my leige!~n...Blarg!", [Parent, Reason]),
+ exit(parent_died);
+ Message = {'EXIT', _, _} ->
+ NewChannels = handle_exit(Channels, Message),
loop(Parent, NewChannels);
status ->
note("Channels ~p", [Channels]),
@@ -53,17 +57,17 @@ loop(Parent, Channels) ->
%% Magic
acquire(Channel, Channels) ->
case lists:keyfind(Channel, 1, Channels) of
- {_, Pid, _} ->
+ {_, Pid} ->
{Pid, Channels};
false ->
Conf = {Channel, [], []},
- {Pid, Ref} = channel:start_monitor(self(), Conf),
- NewChannels = [{Channel, Pid, Ref} | Channels],
+ Pid = channel:start_link(self(), Conf),
+ NewChannels = [{Channel, Pid} | Channels],
{Pid, NewChannels}
end.
-handle_down(Channels, Message = {_, Ref, _, _, _}) ->
- case lists:keyfind(Ref, 3, Channels) of
+handle_exit(Channels, Message = {_, Pid, _}) ->
+ case lists:keyfind(Pid, 2, Channels) of
false ->
note("Received ~p", [Message]),
Channels;
View
@@ -1,9 +1,9 @@
-module(channel).
--export([start/2, start_monitor/2, code_change/1]).
+-export([start/2, start_link/2, code_change/1]).
%% Startup
start(Parent, Conf) -> starter(fun spawn/1, Parent, Conf).
-start_monitor(Parent, Conf) -> starter(fun spawn_monitor/1, Parent, Conf).
+start_link(Parent, Conf) -> starter(fun spawn_link/1, Parent, Conf).
starter(Spawn, Parent, Conf) -> Spawn(fun() -> init(Parent, Conf) end).
View
@@ -31,6 +31,9 @@ loop(Running, Services) ->
{From, Ref, {info, services}} ->
From ! {Ref, Services},
loop(Running, Services);
+ Message = {'EXIT', _, _} ->
+ NewRunning = restart(Message, Running, Services),
+ loop(NewRunning, Services);
status ->
note("Active components: ~tp", [Running]),
loop(Running, Services);
@@ -51,6 +54,21 @@ shutdown(Running) ->
em_lib:broadcast(Pids, shutdown),
ok.
+restart(Message = {'EXIT', Pid, Reason},
+ Running,
+ Services) ->
+ case lists:keyfind(Pid, 1, Running) of
+ undefined ->
+ note("Received ~p", [Message]),
+ {Running, Services};
+ Dead = {_, Name} ->
+ note("Service ~p exited with ~p", [Name, Reason]),
+ Dropped = lists:delete(Dead, Running),
+ {M, F, A} = lists:keyfind(Name, 1, Services),
+ {ok, NewPid} = apply(M, F, [self(), A]),
+ [{NewPid, Name} | Dropped]
+ end.
+
%% Magic
live_pids(Running) ->
[Pid || {Pid, _} <- Running].
View
@@ -88,6 +88,9 @@ loop(State = {Talker, Handle, Minion = {MPid, MRef, _Actions}, Channels}) ->
{chat, Message} ->
unprompted(Message, State),
loop(State);
+ {notice, Message} ->
+ unprompted(Message, State),
+ loop(State);
{acquire_minion, Pid} ->
NewMinion = acquire_minion(Pid),
loop({Talker, Handle, NewMinion, Channels});
@@ -182,7 +185,7 @@ stringify(Bin) -> string:tokens(binary_to_list(Bin), "\r\n").
%% Controller actions
echo(String) -> String.
-bargle() -> "Arglebargle, glop-glyf!?!".
+bargle() -> "Arglebargle, glop-glyph!?!".
quit(Talker, Handle) ->
Message = "Goodbye, " ++ Handle ++ "!\r\n",
@@ -326,8 +329,8 @@ handle_down(State = {Talker, Handle, Minion, Channels},
note("Received ~p", [Message]),
State;
Chan = {Channel, _, _} ->
- Message = "CHAT: Channel #" ++ Channel ++ " closed.\r\n",
- Talker ! {send, Message},
+ Notice = "CHAT: Channel " ++ Channel ++ " closed.",
+ unprompted(Notice, State),
NewChannels = lists:delete(Chan, Channels),
{Talker, Handle, Minion, NewChannels}
end.

1 comment on commit 897637b

@zxq9

This comment has been minimized.

Show comment
Hide comment
@zxq9

zxq9 Nov 12, 2014

Owner

Originally chanman had been written to monitor, but not link or trap exits of channel processes. At first glance this looks acceptable, after all the chanman doesn't have any need to restart channels, since they are supposed to die when they hit zero participants anyway, and upon death the participant count winds up being zero. But this assumes that the chanman itself will never die (of course not, since my code is perfect!). This is always a faulty assumption. It might not be very nice to have all your connected channels die suddenly, but its rather trivial for users to recreate them by just re-joining whatever died. On the other hand, resource exhaustion and really confused directory situations (the output of \list can never match reality once chanman dies without taking its children with it) are a lot worse.

All channels crashing with the chanman might suck a little, but letting the server get to a corrupted state is unrecoverable without a restart -- and that would require taking the game and everything else down with it just because the chat service had a hiccup.

Here we have one of the most important examples of why supervision trees matter: they create a direct chain of command, to include avoiding orphan process situations.

Most of the "scaffolding" bits have now been written in raw Erlang and it is a good time to sit back and check out just how much repetive code has been naturally popping up all over the place. The repetitions aren't resulting from some mandatory framework or environment boilerplate -- I'm deliberately making an effort to write really "low level" Erlang, so low that there are no system or framework imposed patterns -- they are resulting from the basic, natural fact that supervision trees provide one of the only known ways to naturally guarantee known-state consistency throughout the entire system.

Another very important thing to notice is how inconsistent my off-the-cuff implementation of several of these patterns has been. Sometimes a loop has a single State variable that wraps all the state of the service, sometimes bits are split out, sometimes it was one way to begin with and switched a few commits ago (especially once the argument list grew long enough to annoy me when typing). Some code_change/N functions have flipped back and forth along with this, and that required hand tweaking code that really could have been easier had every loop accepted a single wrapped State, or at least some standard structure that didn't change every time I added something to the main loop without messing with code_change.

The final thing to note is something I commented on a few commits ago, which is just how confusing tracing message passage can be when not using module interface functions. The send and receive locations are distant in the code, so checking for where things are sent from and where they are going to is a bit of a trick in the more complex cases (fortunately none of this has been particularly complex). One of the best things about using interface functions is the ability to glance at them for type information while working on other modules, use tools like Dialyzer (which we won't get into we get into "pure Erlang" in v0.2), and easily grep or let Emacs or an IDE find calling sites for you. This is nearly impossible with pure ad hoc messaging. Ad hoc messaging is fine when writing a stub or two to test a concept, but anything beyond that starts getting very hard to keep track of, because the locations significant to the message protocol are both scattered about the code (seemingly at random) and can't be defined by any typing tools.

I think this code proves three things:

  • Erlang is really quick to hack things together that are more difficult to get right in other languages, even when handwriting the "robust" bits of code. I wrote a chat system this afternoon from scratch, all by hand, with no framework code. It sort of sucked more than it needed to since I deliberately avoided adhering to most coding standards, but it was still possible and quick. I wouldn't want to have to maintain this two months from now, though.
  • Code convention recommendations from folks like Joe Armstrong (who actually does a good bit of by-hand, non-OTP server writing -- but is usually rather specific about how he does it), and "standard" set stuff like OTP exists for an obvious reason. Just look at the mess I've created!
  • Deployment clearly requires a better solution than this. We won't touch on this issue for a while yet, but seriously, how in the hell would you automate deployment of a scattering of files like this?
Owner

zxq9 commented on 897637b Nov 12, 2014

Originally chanman had been written to monitor, but not link or trap exits of channel processes. At first glance this looks acceptable, after all the chanman doesn't have any need to restart channels, since they are supposed to die when they hit zero participants anyway, and upon death the participant count winds up being zero. But this assumes that the chanman itself will never die (of course not, since my code is perfect!). This is always a faulty assumption. It might not be very nice to have all your connected channels die suddenly, but its rather trivial for users to recreate them by just re-joining whatever died. On the other hand, resource exhaustion and really confused directory situations (the output of \list can never match reality once chanman dies without taking its children with it) are a lot worse.

All channels crashing with the chanman might suck a little, but letting the server get to a corrupted state is unrecoverable without a restart -- and that would require taking the game and everything else down with it just because the chat service had a hiccup.

Here we have one of the most important examples of why supervision trees matter: they create a direct chain of command, to include avoiding orphan process situations.

Most of the "scaffolding" bits have now been written in raw Erlang and it is a good time to sit back and check out just how much repetive code has been naturally popping up all over the place. The repetitions aren't resulting from some mandatory framework or environment boilerplate -- I'm deliberately making an effort to write really "low level" Erlang, so low that there are no system or framework imposed patterns -- they are resulting from the basic, natural fact that supervision trees provide one of the only known ways to naturally guarantee known-state consistency throughout the entire system.

Another very important thing to notice is how inconsistent my off-the-cuff implementation of several of these patterns has been. Sometimes a loop has a single State variable that wraps all the state of the service, sometimes bits are split out, sometimes it was one way to begin with and switched a few commits ago (especially once the argument list grew long enough to annoy me when typing). Some code_change/N functions have flipped back and forth along with this, and that required hand tweaking code that really could have been easier had every loop accepted a single wrapped State, or at least some standard structure that didn't change every time I added something to the main loop without messing with code_change.

The final thing to note is something I commented on a few commits ago, which is just how confusing tracing message passage can be when not using module interface functions. The send and receive locations are distant in the code, so checking for where things are sent from and where they are going to is a bit of a trick in the more complex cases (fortunately none of this has been particularly complex). One of the best things about using interface functions is the ability to glance at them for type information while working on other modules, use tools like Dialyzer (which we won't get into we get into "pure Erlang" in v0.2), and easily grep or let Emacs or an IDE find calling sites for you. This is nearly impossible with pure ad hoc messaging. Ad hoc messaging is fine when writing a stub or two to test a concept, but anything beyond that starts getting very hard to keep track of, because the locations significant to the message protocol are both scattered about the code (seemingly at random) and can't be defined by any typing tools.

I think this code proves three things:

  • Erlang is really quick to hack things together that are more difficult to get right in other languages, even when handwriting the "robust" bits of code. I wrote a chat system this afternoon from scratch, all by hand, with no framework code. It sort of sucked more than it needed to since I deliberately avoided adhering to most coding standards, but it was still possible and quick. I wouldn't want to have to maintain this two months from now, though.
  • Code convention recommendations from folks like Joe Armstrong (who actually does a good bit of by-hand, non-OTP server writing -- but is usually rather specific about how he does it), and "standard" set stuff like OTP exists for an obvious reason. Just look at the mess I've created!
  • Deployment clearly requires a better solution than this. We won't touch on this issue for a while yet, but seriously, how in the hell would you automate deployment of a scattering of files like this?
Please sign in to comment.