Skip to content

Commit

Permalink
move socket read error handling logic around a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
RalfJung committed Aug 16, 2023
1 parent fdbb096 commit 9d9c49e
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 31 deletions.
6 changes: 3 additions & 3 deletions broker/src/tunneldigger_broker/eventloop.py
Expand Up @@ -54,10 +54,10 @@ def start(self):

pollable, file_object = mapping

if event & select.EPOLLIN:
if event & select.EPOLLIN or event & select.EPOLLERR or event & select.EPOLLHUP:
# If there's anything new, we read. If there was an error, the read will
# return that error so we can handle it.
pollable.read(file_object)
elif event & select.EPOLLERR or event & select.EPOLLHUP:
pollable.error()
except IOError:
# IOError get produced by signal even. in version 3.5 this is fixed an the poll retries
# TODO: in py3 it's InterruptedError
Expand Down
6 changes: 0 additions & 6 deletions broker/src/tunneldigger_broker/hooks.py
Expand Up @@ -50,9 +50,6 @@ def register(self, event_loop):
event_loop.register(self, self.process.stderr, select.EPOLLIN)
self.event_loop = event_loop

def error(self):
self.close()

def close(self):
"""
Closes the hook process.
Expand Down Expand Up @@ -163,9 +160,6 @@ def run_hook(self, name, *args):
except OSError as e:
logger.error("Error while executing script '%s': %s" % (script, e))

def error(self):
self.close()

def close(self):
os.close(self.sigchld_fd)

Expand Down
17 changes: 7 additions & 10 deletions broker/src/tunneldigger_broker/network.py
Expand Up @@ -55,8 +55,11 @@ def register(self, event_loop):
event_loop.register(self, self.socket, select.EPOLLIN)
self.event_loop = event_loop

def error(self):
self.close()
def error(self, direction, e):
"""
`direction` can be "reading" or "writing"
"""
logger.warning("{}: error {} socket: {}".format(self.name, direction, e))

def close(self):
"""
Expand Down Expand Up @@ -106,9 +109,6 @@ def read(timer_self, file_object):
if interval is None:
timer_self.close()

def error(self):
self.close()

def close(timer_self):
self.event_loop.unregister(timer)
self.timers.remove(timer_self)
Expand All @@ -130,6 +130,7 @@ def write(self, address, data):
try:
self.socket.sendto(data, address)
except socket.error:
self.error("writing", e)
return

def write_message(self, address, msg_type, msg_data=b''):
Expand Down Expand Up @@ -167,11 +168,7 @@ def read(self, file_object):
try:
data, address = self.socket.recvfrom(2048)
except socket.error as e:
if e.errno == errno.EMSGSIZE:
# silence these, they occur during PMTU probing
pass
else:
logger.warning("{}: error reading from socket: {}".format(self.name, e))
self.error("reading", e)
return

msg_type, msg_data = protocol.parse_message(data)
Expand Down
22 changes: 10 additions & 12 deletions broker/src/tunneldigger_broker/tunnel.py
Expand Up @@ -239,16 +239,13 @@ def keepalive(self):
logger.warning("%s: timed out", self.name)
self.close(reason=protocol.ERROR_REASON_TIMEOUT)

def error(self):
# Read from the socket, to "consume" the error (and show it in the log)
self.read(None)
# Here we have a problem. This can indicate permanent connection failure
# (https://github.com/wlanslovenija/tunneldigger/issues/143), or it can
# indicate that we sent a packet that was too big (e.g. the PMTU probe
# reply, see https://github.com/wlanslovenija/tunneldigger/issues/171).
# To distinguish the two, we count how many consecutive errors we see without a proper message in between.
# If that reaches a threshold, we consider this error permanent and close the connection.
# PMTU discovery sends 6 probes, so 10 should be safe as a threshold.
def error(self, direction, e):
if e.errno == errno.EMSGSIZE:
# ignore these, they occur during PMTU probing
return
super(Tunnel, self).error(direction, e)
# This connection is probably dead, but we've seen connections in the wild that occasionally raise
# an error. So only kill the connection if this happens again and again.
# We could just rely on the timeout, but when there's a lot of errors it seems better to
# kill the connection early rather than waiting for 2 whole minutes.
self.error_count += 1
Expand Down Expand Up @@ -347,8 +344,9 @@ def message(self, address, msg_type, msg_data, raw_length):

# Update keepalive indicator.
self.last_alive = time.time()
# Remember that we got a message -- reset error count for transient error tolerance.
self.error_count = 0
# We got a message, so *something* is working. Reduce error count for transient error tolerance.
if self.error_count > 0:
self.error_count -= 1

if msg_type == protocol.CONTROL_TYPE_ERROR:
# Error notification from the remote side.
Expand Down

0 comments on commit 9d9c49e

Please sign in to comment.