Skip to content

Commit

Permalink
Fixed issue with unknown failed state from task manager
Browse files Browse the repository at this point in the history
  • Loading branch information
frankcorneliusmartin committed Nov 3, 2022
1 parent a398921 commit cb1ef07
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 26 deletions.
48 changes: 29 additions & 19 deletions vantage6-node/vantage6/node/docker/docker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from vantage6.node.docker.vpn_manager import VPNManager
from vantage6.node.util import logger_name
from vantage6.common.docker.network_manager import NetworkManager
from vantage6.node.docker.task_manager import DockerTaskManager
from vantage6.node.docker.task_manager import DockerTaskManager, TaskStatus
from vantage6.node.docker.exceptions import (
UnknownAlgorithmStartFail,
PermanentAlgorithmStartFail
Expand Down Expand Up @@ -288,28 +288,38 @@ def run(self, result_id: int, image: str, docker_input: bytes,
)
database = database if (database and len(database)) else 'default'

__run = lambda: task.run(
docker_input=docker_input, tmp_vol_name=tmp_vol_name, token=token,
algorithm_env=self.algorithm_env, database=database
)

try:
vpn_ports = __run()
except PermanentAlgorithmStartFail:
self.log.debug(f'Marking result {result_id} as failed')
self.failed_tasks.append(task)
return None
# attempt to kick of the task. If it fails do to unknown reasons we try
# again. If it fails permanently we add it to the failed tasks to be
# handled by the speaking worker of the node
tries = 1
while not (task.status == TaskStatus.STARTED) and tries >= 3 :
try:
vpn_ports = task.run(
docker_input=docker_input, tmp_vol_name=tmp_vol_name,
token=token, algorithm_env=self.algorithm_env,
database=database
)
task.started = True

except UnknownAlgorithmStartFail:
self.log.exception(f'Failed to start result {result_id} due to unknown reason')
task.failed = True
vpn_ports = __run()
return None
except UnknownAlgorithmStartFail:
self.log.exception(f'Failed to start result {result_id} due '
'to unknown reason. Trying')
time.sleep(1) # add some time before retrying the next attempt

# keep track of the active container
self.active_tasks.append(task)
except PermanentAlgorithmStartFail:
break

tries += 1

return vpn_ports

# keep track of the active container
if task.failed:
self.failed_tasks.append(task)
return None
else:
self.active_tasks.append(task)
return vpn_ports

def get_result(self) -> Result:
"""
Expand Down
26 changes: 19 additions & 7 deletions vantage6-node/vantage6/node/docker/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import docker.errors

from enum import Enum
from typing import Dict, List, Union
from pathlib import Path

Expand All @@ -22,6 +23,17 @@
PermanentAlgorithmStartFail
)

class TaskStatus(Enum):
# Task constructor is executed
INITIALIZED = 0
# Container started without exceptions
STARTED = 1
# Container exited and had zero exit code
# COMPLETED = 2
# Failed to start the container
FAILED = 90
# Container had a non zero exit code
# CRASHED = 91

class DockerTaskManager(DockerBaseManager):
"""
Expand Down Expand Up @@ -74,9 +86,6 @@ def __init__(self, image: str, vpn_manager: VPNManager, node_name: str,
self.alpine_image = ALPINE_IMAGE if alpine_image is None \
else alpine_image

# toggle to be set if the task failed to start due to unknown reasons
self.failed = False

self.container = None
self.status_code = None

Expand All @@ -93,6 +102,9 @@ def __init__(self, image: str, vpn_manager: VPNManager, node_name: str,
self.tmp_folder = "/mnt/tmp"
self.data_folder = "/mnt/data"

# keep track of the task status
self.status: TaskStatus = TaskStatus.INITIALIZED

def is_finished(self) -> bool:
"""
Checks if algorithm container is finished
Expand Down Expand Up @@ -256,16 +268,16 @@ def _run_algorithm(self) -> List[Dict]:
name=container_name,
labels=self.labels
)

except docker.errors.ImageNotFound:
self.log.error(f'Could not download image: {self.image}')
self.log.error(f'Could not find image: {self.image}')
self.status = TaskStatus.FAILED
raise PermanentAlgorithmStartFail

except Exception as e:
self.log.exception('Could not start algorithm...')
if self.failed:
raise PermanentAlgorithmStartFail
raise UnknownAlgorithmStartFail(e)

self.status = TaskStatus.STARTED
return vpn_ports

def _make_task_folders(self) -> None:
Expand Down

0 comments on commit cb1ef07

Please sign in to comment.