Skip to content

Commit

Permalink
failovers: Add test for RPC reconnection nspcc-dev#557
Browse files Browse the repository at this point in the history
Added a new test in the failover network test suite to simulate and handle
RPC connection failures. It forcibly disconnects storage nodes from RPC nodes
repeatedly, and checks if the storage nodes are able to reconnect.

Signed-off-by: Oleg Kulachenko <oleg@nspcc.ru>
  • Loading branch information
vvarg229 committed Jul 19, 2023
1 parent dd737a3 commit e9d5bd5
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 1 deletion.
65 changes: 64 additions & 1 deletion pytest_tests/testsuites/failovers/test_failover_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@

import allure
import pytest
import subprocess
from cluster import StorageNode
from failover_utils import wait_all_storage_nodes_returned, wait_object_replication
from failover_utils import wait_all_storage_nodes_returned, wait_object_replication, get_morph_chain_endpoints
from file_helper import generate_file, get_file_hash
from iptables_helper import IpTablesHelper
from python_keywords.container import create_container
from python_keywords.neofs_verbs import get_object, put_object_to_random_node
from wellknown_acl import PUBLIC_ACL
from python_keywords.node_management import storage_node_healthcheck, check_node_in_map
from neofs_testlib.hosting import Hosting
from common import STORAGE_NODE_SERVICE_NAME_REGEX

from steps.cluster_test_base import ClusterTestBase

Expand Down Expand Up @@ -110,3 +114,62 @@ def test_block_storage_node_traffic(
wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint()
)
assert get_file_hash(source_file_path) == get_file_hash(got_file_path)

@pytest.mark.sanity
@allure.title("RPC reconnection test")
def test_rpc_reconnection(self, hosting: Hosting):
"""
When RPC connection fails (and it can), storage node reconnects to some other node and continues to operate.
"""
dport_repeat = 10 # Constant for the number of the disconnect should be repeated
morph_chain_endpoints = get_morph_chain_endpoints(hosting)

for storage_node in hosting.find_service_configs(STORAGE_NODE_SERVICE_NAME_REGEX):
host = hosting.get_host_by_service(storage_node.name)
pid = host.get_service_pid(storage_node.name)

for morph_chain_addr, morph_chain_port in morph_chain_endpoints:
with allure.step(f'Disconnecting storage node {storage_node.name} '
f'from {morph_chain_addr} {dport_repeat} times'):
for repeat in range(dport_repeat):
with allure.step(f'Disconnect number {repeat}'):
try:
"""
Of course, it would be cleaner to use such code:
with Namespace(pid, 'net'):
subprocess.check_output(['ss', '-K', 'dst', addr, 'dport', port])
But it would be required to run the tests from root, which is bad practice.
But we face the limitations of the ubuntu-latest runner:
And using setfacl is not possible due to GitHub ubuntu-latest runner limitations.
"""
command = f'ss -K dst {morph_chain_addr} dport {morph_chain_port}'
sudo_command = f'sudo nsenter -t {pid} -n {command}'
output = subprocess.check_output(sudo_command, shell=True)
logger.info(f'Output of the command {sudo_command}: {output}')
except subprocess.CalledProcessError as e:
logger.error(
f'Error occurred while running command: {sudo_command}. Error message: {str(e)}')
raise
finally:
# Delay between shutdown attempts, emulates a real disconnection
sleep(5)
logger.info(
f'Disconnected storage node {storage_node.name} from {morph_chain_addr} {dport_repeat} times')

for node in self.cluster.storage_nodes:
with allure.step(f'Checking if node {node} is alive'):
try:
health_check = storage_node_healthcheck(node)
assert (
health_check.health_status == "READY"
and health_check.network_status == "ONLINE"
)
except Exception as err:
logger.warning(f'Node {node} is not online:\n{err}')
raise AssertionError(
f'After the RPC connection failed, the storage node {node} DID NOT reconnect '
f'to any other node and FAILED to continue operating. '
)
with allure.step(f'Checking if node {node} in map'):
check_node_in_map(node, shell=self.shell)
logger.info(f'Node {node} is alive and online')
19 changes: 19 additions & 0 deletions robot/resources/lib/python_keywords/failover_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
from time import sleep

import allure
from typing import List, Tuple, Optional
from urllib.parse import urlparse
from cluster import Cluster, StorageNode
from neofs_testlib.shell import Shell
from neofs_testlib.hosting import Hosting
from python_keywords.node_management import storage_node_healthcheck
from storage_policy import get_nodes_with_object
from common import MORPH_CHAIN_SERVICE_NAME_REGEX, ENDPOINT_INTERNAL0

logger = logging.getLogger("NeoLogger")

Expand Down Expand Up @@ -52,3 +56,18 @@ def is_all_storage_nodes_returned(cluster: Cluster) -> bool:
if health_check.health_status != "READY" or health_check.network_status != "ONLINE":
return False
return True


@allure.step("Get morph chain endpoints")
def get_morph_chain_endpoints(hosting: Hosting) -> List[Tuple[str, str]]:
morph_chain_config = hosting.find_service_configs(MORPH_CHAIN_SERVICE_NAME_REGEX)
endpoints = []
for config in morph_chain_config:
if ENDPOINT_INTERNAL0 not in config.attributes:
raise ValueError(f"{ENDPOINT_INTERNAL0} is not present in the attributes of the config: {config}")
morph_chain_addr_full = config.attributes[ENDPOINT_INTERNAL0]
parsed_url = urlparse(morph_chain_addr_full)
addr = parsed_url.hostname
port = str(parsed_url.port)
endpoints.append((addr, port))
return endpoints
3 changes: 3 additions & 0 deletions robot/variables/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,12 @@

HOSTING_CONFIG_FILE = os.getenv("HOSTING_CONFIG_FILE", ".devenv.hosting.yaml")
STORAGE_NODE_SERVICE_NAME_REGEX = r"s\d\d"
MORPH_CHAIN_SERVICE_NAME_REGEX = r"morph-chain\d\d"
HTTP_GATE_SERVICE_NAME_REGEX = r"http-gate\d\d"
S3_GATE_SERVICE_NAME_REGEX = r"s3-gate\d\d"

ENDPOINT_INTERNAL0 = "endpoint_internal0"

# Generate wallet configs
# TODO: we should move all info about wallet configs to fixtures
WALLET_CONFIG = os.path.join(os.getcwd(), "wallet_config.yml")
Expand Down

0 comments on commit e9d5bd5

Please sign in to comment.