Add test for hanging xlog tarantool instance

Was added another way with checking patterns in log. Where test sequentially finds expected patterns and checks that there are no unexpected lines. For this approach was changed TarantoolLog.seek_wait() function. Now it is able to find patterns not from beginning (for sequence of patterns). If a pattern was found its last symbol position is saved as position for start point for next searching with start not from beginning. This approach helps for comparing hanging result. Pytest script test_hanging_xlog.py executes hang.test.lua and test-run log is comparing by expected patterns in log. Also, there is check about all subprocesses were killed. Otherwise, raise an exception with existed processes info. Follows up #276
tarantool · May 31, 2021 · 241b0d8 · 241b0d8
1 parent 5ff2d63
commit 241b0d8
Show file tree

Hide file tree

Showing 10 changed files with 172 additions and 3 deletions.
diff --git a/.luacheckrc b/.luacheckrc
@@ -27,4 +27,5 @@ include_files = {
 exclude_files = {
     "lib/tarantool-python",
     "test/test-tarantool/*.test.lua",
+    "test/negative_tests/*.test.lua",
 }
diff --git a/lib/tarantool_server.py b/lib/tarantool_server.py
@@ -439,6 +439,7 @@ class TarantoolLog(object):
     def __init__(self, path):
         self.path = path
         self.log_begin = 0
+        self.last_position = 0
 
     def positioning(self):
         if os.path.exists(self.path):
@@ -461,14 +462,18 @@ def seek_once(self, msg):
                 if pos != -1:
                     return pos
 
-    def seek_wait(self, msg, proc=None, name=None, deadline=None, timeout=10):
+    def seek_wait(self, msg, proc=None, name=None, deadline=None, timeout=10,
+                  start_from_beginning=True):
         while True:
             if os.path.exists(self.path):
                 break
             gevent.sleep(0.001)
 
         with open(self.path, 'r') as f:
-            f.seek(self.log_begin, os.SEEK_SET)
+            if start_from_beginning:
+                f.seek(self.log_begin, os.SEEK_SET)
+            else:
+                f.seek(self.last_position, os.SEEK_SET)
             cur_pos = self.log_begin
             if deadline is None:
                 deadline = time.time() + timeout
@@ -481,7 +486,9 @@ def seek_wait(self, msg, proc=None, name=None, deadline=None, timeout=10):
                     gevent.sleep(0.001)
                     f.seek(cur_pos, os.SEEK_SET)
                     continue
-                if re.findall(msg, log_str):
+                if re.search(msg, log_str):
+                    self.last_position = re.search(msg, log_str).end() + \
+                                         cur_pos
                     return True
                 cur_pos = f.tell()
         return False

diff --git a/test/negative_tests/box.lua b/test/negative_tests/box.lua
@@ -0,0 +1,12 @@
+#!/usr/bin/env tarantool
+local os = require('os')
+
+box.cfg{
+    listen              = os.getenv("LISTEN"),
+    memtx_memory        = 107374182,
+    pid_file            = "tarantool.pid",
+    force_recovery      = true,
+    wal_max_size        = 500
+}
+
+require('console').listen(os.getenv('ADMIN'))
diff --git a/test/negative_tests/conftest.py b/test/negative_tests/conftest.py
@@ -0,0 +1,42 @@
+import pytest
+import psutil
+from pathlib import Path
+from gevent import subprocess
+from lib import Options
+from lib.tarantool_server import TarantoolLog
+
+
+@pytest.yield_fixture(scope="session", autouse=True)
+def clean_all_subprocesses() -> None:
+    """Kill remained subprocess. Raise an exception of not killed procs."""
+    current_process = psutil.Process()
+    children = current_process.children(recursive=True)
+    yield
+    not_terminated_processes = []
+    for child in children:
+        if psutil.pid_exists(child.pid):
+            not_terminated_processes.append(child)
+            child.terminate()
+    if not_terminated_processes:
+        raise Exception(
+            "Next processes were not terminated: {}\n".format(
+                not_terminated_processes))
+
+
+@pytest.fixture
+def tarantool_log(log_file: Path) -> TarantoolLog:
+    tarantool_log = TarantoolLog(log_file)
+    return tarantool_log
+
+
+@pytest.yield_fixture
+def log_file() -> Path:
+    dir_path = Path(__file__).resolve().parent
+    with open(dir_path / 'outfile.txt', 'w') as file:
+        ret = subprocess.Popen(
+            ['../test-run.py',
+             'hang.test.lua'], cwd=dir_path, stdout=file)
+        ret.wait(timeout=Options().args.no_output_timeout + 10)
+    yield Path(dir_path / 'outfile.txt')
+    ret.kill()
+
diff --git a/test/negative_tests/hang.result b/test/negative_tests/hang.result
@@ -0,0 +1,25 @@
+-- test-run result file version 2
+--  box.schema.user.grant('guest', 'replication')replication
+local test_run = require('test_run').new()
+ | ---
+ | ...
+test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
+ | ---
+ | - error: '[string "return test_run:cmd(''create server replica wi..."]:1: attempt
+ |     to index global ''test_run'' (a nil value)'
+ | ...
+test_run:cmd('start server replica')
+ | ---
+ | - error: '[string "return test_run:cmd(''start server replica'') "]:1: attempt to
+ |     index global ''test_run'' (a nil value)'
+ | ...
+test_run:cmd('cleanup server replica')
+ | ---
+ | - error: '[string "return test_run:cmd(''cleanup server replica'') "]:1: attempt to
+ |     index global ''test_run'' (a nil value)'
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - error: '[string "return test_run:cmd(''delete server replica'') "]:1: attempt to
+ |     index global ''test_run'' (a nil value)'
+ | ...
diff --git a/test/negative_tests/hang.test.lua b/test/negative_tests/hang.test.lua
@@ -0,0 +1,6 @@
+--  box.schema.user.grant('guest', 'replication')replication
+test_run = require('test_run').new()
+test_run:cmd('create server replica with rpl_master=default, script="negative_tests/replica.lua"')
+test_run:cmd('start server replica')
+test_run:cmd('cleanup server replica')
+test_run:cmd('delete server replica')
diff --git a/test/negative_tests/replica.lua b/test/negative_tests/replica.lua
@@ -0,0 +1,13 @@
+#!/usr/bin/env tarantool
+
+-- Start the console first to allow test-run to attach even before
+-- box.cfg is finished.
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg({
+    listen              = os.getenv("LISTEN"),
+    replication         = os.getenv("MASTER"),
+    memtx_memory        = 107374182,
+--    pid_file            = "tarantool.pid",
+--    logger              = "tarantool.log",
+})
diff --git a/test/negative_tests/suite.ini b/test/negative_tests/suite.ini
@@ -0,0 +1,10 @@
+[default]
+core = tarantool
+description = tarantool write ahead log tests
+disabled = hang.test.lua
+script = xlog.lua
+use_unix_sockets = True
+use_unix_sockets_iproto = True
+long_run = snap_io_rate.test.lua
+is_parallel = True
+pretest_clean = True
diff --git a/test/negative_tests/test_hanging_xlog.py b/test/negative_tests/test_hanging_xlog.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+
+from lib import Options
+from lib.tarantool_server import TarantoolLog
+
+
+class TestHangingXLog:
+
+    def test_hanging_xlog(self, tarantool_log: TarantoolLog) -> None:
+        """Check if tarantool is hanging it will be stopped by start_timeout.
+
+        Check next patterns in the test-run log:
+        There are (timeout // 10 - 1) lines with 'No output' warnings, which
+        are going sequentially.
+        There are no (timeout // 10)th line with same warning.
+        Next, find log line with failing on start server timeout.
+        Finally, check log contains '[ fail ]'.
+        """
+
+        timeout = Options().args.server_start_timeout
+        no_output_timeout = Options().args.no_output_timeout
+        p = Path(tarantool_log.path)
+        assert p.stat().st_size != 0
+        for time in range(10, timeout, 10):
+            assert tarantool_log.seek_wait(
+                r"No output during {} seconds. "
+                r"Will abort after {} seconds".format(time, no_output_timeout),
+                start_from_beginning=False)
+        assert not tarantool_log.seek_wait(
+                r"No output during {} seconds. "
+                r"Will abort after {} seconds".format(
+                    timeout, no_output_timeout),
+                start_from_beginning=False, timeout=1)
+        assert tarantool_log.seek_wait(
+            r"\[Instance 'replica'\] "
+            r"Start timeout {} was reached.".format(timeout),
+            start_from_beginning=False, timeout=1)
+        assert tarantool_log.seek_wait(
+            r'\[ fail \]', start_from_beginning=False, timeout=1)
+
diff --git a/test/negative_tests/xlog.lua b/test/negative_tests/xlog.lua
@@ -0,0 +1,13 @@
+#!/usr/bin/env tarantool
+local os = require('os')
+
+box.cfg{
+    listen              = os.getenv("LISTEN"),
+    memtx_memory        = 107374182,
+    pid_file            = "tarantool.pid",
+    force_recovery      = true,
+    wal_max_size        = 500,
+    snap_io_rate_limit  = 16
+}
+
+require('console').listen(os.getenv('ADMIN'))