Rerun all failed tests, not only marked as fragile

test-run supports functionality to rerun failed tests in place, but these tests have to be on so called fragile list. To add a test to the fragile list we need to add a special configuration to the suite.ini file of a test suite. Configuration example: fragile = { "retries": 5, "tests": { "tarantoolctl.test.lua": { "issues": [ "gh-5059", "gh-5346" ] }, "debug.test.lua": { "issues": [ "gh-5346" ] }, ... } } Rerunning failed tests in place is quite convenient because it allows us to avoid rerunning all tests again and thus save time. But to make it work as expected we should keep the list of fragile tests always up-to-date. Flaky tests may be introduced every day and keeping the list of fragile tests always up-to-date becomes extremely difficult to do. So our solusion is quite simple: just rerun all failed tests. By default, the number of retries for regular and fragile tests is 3. But for fragile tests this number can be overriden in the suite.ini file. Closes #328
tarantool · Mar 16, 2022 · 36cee89 · 36cee89
1 parent f246567
commit 36cee89
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 10 deletions.
diff --git a/lib/test_suite.py b/lib/test_suite.py
@@ -45,6 +45,9 @@ class TestSuite:
     server for this suite, the client program to execute individual
     tests and other suite properties. The server is started once per
     suite."""
+
+    RETRIES_COUNT = 3
+
     def get_multirun_conf(self, suite_path):
         conf_name = self.ini.get('config', None)
         if conf_name is None:
@@ -91,7 +94,7 @@ def __init__(self, suite_path, args):
         self.args = args
         self.tests = []
         self.ini = {}
-        self.fragile = {'retries': 0, 'tests': {}}
+        self.fragile = {'retries': self.RETRIES_COUNT, 'tests': {}}
         self.suite_path = suite_path
         self.ini["core"] = "tarantool"
 
@@ -128,7 +131,7 @@ def __init__(self, suite_path, args):
         if config.has_option("default", "fragile"):
             fragiles = config.get("default", "fragile")
             try:
-                self.fragile = json.loads(fragiles)
+                self.fragile.update(json.loads(fragiles))
                 if 'tests' not in self.fragile:
                     raise RuntimeError(
                         "Key 'tests' absent in 'fragile' json: {}"
@@ -288,7 +291,7 @@ def is_parallel(self):
         return self.ini['is_parallel']
 
     def fragile_retries(self):
-        return self.fragile.get('retries', 0)
+        return self.fragile['retries']
 
     def show_reproduce_content(self):
         return self.ini['show_reproduce_content']

diff --git a/lib/worker.py b/lib/worker.py
@@ -350,20 +350,25 @@ def run_loop(self, task_queue, result_queue):
                                   'defined in suite.ini but this functionality '
                                   'is dropped' % testname)
                 )
-            retries_left = self.suite.fragile_retries()
+            retries_left = self.suite.RETRIES_COUNT
+            if testname in self.suite.fragile['tests']:
+                retries_left = self.suite.fragile_retries()
             # let's run till short_status became 'pass'
             while short_status in (None, 'fail') and retries_left >= 0:
                 self.restart_server()
                 # print message only after some fails occurred
                 if short_status == 'fail':
-                    color_stdout(
-                        'Test "%s", conf: "%s"\n'
-                        '\tfrom "fragile" list failed, rerunning ...\n'
-                        % (task_id[0], task_id[1]), schema='error')
+                    if testname not in self.suite.fragile['tests']:
+                        color_stdout(
+                            'Test "%s", conf: "%s"\n\tfailed, rerunning ...\n'
+                            % (task_id[0], task_id[1]), schema='error')
+                    else:
+                        color_stdout(
+                            'Test "%s", conf: "%s"\n'
+                            '\tfrom "fragile" list failed, rerunning ...\n'
+                            % (task_id[0], task_id[1]), schema='error')
                 # run task and save the result to short_status
                 short_status, duration = self.run_task(task_id)
-                if testname not in self.suite.fragile['tests']:
-                    break
                 retries_left = retries_left - 1
 
             result_queue.put(self.wrap_result(task_id, short_status, duration))