Skip to content

Commit

Permalink
Introduce flaky tests statistics
Browse files Browse the repository at this point in the history
* Consider a test as 'flaky' if it was restarted at least 1 time
  and eventually passed. Print the number of flaked tests and the
  corresponding info in statistics.

* Collect logs of such tests into the 'artifacts' directory, along
  with logs of failed tests.

Example:

    $ ./test-run.py --force replication-luatest/
    Started ./test-run.py --force replication-luatest/
    Running in parallel with 16 workers

    ... <other tests>

    [016] replication-luatest/gh_6033_box_promote_demote>                 [ pass ]
    [014] replication-luatest/gh_5418_qsync_with_anon_te>                 [ pass ]
    [012] replication-luatest/quorum_misc_test.lua                        [ pass ]
    [005] replication-luatest/linearizable_test.lua                       [ fail ]

    ... <traceback>

    [005] Test "replication-luatest/linearizable_test.lua", conf: "None"
    [005] 	failed, rerunning ...
    [005] replication-luatest/linearizable_test.lua                       [ pass ]

    ---------------------------------------------------------------------------------
    Top 10 longest tests (seconds):
    *  67.86 replication-luatest/quorum_orphan_test.lua
    *  26.00 replication-luatest/quorum_misc_test.lua
    *  15.59 replication-luatest/gh_5418_qsync_with_anon_test.lua
    *  13.39 replication-luatest/election_fencing_test.lua
    *   9.77 replication-luatest/gh_5295_split_brain_test.lua
    *   6.95 replication-luatest/no_quorum_test.lua
    *   6.29 replication-luatest/bootstrap_strategy_auto_test.lua
    *   6.26 replication-luatest/gh_5568_read_only_reason_test.lua
    *   5.31 replication-luatest/gh_7253_election_long_wal_write_test.lua
    *   5.13 replication-luatest/gh_7086_box_issue_promote_assert_test.lua
    ---------------------------------------------------------------------------------
    Statistics:
    * pass: 37 (flaky: 1)
    Flaked tasks:
    - [replication-luatest/linearizable_test.lua, null]
    # logfile:        /private/tmp/t/log/005_replication-luatest.log
    # reproduce file: /private/tmp/t/reproduce/005_replication-luatest.list.yaml

    $ ls /tmp/t/artifacts/
    005_replication-luatest/ log/                     reproduce/               statistics/
  • Loading branch information
ylobankov committed Mar 21, 2023
1 parent cc3c38e commit 596fd3b
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 32 deletions.
18 changes: 18 additions & 0 deletions lib/worker.py
Expand Up @@ -160,6 +160,15 @@ def __init__(self, worker_id, worker_name, task_id,
self.show_reproduce_content = show_reproduce_content


class WorkerFlakedTask(BaseWorkerMessage):
"""Passed into the result queue when a task flaked.
The task_id (any hashable object) field holds ID of the flaked task.
"""
def __init__(self, worker_id, worker_name, task_id):
super(WorkerFlakedTask, self).__init__(worker_id, worker_name)
self.task_id = task_id


class WorkerOutput(BaseWorkerMessage):
"""The output passed by worker processes via color_stdout/color_log
functions. The output wrapped into objects of this class by setting queue
Expand Down Expand Up @@ -227,6 +236,9 @@ def wrap_result(self, task_id, short_status, duration):
self.suite.test_is_long(task_id), duration,
self.suite.show_reproduce_content())

def wrap_flaked_task(self, task_id):
return WorkerFlakedTask(self.id, self.name, task_id)

def sigterm_handler(self, signum, frame):
self.sigterm_received = True

Expand Down Expand Up @@ -341,6 +353,7 @@ def run_loop(self, task_queue, result_queue):
break

short_status = None
fails_count = 0
duration = 0.0
result_queue.put(self.current_task(task_id))
testname = os.path.basename(task_id[0])
Expand All @@ -358,6 +371,7 @@ def run_loop(self, task_queue, result_queue):
self.restart_server()
# print message only after some fails occurred
if short_status == 'fail':
fails_count += 1
if testname not in self.suite.fragile['tests']:
color_stdout(
'Test "%s", conf: "%s"\n\tfailed, rerunning ...\n'
Expand All @@ -372,6 +386,10 @@ def run_loop(self, task_queue, result_queue):
retries_left = retries_left - 1

result_queue.put(self.wrap_result(task_id, short_status, duration))

if fails_count > 0 and short_status == 'pass':
result_queue.put(self.wrap_flaked_task(task_id))

if short_status == 'fail':
if Options().args.is_force:
self.restart_server()
Expand Down
76 changes: 47 additions & 29 deletions listeners.py
Expand Up @@ -10,6 +10,7 @@
from lib.sampler import sampler
from lib.worker import WorkerCurrentTask
from lib.worker import WorkerDone
from lib.worker import WorkerFlakedTask
from lib.worker import WorkerOutput
from lib.worker import WorkerTaskResult
from lib.worker import get_reproduce_file
Expand Down Expand Up @@ -39,26 +40,28 @@ def __init__(self, get_logfile):
self._sampler = sampler
self.duration_stats = dict()
self.failed_tasks = []
self.flaked_tasks = []
self.get_logfile = get_logfile
self.long_tasks = set()

def process_result(self, obj):
if not isinstance(obj, WorkerTaskResult):
return
if isinstance(obj, WorkerTaskResult):
if obj.is_long:
self.long_tasks.add(obj.task_id)

if obj.is_long:
self.long_tasks.add(obj.task_id)
if obj.short_status not in self.stats:
self.stats[obj.short_status] = 0
self.stats[obj.short_status] += 1

if obj.short_status not in self.stats:
self.stats[obj.short_status] = 0
self.stats[obj.short_status] += 1
if obj.short_status == 'fail':
self.failed_tasks.append((obj.task_id,
obj.worker_name,
obj.show_reproduce_content))

if obj.short_status == 'fail':
self.failed_tasks.append((obj.task_id,
obj.worker_name,
obj.show_reproduce_content))
self.duration_stats[obj.task_id] = obj.duration

self.duration_stats[obj.task_id] = obj.duration
if isinstance(obj, WorkerFlakedTask):
self.flaked_tasks.append((obj.task_id, obj.worker_name, False))

def get_long_mark(self, task):
return '(long)' if task in self.long_tasks else ''
Expand Down Expand Up @@ -149,6 +152,19 @@ def print_duration(self, stats_dir):
self.duration_stats[task_id]))
fd.close()

def print_tasks_info(self, tasks):
for task_id, worker_name, show_reproduce_content in tasks:
logfile = self.get_logfile(worker_name)
task_id_str = yaml.safe_dump(task_id, default_flow_style=True)
final_report('- %s' % task_id_str, schema='test_var')
color_stdout('# logfile: %s\n' % logfile)
reproduce_file_path = get_reproduce_file(worker_name)
color_stdout('# reproduce file: %s\n' % reproduce_file_path)
if show_reproduce_content:
color_stdout("---\n", schema='separator')
print_tail_n(reproduce_file_path)
color_stdout("...\n", schema='separator')

def print_statistics(self):
"""Print statistics and results of testing."""
# Prepare standalone subpath '<vardir>/statistics' for statistics files.
Expand All @@ -161,25 +177,25 @@ def print_statistics(self):
if self.stats:
final_report('Statistics:\n', schema='test_var')
for short_status, cnt in self.stats.items():
final_report('* %s: %d\n' % (short_status, cnt), schema='test_var')
if short_status == 'pass' and self.flaked_tasks:
final_report('* %s: %d (flaky: %d)\n' %
(short_status, cnt, len(self.flaked_tasks)),
schema='test_var')
else:
final_report('* %s: %d\n' % (short_status, cnt),
schema='test_var')

if self.flaked_tasks:
final_report('Flaked tasks:\n', schema='test_var')
self.print_tasks_info(self.flaked_tasks)

if not self.failed_tasks:
return False
return False, bool(self.flaked_tasks)

final_report('Failed tasks:\n', schema='test_var')
for task_id, worker_name, show_reproduce_content in self.failed_tasks:
logfile = self.get_logfile(worker_name)
task_id_str = yaml.safe_dump(task_id, default_flow_style=True)
final_report('- %s' % task_id_str, schema='test_var')
color_stdout('# logfile: %s\n' % logfile)
reproduce_file_path = get_reproduce_file(worker_name)
color_stdout('# reproduce file: %s\n' % reproduce_file_path)
if show_reproduce_content:
color_stdout("---\n", schema='separator')
print_tail_n(reproduce_file_path)
color_stdout("...\n", schema='separator')
self.print_tasks_info(self.failed_tasks)

return True
return True, bool(self.flaked_tasks)


class ArtifactsWatcher(BaseWatcher):
Expand All @@ -193,10 +209,12 @@ def __init__(self, get_logfile):
self.get_logfile = get_logfile

def process_result(self, obj):
if not isinstance(obj, WorkerTaskResult):
return
if isinstance(obj, WorkerTaskResult):
if obj.short_status == 'fail' and \
obj.worker_name not in self.failed_workers:
self.failed_workers.append(obj.worker_name)

if obj.short_status == 'fail' and \
if isinstance(obj, WorkerFlakedTask) and \
obj.worker_name not in self.failed_workers:
self.failed_workers.append(obj.worker_name)

Expand Down
7 changes: 4 additions & 3 deletions test-run.py
Expand Up @@ -126,12 +126,13 @@ def main_loop_parallel():
dispatcher.wait()
dispatcher.wait_processes()
color_stdout('-' * 81, "\n", schema='separator')
has_failed = dispatcher.statistics.print_statistics()
has_failed, has_flaked = dispatcher.statistics.print_statistics()
has_undone = dispatcher.report_undone(
verbose=bool(is_force or not has_failed))
if has_failed:
if any([has_failed, has_flaked]):
dispatcher.artifacts.save_artifacts()
return EXIT_FAILED_TEST
if has_failed:
return EXIT_FAILED_TEST
if has_undone:
return EXIT_NOTDONE_TEST
except KeyboardInterrupt:
Expand Down

0 comments on commit 596fd3b

Please sign in to comment.