ci: aarch64: make ctime regression a warning

Ryo-not-rio · Ryo-not-rio · commit b1fce0753427 · 2025-06-11T14:55:33.000Z
diff --git a/.github/automation/performance/benchdnn_comparison.py b/.github/automation/performance/benchdnn_comparison.py
@@ -24,6 +24,10 @@
 import warnings
 import statistics
 
+def print_to_github_out(message):
+    if "GITHUB_OUTPUT" in os.environ:
+        with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+            print(message, file=f)
 
 def compare_two_benchdnn(file1, file2, tolerance=0.05):
     """
@@ -40,9 +44,9 @@ def compare_two_benchdnn(file1, file2, tolerance=0.05):
     r2 = [x.split(",") for x in r2 if x[0:8] == "--mode=P"]
 
     if (len(r1) == 0) or (len(r2) == 0):
-        warnings.warn("One or both of the test results have zero lines")
+        raise Exception("One or both of the test results have zero lines")
     if len(r1) != len(r2):
-        warnings.warn("The number of benchdnn runs do not match")
+        raise Exception("The number of benchdnn runs do not match")
 
     r1_exec = defaultdict(list)
     r1_ctime = defaultdict(list)
@@ -57,17 +61,17 @@ def compare_two_benchdnn(file1, file2, tolerance=0.05):
         r2_exec[key].append(float(exec_time))
         r2_ctime[key].append(float(ctime))
 
-    failed_tests = []
+    exec_failures, ctime_failures = [], []
     for prb in r1_exec:
         if prb not in r2_exec:
-            warnings.warn(f"{prb} exists in {file1} but not {file2}")
-            continue
+            raise Exception(f"{prb} exists in {file1} but not {file2}")
+
         exec1 = r1_exec[prb]
         exec2 = r2_exec[prb]
         ctime1 = r1_ctime[prb]
         ctime2 = r2_ctime[prb]
-        res = ttest_ind(exec2, exec1, alternative="greater")
-        ctime_test = ttest_ind(ctime2, ctime1, alternative="greater")
+        exec_ttest = ttest_ind(exec2, exec1, alternative="greater")
+        ctime_ttest = ttest_ind(ctime2, ctime1, alternative="greater")
         r1_med_exec = statistics.median(exec1)
         r2_med_exec = statistics.median(exec2)
         r1_med_ctime = statistics.median(ctime1)
@@ -81,42 +85,49 @@ def compare_two_benchdnn(file1, file2, tolerance=0.05):
             )
             continue
 
-        # A test fails if either execution time or creation time:
+        # A test fails if execution time:
         # - shows a statistically significant regression and
-        # - shows ≥ 10% slowdown in both median or min times
-        exec_regressed = res.pvalue <= 0.05 and (
+        # - shows ≥ 10% slowdown in either median or min times
+        exec_regressed = exec_ttest.pvalue <= 0.05 and (
             (r2_med_exec - r1_med_exec) / r1_med_exec >= 0.1
             or (min(exec2) - min(exec1)) / min(exec1) >= 0.1
         )
-        ctime_regressed = ctime_test.pvalue <= 0.05 and (
+        ctime_regressed = ctime_ttest.pvalue <= 0.05 and (
             (r2_med_ctime - r1_med_ctime) / r1_med_ctime >= 0.1
             or (min(ctime2) - min(ctime1)) / min(ctime1) >= 0.1
         )
 
-        if exec_regressed or ctime_regressed:
-            failed_tests.append(
+        if exec_regressed:
+            exec_failures.append(
                 f"{prb} exec: {r1_med_exec:.3g} → {r2_med_exec:.3g} "
-                f"(p={res.pvalue:.3g}), "
+                f"(p={exec_ttest.pvalue:.3g})"
+            )
+        if ctime_regressed:
+            ctime_failures.append(
                 f"ctime: {r1_med_ctime:.3g} → {r2_med_ctime:.3g}"
-                f"(p={ctime_test.pvalue:.3g})"
+                f"(p={ctime_ttest.pvalue:.3g})"
             )
 
-    if "GITHUB_OUTPUT" in os.environ:
-        with open(os.environ["GITHUB_OUTPUT"], "a") as f:
-            print(f"pass={not failed_tests}", file=f)
+    print_to_github_out(f"pass={not exec_failures}")
+
+    message = ""
+    if ctime_failures:
+        message += (
+            "\n----The following ctime regression tests failed:----\n"
+            + "\n".join(ctime_failures)
+            + "\n"
+        )
 
-    if not failed_tests:
+    if not exec_failures:
+        print_to_github_out(f'message={message.replace("\n", "%0A")}')
         print("Regression tests passed")
     else:
-        message = (
-            "\n----The following regression tests failed:----\n"
-            + "\n".join(failed_tests)
+        message += (
+            "\n----The following exec time regression tests failed:----\n"
+            + "\n".join(exec_failures)
             + "\n"
         )
-        if "GITHUB_OUTPUT" in os.environ:
-            out_message = message.replace("\n", "%0A")
-            with open(os.environ["GITHUB_OUTPUT"], "a") as f:
-                print(f"message={out_message}", file=f)
+        print_to_github_out(f'message={message.replace("\n", "%0A")}')
         print(message)
         raise Exception("Some regression tests failed")