Merge 4731623 into bfbf663

conbench · May 27, 2021 · 90ffff0 · 90ffff0
2 parents bfbf663 + 4731623
commit 90ffff0
Show file tree

Hide file tree

Showing 20 changed files with 823 additions and 262 deletions.
diff --git a/conbench/api/_examples.py b/conbench/api/_examples.py
@@ -66,6 +66,9 @@ def _api_benchmark_entity(
             "q3": "0.036942",
             "stdev": "0.049194",
             "timestamp": "2020-11-25T21:02:42.706806",
+            "z_score": "0.000000",
+            "z_regression": False,
+            "z_improvement": False,
         },
         "tags": {
             "id": case_id,
@@ -117,10 +120,10 @@ def _api_compare_entity(benchmark_ids, batch_ids, run_ids, batch, benchmark, tag
         "deviations": "2.000",
         "baseline_z_score": "0.000",
         "contender_z_score": "0.000",
-        "baseline_regression_z": False,
-        "baseline_improvement_z": False,
-        "contender_regression_z": False,
-        "contender_improvement_z": False,
+        "baseline_z_regression": False,
+        "baseline_z_improvement": False,
+        "contender_z_regression": False,
+        "contender_z_improvement": False,
         "contender": "0.036 s",
         "contender_id": benchmark_ids[1],
         "contender_batch_id": batch_ids[1],
@@ -155,10 +158,10 @@ def _api_compare_list(
             "deviations": "2.000",
             "baseline_z_score": "0.000",
             "contender_z_score": "0.000",
-            "baseline_regression_z": False,
-            "baseline_improvement_z": False,
-            "contender_regression_z": False,
-            "contender_improvement_z": False,
+            "baseline_z_regression": False,
+            "baseline_z_improvement": False,
+            "contender_z_regression": False,
+            "contender_z_improvement": False,
             "contender": "0.036 s",
             "contender_id": contender_ids[0],
             "contender_batch_id": batch_ids[1],
@@ -181,10 +184,10 @@ def _api_compare_list(
             "deviations": "2.000",
             "baseline_z_score": "0.000",
             "contender_z_score": "0.000",
-            "baseline_regression_z": False,
-            "baseline_improvement_z": False,
-            "contender_regression_z": False,
-            "contender_improvement_z": False,
+            "baseline_z_regression": False,
+            "baseline_z_improvement": False,
+            "contender_z_regression": False,
+            "contender_z_improvement": False,
             "contender": "0.036 s",
             "contender_id": contender_ids[1],
             "contender_batch_id": batch_ids[1],

diff --git a/conbench/api/benchmarks.py b/conbench/api/benchmarks.py
@@ -7,6 +7,7 @@
 from ..api._endpoint import ApiEndpoint
 from ..entities._entity import NotFound
 from ..entities.case import Case
+from ..entities.distribution import set_z_scores
 from ..entities.summary import BenchmarkFacadeSchema, Summary, SummarySerializer
 
 
@@ -42,6 +43,7 @@ def get(self, benchmark_id):
           - Benchmarks
         """
         summary = self._get(benchmark_id)
+        set_z_scores([summary])
         return self.serializer.one.dump(summary)
 
     @flask_login.login_required
@@ -101,16 +103,26 @@ def get(self):
                 filters=[Case.name == name],
                 joins=[Case],
             )
+            # TODO: cannot currently compute z_score on an arbitrary
+            # list of summaries - assumes same machine/sha/repository.
+            for summary in summaries:
+                summary.z_score = 0
         elif batch_id:
             summaries = Summary.search(
                 filters=[Summary.batch_id == batch_id],
             )
+            set_z_scores(summaries)
         elif run_id:
             summaries = Summary.search(
                 filters=[Summary.run_id == run_id],
             )
+            set_z_scores(summaries)
         else:
             summaries = Summary.all(order_by=Summary.timestamp.desc(), limit=500)
+            # TODO: cannot currently compute z_score on an arbitrary
+            # list of summaries - assumes same machine/sha/repository.s
+            for summary in summaries:
+                summary.z_score = 0
         return self.serializer.many.dump(summaries)
 
     @flask_login.login_required
@@ -131,6 +143,7 @@ def post(self):
         """
         data = self.validate_benchmark(self.schema.create)
         summary = Summary.create(data)
+        set_z_scores([summary])
         return self.response_201_created(self.serializer.one.dump(summary))
 
 

diff --git a/conbench/api/compare.py b/conbench/api/compare.py
@@ -2,8 +2,8 @@
 
 
 from ..api import rule
-from ..api._comparator import BenchmarkComparator, BenchmarkListComparator
 from ..api._endpoint import ApiEndpoint
+from ..entities._comparator import BenchmarkComparator, BenchmarkListComparator
 from ..entities._entity import NotFound
 from ..entities.distribution import set_z_scores
 from ..entities.summary import Summary

diff --git a/conbench/app/compare.py b/conbench/app/compare.py
@@ -124,10 +124,6 @@ def _compare(self, params):
                 compare = f'{c["baseline_batch_id"]}...{c["contender_batch_id"]}'
                 c["compare_batches_url"] = f.url_for(view, compare_ids=compare)
 
-                c["change"] = float(c["change"][:-1])
-                if c["less_is_better"] and c["change"] != 0:
-                    c["change"] = c["change"] * -1
-
                 if c["regression"]:
                     regressions += 1
                 if c["improvement"]:

diff --git a/conbench/api/_comparator.py → conbench/entities/_comparator.py b/conbench/api/_comparator.py → conbench/entities/_comparator.py
@@ -15,6 +15,22 @@ def change_fmt(value):
     return "{:.3%}".format(value)
 
 
+def _less_is_better(unit):
+    if unit in ["B/s", "i/s"]:
+        return False
+    return True
+
+
+def z_regression(z_score, deviations=None):
+    deviations = deviations if deviations else DEVIATIONS
+    return -z_score > deviations
+
+
+def z_improvement(z_score, deviations=None):
+    deviations = deviations if deviations else DEVIATIONS
+    return z_score > deviations
+
+
 class BenchmarkResult:
     def __init__(
         self,
@@ -72,9 +88,7 @@ def unit(self):
 
     @property
     def less_is_better(self):
-        if self.unit in ["B/s", "i/s"]:
-            return False
-        return True
+        return _less_is_better(self.unit)
 
     @property
     def change(self):
@@ -89,19 +103,19 @@ def change(self):
         if old == 0:
             return 0.0
 
-        return (new - old) / abs(old)
+        result = (new - old) / abs(old)
+        if self.less_is_better and result != 0:
+            result = result * -1
+
+        return result
 
     @property
     def regression(self):
-        change = self.change
-        adjusted_change = change if self.less_is_better else -change
-        return adjusted_change * 100 > self.threshold
+        return -self.change * 100 > self.threshold
 
     @property
     def improvement(self):
-        change = self.change
-        adjusted_change = -change if self.less_is_better else change
-        return adjusted_change * 100 > self.threshold
+        return self.change * 100 > self.threshold
 
     @property
     def baseline_z_score(self):
@@ -116,28 +130,20 @@ def contender_z_score(self):
         return self.contender.z_score
 
     @property
-    def baseline_regression_z(self):
-        z_score = self.baseline_z_score
-        adjusted_z_score = z_score if self.less_is_better else -z_score
-        return adjusted_z_score > self.deviations
+    def baseline_z_regression(self):
+        return z_regression(self.baseline_z_score, self.deviations)
 
     @property
-    def baseline_improvement_z(self):
-        z_score = self.baseline_z_score
-        adjusted_z_score = -z_score if self.less_is_better else z_score
-        return adjusted_z_score > self.deviations
+    def baseline_z_improvement(self):
+        return z_improvement(self.baseline_z_score, self.deviations)
 
     @property
-    def contender_regression_z(self):
-        z_score = self.contender_z_score
-        adjusted_z_score = z_score if self.less_is_better else -z_score
-        return adjusted_z_score > self.deviations
+    def contender_z_regression(self):
+        return z_regression(self.contender_z_score, self.deviations)
 
     @property
-    def contender_improvement_z(self):
-        z_score = self.contender_z_score
-        adjusted_z_score = -z_score if self.less_is_better else z_score
-        return adjusted_z_score > self.deviations
+    def contender_z_improvement(self):
+        return z_improvement(self.contender_z_score, self.deviations)
 
     @property
     def tags(self):
@@ -161,10 +167,10 @@ def formatted(self):
             "deviations": fmt(self.deviations),
             "baseline_z_score": fmt(self.baseline_z_score),
             "contender_z_score": fmt(self.contender_z_score),
-            "baseline_regression_z": self.baseline_regression_z,
-            "baseline_improvement_z": self.baseline_improvement_z,
-            "contender_regression_z": self.contender_regression_z,
-            "contender_improvement_z": self.contender_improvement_z,
+            "baseline_z_regression": self.baseline_z_regression,
+            "baseline_z_improvement": self.baseline_z_improvement,
+            "contender_z_regression": self.contender_z_regression,
+            "contender_z_improvement": self.contender_z_improvement,
             "baseline": fmt_unit(baseline, self.unit),
             "contender": fmt_unit(contender, self.unit),
             "baseline_id": self.baseline.id if self.baseline else None,
@@ -191,10 +197,10 @@ def compare(self):
             "deviations": fmt(self.deviations),
             "baseline_z_score": fmt(self.baseline_z_score),
             "contender_z_score": fmt(self.contender_z_score),
-            "baseline_regression_z": self.baseline_regression_z,
-            "baseline_improvement_z": self.baseline_improvement_z,
-            "contender_regression_z": self.contender_regression_z,
-            "contender_improvement_z": self.contender_improvement_z,
+            "baseline_z_regression": self.baseline_z_regression,
+            "baseline_z_improvement": self.baseline_z_improvement,
+            "contender_z_regression": self.contender_z_regression,
+            "contender_z_improvement": self.contender_z_improvement,
             "baseline": fmt(baseline),
             "contender": fmt(contender),
             "baseline_id": self.baseline.id if self.baseline else None,

diff --git a/conbench/entities/distribution.py b/conbench/entities/distribution.py
@@ -11,6 +11,7 @@
     NotNull,
     Nullable,
 )
+from ..entities._comparator import _less_is_better
 from ..entities.commit import Commit
 from ..entities.machine import Machine
 from ..entities.run import Run
@@ -184,3 +185,5 @@ def set_z_scores(summaries):
         d = lookup.get(f"{summary.case_id}-{summary.context_id}")
         if d and d.mean_sd:
             summary.z_score = (summary.mean - d.mean_mean) / d.mean_sd
+        if _less_is_better(summary.unit) and summary.z_score != 0:
+            summary.z_score = summary.z_score * -1
diff --git a/conbench/entities/summary.py b/conbench/entities/summary.py
@@ -16,6 +16,7 @@
     NotNull,
     Nullable,
 )
+from ..entities._comparator import z_improvement, z_regression
 from ..entities.case import Case
 from ..entities.context import Context
 from ..entities.commit import Commit, parse_commit
@@ -215,6 +216,9 @@ def _dump(self, summary):
                 "q3": self.decimal_fmt.format(summary.q3),
                 "iqr": self.decimal_fmt.format(summary.iqr),
                 "timestamp": summary.timestamp.isoformat(),
+                "z_score": self.decimal_fmt.format(summary.z_score),
+                "z_regression": z_regression(summary.z_score),
+                "z_improvement": z_improvement(summary.z_score),
             },
             "links": {
                 "list": f.url_for("api.benchmarks", _external=True),

diff --git a/conbench/runner.py b/conbench/runner.py
@@ -149,6 +149,7 @@ def record(self, result, name, tags, context, github, options, output=None):
             result.get("time_unit", "s"),
             timestamp,
             run_id,
+            self.batch_id,
             run_name,
         )
         benchmark = {
@@ -224,7 +225,8 @@ def _get_timing_options(self, options):
             "iterations": options.get("iterations", 1),
         }
 
-    def _stats(self, data, unit, times, time_unit, timestamp, run_id, run_name):
+    @staticmethod
+    def _stats(data, unit, times, time_unit, timestamp, run_id, batch_id, run_name):
         fmt = "{:.6f}"
 
         def _format(f, data, min_length=0):
@@ -236,7 +238,7 @@ def _format(f, data, min_length=0):
         q1, q3 = np.percentile(data, [25, 75])
 
         if not run_id:
-            run_id = self.batch_id
+            run_id = batch_id
 
         result = {
             "data": [fmt.format(x) for x in data],
@@ -245,7 +247,7 @@ def _format(f, data, min_length=0):
             "time_unit": time_unit,
             "iterations": len(data),
             "timestamp": timestamp,
-            "batch_id": self.batch_id,
+            "batch_id": batch_id,
             "run_id": run_id,
             "mean": _format(statistics.mean, data),
             "median": _format(statistics.median, data),

diff --git a/conbench/templates/batch.html b/conbench/templates/batch.html
@@ -60,6 +60,7 @@
                     <th scope="col">Batch</th>
                     <th scope="col">Benchmark</th>
                     <th scope="col">Mean</th>
+                    <th scope="col">Z-Score</th>
                 </tr>
             </thead>
             <tbody>
@@ -73,6 +74,7 @@
                          <div>{{ benchmark.display_name }}</div>
                      </a></td>
                      <td>{{ benchmark.display_mean }}</td>
+                     <td>{{ benchmark.stats.z_score }}</td>
                 </tr>
                 {% endfor %}
             </tbody>

diff --git a/conbench/templates/benchmark-entity.html b/conbench/templates/benchmark-entity.html
@@ -58,10 +58,6 @@
             </li>
             {% endif %}
           {% endfor %}
-            <li class="list-group-item" style="overflow-y: auto;">
-              <b>&nbsp;</b>
-              <div align="right" style="display:inline-block; float: right;">&nbsp;</div>
-            </li>
             <li class="list-group-item active">Tags</li>
           {% for k,v in benchmark.tags.items() %}
             <li class="list-group-item" style="overflow-y: auto;">
@@ -109,6 +105,14 @@
               <div align="right" style="display:inline-block; float: right;">{{ v }}</div>
             </li>
             {% endfor %}
+            <li class="list-group-item" style="overflow-y: auto;">
+              <b>&nbsp;</b>
+              <div align="right" style="display:inline-block; float: right;">&nbsp;</div>
+            </li>
+            <li class="list-group-item" style="overflow-y: auto;">
+              <b>&nbsp;</b>
+              <div align="right" style="display:inline-block; float: right;">&nbsp;</div>
+            </li>
             <li class="list-group-item active" >Context</li>
             {% for k,v in benchmark.context.items() %}
               <li class="list-group-item" style="overflow-y: auto;">

diff --git a/conbench/templates/benchmark-list.html b/conbench/templates/benchmark-list.html
@@ -60,6 +60,7 @@
   var table =  $('#benchmarks').dataTable( {
     "responsive": true,
     "order": [[0, 'desc']],
+    "columnDefs": [{ "orderable": false, "targets": [3] }]
   } );
 {% else %}
   var table = $('#benchmarks').dataTable( {