Z-score API scafold (#70)

conbench · May 20, 2021 · 3852549 · 3852549
1 parent a60fafa
commit 3852549
Show file tree

Hide file tree

Showing 6 changed files with 546 additions and 22 deletions.
diff --git a/conbench/api/_comparator.py b/conbench/api/_comparator.py
@@ -3,7 +3,8 @@
 from ..units import formatter_for_unit
 
 
-THRESHOLD = 5  # percent
+THRESHOLD = 5.0  # percent
+DEVIATIONS = 2.0  # standard deviations
 
 
 def fmt(value):
@@ -15,7 +16,18 @@ def change_fmt(value):
 
 
 class BenchmarkResult:
-    def __init__(self, id, batch_id, run_id, unit, value, batch, benchmark, tags):
+    def __init__(
+        self,
+        id,
+        batch_id,
+        run_id,
+        unit,
+        value,
+        batch,
+        benchmark,
+        tags,
+        z_score,
+    ):
         self.id = id
         self.batch_id = batch_id
         self.run_id = run_id
@@ -24,13 +36,15 @@ def __init__(self, id, batch_id, run_id, unit, value, batch, benchmark, tags):
         self.benchmark = benchmark
         self.value = decimal.Decimal(value)
         self.tags = tags
+        self.z_score = decimal.Decimal(z_score)
 
 
 class BenchmarkComparator:
-    def __init__(self, baseline, contender, threshold=None):
+    def __init__(self, baseline, contender, threshold=None, deviations=None):
         self.baseline = BenchmarkResult(**baseline) if baseline else None
         self.contender = BenchmarkResult(**contender) if contender else None
-        self.threshold = threshold if threshold is not None else THRESHOLD
+        self.threshold = float(threshold) if threshold is not None else THRESHOLD
+        self.deviations = float(deviations) if deviations is not None else DEVIATIONS
 
     @property
     def batch(self):
@@ -89,6 +103,42 @@ def improvement(self):
         adjusted_change = -change if self.less_is_better else change
         return adjusted_change * 100 > self.threshold
 
+    @property
+    def baseline_z_score(self):
+        if self.baseline is None:
+            return 0.0
+        return self.baseline.z_score
+
+    @property
+    def contender_z_score(self):
+        if self.contender is None:
+            return 0.0
+        return self.contender.z_score
+
+    @property
+    def baseline_regression_z(self):
+        z_score = self.baseline_z_score
+        adjusted_z_score = z_score if self.less_is_better else -z_score
+        return adjusted_z_score > self.deviations
+
+    @property
+    def baseline_improvement_z(self):
+        z_score = self.baseline_z_score
+        adjusted_z_score = -z_score if self.less_is_better else z_score
+        return adjusted_z_score > self.deviations
+
+    @property
+    def contender_regression_z(self):
+        z_score = self.contender_z_score
+        adjusted_z_score = z_score if self.less_is_better else -z_score
+        return adjusted_z_score > self.deviations
+
+    @property
+    def contender_improvement_z(self):
+        z_score = self.contender_z_score
+        adjusted_z_score = -z_score if self.less_is_better else z_score
+        return adjusted_z_score > self.deviations
+
     @property
     def tags(self):
         if self.baseline is not None:
@@ -98,17 +148,25 @@ def tags(self):
         return "unknown"
 
     def formatted(self):
-        fmt = formatter_for_unit(self.unit)
+        fmt_unit = formatter_for_unit(self.unit)
         baseline = self.baseline.value if self.baseline else None
         contender = self.contender.value if self.contender else None
         return {
             "batch": self.batch,
             "benchmark": self.benchmark,
             "change": change_fmt(self.change),
+            "threshold": fmt(self.threshold) + "%",
             "regression": self.regression,
             "improvement": self.improvement,
-            "baseline": fmt(baseline, self.unit),
-            "contender": fmt(contender, self.unit),
+            "deviations": fmt(self.deviations),
+            "baseline_z_score": fmt(self.baseline_z_score),
+            "contender_z_score": fmt(self.contender_z_score),
+            "baseline_regression_z": self.baseline_regression_z,
+            "baseline_improvement_z": self.baseline_improvement_z,
+            "contender_regression_z": self.contender_regression_z,
+            "contender_improvement_z": self.contender_improvement_z,
+            "baseline": fmt_unit(baseline, self.unit),
+            "contender": fmt_unit(contender, self.unit),
             "baseline_id": self.baseline.id if self.baseline else None,
             "contender_id": self.contender.id if self.contender else None,
             "baseline_batch_id": self.baseline.batch_id if self.baseline else None,
@@ -127,8 +185,16 @@ def compare(self):
             "batch": self.batch,
             "benchmark": self.benchmark,
             "change": fmt(self.change * 100),
+            "threshold": fmt(self.threshold),
             "regression": self.regression,
             "improvement": self.improvement,
+            "deviations": fmt(self.deviations),
+            "baseline_z_score": fmt(self.baseline_z_score),
+            "contender_z_score": fmt(self.contender_z_score),
+            "baseline_regression_z": self.baseline_regression_z,
+            "baseline_improvement_z": self.baseline_improvement_z,
+            "contender_regression_z": self.contender_regression_z,
+            "contender_improvement_z": self.contender_improvement_z,
             "baseline": fmt(baseline),
             "contender": fmt(contender),
             "baseline_id": self.baseline.id if self.baseline else None,
@@ -144,16 +210,27 @@ def compare(self):
 
 
 class BenchmarkListComparator:
-    def __init__(self, pairs, threshold=None):
+    def __init__(self, pairs, threshold=None, deviations=None):
         self.pairs = pairs
-        self.threshold = threshold if threshold is not None else THRESHOLD
+        self.threshold = float(threshold) if threshold is not None else THRESHOLD
+        self.deviations = float(deviations) if deviations is not None else DEVIATIONS
 
     def formatted(self):
         for pair in self.pairs.values():
             baseline, contender = pair.get("baseline"), pair.get("contender")
-            yield BenchmarkComparator(baseline, contender, self.threshold).formatted()
+            yield BenchmarkComparator(
+                baseline,
+                contender,
+                self.threshold,
+                self.deviations,
+            ).formatted()
 
     def compare(self):
         for pair in self.pairs.values():
             baseline, contender = pair.get("baseline"), pair.get("contender")
-            yield BenchmarkComparator(baseline, contender, self.threshold).compare()
+            yield BenchmarkComparator(
+                baseline,
+                contender,
+                self.threshold,
+                self.deviations,
+            ).compare()
diff --git a/conbench/api/_examples.py b/conbench/api/_examples.py
@@ -111,13 +111,21 @@ def _api_compare_entity(benchmark_ids, batch_ids, run_ids, batch, benchmark, tag
         "batch": batch,
         "benchmark": benchmark,
         "change": "0.000%",
+        "threshold": "5.000%",
+        "regression": False,
+        "improvement": False,
+        "deviations": "2.000",
+        "baseline_z_score": "0.000",
+        "contender_z_score": "0.000",
+        "baseline_regression_z": False,
+        "baseline_improvement_z": False,
+        "contender_regression_z": False,
+        "contender_improvement_z": False,
         "contender": "0.036 s",
         "contender_id": benchmark_ids[1],
         "contender_batch_id": batch_ids[1],
         "contender_run_id": run_ids[1],
         "less_is_better": True,
-        "regression": False,
-        "improvement": False,
         "unit": "s",
         "tags": tags,
     }
@@ -141,13 +149,21 @@ def _api_compare_list(
             "batch": batches[0],
             "benchmark": benchmarks[0],
             "change": "0.000%",
+            "threshold": "5.000%",
+            "regression": False,
+            "improvement": False,
+            "deviations": "2.000",
+            "baseline_z_score": "0.000",
+            "contender_z_score": "0.000",
+            "baseline_regression_z": False,
+            "baseline_improvement_z": False,
+            "contender_regression_z": False,
+            "contender_improvement_z": False,
             "contender": "0.036 s",
             "contender_id": contender_ids[0],
             "contender_batch_id": batch_ids[1],
             "contender_run_id": run_ids[1],
             "less_is_better": True,
-            "regression": False,
-            "improvement": False,
             "unit": "s",
             "tags": tags[0],
         },
@@ -159,13 +175,21 @@ def _api_compare_list(
             "batch": batches[1],
             "benchmark": benchmarks[1],
             "change": "0.000%",
+            "threshold": "5.000%",
+            "regression": False,
+            "improvement": False,
+            "deviations": "2.000",
+            "baseline_z_score": "0.000",
+            "contender_z_score": "0.000",
+            "baseline_regression_z": False,
+            "baseline_improvement_z": False,
+            "contender_regression_z": False,
+            "contender_improvement_z": False,
             "contender": "0.036 s",
             "contender_id": contender_ids[1],
             "contender_batch_id": batch_ids[1],
             "contender_run_id": run_ids[1],
             "less_is_better": True,
-            "regression": False,
-            "improvement": False,
             "unit": "s",
             "tags": tags[1],
         },

diff --git a/conbench/api/compare.py b/conbench/api/compare.py
@@ -19,6 +19,7 @@ def _compare_entity(summary):
         "benchmark": summary.display_name,
         "batch": summary.display_batch,
         "tags": summary.case.tags,
+        "z_score": 0.0,  # TODO
     }
 
 
@@ -52,14 +53,23 @@ def get(self, compare_ids):
             name: threshold
             schema:
               type: integer
+          - in: query
+            name: deviations
+            schema:
+              type: integer
         tags:
           - Compare
         """
         raw = f.request.args.get("raw", "false").lower() in ["true", "1"]
+
         threshold = f.request.args.get("threshold")
         if threshold is not None:
             threshold = int(threshold)
 
+        deviations = f.request.args.get("deviations")
+        if deviations is not None:
+            deviations = int(deviations)
+
         try:
             baseline_id, contender_id = compare_ids.split("...", 1)
         except ValueError:
@@ -76,9 +86,19 @@ def get(self, compare_ids):
         contender = _compare_entity(contender_summary)
 
         if raw:
-            return BenchmarkComparator(baseline, contender, threshold).compare()
+            return BenchmarkComparator(
+                baseline,
+                contender,
+                threshold,
+                deviations,
+            ).compare()
         else:
-            return BenchmarkComparator(baseline, contender, threshold).formatted()
+            return BenchmarkComparator(
+                baseline,
+                contender,
+                threshold,
+                deviations,
+            ).formatted()
 
 
 class CompareBatchesAPI(ApiEndpoint):
@@ -113,14 +133,23 @@ def get(self, compare_ids):
             name: threshold
             schema:
               type: integer
+          - in: query
+            name: deviations
+            schema:
+              type: integer
         tags:
           - Compare
         """
         raw = f.request.args.get("raw", "false").lower() in ["true", "1"]
+
         threshold = f.request.args.get("threshold")
         if threshold is not None:
             threshold = int(threshold)
 
+        deviations = f.request.args.get("deviations")
+        if deviations is not None:
+            deviations = int(deviations)
+
         try:
             baseline_id, contender_id = compare_ids.split("...", 1)
         except ValueError:
@@ -140,9 +169,17 @@ def get(self, compare_ids):
             self._add_pair(pairs, summary, "contender")
 
         if raw:
-            result = BenchmarkListComparator(pairs, threshold).compare()
+            result = BenchmarkListComparator(
+                pairs,
+                threshold,
+                deviations,
+            ).compare()
         else:
-            result = BenchmarkListComparator(pairs, threshold).formatted()
+            result = BenchmarkListComparator(
+                pairs,
+                threshold,
+                deviations,
+            ).formatted()
 
         return f.jsonify(list(result))
 

diff --git a/conbench/entities/distribution.py b/conbench/entities/distribution.py
@@ -146,3 +146,52 @@ def update_distribution(repository, sha, summary, limit):
             )
         )
         conn.commit()
+
+
+q = """SELECT
+run_id,
+summary.case_id,
+summary.machine_id,
+machine.name AS machine_name,
+summary.unit,
+summary.time_unit,
+summary.min,
+summary.max,
+summary.mean,
+summary.median,
+commit.timestamp AS commit_timestamp,
+commit.repository AS commit_repository,
+(summary.mean - distribution.mean_mean) / distribution.mean_sd AS mean_z,
+(summary.min - distribution.min_mean) / distribution.min_sd AS min_z,
+(summary.max - distribution.max_mean) / distribution.max_sd AS max_z,
+(summary.median - distribution.median_mean) / distribution.median_sd AS median_z,
+commit.timestamp AS commit_timestamp,
+commit.repository AS commit_repository,
+distribution.*
+FROM summary
+INNER JOIN run
+ON summary.run_id = run.id
+INNER JOIN commit
+ON commit.id = run.commit_id
+INNER JOIN machine
+ON summary.machine_id = machine.id
+LEFT JOIN distribution
+ON summary.case_id = distribution.case_id AND machine.name = distribution.machine_name
+WHERE run.name = 'commit: {{to_compare_sha}}'"""
+
+
+def get_z_score(repository, sha, case_id, context_id, machine_hash, mean):
+    result = list(
+        Session.query(Distribution.mean_mean, Distribution.mean_sd).filter(
+            Distribution.repository == repository,
+            Distribution.sha == sha,
+            Distribution.case_id == case_id,
+            Distribution.context_id == context_id,
+            Distribution.machine_hash == machine_hash,
+        )
+    )
+    if result:
+        distribution_mean = result[0]["mean_mean"]
+        distribution_sd = result[0]["mean_sd"]
+        return (mean - distribution_mean) / distribution_sd
+    return None