Merge c691125 into 843ad94

conbench · Jul 30, 2021 · 3f01768 · 3f01768
2 parents 843ad94 + c691125
commit 3f01768
Show file tree

Hide file tree

Showing 15 changed files with 211 additions and 452 deletions.
diff --git a/conbench/api/__init__.py b/conbench/api/__init__.py
@@ -11,7 +11,6 @@
 from .commits import *  # noqa
 from .compare import *  # noqa
 from .contexts import *  # noqa
-from .distribution import *  # noqa
 from .history import *  # noqa
 from .index import *  # noqa
 from .machines import *  # noqa

diff --git a/conbench/api/_docs.py b/conbench/api/_docs.py
@@ -70,7 +70,6 @@ def _201_created(example, schema=None):
 spec.components.response("CompareList", _200_ok(ex.COMPARE_LIST))
 spec.components.response("ContextEntity", _200_ok(ex.CONTEXT_ENTITY))
 spec.components.response("ContextList", _200_ok([ex.CONTEXT_ENTITY]))
-spec.components.response("DistributionList", _200_ok([ex.DISTRIBUTION_ENTITY]))
 spec.components.response("HistoryList", _200_ok([ex.HISTORY_ENTITY]))
 spec.components.response("MachineEntity", _200_ok(ex.MACHINE_ENTITY))
 spec.components.response("MachineList", _200_ok([ex.MACHINE_ENTITY]))
@@ -89,7 +88,6 @@ def _201_created(example, schema=None):
     {"name": "Commits", "description": "Benchmarked commits"},
     {"name": "Comparisons", "description": "Benchmark comparisons"},
     {"name": "Contexts", "description": "Benchmark contexts"},
-    {"name": "Distribution", "description": "Benchmark distribution"},
     {"name": "History", "description": "Benchmark history"},
     {"name": "Machines", "description": "Benchmark machines"},
     {"name": "Runs", "description": "Benchmark runs"},

diff --git a/conbench/api/_examples.py b/conbench/api/_examples.py
@@ -224,29 +224,6 @@ def _api_context_entity(context_id, links=True):
     return result
 
 
-def _api_distribution_entity(
-    distribution_id,
-    case_id,
-    context_id,
-    commit_id,
-):
-    result = {
-        "id": distribution_id,
-        "case_id": case_id,
-        "context_id": context_id,
-        "commit_id": commit_id,
-        "machine_hash": "diana-2-4-17179869184",
-        "unit": "s",
-        "mean_mean": "0.036369",
-        "mean_sd": "0.000000",
-        "repository": "https://github.com/apache/arrow",
-        "sha": "02addad336ba19a654f9c857ede546331be7b631",
-        "first_timestamp": "2021-02-25T01:02:51",
-        "last_timestamp": "2021-02-25T01:02:51",
-    }
-    return result
-
-
 def _api_history_entity(benchmark_id, case_id, context_id):
     return {
         "benchmark_id": benchmark_id,
@@ -255,6 +232,8 @@ def _api_history_entity(benchmark_id, case_id, context_id):
         "machine_hash": "diana-2-4-17179869184",
         "unit": "s",
         "mean": "0.036369",
+        "distribution_mean": "0.036369",
+        "distribution_stdev": "0.000000",
         "repository": "https://github.com/apache/arrow",
         "sha": "02addad336ba19a654f9c857ede546331be7b631",
         "timestamp": "2021-02-25T01:02:51",
@@ -362,12 +341,6 @@ def _api_run_entity(run_id, commit_id, machine_id, now, baseline_id):
     ],
 )
 CONTEXT_ENTITY = _api_context_entity("some-context-uuid-1")
-DISTRIBUTION_ENTITY = _api_distribution_entity(
-    "some-distribution-uuid-1",
-    "some-case-uuid-1",
-    "some-context-uuid-1",
-    "some-commit-uuid-1",
-)
 HISTORY_ENTITY = _api_history_entity(
     "some-benchmark-uuid-1",
     "some-case-uuid-1",

diff --git a/conbench/api/distribution.py b/conbench/api/distribution.py
diff --git a/conbench/app/_plots.py b/conbench/app/_plots.py
@@ -9,10 +9,9 @@
 class TimeSeriesPlotMixin:
     def _get_history_plot(self, benchmark):
         history = self._get_history(benchmark)
-        distribution = self._get_distribution(benchmark)
         return json.dumps(
             bokeh.embed.json_item(
-                time_series_plot(history, distribution, benchmark["id"]),
+                time_series_plot(history, benchmark["id"]),
                 "plot-history",
             )
         )
@@ -24,13 +23,6 @@ def _get_history(self, benchmark):
             return []
         return response.json
 
-    def _get_distribution(self, benchmark):
-        response = self.api_get("api.distribution", benchmark_id=benchmark["id"])
-        if response.status_code != 200:
-            self.flash("Error getting distribution.")
-            return []
-        return response.json
-
 
 def get_display_unit(unit):
     if unit == "s":
@@ -101,17 +93,10 @@ def simple_bar_plot(benchmarks, height=400, width=400):
     return p
 
 
-def time_series_plot(history, distribution, benchmark_id, height=250, width=1000):
-    dist_by_sha = {d["sha"]: d for d in distribution}
-    for h in history:
-        dist = dist_by_sha.get(h["sha"])
-        if dist:
-            h["mean_mean"] = dist["mean_mean"]
-            h["mean_sd"] = dist["mean_sd"]
-
+def time_series_plot(history, benchmark_id, height=250, width=1000):
     unit = get_display_unit(history[0]["unit"])
     current = [h for h in history if h["benchmark_id"] == benchmark_id]
-    with_dist = [h for h in history if h.get("mean_mean")]
+    with_dist = [h for h in history if h["distribution_mean"]]
 
     times = [h["mean"] for h in history]
     commits = [h["message"] for h in history]
@@ -121,15 +106,15 @@ def time_series_plot(history, distribution, benchmark_id, height=250, width=1000
     commits_x = [c["message"] for c in current]
     dates_x = [dateutil.parser.isoparse(c["timestamp"]) for c in current]
 
-    times_mean = [w["mean_mean"] for w in with_dist]
+    times_mean = [w["distribution_mean"] for w in with_dist]
     commits_mean = [w["message"] for w in with_dist]
     dates_mean = [dateutil.parser.isoparse(w["timestamp"]) for w in with_dist]
 
     alert_min, alert_max = [], []
     for w in with_dist:
-        alert = 5 * float(w["mean_sd"])
-        alert_min.append(float(w["mean_mean"]) - alert)
-        alert_max.append(float(w["mean_mean"]) + alert)
+        alert = 5 * float(w["distribution_stdev"])
+        alert_min.append(float(w["distribution_mean"]) - alert)
+        alert_max.append(float(w["distribution_mean"]) + alert)
 
     source_data = dict(x=dates, y=times, commit=commits)
     source = bokeh.models.ColumnDataSource(data=source_data)

diff --git a/conbench/entities/distribution.py b/conbench/entities/distribution.py
@@ -7,7 +7,6 @@
 from ..entities._entity import (
     Base,
     EntityMixin,
-    EntitySerializer,
     generate_uuid,
     NotNull,
     Nullable,
@@ -21,11 +20,9 @@
 class Distribution(Base, EntityMixin):
     __tablename__ = "distribution"
     id = NotNull(s.String(50), primary_key=True, default=generate_uuid)
-    sha = NotNull(s.String(50))
-    repository = NotNull(s.String(100))
     case_id = NotNull(s.String(50), s.ForeignKey("case.id", ondelete="CASCADE"))
     context_id = NotNull(s.String(50), s.ForeignKey("context.id", ondelete="CASCADE"))
-    commit_id = Nullable(s.String(50), s.ForeignKey("commit.id", ondelete="CASCADE"))
+    commit_id = NotNull(s.String(50), s.ForeignKey("commit.id", ondelete="CASCADE"))
     machine_hash = NotNull(s.String(250))
     unit = NotNull(s.Text)
     mean_mean = Nullable(s.Numeric, check("mean_mean>=0"))
@@ -44,71 +41,12 @@ class Distribution(Base, EntityMixin):
 
 s.Index(
     "distribution_index",
-    Distribution.sha,
     Distribution.case_id,
     Distribution.context_id,
+    Distribution.commit_id,
     Distribution.machine_hash,
     unique=True,
 )
-s.Index("distribution_sha_index", Distribution.sha)
-s.Index("distribution_repository_index", Distribution.repository)
-s.Index("distribution_case_id_index", Distribution.case_id)
-s.Index("distribution_context_id_index", Distribution.context_id)
-s.Index("distribution_commit_id_index", Distribution.commit_id)
-s.Index("distribution_machine_hash_index", Distribution.machine_hash)
-
-
-class _Serializer(EntitySerializer):
-    decimal_fmt = "{:.6f}"
-
-    def _dump(self, distribution):
-        standard_deviation = distribution.mean_sd if distribution.mean_sd else 0
-        result = {
-            "id": distribution.id,
-            "sha": distribution.sha,
-            "repository": distribution.repository,
-            "case_id": distribution.case_id,
-            "context_id": distribution.context_id,
-            "commit_id": distribution.commit_id,
-            "machine_hash": distribution.machine_hash,
-            "unit": distribution.unit,
-            "mean_mean": self.decimal_fmt.format(distribution.mean_mean),
-            "mean_sd": self.decimal_fmt.format(standard_deviation),
-            "first_timestamp": distribution.first_timestamp.isoformat(),
-            "last_timestamp": distribution.last_timestamp.isoformat(),
-        }
-        return result
-
-
-class DistributionSerializer:
-    one = _Serializer()
-    many = _Serializer(many=True)
-
-
-def get_distribution_history(case_id, context_id, machine_hash):
-    return (
-        Session.query(
-            Distribution.id,
-            Distribution.repository,
-            Distribution.sha,
-            Distribution.case_id,
-            Distribution.context_id,
-            Distribution.commit_id,
-            Distribution.machine_hash,
-            Distribution.unit,
-            Distribution.mean_mean,
-            Distribution.mean_sd,
-            Distribution.first_timestamp,
-            Distribution.last_timestamp,
-        )
-        .filter(
-            Distribution.case_id == case_id,
-            Distribution.context_id == context_id,
-            Distribution.machine_hash == machine_hash,
-        )
-        .order_by(Distribution.first_timestamp.asc())
-        .all()
-    )
 
 
 def get_commit_index(repository):
@@ -131,19 +69,24 @@ def get_commits_up(repository, sha, limit):
     return Session.query(index).filter(index.c.row_number >= n).limit(limit)
 
 
-def get_distribution(
-    repository, sha, case_id, context_id, commit_id, machine_hash, limit
-):
+def get_distribution(summary, limit):
     from ..entities.summary import Summary
 
-    commits_up = get_commits_up(repository, sha, limit).subquery().alias("commits_up")
+    commits_up = (
+        get_commits_up(
+            summary.run.commit.repository,
+            summary.run.commit.sha,
+            limit,
+        )
+        .subquery()
+        .alias("commits_up")
+    )
+
     return (
         Session.query(
-            func.text(repository).label("repository"),
-            func.text(sha).label("sha"),
-            func.text(case_id).label("case_id"),
-            func.text(context_id).label("context_id"),
-            func.text(commit_id).label("commit_id"),
+            func.text(summary.case_id).label("case_id"),
+            func.text(summary.context_id).label("context_id"),
+            func.text(summary.run.commit_id).label("commit_id"),
             Machine.hash,
             func.max(Summary.unit).label("unit"),
             func.avg(Summary.mean).label("mean_mean"),
@@ -171,25 +114,17 @@ def get_distribution(
         .join(commits_up, commits_up.c.id == Run.commit_id)
         .filter(
             Run.name.like("commit: %"),
-            Summary.case_id == case_id,
-            Summary.context_id == context_id,
-            Machine.hash == machine_hash,
+            Summary.case_id == summary.case_id,
+            Summary.context_id == summary.context_id,
+            Machine.hash == summary.run.machine.hash,
         )
     )
 
 
-def update_distribution(repository, sha, summary, limit):
+def update_distribution(summary, limit):
     from ..db import engine
 
-    distribution = get_distribution(
-        repository,
-        sha,
-        summary.case_id,
-        summary.context_id,
-        summary.run.commit_id,
-        summary.run.machine.hash,
-        limit,
-    ).first()
+    distribution = get_distribution(summary, limit).first()
 
     if not distribution:
         return
@@ -204,7 +139,7 @@ def update_distribution(repository, sha, summary, limit):
             insert(Distribution.__table__)
             .values(values)
             .on_conflict_do_update(
-                index_elements=["sha", "case_id", "context_id", "machine_hash"],
+                index_elements=["case_id", "context_id", "commit_id", "machine_hash"],
                 set_=values,
             )
         )
@@ -215,15 +150,21 @@ def set_z_scores(summaries):
     if not summaries:
         return
 
+    for summary in summaries:
+        summary.z_score = 0
+
     first = summaries[0]
-    repository = first.run.commit.repository
-    sha = first.run.commit.parent
-    machine_hash = first.run.machine.hash
+    parent_commit = Commit.first(
+        sha=first.run.commit.parent,
+        repository=first.run.commit.repository,
+    )
+
+    if not parent_commit:
+        return
 
     where = [
-        Distribution.repository == repository,
-        Distribution.sha == sha,
-        Distribution.machine_hash == machine_hash,
+        Distribution.commit_id == parent_commit.id,
+        Distribution.machine_hash == first.run.machine.hash,
     ]
     if len(summaries) == 1:
         where.extend(
@@ -239,6 +180,7 @@ def set_z_scores(summaries):
         Distribution.mean_mean,
         Distribution.mean_sd,
     ]
+
     distributions = Session.query(*cols).filter(*where).all()
     lookup = {f"{d.case_id}-{d.context_id}": d for d in distributions}