Skip to content

Commit

Permalink
Backfill distribution table
Browse files Browse the repository at this point in the history
  • Loading branch information
dianaclarke committed May 11, 2021
1 parent 863b05e commit 68040db
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 16 deletions.
1 change: 1 addition & 0 deletions conbench/entities/_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def first(cls, **kwargs):
@classmethod
def delete_all(cls):
Session.query(cls).delete()
Session.commit()

@classmethod
def create(cls, data):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from ...entities.distribution import Distribution
from ...entities.summary import Summary
from ...db import Session


this_dir = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -117,7 +116,6 @@ def test_upgrade():

Distribution.delete_all()
assert Distribution.count() == 0
Session.commit()

# do migration
alembic_config = Config(config_path)
Expand Down
107 changes: 93 additions & 14 deletions migrations/versions/6da4b0d2ad27_backfill_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
import uuid

from alembic import op
from sqlalchemy import MetaData
from sqlalchemy import func, MetaData
from sqlalchemy.dialects.postgresql import insert

from conbench.entities.distribution import Distribution, get_distribution
from sqlalchemy.sql import select

# revision identifiers, used by Alembic.
revision = "6da4b0d2ad27"
Expand All @@ -20,6 +19,73 @@
depends_on = None


def get_commit_index(commit_table, repository):
ordered = (
select(commit_table.c.id, commit_table.c.sha, commit_table.c.timestamp)
.filter(commit_table.c.repository == repository)
.order_by(commit_table.c.timestamp.desc())
).cte("ordered_commits")
return select(ordered, func.row_number().over().label("row_number"))


def get_commits_up(commit_table, repository, sha, limit):
index = get_commit_index(commit_table, repository).subquery().alias("commit_index")
n = select(index.c.row_number).filter(index.c.sha == sha).scalar_subquery()
return index.select().filter(index.c.row_number >= n).limit(limit)


def get_distribution(
summary_table,
run_table,
commit_table,
repository,
sha,
case_id,
context_id,
machine_id,
limit,
):
commits_up = (
get_commits_up(commit_table, repository, sha, limit)
.subquery()
.alias("commits_up")
)
return (
select(
func.text(repository).label("repository"),
func.text(sha).label("sha"),
summary_table.c.case_id,
summary_table.c.context_id,
summary_table.c.machine_id,
func.max(summary_table.c.unit).label("unit"),
func.avg(summary_table.c.mean).label("mean_mean"),
func.stddev(summary_table.c.mean).label("mean_sd"),
func.avg(summary_table.c.min).label("min_mean"),
func.stddev(summary_table.c.min).label("min_sd"),
func.avg(summary_table.c.max).label("max_mean"),
func.stddev(summary_table.c.max).label("max_sd"),
func.avg(summary_table.c.median).label("median_mean"),
func.stddev(summary_table.c.median).label("median_sd"),
func.min(commits_up.c.timestamp).label("first_timestamp"),
func.max(commits_up.c.timestamp).label("last_timestamp"),
func.count(summary_table.c.mean).label("observations"),
)
.group_by(
summary_table.c.case_id,
summary_table.c.context_id,
summary_table.c.machine_id,
)
.join(run_table, run_table.c.id == summary_table.c.run_id)
.join(commits_up, commits_up.c.id == run_table.c.commit_id)
.filter(
run_table.c.name.like("commit: %"),
summary_table.c.case_id == case_id,
summary_table.c.context_id == context_id,
summary_table.c.machine_id == machine_id,
)
)


def upgrade():
connection = op.get_bind()
meta = MetaData()
Expand All @@ -42,25 +108,38 @@ def upgrade():

summaries = connection.execute(summary_table.select())
for summary in summaries:
run = runs_by_id[summary["run_id"]]
commit = commits_by_id[run["commit_id"]]
run = runs_by_id.get(summary["run_id"])
if not run:
continue

commit = commits_by_id.get(run["commit_id"])
if not commit:
continue

hash_ = f'{commit["sha"]}{summary["case_id"]}{summary["context_id"]}{summary["machine_id"]}'
if hash_ in seen:
continue

distribution = get_distribution(
commit["repository"],
commit["sha"],
summary["case_id"],
summary["context_id"],
summary["machine_id"],
1000,
).first()
distributions = list(
connection.execute(
get_distribution(
summary_table,
run_table,
commit_table,
commit["repository"],
commit["sha"],
summary["case_id"],
summary["context_id"],
summary["machine_id"],
1000,
)
)
)

if not distribution:
if not distributions:
continue

distribution = distributions[0]
values = dict(distribution)
values["id"] = uuid.uuid4().hex

Expand Down

0 comments on commit 68040db

Please sign in to comment.