Skip to content

Commit

Permalink
Do a simple linear search on the first parent of every commit to find…
Browse files Browse the repository at this point in the history
… all commits for the changes view.

- this works for many cases but not all
- in case it does not walk the right pass to a parent, it will terminate after 10 revisions
- should be 'good enough' for many cases
- better solutions still welcome

Signed-off-by: Stefan Marr <git@stefan-marr.de>
  • Loading branch information
smarr committed Jun 22, 2011
1 parent 8795f63 commit 9538462
Showing 1 changed file with 77 additions and 47 deletions.
124 changes: 77 additions & 47 deletions speedcenter/codespeed/github.py
Expand Up @@ -18,11 +18,68 @@

GITHUB_URL_RE = re.compile(r'^(?P<proto>\w+)://github.com/(?P<username>[^/]+)/(?P<project>[^/]+)[.]git$')

# We currently use a simple linear search of on a single parent to retrieve
# the history. This is often good enough, but might miss the actual starting
# point. Thus, we need to terminate the search after a resonable number of
# revisions.
GITHUB_REVISION_LIMIT = 10

def updaterepo(project, update=True):
return


def retrieve_revision(commit_id, username, project, revision = None):
commit_url = 'http://github.com/api/v2/json/commits/show/%s/%s/%s' % (
username, project, commit_id)
print commit_url

commit_json = cache.get(commit_url)

if commit_json is None:
try:
commit_json = json.load(urllib.urlopen(commit_url))
except IOError, e:
logging.exception("Unable to load %s: %s",
commit_url, e, exc_info=True)
raise e

if 'error' in commit_json:
# We'll still cache these for a brief period of time to avoid making too many requests:
cache.set(commit_url, commit_json, 300)
else:
# We'll cache successes for a very long period of time since
# SCM diffs shouldn't change:
cache.set(commit_url, commit_json, 86400 * 30)

if 'error' in commit_json:
raise RuntimeError("Unable to load %s: %s" % (commit_url, commit_json['error']))

commit = commit_json['commit']

date = isodate.parse_datetime(commit['committed_date'])

if revision:
# Overwrite any existing data we might have for this revision since
# we never want our records to be out of sync with the actual VCS:

# We need to convert the timezone-aware date to a naive (i.e.
# timezone-less) date in UTC to avoid killing MySQL:
revision.date = date.astimezone(isodate.tzinfo.Utc()).replace(tzinfo=None)
revision.author = commit['author']['name']
revision.message = commit['message']
revision.full_clean()
revision.save()

return {'date': date,
'message': commit['message'],
'body': "", # TODO: pretty-print diffs
'author': commit['author']['name'],
'author_email': commit['author']['email'],
'commitid': commit['id'],
'short_commit_id': commit['id'][0:7],
'links': {'Github': 'http://github.com%s' % commit['url']},
'parents': commit['parents']}

def getlogs(endrev, startrev):
if endrev != startrev:
revisions = endrev.branch.revisions.filter(
Expand All @@ -40,53 +97,26 @@ def getlogs(endrev, startrev):
project = m.group("project")

logs = []
last_rev_data = None
revision_count = 0
ancestor_found = False
#TODO: get all revisions between endrev and startrev,
# not only those present in the Codespeed DB
for revision in revisions:
commit_url = 'http://github.com/api/v2/json/commits/show/%s/%s/%s' % (
username, project, revision.commitid)

commit_json = cache.get(commit_url)

if commit_json is None:
try:
commit_json = json.load(urllib.urlopen(commit_url))
except IOError, e:
logging.exception("Unable to load %s: %s",
commit_url, e, exc_info=True)
raise e

if 'error' in commit_json:
# We'll still cache these for a brief period of time to avoid making too many requests:
cache.set(commit_url, commit_json, 300)
else:
# We'll cache successes for a very long period of time since
# SCM diffs shouldn't change:
cache.set(commit_url, commit_json, 86400 * 30)

if 'error' in commit_json:
raise RuntimeError("Unable to load %s: %s" % (commit_url, commit_json['error']))

commit = commit_json['commit']

date = isodate.parse_datetime(commit['committed_date'])

# Overwrite any existing data we might have for this revision since
# we never want our records to be out of sync with the actual VCS:

# We need to convert the timezone-aware date to a naive (i.e.
# timezone-less) date in UTC to avoid killing MySQL:
revision.date = date.astimezone(isodate.tzinfo.Utc()).replace(tzinfo=None)
revision.author = commit['author']['name']
revision.message = commit['message']
revision.full_clean()
revision.save()

logs.append({'date': date, 'message': commit['message'],
'body': "", # TODO: pretty-print diffs
'author': commit['author']['name'],
'author_email': commit['author']['email'],
'commitid': commit['id'],
'short_commit_id': commit['id'][0:7],
'links': {'Github': 'http://github.com%s' % commit['url']}})
return logs
for revision in revisions:
last_rev_data = retrieve_revision(revision.commitid, username, project, revision)
logs.append(last_rev_data)
revision_count += 1
ancestor_found = (startrev.commitid in [rev['id'] for rev in last_rev_data['parents']])

# Simple approach to find the startrev, stop after found or after
# #GITHUB_REVISION_LIMIT revisions are fetched
while (revision_count < GITHUB_REVISION_LIMIT
and not ancestor_found
and len(last_rev_data['parents']) > 0):
last_rev_data = retrieve_revision(last_rev_data['parents'][0]['id'], username, project)
logs.append(last_rev_data)
revision_count += 1
ancestor_found = (startrev.commitid in [rev['id'] for rev in last_rev_data['parents']])

return sorted(logs, key=lambda i: i['date'], reverse=True)

0 comments on commit 9538462

Please sign in to comment.