Do a simple linear search on the first parent of every commit to find…

… all commits for the changes view. - this works for many cases but not all - in case it does not walk the right pass to a parent, it will terminate after 10 revisions - should be 'good enough' for many cases - better solutions still welcome Signed-off-by: Stefan Marr <git@stefan-marr.de>
tobami · Jun 22, 2011 · 9538462 · 9538462
1 parent 8795f63
commit 9538462
Showing 1 changed file with 77 additions and 47 deletions.
diff --git a/speedcenter/codespeed/github.py b/speedcenter/codespeed/github.py
@@ -18,11 +18,68 @@
 
 GITHUB_URL_RE = re.compile(r'^(?P<proto>\w+)://github.com/(?P<username>[^/]+)/(?P<project>[^/]+)[.]git$')
 
+# We currently use a simple linear search of on a single parent to retrieve
+# the history. This is often good enough, but might miss the actual starting
+# point. Thus, we need to terminate the search after a resonable number of
+# revisions.
+GITHUB_REVISION_LIMIT = 10
 
 def updaterepo(project, update=True):
     return
 
 
+def retrieve_revision(commit_id, username, project, revision = None):
+    commit_url = 'http://github.com/api/v2/json/commits/show/%s/%s/%s' % (
+        username, project, commit_id)
+    print commit_url
+
+    commit_json = cache.get(commit_url)
+
+    if commit_json is None:
+        try:
+            commit_json = json.load(urllib.urlopen(commit_url))
+        except IOError, e:
+            logging.exception("Unable to load %s: %s",
+                commit_url, e, exc_info=True)
+            raise e
+
+        if 'error' in commit_json:
+            # We'll still cache these for a brief period of time to avoid making too many requests:
+            cache.set(commit_url, commit_json, 300)
+        else:
+            # We'll cache successes for a very long period of time since
+            # SCM diffs shouldn't change:
+            cache.set(commit_url, commit_json, 86400 * 30)
+
+    if 'error' in commit_json:
+        raise RuntimeError("Unable to load %s: %s" % (commit_url, commit_json['error']))
+
+    commit = commit_json['commit']
+
+    date = isodate.parse_datetime(commit['committed_date'])
+
+    if revision:
+        # Overwrite any existing data we might have for this revision since
+        # we never want our records to be out of sync with the actual VCS:
+
+        # We need to convert the timezone-aware date to a naive (i.e.
+        # timezone-less) date in UTC to avoid killing MySQL:
+        revision.date = date.astimezone(isodate.tzinfo.Utc()).replace(tzinfo=None)
+        revision.author = commit['author']['name']
+        revision.message = commit['message']
+        revision.full_clean()
+        revision.save()
+
+    return {'date':         date,
+            'message':      commit['message'],
+            'body':         "", # TODO: pretty-print diffs
+            'author':       commit['author']['name'],
+            'author_email': commit['author']['email'],
+            'commitid':     commit['id'],
+            'short_commit_id': commit['id'][0:7],
+            'links': {'Github': 'http://github.com%s' % commit['url']},
+            'parents':      commit['parents']}
+
 def getlogs(endrev, startrev):
     if endrev != startrev:
         revisions = endrev.branch.revisions.filter(
@@ -40,53 +97,26 @@ def getlogs(endrev, startrev):
     project = m.group("project")
 
     logs = []
+    last_rev_data = None
+    revision_count = 0
+    ancestor_found = False
     #TODO: get all revisions between endrev and startrev,
     # not only those present in the Codespeed DB
-    for revision in revisions:
-        commit_url = 'http://github.com/api/v2/json/commits/show/%s/%s/%s' % (
-            username, project, revision.commitid)
-
-        commit_json = cache.get(commit_url)
-
-        if commit_json is None:
-            try:
-                commit_json = json.load(urllib.urlopen(commit_url))
-            except IOError, e:
-                logging.exception("Unable to load %s: %s",
-                    commit_url, e, exc_info=True)
-                raise e
-
-            if 'error' in commit_json:
-                # We'll still cache these for a brief period of time to avoid making too many requests:
-                cache.set(commit_url, commit_json, 300)
-            else:
-                # We'll cache successes for a very long period of time since
-                # SCM diffs shouldn't change:
-                cache.set(commit_url, commit_json, 86400 * 30)
-
-        if 'error' in commit_json:
-            raise RuntimeError("Unable to load %s: %s" % (commit_url, commit_json['error']))
-
-        commit = commit_json['commit']
-
-        date = isodate.parse_datetime(commit['committed_date'])
-
-        # Overwrite any existing data we might have for this revision since
-        # we never want our records to be out of sync with the actual VCS:
 
-        # We need to convert the timezone-aware date to a naive (i.e.
-        # timezone-less) date in UTC to avoid killing MySQL:
-        revision.date = date.astimezone(isodate.tzinfo.Utc()).replace(tzinfo=None)
-        revision.author = commit['author']['name']
-        revision.message = commit['message']
-        revision.full_clean()
-        revision.save()
-
-        logs.append({'date': date, 'message': commit['message'],
-                        'body': "", # TODO: pretty-print diffs
-                        'author': commit['author']['name'],
-                        'author_email': commit['author']['email'],
-                        'commitid': commit['id'],
-                        'short_commit_id': commit['id'][0:7],
-                        'links': {'Github': 'http://github.com%s' % commit['url']}})
-    return logs
+    for revision in revisions:
+        last_rev_data = retrieve_revision(revision.commitid, username, project, revision)
+        logs.append(last_rev_data)
+        revision_count += 1
+        ancestor_found = (startrev.commitid in [rev['id'] for rev in last_rev_data['parents']])
+
+    # Simple approach to find the startrev, stop after found or after
+    # #GITHUB_REVISION_LIMIT revisions are fetched
+    while (revision_count < GITHUB_REVISION_LIMIT 
+            and not ancestor_found
+            and len(last_rev_data['parents']) > 0):
+        last_rev_data = retrieve_revision(last_rev_data['parents'][0]['id'], username, project)
+        logs.append(last_rev_data)
+        revision_count += 1
+        ancestor_found = (startrev.commitid in [rev['id'] for rev in last_rev_data['parents']])
+
+    return sorted(logs, key=lambda i: i['date'], reverse=True)