From 307b728e15755fe7ca4d87e23617a4b5728a9688 Mon Sep 17 00:00:00 2001 From: Scott Shawcroft Date: Mon, 13 May 2013 23:34:31 -0700 Subject: [PATCH] Lots of changes: * change group quality get package list from the db. * Update http_open_dir to support s3 buckets (chromium). * Tweak SF Api use to get newest packages first. * Limit pakcage description in search. * verify with bing webmaster tools --- crawl/scripts/group_quality.py | 9 ++--- crawl/upstream/explore.py | 1 - crawl/upstream/sf.py | 2 +- crawl/utils/helper.py | 70 +++++++++++++++++++++------------- site/oswatershed/views.py | 2 + site/templates/base.html | 1 + 6 files changed, 52 insertions(+), 33 deletions(-) diff --git a/crawl/scripts/group_quality.py b/crawl/scripts/group_quality.py index 217a361..3eb81d7 100644 --- a/crawl/scripts/group_quality.py +++ b/crawl/scripts/group_quality.py @@ -4,17 +4,16 @@ sys.path.append(os.getcwd()) from utils import history +from utils.db import groups if len(sys.argv)<2: - print sys.argv[0],"" + print sys.argv[0],"" if len(sys.argv)>2: num_bugs = int(sys.argv[2]) else: num_bugs = None -f = open(sys.argv[1]) - total = 0 fake_upstream = 0 missing_distro = 0 @@ -23,7 +22,7 @@ BUGS = map(lambda x: [], [0] * 10) -for pkg in f: +for pkg in groups.get_group(sys.argv[1]): total += 1 pkg = pkg.strip() hist = history.PackageHistory(pkg) @@ -44,7 +43,7 @@ if num_bugs == None or num_bugs == bugs: print pkg, ups, missing - BUGS[bugs].append(pkg) + #BUGS[bugs].append(pkg) print print fake_upstream,"/",total,"with approx upstream" print missing_distro,"/",total,"missing from a distro" diff --git a/crawl/upstream/explore.py b/crawl/upstream/explore.py index 3da5068..c54e402 100644 --- a/crawl/upstream/explore.py +++ b/crawl/upstream/explore.py @@ -18,7 +18,6 @@ def contains(s, parts): return False def explore(url, depth, good, bad, fn_remove, badv, dead, last_crawl=None): - print url, last_crawl pkgs = [] info = helper.open_dir(url) diff --git a/crawl/upstream/sf.py b/crawl/upstream/sf.py index ae8768d..eeb13fa 100644 --- a/crawl/upstream/sf.py +++ b/crawl/upstream/sf.py @@ -27,7 +27,7 @@ def get_files(project_id, paths=["/"], last_crawl=None): for path in paths: fn = "files/sourceforge/%d-%s-%d.rss"%(time.time(),project_id,i) try: - ret = helper.open_url("http://sourceforge.net/api/file/index/project-id/%s/rss?path=%s"%(project_id,path),fn) + ret = helper.open_url("http://sourceforge.net/api/file/index/project-id/%s/mtime/desc/limit/%d/rss?path=%s"%(project_id,limit,path),fn) except httplib.BadStatusLine: print "ERROR bad status" return [] diff --git a/crawl/utils/helper.py b/crawl/utils/helper.py index ecbc6a7..8dedc5b 100644 --- a/crawl/utils/helper.py +++ b/crawl/utils/helper.py @@ -67,7 +67,6 @@ def http_open_url(url, filename, last_crawl=None): request.add_header('If-Modified-Since', last_crawl) opener = urllib2.build_opener(DefaultErrorHandler()) datastream = opener.open(request) - if datastream.status == 404: print datastream.status,#url, return None @@ -89,33 +88,52 @@ def open_url(url, filename, last_crawl=None): return http_open_url(url, filename, last_crawl) elif url.startswith("ftp://"): return ftp_open_url(url, filename, last_crawl) + else: + print "unknown protocol:", url def http_open_dir(url): - filename = "".join(("files/helper/", str(time.time()), "-", url.rsplit("/",1)[1])) - if open_url(url, filename)==None: - return None - - pattern = '()?(<(img|IMG) [^>]*(ALT|alt)="(?P[^"]*)"[^>]*>)?( |)?<(A|a)[^>]*>(?P[^<]*) *()?(?P.* [0-9][0-9]:[0-9][0-9]).*' - pattern = re.compile(pattern) - - f = open(filename) + patterns = ['()?(<(a|A)[^>]*>)?(<(img|IMG) [^>]*(ALT|alt)="(?P[^"]*)"[^>]*>)?()?( |)?<(A|a)[^>]*>(?P[^<]*) *()?(?P[^<>]* [0-9][0-9]:[0-9][0-9])', + '(?P[^<]*)(?P.* [0-9][0-9]:[0-9][0-9](:[0-9][0-9])?).*(?P.*)', + ' ]*>(?P[^<]*)(?P.* [0-9][0-9]:[0-9][0-9])', + '(?P[^<]+)', + '(?P[^<]*).*?(?P[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-6][0-9]:[0-6][0-9])\.[0-9]*Z'] + patterns = [re.compile(x) for x in patterns] + original_url = url files = [] - for line in f: - match = pattern.match(line) - if match: - d = match.groupdict() - is_dir = d["dir"]=="[DIR]" or (d["dir"]==None and d["name"][-1]=="/") - release_time = None - try: - release_time = datetime.datetime.strptime(d["modified"],"%d-%b-%Y %H:%M") - except: - try: - release_time = datetime.datetime.strptime(d["modified"],"%Y-%m-%d %H:%M") - except: - print "unsupported date format:", d["modified"] - if release_time: - files.append((is_dir,d["name"],release_time)) - f.close() + while url: + filename = "".join(("files/helper/", str(time.time()), "-", url.rsplit("/",1)[1])) + if open_url(url, filename)==None: + return None + url = None + + f = open(filename) + s = f.read() + for p in patterns: + num_matches = 0 + for match in p.finditer(s): + d = match.groupdict() + if "marker" in d: + url = original_url + "?marker=" + d["marker"] + continue + is_dir = ("dir" in d and (d["dir"]=="[DIR]" or d["dir"] == "Directory" or (d["dir"]==None and d["name"][-1]=="/"))) or d["name"][-1]=="/" + # 2012-06-08T13:35:39.149Z + date_formats = ["%d-%b-%Y %H:%M", "%Y-%m-%d %H:%M", "%Y-%b-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S"] + release_time = None + for date_format in date_formats: + try: + release_time = datetime.datetime.strptime(d["modified"], date_format) + break + except: + pass + if release_time: + files.append((is_dir,d["name"],release_time)) + else: + print "unsupported date format:", d["modified"] + num_matches += 1 + if num_matches > 0: + #print "matched pattern:", p.pattern + break + f.close() return files def ftp_open_dir(url): @@ -154,7 +172,7 @@ def open_dir(url): except urllib2.URLError: print "bad http",url except Exception, e: - print e + print "exception", e return [] def find_match(s, res): diff --git a/site/oswatershed/views.py b/site/oswatershed/views.py index 63cbae6..e5b0c53 100644 --- a/site/oswatershed/views.py +++ b/site/oswatershed/views.py @@ -104,6 +104,8 @@ def search(request, search): for name,description in search.results: if len(name)>35: name = name[:32]+"..." + if description != None and len(description)>64: + description = description[:64] + "..." line = [name, description] results.append(line) return render_to_response('search.html', diff --git a/site/templates/base.html b/site/templates/base.html index 4bb5cbe..1b08ac6 100644 --- a/site/templates/base.html +++ b/site/templates/base.html @@ -14,6 +14,7 @@ } +