diff --git a/filtermodules/geocoords.py b/filtermodules/geocoords.py new file mode 100644 index 0000000..591aed7 --- /dev/null +++ b/filtermodules/geocoords.py @@ -0,0 +1,38 @@ +#!/usr/bin/python +# -*- coding:utf-8 -*- +import time +from tlgflaws import * +from utils import * + + +## +class FGeotags(FlawFilter): + shortname= 'Geotags' + label= _('Geotags') + description= _('Get pages with Geotags') + + # our action class + class Action(TlgAction): + def execute(self, resultQueue): + cur= getCursors()[self.wiki] + format_strings = ' OR '.join(['page.page_id=%s'] * len(self.pageIDs)) + cur.execute("""SELECT page.page_id, page.page_namespace, page.page_title, page.page_restrictions, page.page_counter, +page.page_is_new, page.page_random, page.page_touched, page.page_latest, page.page_len, +geo_tags.gt_lat, geo_tags.gt_lon +FROM page +JOIN geo_tags ON geo_tags.gt_page_id=page.page_id +WHERE (page.page_namespace=0 OR page.page_namespace=6) AND (%s)""" % format_strings, self.pageIDs) + res= cur.fetchall() + + for row in res: + filtertitle= '%s,%s' % (row['gt_lat'], row['gt_lon'], row['gt_lat'], row['gt_lon'], row['gt_lat'], row['gt_lon']) + #~ filtertitle= '%s,%s' % (row['gt_lat'], row['gt_lon']) + resultQueue.put(TlgResult(self.wiki, row, self.parent, filtertitle)) + + def getPreferredPagesPerAction(self): + return 50 + + def createActions(self, language, pages, actionQueue): + actionQueue.put(self.Action(self, language, pages)) + +FlawFilters.register(FGeotags) diff --git a/filtermodules/pagehits.py b/filtermodules/pagehits.py new file mode 100644 index 0000000..87eb0d6 --- /dev/null +++ b/filtermodules/pagehits.py @@ -0,0 +1,70 @@ +#!/usr/bin/python +# -*- coding:utf-8 -*- +import time +import datetime +import requests +from tlgflaws import * +from utils import * + +# snippet from http://stackoverflow.com/questions/1265665/python-check-if-a-string-represents-an-int-without-using-try-except ... +def isInt_str(v): + v = str(v).strip() + return v=='0' or (v if v.find('..') > -1 else v.lstrip('-+').rstrip('0').rstrip('.')).isdigit() + +## +class FPagehits(FlawFilter): + shortname= 'Pagehits' + label= _('Page Hits') + description= _('Sort articles by hit count. Uses data from stats.grok.se from previous month.') + + @staticmethod + def makeGrokSession(): + grokSession= requests.Session() + # if they ever need to complain about our requests, they'll know where to look: + grokSession.headers.update({ 'User-Agent': 'Article List Generator (http://tools.wmflabs.org/render/stools/alg'}) + return grokSession + + @cache_region('disklongterm') + def getHitcount(self, year, month, title): + try: + # the requests library is totally thread-safe, except when it isn't. + # so we need to work around by creating a separate session for each worker thread + session= CachedThreadValue("grokSession", self.makeGrokSession) + res= session.get('http://stats.grok.se/json/de/%s%02d/%s' % (year, int(month), title)) + if res.status_code==200: + json= res.json() + total= 0 + for day in json['daily_views']: + total+= int(json['daily_views'][day]) + return total + except Exception as ex: + return str(ex) # .... + return '?' + + # our action class + class Action(TlgAction): + def execute(self, resultQueue): + cur= getCursors()[self.wiki] + format_strings = ' OR '.join(['page_id=%s'] * len(self.pageIDs)) + cur.execute("""SELECT page_id, page_namespace, page_title, page_restrictions, page_counter, page_is_redirect, +page_is_new, page_random, page_touched, page_latest, page_len +FROM page WHERE (page_namespace=0 OR page_namespace=6) AND page_is_redirect=0 AND (%s)""" % format_strings, self.pageIDs) + res= cur.fetchall() + + lastmonth= datetime.datetime.fromtimestamp(time.time()) + statyear= lastmonth.year + statmonth= lastmonth.month + + for row in res: + count= self.parent.getHitcount(statyear, statmonth, row['page_title']) + filtertitle= 'count: %s' % count + sortkey= -int(count) if isInt_str(count) else 1 + resultQueue.put(TlgResult(self.wiki, row, self.parent, filtertitle, sortkey= sortkey)) + + def getPreferredPagesPerAction(self): + return 50 + + def createActions(self, language, pages, actionQueue): + actionQueue.put(self.Action(self, language, pages)) + +FlawFilters.register(FPagehits) diff --git a/geobbox.py b/geobbox.py new file mode 100644 index 0000000..ec529c8 --- /dev/null +++ b/geobbox.py @@ -0,0 +1,61 @@ +# code to create a bounding box from a center geocoord and a 'radius' (half-side of square) in km. +# this is copypasta from SO question at http://stackoverflow.com/questions/3182260/python-geocode-filtering-by-distance +# answered by http://stackoverflow.com/users/84270/john-machin + +from math import sin, cos, asin, sqrt, degrees, radians + +Earth_radius_km = 6371.0 +RADIUS = Earth_radius_km + +def haversine(angle_radians): + return sin(angle_radians / 2.0) ** 2 + +def inverse_haversine(h): + return 2 * asin(sqrt(h)) # radians + +def distance_between_points(lat1, lon1, lat2, lon2): + # all args are in degrees + # WARNING: loss of absolute precision when points are near-antipodal + lat1 = radians(lat1) + lat2 = radians(lat2) + dlat = lat2 - lat1 + dlon = radians(lon2 - lon1) + h = haversine(dlat) + cos(lat1) * cos(lat2) * haversine(dlon) + return RADIUS * inverse_haversine(h) + +def bounding_box(lat, lon, distance): + # Input and output lats/longs are in degrees. + # Distance arg must be in same units as RADIUS. + # Returns (dlat, dlon) such that + # no points outside lat +/- dlat or outside lon +/- dlon + # are <= "distance" from the (lat, lon) point. + # Derived from: http://janmatuschek.de/LatitudeLongitudeBoundingCoordinates + # WARNING: problems if North/South Pole is in circle of interest + # WARNING: problems if longitude meridian +/-180 degrees intersects circle of interest + # See quoted article for how to detect and overcome the above problems. + # Note: the result is independent of the longitude of the central point, so the + # "lon" arg is not used. + dlat = distance / RADIUS + dlon = asin(sin(dlat) / cos(radians(lat))) + return degrees(dlat), degrees(dlon) + +if __name__ == "__main__": + + # Examples from Jan Matuschek's article + + def test(lat, lon, dist): + print "test bounding box", lat, lon, dist + dlat, dlon = bounding_box(lat, lon, dist) + print "dlat, dlon degrees", dlat, dlon + print "lat min/max rads", map(radians, (lat - dlat, lat + dlat)) + print "lon min/max rads", map(radians, (lon - dlon, lon + dlon)) + + print "liberty to eiffel" + print distance_between_points(40.6892, -74.0444, 48.8583, 2.2945) # about 5837 km + print + print "calc min/max lat/lon" + degs = map(degrees, (1.3963, -0.6981)) + test(*degs, dist=1000) + print + degs = map(degrees, (1.3963, -0.6981, 1.4618, -1.6021)) + print degs, "distance", distance_between_points(*degs) # 872 km \ No newline at end of file