several fixes and improvements

yaph · May 18, 2012 · ae691ae · ae691ae
1 parent a23bd3a
commit ae691ae
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 16 deletions.
diff --git a/commit_locations.py b/commit_locations.py
@@ -1,31 +1,36 @@
 # -*- coding: utf-8 -*-
 # This is by far the ugliest Python script I ever wrote
 # withoud cities 14954 location string remain unresovled
-# with unique city names 7754 location string remain unresovled
+# with unique city names 6345 (number is not up to date) location string remain unresovled
+# with largest city name 4623 location string remain unresovled
 
 import csv, json, re
 from geonamescache import GeonamesCache
 
 commits_by_countries = {}
+countries_by_locstr = {}
 gc = GeonamesCache()
 countries = gc.get_countries()
 countries_by_names = gc.get_countries_by_names()
 us_states = gc.get_us_states()
 us_states_by_names = gc.get_us_states_by_names()
 
 re_ignore = re.compile(r'[\.\(\)\d-]')
-re_ws = re.compile(r'\w{2,}')
+re_ws = re.compile(r'\s{2,}')
 
 
 def test_locs(locs):
     for loc in locs:
-        loc = loc.strip().lower().capitalize()
+        loc = loc.strip().lower()
+        loctitle = loc.title()
         locupper = loc.upper()
         if loc in countries_by_names:
             return loc
-        elif locupper in us_states:
+        elif loctitle in countries_by_names:
+            return loctitle
+        elif 2 == len(loc) and locupper in us_states:
             return 'United States'
-        elif locupper in us_states_by_names:
+        elif loc in us_states_by_names or loctitle in us_states_by_names:
             return 'United States'
         elif locupper in ['USA', 'US']:
             return 'United States'
@@ -36,21 +41,33 @@ def test_locs(locs):
         elif locupper in countries:
             return countries[locupper]['name']
         else:
-            cities = gc.get_cities_by_name(loc)
-            # only consider unique city names
-            if 1 == len(cities):
-                return countries[cities[0].values()[0]['countrycode']]['name']
+            for ll in [loc, loctitle]:
+                cities = gc.get_cities_by_name(ll)
+                # unique city names
+                lencities = len(cities)
+                if 1 == lencities:
+                    return countries[cities[0].values()[0]['countrycode']]['name']
+                # assume the largest city
+                elif lencities > 1:
+                    largestcity = sorted([(city['population'], city['countrycode']) for cdict in cities for gid, city in cdict.items()])[-1]
+                    return countries[largestcity[-1]]['name']
 
 
 def determine_country(locstr, langcnt):
     """Try to determine country from given location string."""
 
     locstr = re.sub(re_ws, ' ', re.sub(re_ignore, ' ', locstr))
-    # try different split chars, 1st comma, 2nd slash, 3rd hyphen, last space
-    for sc in [',', '/', '-', ' ']:
+
+    if locstr in countries_by_locstr:
+        return countries_by_locstr[locstr]
+
+    # try different split chars
+    for sc in [',', '/', '-', ' ', ':', '#', '->']:
         country = test_locs(locstr.split(sc))
         if country is not None:
+            countries_by_locstr[locstr] = country
             return country
+
     print('%s, %d' % (loc, langcnt))
 
 
@@ -59,17 +76,22 @@ def determine_country(locstr, langcnt):
 headers = reader.next()
 for record in reader:
     loc, langcnt, repository_language = record
+    if loc.startswith('http://'): continue
+    langcnt = int(langcnt)
     country = determine_country(loc, langcnt)
     if country is not None:
         if country not in commits_by_countries:
             commits_by_countries[country] = {'commits': 0}
-        commits_by_countries[country]['commits'] += int(langcnt)
+        commits_by_countries[country]['commits'] += langcnt
 fcsv.close()
 
 # calc commit ratio per capita
 for c in commits_by_countries:
+    if c not in countries_by_names:
+        print '### %s' % c
+        continue
     popcnt = float(countries_by_names[c]['population'])
-    if popcnt > 0
+    if popcnt > 0:
         by_capita = commits_by_countries[c]['commits'] / popcnt
         by_100k = round(by_capita * 100000, 2)
     else:

diff --git a/unresolved_by_count.py b/unresolved_by_count.py
@@ -1,12 +1,14 @@
 # -*- coding: utf-8 -*-
 import re, operator
 locations = {}
-for line in open('unresovled_locations.txt','r'):
-    loc = line.strip().lower().capitalize()
+for line in open('unresolved_locations.txt','r'):
+    loc, cnt = line.rsplit(',', 1)
+    cnt = int(cnt.strip())
+    loc = loc.strip().lower()
     if '' == loc: continue
     if loc not in locations:
         locations[loc] = 0
-    locations[loc] += 1
+    locations[loc] += cnt
 
 locsorted = sorted(locations.iteritems(), key=operator.itemgetter(1), reverse=True)
 for loc, cnt in locsorted: