Skip to content

Commit

Permalink
several fixes and improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
yaph committed May 18, 2012
1 parent a23bd3a commit ae691ae
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 16 deletions.
48 changes: 35 additions & 13 deletions commit_locations.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,36 @@
# -*- coding: utf-8 -*-
# This is by far the ugliest Python script I ever wrote
# withoud cities 14954 location string remain unresovled
# with unique city names 7754 location string remain unresovled
# with unique city names 6345 (number is not up to date) location string remain unresovled
# with largest city name 4623 location string remain unresovled

import csv, json, re
from geonamescache import GeonamesCache

commits_by_countries = {}
countries_by_locstr = {}
gc = GeonamesCache()
countries = gc.get_countries()
countries_by_names = gc.get_countries_by_names()
us_states = gc.get_us_states()
us_states_by_names = gc.get_us_states_by_names()

re_ignore = re.compile(r'[\.\(\)\d-]')
re_ws = re.compile(r'\w{2,}')
re_ws = re.compile(r'\s{2,}')


def test_locs(locs):
for loc in locs:
loc = loc.strip().lower().capitalize()
loc = loc.strip().lower()
loctitle = loc.title()
locupper = loc.upper()
if loc in countries_by_names:
return loc
elif locupper in us_states:
elif loctitle in countries_by_names:
return loctitle
elif 2 == len(loc) and locupper in us_states:
return 'United States'
elif locupper in us_states_by_names:
elif loc in us_states_by_names or loctitle in us_states_by_names:
return 'United States'
elif locupper in ['USA', 'US']:
return 'United States'
Expand All @@ -36,21 +41,33 @@ def test_locs(locs):
elif locupper in countries:
return countries[locupper]['name']
else:
cities = gc.get_cities_by_name(loc)
# only consider unique city names
if 1 == len(cities):
return countries[cities[0].values()[0]['countrycode']]['name']
for ll in [loc, loctitle]:
cities = gc.get_cities_by_name(ll)
# unique city names
lencities = len(cities)
if 1 == lencities:
return countries[cities[0].values()[0]['countrycode']]['name']
# assume the largest city
elif lencities > 1:
largestcity = sorted([(city['population'], city['countrycode']) for cdict in cities for gid, city in cdict.items()])[-1]
return countries[largestcity[-1]]['name']


def determine_country(locstr, langcnt):
"""Try to determine country from given location string."""

locstr = re.sub(re_ws, ' ', re.sub(re_ignore, ' ', locstr))
# try different split chars, 1st comma, 2nd slash, 3rd hyphen, last space
for sc in [',', '/', '-', ' ']:

if locstr in countries_by_locstr:
return countries_by_locstr[locstr]

# try different split chars
for sc in [',', '/', '-', ' ', ':', '#', '->']:
country = test_locs(locstr.split(sc))
if country is not None:
countries_by_locstr[locstr] = country
return country

print('%s, %d' % (loc, langcnt))


Expand All @@ -59,17 +76,22 @@ def determine_country(locstr, langcnt):
headers = reader.next()
for record in reader:
loc, langcnt, repository_language = record
if loc.startswith('http://'): continue
langcnt = int(langcnt)
country = determine_country(loc, langcnt)
if country is not None:
if country not in commits_by_countries:
commits_by_countries[country] = {'commits': 0}
commits_by_countries[country]['commits'] += int(langcnt)
commits_by_countries[country]['commits'] += langcnt
fcsv.close()

# calc commit ratio per capita
for c in commits_by_countries:
if c not in countries_by_names:
print '### %s' % c
continue
popcnt = float(countries_by_names[c]['population'])
if popcnt > 0
if popcnt > 0:
by_capita = commits_by_countries[c]['commits'] / popcnt
by_100k = round(by_capita * 100000, 2)
else:
Expand Down
8 changes: 5 additions & 3 deletions unresolved_by_count.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# -*- coding: utf-8 -*-
import re, operator
locations = {}
for line in open('unresovled_locations.txt','r'):
loc = line.strip().lower().capitalize()
for line in open('unresolved_locations.txt','r'):
loc, cnt = line.rsplit(',', 1)
cnt = int(cnt.strip())
loc = loc.strip().lower()
if '' == loc: continue
if loc not in locations:
locations[loc] = 0
locations[loc] += 1
locations[loc] += cnt

locsorted = sorted(locations.iteritems(), key=operator.itemgetter(1), reverse=True)
for loc, cnt in locsorted:
Expand Down

0 comments on commit ae691ae

Please sign in to comment.