Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 305 lines (230 sloc) 12.1 KB
#!/usr/bin/env python
import sys
import logging
import os.path
import csv
import pprint
import geojson
import json
import mapzen.whosonfirst.uri
import mapzen.whosonfirst.utils
import requests
import StringIO
import boto.s3.connection
import boto.s3.key
if __name__ == '__main__':
import optparse
opt_parser = optparse.OptionParser()
opt_parser.add_option('-s', '--source', dest='source', action='store', default='fs', help='Where to read files from (default is "fs")')
opt_parser.add_option('-p', '--prefix', dest='prefix', action='store', default=None, help='Prefix for paths in CSV file (optional)')
opt_parser.add_option('-m', '--max', dest='max', action='store', default=None, help='The maximum number of records in asingle GeoJSON file')
opt_parser.add_option('-l', '--slim', dest='slim', action='store_true', default=False, help='Limit property export to subset (roughly those in the CSV file), reduce file size.')
opt_parser.add_option('-k', '--slim-keys', dest='keys', action='store', default=None, help='Ordered list of keys (properties) that are exported in slim mode.')
opt_parser.add_option('-e', '--slim-template', dest='keys_template', action='store', default=None, help='Trim key names to fit Esri Shapefile format 8 char length limit.')
opt_parser.add_option('-t', '--slim-placetype', dest='placetype', action='store', default=None, help='Add additional neighbourhood keys.')
opt_parser.add_option('-f', '--slim-filter-belongs-to', dest='place_filter', action='store', default=None, help='Limit to a WOF hierarchy.')
opt_parser.add_option('-i', '--infer-path', dest='infer_path', action='store_true', default=False, help='Infer (relative) from wof:id column (takes precedence over any pre-exisinting path column)')
opt_parser.add_option('-c', '--csv', dest='csv', action='store', default=None, help='CSV file to read')
opt_parser.add_option('-o', '--out', dest='out', action='store', default=None, help='Where to write feature collection (default is STDOUT)')
opt_parser.add_option('-a', '--aliases', dest='aliases_path', action='store', default=None, help='Path to property aliases JSON for slim mode')
opt_parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Be chatty (default is false)')
options, args = opt_parser.parse_args()
if options.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
first = True
counter = 1
page = 1
max_records = 0
if options.max:
max_records = int(options.max)
max_records = max(0, max_records)
logging.debug("max records per geojson file: %s" % max_records)
if options.slim:
# Default to a reasonable path, in the absence of an alias argument
path = "/usr/local/mapzen/whosonfirst-properties/aliases/property_aliases.json"
if options.aliases_path:
path = options.aliases_path
try:
if not os.path.exists(path):
logging.error("%s does not exist, but required for slim mode" % path)
sys.exit()
fdata = open(path, 'r')
property_aliases = json.load(fdata)
except Exception, e:
logging.error("could not load %s, because %s" % (path, e))
sys.exit()
# Add complex property aliases
property_aliases['country_id'] = 'wof:hierarchy[0]["country_id"]'
property_aliases['country'] = 'wof:hierarchy[0]["country_id"]'
property_aliases['region_id'] = 'wof:hierarchy[0]["region_id"]'
property_aliases['region'] = 'wof:hierarchy[0]["region_id"]'
property_aliases['locality_id'] = 'wof:hierarchy[0]["locality_id"]'
property_aliases['locality'] = 'wof:hierarchy[0]["locality_id"]'
property_aliases['supersedes'] = 'wof:supersedes[0]'
property_aliases['super'] = 'wof:supersedes[0]'
property_aliases['superseded_by'] = 'wof:superseded_by[0]'
property_aliases['super_by'] = 'wof:superseded_by[0]'
property_aliases['classifiers_cat'] = 'sg:classifiers["category"]'
property_aliases['classifiers_subcat'] = 'sg:classifiers["subcategory"]'
property_aliases['classifiers_type'] = 'sg:classifiers["type"]'
property_aliases['cl_cat'] = 'sg:classifiers["category"]'
property_aliases['cl_subcat'] = 'sg:classifiers["subcategory"]'
property_aliases['cl_type'] = 'sg:classifiers["type"]'
slim_keys = []
if options.keys:
slim_keys = options.keys.split(',')
else:
if options.keys_template:
if options.keys_template == 'shapefile':
slim_keys = ['name','id','placetype','parent','country','region','locality','wof_a3','lbl_lat','lbl_long']
elif options.keys_template == 'external_editor':
slim_keys = ['name','id','placetype','parent_id','is_current','is_landuse_aoi','supersedes','superseded_by','deprecated','cessation','eng_preferred_name','eng_variant_name','max_zoom','min_zoom','controlled','geom_id','rev_lat','rev_long','lbl_latitude','lbl_longitude','is_funky','is_hard_boundary','is_official','tier_locality','country_id','placetype_alt','region_id','locality_id','wof_country','iso_country','mz_note','src_geom']
elif options.keys_template == 'more':
slim_keys = ['name','id','placetype','parent_id','country_id','region_id','locality_id','wof_country','iso_country','lbl_latitude','lbl_longitude','inception','cessation','deprecated','supersedes','superseded_by','lastmodified','source']
else:
slim_keys = ['name','id','placetype','parent_id','country_id','region_id','locality_id','wof_country','lbl_latitude','lbl_longitude']
if options.placetype == 'neighbourhood' and options.keys_template != 'external_editor':
if options.keys_template == 'shapefile':
slim_keys.extend( ['is_funky','is_hard','is_aoi','is_offic','max_zoom','min_zoom','tier_local'] )
else:
slim_keys.extend( ['is_funky','is_hard_boundary','is_landuse_aoi','is_official','max_zoom','min_zoom','tier_locality'] )
if options.placetype == 'venue':
if options.keys_template == 'shapefile':
slim_keys.extend( ['address', 'city', 'cl_cat', 'cl_subcat', 'cl_type', 'owner', 'phone', 'postcode', 'province', 'tags'] )
else:
slim_keys.extend( ['address', 'city', 'classifiers_cat', 'classifiers_subcat', 'classifiers_type', 'owner', 'phone', 'postcode', 'province', 'tags'] )
if options.slim:
logging.debug("slim properties export mode")
logging.debug("\t%s" % slim_keys)
out = None
path = os.path.abspath(options.csv)
fh = open(path, 'r')
if max_records:
lines = 0
for ln in fh.readlines():
lines += 1
if lines == (max_records + 1):
break
if lines <= max_records:
max_records = 0
fh.seek(0)
reader = csv.DictReader(fh)
for row in reader:
path = row.get('path', None)
if options.infer_path:
wof_id = row.get('wof:id', None)
if wof_id == None:
logging.error("missing wof:id column... how can --infer-path work... no, I mean really?")
sys.exit(1)
path = mapzen.whosonfirst.uri.id2relpath( wof_id )
else:
if not path or path == '':
logging.error("missing path column, perhaps you want to run this script with --infer-path ?")
sys.exit(1)
if options.prefix:
path = os.path.join(options.prefix, path)
logging.debug("fetching %s, with %s" % (path, options.source))
fdata = None
if options.source == 's3':
try:
rsp = requests.get(path)
except Exception, e:
logging.error("failed to retrieve %s, because %s" % (path, e))
continue
code = rsp.status_code
if code != 200:
logging.error("%s returned an unexpected status code: %s" (path, code))
continue
fdata = StringIO.StringIO()
fdata.write(rsp.content)
fdata.seek(0)
else:
if not os.path.exists(path):
logging.error("%s does not exist, skipping" % path)
continue
fdata = open(path, 'r')
try:
geojson.load(fdata)
fdata.seek(0)
except Exception, e:
logging.error("failed to parse %s, because %s" % (path, e))
raise Exception, "SAD FACE"
if not out:
if options.out:
outpath = options.out
outpath = os.path.abspath(outpath)
if max_records:
root = os.path.dirname(outpath)
fname = os.path.basename(outpath)
fname, ext = os.path.splitext(fname)
ext = ext.lstrip(".")
fname = "%s-%s.%s" % (fname, page, ext)
outpath = os.path.join(root, fname)
logging.debug("write geojson to %s" % outpath)
out = open(outpath, 'w')
else:
out = sys.stdout
# See this? Yeah... it makes you sad, doesn't it. That's okay.
# Compare your sadness at this to your sadness at ogr2ogr FREAKING
# OUT AND DIEING on records with too many features in them triggering
# the E_EXCESSIVE_GIGABYTE error and then just move on.
out.write("""{"type": "FeatureCollection", "features": [""")
if options.slim:
data = geojson.load(fdata)
props = data['properties']
if options.place_filter:
filter = int( options.place_filter )
if not filter in props[ "wof:belongsto" ]:
continue
if first == True:
first = False
else:
# add a new line between records for easier debug but larger file size
out.write(",\r")
#print data # props is an object
fresh_props = {}
for key in slim_keys:
try:
# if there are nested props, it only gets the first one, see property_aliases logic above
if key in ('country_id', 'region_id', 'locality_id'):
fresh_props.update( { key: props["wof:hierarchy"][0][ key ] } )
elif key in ('country', 'region', 'locality'):
key = "%s_id" % key
fresh_props.update( { key: props["wof:hierarchy"][0][ key ] } )
elif key in ('classifiers_cat', 'cl_cat'):
fresh_props.update( { key: props["sg:classifiers"][0]["category"] } )
elif key in ('classifiers_subcat', 'cl_subcat'):
fresh_props.update( { key: props["sg:classifiers"][0]["subcategory"] } )
elif key in ('classifiers_type', 'cl_type'):
fresh_props.update( { key: props["sg:classifiers"][0]["type"] } )
else:
fresh_props.update( { key: props[ property_aliases[key] ] } )
except:
logging.debug("\tmissing key: %s for %s" % (key, property_aliases[key]))
fresh_props.update( { key: '' } )
#print fresh_props
# this won't be an ordered list, sadly
if options.keys_template != "full":
data['properties'] = fresh_props
#print data
geojson.dump(data, out)
else:
if first == True:
first = False
else:
out.write(",")
out.write(fdata.read())
counter += 1
if counter == max_records:
logging.debug("reached max limit for records")
out.write("""]}""")
out.close()
out = None
counter = 0
first = True
page += 1
if out:
out.write("""]}""")
out.close()