Permalink
Browse files

address coordinates

  • Loading branch information...
1 parent b1ddba7 commit 27f46917cb893eaba9cce8e2b81733c8428e3d12 @timrdf committed Oct 5, 2012
Showing with 260 additions and 0 deletions.
  1. +106 −0 bin/secondary/cr-address-coordinates.py
  2. +154 −0 bin/secondary/cr-address-coordinates.sh
View
106 bin/secondary/cr-address-coordinates.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+#
+# Requires: http://pypi.python.org/pypi/googlemaps
+# easy_install http://pypi.python.org/packages/source/g/googlemaps/googlemaps-1.0.2.tar.gz
+#
+#3> <> prov:wasDerivedFrom <https://github.com/jimmccusker/twc-healthdata/tree/master/data/source/healthdata-tw-rpi-edu/address-coordinates/version>;
+#3> prov:wasAttributedTo <http://tw.rpi.edu/instances/JamesMcCusker>;
+#3> .
+#
+# Usage:
+#
+# 1) Retrieve the results, and store its provenance in a separate file:
+# cr-address-coordinates.py http://healthdata.tw.rpi.edu/sparql > b.ttl
+# cr-address-coordinates.py http://healthdata.tw.rpi.edu/sparql --prov b.ttl > b.ttl.prov.ttl
+#
+# 2) Retrieve the results, and embed its provenance within the same file:
+# cr-address-coordinates.py http://healthdata.tw.rpi.edu/sparql > c.ttl
+# cr-address-coordinates.py http://healthdata.tw.rpi.edu/sparql --prov | awk '{print "#3> "$0}' >> c.ttl
+
+from googlemaps import GoogleMaps, GoogleMapsError
+import os, json
+from datetime import *
+import csv, sys, urllib, os, collections, datetime
+
+query = '''prefix vcard: <http://www.w3.org/2006/vcard/ns#>
+prefix wgs: <http://www.w3.org/2003/01/geo/wgs84_pos#>
+
+select distinct ?address ?streetAddress ?streetAddress2 ?locality ?region ?postalCode ?country
+where {
+ ?address a vcard:Address.
+ OPTIONAL { ?address vcard:street-address ?streetAddress }
+ OPTIONAL { ?address vcard:extended-address ?streetAddress2 }
+ OPTIONAL { ?address vcard:locality ?locality }
+ OPTIONAL { ?address vcard:region ?region }
+ OPTIONAL { ?address vcard:postal-code ?postalCode }
+ OPTIONAL { ?address vcard:country-name ?country }
+
+ OPTIONAL { ?address wgs:latitude ?lat; wgs:longitude ?long }
+ FILTER (!bound(?lat) && !bound(?long))
+} limit 100'''
+
+def retrieve(endpoint, api_key):
+
+ gmaps = GoogleMaps(api_key)
+ url = endpoint + '?' + urllib.urlencode([("query",query)]) + '&format=text%2Fcsv'
+ header = None
+ print >> sys.stderr, url
+
+ for line in csv.reader(urllib.urlopen(url),delimiter=","):
+ if header == None:
+ header = line
+ continue
+ addressURI = line[0]
+ address = ", ".join([x for x in line[1:] if x != ""])
+ try:
+ lat, lng = gmaps.address_to_latlng(address)
+ except GoogleMapsError:
+ print >> sys.stderr, 'GoogleMapsError'
+
+ print '{},{},{}'.format(addressURI,lat,lng)
+
+if __name__=='__main__':
+
+ USAGE = '''usage: cr-address-coordinates.py <endpoint> [--prov <output-file>]
+
+ endpoint : URI of a SPARQL endpoint.
+ e.g. http://healthdata.tw.rpi.edu/sparql
+
+ --prov <output-file> : Print the provenance about <output-file>, created by calling without --prov.
+'''
+
+ if len(sys.argv) not in [2,3,4] or sys.argv[1] == "--help":
+ sys.stderr.write(USAGE+'\n')
+ sys.exit(1)
+
+ endpoint = sys.argv[1] # http://healthdata.tw.rpi.edu/sparql
+
+ if len(sys.argv) == 2:
+
+ api_key = os.environ['X_GOOGLE_MAPS_API_Key'] # api_key must be defined to POST/PUT.
+ retrieve(endpoint, api_key)
+
+ elif len(sys.argv) > 2 and sys.argv[2] == '--prov':
+ print '''@prefix prov: <http://www.w3.org/ns/prov#> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+<{outputfile}>
+ prov:wasGeneratedBy [
+ a prov:Activity, <https://raw.github.com/jimmccusker/twc-healthdata/master/data/source/healthdata-tw-rpi-edu/address-coordinates/version/retrieve.py>;
+
+ prov:qualifiedAssociation [
+ a prov:Association;
+ prov:hadPlan <https://raw.github.com/jimmccusker/twc-healthdata/master/data/source/healthdata-tw-rpi-edu/address-coordinates/version/retrieve.py>;
+ ];
+ prov:used [
+ prov:value """{sparql}""";
+ ];
+ prov:used <http://maps.googleapis.com/maps/api/geocode/>;
+ prov:endedAtTime "{end}"^^xsd:dateTime;
+ ];
+.
+
+<https://raw.github.com/jimmccusker/twc-healthdata/master/data/source/healthdata-tw-rpi-edu/address-coordinates/version/retrieve.py> a prov:Plan;
+ foaf:homepage <https://github.com/jimmccusker/twc-healthdata/blob/master/data/source/healthdata-tw-rpi-edu/address-coordinates/version/retrieve.py> .
+'''.format(outputfile=sys.argv[3] if len(sys.argv) > 3 else '', sparql=query, end=datetime.datetime.now().isoformat())
View
154 bin/secondary/cr-address-coordinates.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+#
+# https://github.com/timrdf/csv2rdf4lod-automation/blob/master/bin/cr-publish-isdefinedby-to-endpoint.sh
+#
+# See also:
+# https://github.com/timrdf/csv2rdf4lod-automation/wiki/Aggregating-subsets-of-converted-datasets
+#
+# Environment variables used:
+#
+# (see https://github.com/timrdf/csv2rdf4lod-automation/wiki/CSV2RDF4LOD-environment-variables)
+#
+# Usage:
+#
+# Example usage:
+#
+
+see="https://github.com/timrdf/csv2rdf4lod-automation/wiki/CSV2RDF4LOD-not-set"
+CSV2RDF4LOD_HOME=${CSV2RDF4LOD_HOME:?"not set; source csv2rdf4lod/source-me.sh or see $see"}
+
+# cr:data-root cr:source cr:directory-of-datasets cr:dataset cr:directory-of-versions cr:conversion-cockpit
+ACCEPTABLE_PWDs="cr:data-root cr:source cr:dataset cr:directory-of-versions"
+if [ `${CSV2RDF4LOD_HOME}/bin/util/is-pwd-a.sh $ACCEPTABLE_PWDs` != "yes" ]; then
+ ${CSV2RDF4LOD_HOME}/bin/util/pwd-not-a.sh $ACCEPTABLE_PWDs
+ exit 1
+fi
+
+TEMP="_"`basename $0``date +%s`_$$.tmp
+
+sourceID=$CSV2RDF4LOD_PUBLISH_OUR_SOURCE_ID
+datasetID=`basename $0 | sed 's/.sh$//'`
+versionID=`date +%Y-%b-%d`
+
+graphName=${CSV2RDF4LOD_BASE_URI_OVERRIDE:-$CSV2RDF4LOD_BASE_URI}/source/$sourceID/dataset/$datasetID/version/$versionID
+
+if [[ $# -lt 1 || "$1" == "--help" ]]; then
+ echo "usage: `basename $0` [--target] [-n] --clear-graph <named_graph_URI | cr:auto | .>"
+ echo ""
+ echo " Query CSV2RDF4LOD_PUBLISH_SPARQL_ENDPOINT for all classes and predicates used,"
+ echo " assert rdfs:isDefinedBy to its namespace and prov:wasAttributedTo to its domain."
+ echo " load it into a virtuoso sparql endpoint."
+ echo ""
+ echo " --target : return the name of graph that will be loaded; then quit."
+ echo " -n : perform dry run only; do not load named graph."
+ echo " --clear-graph : clear the named graph."
+ echo
+ echo " named_graph_URI : use graph name given"
+ echo " cr:auto : use named graph $graphName"
+ echo " . : print to stdout"
+ exit 1
+fi
+
+if [ "$1" == "--target" ]; then
+ # a conversion:VersionedDataset:
+ # e.g. http://purl.org/twc/health/source/tw-rpi-edu/dataset/cr-publish-tic-to-endpoint/version/2012-Sep-07
+ echo $graphName
+ exit 0
+fi
+
+dryRun="false"
+if [ "$1" == "-n" ]; then
+ dryRun="true"
+ dryrun.sh $dryrun beginning
+ shift
+fi
+
+clearGraph="false"
+if [ "$1" == "--clear-graph" ]; then
+ clearGraph="true"
+ shift
+fi
+
+if [ "$1" != "cr:auto" ]; then
+ graphName="$1"
+ shift
+fi
+
+if [[ `is-pwd-a.sh cr:directory-of-versions` == "yes" ]]; then
+
+ endpoint="$CSV2RDF4LOD_PUBLISH_SPARQL_ENDPOINT"
+ if [ ${#endpoint} -eq 0 ]; then
+ endpoint="$CSV2RDF4LOD_PUBLISH_VIRTUOSO_SPARQL_ENDPOINT"
+ fi
+ if [ ${#endpoint} -eq 0 ]; then
+ echo "ERROR: no endpoint defined. Define CSV2RDF4LOD_PUBLISH_SPARQL_ENDPOINT"
+ exit 1
+ fi
+
+ cockpit="$versionID"
+ if [ ! -d $cockpit/source ]; then
+ mkdir -p $cockpit/automatic
+ fi
+ rm -rf $cockpit/source/*
+
+ echo python $CSV2RDF4LOD_HOME/bin/cr-publish-isdefinedby-to-endpoint.py $endpoint
+ if [ "$dryRun" != "true" ]; then
+ python $CSV2RDF4LOD_HOME/bin/cr-publish-isdefinedby-to-endpoint.py $endpoint > $cockpit/automatic/isdefinedby.nt
+ pushd $cockpit &> /dev/null
+ aggregate-source-rdf.sh --link-as-latest automatic/*
+ # ^^ publishes if CSV2RDF4LOD_PUBLISH_VIRTUOSO
+ popd &> /dev/null
+ fi
+
+ if [ "$clearGraph" == "true" ]; then
+ echo ""
+ echo "Deleting $graphName" >&2
+ if [ "$dryRun" != "true" ]; then
+ publish/bin/virtuoso-delete-$sourceID-$datasetID-$versionID.sh
+ fi
+ fi
+
+ if [ "$dryRun" != "true" ]; then
+ pushd $cockpit &> /dev/null
+ publish/bin/virtuoso-load-$sourceID-$datasetID-$versionID.sh
+ popd &> /dev/null
+ fi
+
+ # if [ "$CSV2RDF4LOD_PUBLISH_COMPRESS" == "true" ]; then
+ # fi
+
+ dryrun.sh $dryrun ending
+elif [[ `is-pwd-a.sh cr:dataset` == "yes" ]]; then
+ if [ ! -e version ]; then
+ mkdir version # See https://github.com/timrdf/csv2rdf4lod-automation/wiki/Directory-Conventions
+ fi
+ pushd version &> /dev/null
+ $0 $* # Recursive call
+ popd &> /dev/null
+elif [[ `is-pwd-a.sh cr:source ` == "yes" ]]; then
+ if [ -d dataset ]; then
+ # This would conform to the directory structure if
+ # we had included 'dataset' in the convention.
+ # This is here in case we ever fully support it.
+ pushd dataset > /dev/null
+ $0 $* # Recursive call
+ popd > /dev/null
+ else
+ # Handle the original (3-year old) directory structure
+ # that does not include 'dataset' as a directory.
+ datasetID=`basename $0`
+ if [ ! -d ${datasetID%.*} ]; then
+ mkdir ${datasetID%.*}
+ fi
+ pushd ${datasetID%.*} > /dev/null
+ $0 $* # Recursive call
+ popd > /dev/null
+ fi
+elif [[ `is-pwd-a.sh cr:data-root ` == "yes" ]]; then
+ see="https://github.com/timrdf/csv2rdf4lod-automation/wiki/Aggregating-subsets-of-converted-datasets"
+ CSV2RDF4LOD_PUBLISH_OUR_SOURCE_ID=${CSV2RDF4LOD_PUBLISH_OUR_SOURCE_ID:?"not set; see $see"}
+
+ pushd $CSV2RDF4LOD_PUBLISH_OUR_SOURCE_ID > /dev/null
+ $0 $* # Recursive call
+ popd > /dev/null
+fi

0 comments on commit 27f4691

Please sign in to comment.