In [1]:
""" 
1. References:
http://linkedgeodata.org/OnlineAccess/SparqlEndpoints
http://linkedgeodata.org/OSM
http://linkedgeodata.org/sparql


2. Use case-sensitive exact matching due to search time
'BLACK MOUNTAIN', 'black mountain': no result
Only "Black Mountain" has results. 

note:
1) case-insensitive, partial matching query
  FILTER regex (?o, 'black mountain' , "i")

2) case-insenstive query
  FILTER (lcase(str(?o)) = \"%s\")
"""

# case-sensitive exact matching
# get all uri having exact names
from SPARQLWrapper import SPARQLWrapper, JSON

def sparql_query (name):
    sparql = SPARQLWrapper("http://linkedgeodata.org/sparql")

    sparql.setQuery("""

Prefix lgdo: <http://linkedgeodata.org/ontology/>
Prefix geom: <http://geovocab.org/geometry#>
Prefix ogc: <http://www.opengis.net/ont/geosparql#>
Prefix owl: <http://www.w3.org/2002/07/owl#>
Prefix wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#>
Prefix owl: <http://www.w3.org/2002/07/owl#>
Prefix gn: <http://www.geonames.org/ontology#>

Select * {
    {?s rdfs:label \'%s\'} UNION 
    {?s owl:sameAs ?geo .
    ?geo gn:name \'%s\' }    
} 
    """%(name, name))
    
    sparql.setReturnFormat(JSON)    
       
    return sparql.query().convert()

In [2]:
# example: get uris having 'El Centro'
sparql_query ('El Centro')

{'head': {'link': [], 'vars': ['s', 'geo']},
 'results': {'distinct': False,
  'ordered': True,
  'bindings': [{'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node2623608640'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node3802571884'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/way71745414'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node1243747194'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node2464414044'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node2682445513'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node3732231293'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node676126734'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata.org/triplify/node703341627'}},
   {'s': {'type': 'uri',
     'value': 'http://linkedgeodata

In [3]:
"""
map_phrase_USGS(GT).json: ground truth phrases on USGS maps
-> output of extractPhrase_USGS(GT).ipynb in "makeRDF" folder
""" 
import json
with open('map_phrase_USGS(GT).json', 'r') as f: 
    phrase_dic = json.load(f)

In [4]:
# make "black mountain" to "Black Mountain"
new_dic = dict()
for key in phrase_dic:
    new_phrase = list()
    for phrase in phrase_dic[key]:
        string = ""
        for voc in phrase.split():
            string += voc.capitalize() + " "
        new_phrase.append(string[:-1])
    new_dic[key] = new_phrase

In [5]:
new_dic

{'USGS-15-CA-brawley-e1957-s1957-p1961.jpg': ['Fairgrounds',
  'Lilac Drain',
  'El Centro',
  'Southern Pacific',
  'Cotton Gin',
  'Sunbeam Recreation Area',
  'Rio Vista Ave',
  '10th St',
  'Sumac Canal',
  'Lateral Drain',
  'Bm 107',
  'Mc Call Drain One',
  'United States Department Of The Interior Geological Survey',
  'Bryant Canal',
  'Sumac Lateral',
  'Rubber Drain',
  'Mound',
  'Tokay Canal',
  'Fig Canal',
  'Westside Main Sandal',
  'Dahlia Canal',
  'Central Drain',
  'Landing Field',
  'Thorn One Canal',
  'Sunbeam Lake',
  'Flax Canal',
  'Reid Sch',
  'Thorn Canal',
  'Wt',
  'Radio Facility',
  'Edgar',
  'Palm Ave',
  'Purification Plant',
  'Pearson Drain',
  'Witter Sch',
  'Imperial',
  'Pioneer Memorial Hospital',
  'Bm 115',
  'Hotville',
  'Best Canal',
  'Rose Drain',
  'Tuberose Canal',
  'Imperial Cem',
  'Bm 111',
  'Labor Camp',
  'Rockwood Canal',
  'Dandelion Canal',
  'Dahlia Lateral',
  'Date Canal',
  'Thistle Lateral',
  'Imperial Valley',
  'Po',

In [6]:
import time

In [7]:
def getURI (key):
    
    tmp_dic = dict()
    for phrase in new_dic[key]:
        try:
            results = sparql_query (phrase)
        
            tmp_dic[phrase] = list()
            for result in results["results"]["bindings"]:
    #             lat = result['lat']['value']
    #             long = result['long']['value']
    #             if (float(lat) >= min(loc_dic[key]['lat'])) and (float(lat) <= max(loc_dic[key]['lat'])) and (float(long) >= min(loc_dic[key]['long'])) and (float(long) <= max(loc_dic[key]['long'])):
                try:
                    tmp_dic[phrase].append(
                           [result['s']['value'], result['geo']['value']]
                       )
                except:
                    tmp_dic[phrase].append(
                           [result['s']['value']]
                       )
                    
        except:
            continue
    return tmp_dic

In [9]:
start_time = time.time()
uri_dic = dict()
for key in new_dic:
    uri_dic[key] = getURI(key)
    
print("Duration: %d" %int(time.time() -start_time))

Duration: 940


In [10]:
uri_dic 

{'USGS-15-CA-brawley-e1957-s1957-p1961.jpg': {'Fairgrounds': [['http://linkedgeodata.org/triplify/node472459712'],
   ['http://linkedgeodata.org/triplify/node1621257769'],
   ['http://linkedgeodata.org/triplify/node1621264958'],
   ['http://linkedgeodata.org/triplify/node570580091'],
   ['http://linkedgeodata.org/triplify/way136245036'],
   ['http://linkedgeodata.org/triplify/way136247824'],
   ['http://linkedgeodata.org/triplify/way147612875'],
   ['http://linkedgeodata.org/triplify/way147612880'],
   ['http://linkedgeodata.org/triplify/node983913294']],
  'Lilac Drain': [],
  'El Centro': [['http://linkedgeodata.org/triplify/node2623608640'],
   ['http://linkedgeodata.org/triplify/node3802571884'],
   ['http://linkedgeodata.org/triplify/way71745414'],
   ['http://linkedgeodata.org/triplify/node1243747194'],
   ['http://linkedgeodata.org/triplify/node2464414044'],
   ['http://linkedgeodata.org/triplify/node2682445513'],
   ['http://linkedgeodata.org/triplify/node3732231293'],
   ['htt

In [12]:
import pickle
from pathlib import Path

intermediate_output = Path("intermediate_output")

with open(intermediate_output /'URIwithPhrase.pkl', 'wb') as handle:
    pickle.dump(uri_dic, handle)