# Automatic creation of short URLs of Rome drinking fountains in Wikidata
The following script adds h2o short Urls to fountain items as per https://github.com/water-fountains/proximap/issues/244
(and https://github.com/water-fountains/proximap/issues/133)

## Initialize environment

In [36]:
from datetime import datetime as dt
dtFmt = "%y%m%d_%H%M%S"
print (dt.now().strftime(dtFmt))
import pandas as pd
import io
import numpy as np
from urllib.request import urlopen
import json
from math import *
from platform import python_version
print("Python v "+python_version())
#https://github.com/paulhoule/gastrodon/issues/7 
from gastrodon import RemoteEndpoint,QName,ttl,URIRef,inline
from matplotlib import pyplot


191201_101809
Python v 3.6.5


In [37]:
#@prefix wikibase: <wikibase: <http://wikiba.se/ontology#> .
prefixes=inline("""
   @prefix wd: <http://www.wikidata.org/entity/> .
   @prefix wdt: <http://www.wikidata.org/prop/direct/> .
   @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
   @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
""").graph
endpoint=RemoteEndpoint(
   #"https://query.wikidata.org/sparql"
    "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
   ,prefixes=prefixes
)

## Load data

In [38]:
df = pd.read_csv("osmFountainsRome191201_082705.csv")

In [39]:
df.head()

Unnamed: 0,type,id,lat,lon,amenity,fountain,created_by,wheelchair,flow,architect,...,operator,designation,level,location,covered,description:de,website,artwork_type,subject:wikidata,drinking_water:legal
0,node,246569213,41.824508,12.485546,drinking_water,nasone,,,,,...,,,,,,,,,,
1,node,246569214,41.852621,12.478328,drinking_water,nasone,,,,,...,,,,,,,,,,
2,node,246569215,41.854102,12.476834,drinking_water,nasone,,,,,...,,,,,,,,,,
3,node,246569216,41.863749,12.478948,drinking_water,nasone,,,,,...,,,,,,,,,,
4,node,246571139,41.904287,12.513278,drinking_water,,JOSM,,,,...,,,,,,,,,,


### Rename columns to make them easier to work with

In [40]:
# remove not needed columns
df = df.drop(columns=['type','id'])

In [41]:
# rename columns
df = df.rename(index=str, columns=
               {"lat": "Y"})

In [42]:
# rename columns
df = df.rename(index=str, columns=
               {"lon": "X"})

In [43]:
df.head()

Unnamed: 0,Y,X,amenity,fountain,created_by,wheelchair,flow,architect,description,drinking_water,...,operator,designation,level,location,covered,description:de,website,artwork_type,subject:wikidata,drinking_water:legal
0,41.824508,12.485546,drinking_water,nasone,,,,,,,...,,,,,,,,,,
1,41.852621,12.478328,drinking_water,nasone,,,,,,,...,,,,,,,,,,
2,41.854102,12.476834,drinking_water,nasone,,,,,,,...,,,,,,,,,,
3,41.863749,12.478948,drinking_water,nasone,,,,,,,...,,,,,,,,,,
4,41.904287,12.513278,drinking_water,,JOSM,,,,,,...,,,,,,,,,,


In [44]:
len(df)

1645

## Identify already existing fountains
### Query fountains from Wikidata

In [45]:
# Find the geographic extent of the data

buffer = 0.0003  # in degrees, corresponds to about 20-30 meters)
bounds = {
    'minX': df['X'].min() - buffer,
    'minY': df['Y'].min() - buffer,
    'maxX': df['X'].max() + buffer,
    'maxY': df['Y'].max() + buffer
}
print("bounds: ")
for key,value in bounds.items():
    print(key+ ": "+str(value))

bounds: 
minX: 12.3238906
minY: 41.802459999999996
maxX: 12.6220409
maxY: 41.99526720000001


In [46]:
# Query fountains (both water wells and fountains) from Wikidata within bounding box found above
# placeLabel 

query_string = """SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator ?describeAtUrl
WHERE
{{
  # Enter coordinates
  SERVICE wikibase:box {{
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point({minX} {minY})"^^geo:wktLiteral.
    bd:serviceParam wikibase:cornerEast "Point({maxX} {maxY})"^^geo:wktLiteral.
  }} .
  # Is a water well or fountain or subclass of fountain
  FILTER (EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q43483 }} || EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q483453 }}).
  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
  }} 
  OPTIONAL {{ ?place p:P528 ?catalog_code.
            ?catalog_code pq:P972 ?catalog.}}
  OPTIONAL {{ ?place wdt:P571 ?date.}}
  OPTIONAL {{ ?place wdt:P973 ?describeAtUrl.}}
  OPTIONAL {{ ?place wdt:P137 ?operator.}}
  OPTIONAL {{ ?place rdfs:placeLabel ?placeLabel}}
}}
  """.format(**bounds)

print(query_string)

SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator ?describeAtUrl
WHERE
{
  # Enter coordinates
  SERVICE wikibase:box {
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point(12.3238906 41.802459999999996)"^^geo:wktLiteral.
    bd:serviceParam wikibase:cornerEast "Point(12.6220409 41.99526720000001)"^^geo:wktLiteral.
  } .
  # Is a water well or fountain or subclass of fountain
  FILTER (EXISTS { ?place wdt:P31/wdt:P279* wd:Q43483 } || EXISTS { ?place wdt:P31/wdt:P279* wd:Q483453 }).
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
  } 
  OPTIONAL { ?place p:P528 ?catalog_code.
            ?catalog_code pq:P972 ?catalog.}
  OPTIONAL { ?place wdt:P571 ?date.}
  OPTIONAL { ?place wdt:P973 ?describeAtUrl.}
  OPTIONAL { ?place wdt:P137 ?operator.}
  OPTIONAL { ?place rdfs:placeLabel ?placeLabel}
}
  


In [47]:
# Perform query
query_result = endpoint.select(query_string)

In [17]:
#print(query_string)
print("\n\nTotal number of rows incl. duplicates "+str(len(query_result))+" size "+str(query_result.size))



Total number of rows incl. duplicates 219 size 1752


### Tidy up data

In [48]:
# Extract coordinates from Wikidata results

query_result['X'] = query_result['location'].apply(lambda l:float(l.split('(')[1].split(' ')[0]))
query_result['Y'] = query_result['location'].apply(lambda l:float(l.split(' ')[1].split(')')[0]))

In [49]:
query_result.head(100)

Unnamed: 0,place,placeLabel,location,date,catalog_code,catalogLabel,operator,describeAtUrl,X,Y
0,wd:Q74574057,Brunnen,Point(12.519195 41.8814447),,,,,https://h2o.do/Q74574057,12.519195,41.881445
1,wd:Q76937644,Brunnen,Point(12.485546 41.8245081),,,,,https://h2o.do/Q76937644,12.485546,41.824508
2,wd:Q76937896,Brunnen,Point(12.4783285 41.8526205),,,,,https://h2o.do/Q76937896,12.478328,41.852621
3,wd:Q76937912,Brunnen,Point(12.4768335 41.8541022),,,,,https://h2o.do/Q76937912,12.476833,41.854102
4,wd:Q76937931,Brunnen,Point(12.4789476 41.8637486),,,,,https://h2o.do/Q76937931,12.478948,41.863749
...,...,...,...,...,...,...,...,...,...,...
95,wd:Q76941464,Brunnen,Point(12.5085975 41.9004915),,,,,,12.508598,41.900492
96,wd:Q76941471,Brunnen,Point(12.5211585 41.8514972),,,,,,12.521159,41.851497
97,wd:Q76941484,Brunnen,Point(12.4807003 41.8488312),,,,,,12.480700,41.848831
98,wd:Q76941501,Brunnen,Point(12.4832348 41.8457644),,,,,,12.483235,41.845764


### Compute distances between fountains

In [50]:
# helper function to compute distances on the globe, returns distances in meters
def spherical_dist(pos1, pos2, r=6371000):
    pos1 = pos1 * np.pi / 180
    pos2 = pos2 * np.pi / 180
    cos_lat1 = np.cos(pos1[..., 0])
    cos_lat2 = np.cos(pos2[..., 0])
    cos_lat_d = np.cos(pos1[..., 0] - pos2[..., 0])
    cos_lon_d = np.cos(pos1[..., 1] - pos2[..., 1])
    return r * np.arccos(cos_lat_d - cos_lat1 * cos_lat2 * (1 - cos_lon_d))


# compute distances from each ODZ fountain to each Wikidata fountain
distances = spherical_dist(df[['X','Y']].values[:, None], query_result[['X','Y']].values)

### Identify nearest and second nearest matches for each wikidata Rome fountain

In [51]:
# indexes of nearest fountains
nearest_idx = np.argmin(distances, axis=1).tolist()

# QID of nearest fountains
df['nearest_qid'] = query_result.iloc[nearest_idx]['place'].apply(lambda id:id[3:]).tolist()

# distance to nearest fountain
df['nearest_distance'] = np.min(distances, axis=1).tolist()


# then remove nearest
i_line=0
for i_col in nearest_idx:
    distances[i_line, i_col] = 100000
    i_line += 1
# find distance to second nearest
df['2nd_nearest_distance'] = np.min(distances, axis=1).tolist()

df.head(100)

Unnamed: 0,Y,X,amenity,fountain,created_by,wheelchair,flow,architect,description,drinking_water,...,location,covered,description:de,website,artwork_type,subject:wikidata,drinking_water:legal,nearest_qid,nearest_distance,2nd_nearest_distance
0,41.824508,12.485546,drinking_water,nasone,,,,,,,...,,,,,,,,Q76937644,0.0,612.792877
1,41.852621,12.478328,drinking_water,nasone,,,,,,,...,,,,,,,,Q76937896,0.0,129.883666
2,41.854102,12.476834,drinking_water,nasone,,,,,,,...,,,,,,,,Q76937912,0.0,102.431975
3,41.863749,12.478948,drinking_water,nasone,,,,,,,...,,,,,,,,Q76937931,0.0,254.841443
4,41.904287,12.513278,drinking_water,,JOSM,,,,,,...,,,,,,,,Q76937949,0.0,298.853091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,41.901662,12.485754,drinking_water,,,,,,,,...,,,,,,,,Q76939583,0.0,97.237198
96,41.911419,12.487827,drinking_water,,JOSM,,,,,,...,,,,,,,,Q76939597,0.0,342.999629
97,41.892383,12.512610,drinking_water,nasone,,,,,,,...,,,,,,,,Q76939614,0.0,79.473433
98,41.893211,12.509447,drinking_water,nasone,,,push-button,,,,...,,,,,,,,Q76939633,0.0,139.503504


### Find out what information already exists for the nearest fountains

In [52]:
# does nearest have describedAtUrl?
df['nearest_has_described_at_url'] = query_result.iloc[nearest_idx]['describeAtUrl'].apply(lambda id:id is not None).tolist()

# does nearest have label in german?
df['nearest_has_label_de'] = (query_result.iloc[nearest_idx]['place'].apply(lambda p:p[3:]) != query_result.iloc[nearest_idx]['placeLabel']).tolist()

# does nearest have date?
df['nearest_has_date'] = query_result.iloc[nearest_idx]['date'].apply(lambda d:d is not None).tolist()

# does nearest have operator?
df['nearest_has_operator'] = query_result.iloc[nearest_idx]['operator'].apply(lambda id:id is not None).tolist()

# does nearest have catalog code?
df['nearest_has_code'] = query_result.iloc[nearest_idx]['catalog_code'].apply(lambda id:id is not None).tolist()

### Decide on whether nearest fountain should be considered a match

In [53]:
# The nearest fountain is a match if: 
# - no further than x m away
# - 2nd nearest fountain at nearest least ratio_min further away than the nearest fountain
def validate_proposal(qid, d1, d2, dmax=10, ratio_min=0.5):
    
    if d1 == 0 or (d1<=dmax and d2/d1-1 >= ratio_min):
        return 'match'
    elif d1<=dmax and d2/d1-1 < ratio_min:
        return 'unclear'
    else:
        #print("Q-# "+qid+"\td1 "+str(d1)+"\td2 "+str(d2))
        return 'no match'
    
for index, row in df.iterrows():
    df.loc[index, 'match_found'] = validate_proposal(
        row['nearest_qid'], 
        row['nearest_distance'], 
        row['2nd_nearest_distance'],
        dmax=15
    )
dffinal = df.drop(columns=['nearest_distance', '2nd_nearest_distance'])

In [54]:
len(dffinal)

1645

In [55]:
dffinal

Unnamed: 0,Y,X,amenity,fountain,created_by,wheelchair,flow,architect,description,drinking_water,...,artwork_type,subject:wikidata,drinking_water:legal,nearest_qid,nearest_has_described_at_url,nearest_has_label_de,nearest_has_date,nearest_has_operator,nearest_has_code,match_found
0,41.824508,12.485546,drinking_water,nasone,,,,,,,...,,,,Q76937644,True,True,False,False,False,match
1,41.852621,12.478328,drinking_water,nasone,,,,,,,...,,,,Q76937896,True,True,False,False,False,match
2,41.854102,12.476834,drinking_water,nasone,,,,,,,...,,,,Q76937912,True,True,False,False,False,match
3,41.863749,12.478948,drinking_water,nasone,,,,,,,...,,,,Q76937931,True,True,False,False,False,match
4,41.904287,12.513278,drinking_water,,JOSM,,,,,,...,,,,Q76937949,True,True,False,False,False,match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,41.913437,12.461680,drinking_water,nasone,,,,,,,...,,,,Q3747400,False,False,False,False,False,no match
1641,41.902114,12.590335,drinking_water,,,yes,,,,,...,,,,Q76939216,True,True,False,False,False,no match
1642,41.888007,12.461600,drinking_water,nasone,,,,,,,...,,,,Q76938934,True,True,False,False,False,no match
1643,41.903375,12.512719,drinking_water,,,,,,,,...,,,,Q76937949,True,True,False,False,False,no match


In [56]:
print("\n\nTotal number of rows "+str(len(dffinal)))



Total number of rows 1645


## Create Quickstatement commands from data
### Helper functions to format content according to Quickstatements v1 syntax

In [57]:
def createline(lines, item, prop, value, extra, qualifiers=[]):
    # general function to create Quickstatement v1 commands
    if value != '' and value != '""':
        statement = '{}\t{}\t{}'.format(item, prop, value)
        if len(qualifiers):
            # append qualifiers if applicable
            for q in qualifiers:
                statement += '\t{}\t{}'.format(q['prop'], q['value'])
        statement += extra
        statement += '\n'
        lines.append(statement)
    return lines

### Create statements, taking care not to overwrite existing data

In [59]:
# initialize command storage list
lines = []
i=0
for index, row in dffinal.iterrows():
    i+=1
    # either create new or edit existing entity
    if row['match_found'] == 'no match':
        continue
    elif row['match_found'] == 'unclear':
        print('unclear match')
        print(row)
        continue
    elif row['match_found'] == 'match':
        # update existing fountain
        item = row['nearest_qid']
        
        
    # Add this basic information only if creating a new entity
    if row['nearest_has_described_at_url']:
        print(str(i)+' '+item+' has ready a shortUrl')
    else:
        #short URL as per https://github.com/water-fountains/proximap/issues/244
        lines = createline(lines, item, 'P973', '"https://h2o.do/'+item+'"','')        


1 Q76937644 has ready a shortUrl
2 Q76937896 has ready a shortUrl
3 Q76937912 has ready a shortUrl
4 Q76937931 has ready a shortUrl
5 Q76937949 has ready a shortUrl
6 Q76937968 has ready a shortUrl
7 Q76937986 has ready a shortUrl
8 Q76938003 has ready a shortUrl
9 Q76938024 has ready a shortUrl
10 Q76938042 has ready a shortUrl
11 Q76938058 has ready a shortUrl
12 Q76938075 has ready a shortUrl
13 Q76938093 has ready a shortUrl
14 Q76938110 has ready a shortUrl
15 Q76938125 has ready a shortUrl
16 Q76938144 has ready a shortUrl
17 Q76938161 has ready a shortUrl
18 Q76938177 has ready a shortUrl
19 Q76938196 has ready a shortUrl
20 Q76938214 has ready a shortUrl
21 Q76938234 has ready a shortUrl
22 Q76938253 has ready a shortUrl
23 Q76938275 has ready a shortUrl
24 Q76938291 has ready a shortUrl
25 Q76938312 has ready a shortUrl
26 Q76938331 has ready a shortUrl
27 Q76938351 has ready a shortUrl
28 Q76938361 has ready a shortUrl
29 Q76938377 has ready a shortUrl
30 Q76938390 has ready 

# Write commands to file

In [60]:
quickStatFileName = "quickstatement_commands_Rome_fountain_shortUrls_"+dt.now().strftime(dtFmt)+".txt"
with io.open(quickStatFileName, "w", encoding='utf8') as f:
    f.writelines(lines)
print("wrote '"+quickStatFileName+"' with "+str(len(lines))+" lines")

wrote 'quickstatement_commands_Rome_fountain_shortUrls_191201_102001.txt' with 155 lines


# Import into Wikidata
- Go to https://tools.wmflabs.org/wikidata-todo/quick_statements.php.
- Authenticate yourself with your Wikidata account.
- Copy and paste the contents of quickstatement_commands*.txt into the blank field, and run the commands

see ../20191030_1600_import.png

...
58. Processing Q72935495 (Q72935495 Lde "Brunnen (Seelöwe-Planschbecken )")
59. Processing Q72935495 (Q72935495 P137 Q27229237)

All done!.

In [15]:
# it may well take half an hour until it works https://query.wikidata.org/