### Initialize env

In [1]:
from datetime import datetime as dt
dtFmt = "%y%m%d_%H%M%S"
print (dt.now().strftime(dtFmt))
import pandas as pd
import io
import numpy as np
from urllib.request import urlopen
import json
import codecs
from math import *
from gastrodon import RemoteEndpoint,QName,ttl,URIRef,inline
from matplotlib import pyplot

200106_090124


In [2]:
#@prefix wikibase: <wikibase: <http://wikiba.se/ontology#> .
prefixes=inline("""
   @prefix wd: <http://www.wikidata.org/entity/> .
   @prefix wdt: <http://www.wikidata.org/prop/direct/> .
   @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
   @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
""").graph
endpoint=RemoteEndpoint(
   "http://query.wikidata.org/sparql"
   ,prefixes=prefixes
)

### Load data

In [3]:
with codecs.open('201911lucerneWaterFountains.enabled.json', 'r', encoding='utf8') as f:
    df = pd.read_json(f)
    df = df.transpose()
df_filtered = df.filter(items=['Name', 'GeoLocation', 'Typ', 'Quality'])

### Rename columns

In [5]:
latitudes = []
longitudes = []
for geolocation in df_filtered['GeoLocation']:
    latitudes.append(float(geolocation['latitude']))
    longitudes.append(float(geolocation['longitude']))

In [11]:
df_filtered['Y'] = latitudes
df_filtered['X'] = longitudes
df = df_filtered.filter(items=['Name', 'Y', 'X', 'Typ', 'Quality'])
df = df.rename(index=str, columns={"Name": "label_de"})

In [12]:
df

Unnamed: 0,label_de,Y,X,Typ,Quality
45,Kornmarktbrunnen,47.052190,8.306028,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD
34,Josefbrunnen,47.042078,8.299762,type_springbrunnen,WATER_QUALITY_BAD
169,Friedhofshallenbrunnen,47.060417,8.295880,type_sprudelbrunnen,WATER_QUALITY_GOOD
120,Wandbrunnen Moosmattstrasse,47.039483,8.304261,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD
193,Relibrunnen,47.035410,8.324500,type_zierbrunnen,WATER_QUALITY_BAD
...,...,...,...,...,...
229,Salzfassbrunnen,47.045223,8.347224,type_zierbrunnen,WATER_QUALITY_BAD
200,Krematoriumsbrunnen Süd II,47.061504,8.289488,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD
28,Pavarottibrunnen,47.051033,8.302585,type_zierbrunnen,WATER_QUALITY_BAD
136,Försterbrunnen Gütsch,47.049765,8.290351,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD


In [13]:
df.Typ.unique()
df.Quality.unique()

array(['WATER_QUALITY_GOOD', 'WATER_QUALITY_BAD'], dtype=object)

## Identify already existing fountains

### Query fountains from Wikidata

In [221]:
buffer = 0.0003  # in degrees, corresponds to about 20-30 meters)
bounds = {
    'minX': float(df['X'].min()) - buffer,
    'minY': float(df['Y'].min()) - buffer,
    'maxX': float(df['X'].max()) + buffer,
    'maxY': float(df['Y'].max()) + buffer
}

In [222]:
# Query fountains (both water wells and fountains) from Wikidata within bounding box found above

query_string = """ SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator ?water_supply_type
WHERE
{{
  # Enter coordinates
  SERVICE wikibase:box {{
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point(8.20651 46.998006)"^^geo:wktLiteral.
    bd:serviceParam wikibase:cornerEast "Point(8.3914 47.083523)"^^geo:wktLiteral.
  }} .
  # Is a water well or fountain or subclass of fountain
  FILTER (EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q43483 }} || EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q483453 }}).
  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
  }} 
  OPTIONAL {{ ?place p:P528 ?catalog_code.
            ?catalog_code pq:P972 ?catalog.}}
  OPTIONAL {{ ?place wdt:P571 ?date.}}
  OPTIONAL {{ ?place wdt:P5623 ?water_supply_type}}
  OPTIONAL {{ ?place wdt:P137 ?operator.}}
}}
  """.format(**bounds)

# Perform query
query_result = endpoint.select(query_string)

In [223]:
print(query_string)

 SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator ?water_supply_type
WHERE
{
  # Enter coordinates
  SERVICE wikibase:box {
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point(8.20651 46.998006)"^^geo:wktLiteral.
    bd:serviceParam wikibase:cornerEast "Point(8.3914 47.083523)"^^geo:wktLiteral.
  } .
  # Is a water well or fountain or subclass of fountain
  FILTER (EXISTS { ?place wdt:P31/wdt:P279* wd:Q43483 } || EXISTS { ?place wdt:P31/wdt:P279* wd:Q483453 }).
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
  } 
  OPTIONAL { ?place p:P528 ?catalog_code.
            ?catalog_code pq:P972 ?catalog.}
  OPTIONAL { ?place wdt:P571 ?date.}
  OPTIONAL { ?place wdt:P5623 ?water_supply_type}
  OPTIONAL { ?place wdt:P137 ?operator.}
}
  


###  Tidy up data

In [224]:
# Extract coordinates from Wikidata results

query_result['X'] = query_result['location'].apply(lambda l:float(l.split('(')[1].split(' ')[0]))
query_result['Y'] = query_result['location'].apply(lambda l:float(l.split(' ')[1].split(')')[0]))

In [225]:
# remove duplicate entries

# duplicate entries are caused when e.g. a fountain has catalog codes from two catalogs
query_result = query_result.drop_duplicates('place')


### Compute distances between fountains

In [226]:
# helper function to compute distances on the globe, returns distances in meters
def spherical_dist(pos1, pos2, r=6371000):
    pos1 = pos1 * np.pi / 180
    pos2 = pos2 * np.pi / 180
    cos_lat1 = np.cos(pos1[..., 0])
    cos_lat2 = np.cos(pos2[..., 0])
    cos_lat_d = np.cos(pos1[..., 0] - pos2[..., 0])
    cos_lon_d = np.cos(pos1[..., 1] - pos2[..., 1])
    return r * np.arccos(cos_lat_d - cos_lat1 * cos_lat2 * (1 - cos_lon_d))


# compute distances from each ODZ fountain to each Wikidata fountain
distances = spherical_dist(df[['X','Y']].values[:, None], query_result[['X','Y']].values)
print(distances)

[[2037.8597713  2074.53452947 1858.04373256 ... 2266.55670483
  7690.10475858 7011.66933872]
 [2037.8597713  2074.53452947 1858.04373256 ... 2266.55670483
  7690.10475858 7011.66933872]
 [2037.8597713  2074.53452947 1858.04373256 ... 2266.55670483
  7690.10475858 7011.66933872]
 ...
 [2037.8597713  2074.53452947 1858.04373256 ... 2266.55670483
  7690.10475858 7011.66933872]
 [2037.8597713  2074.53452947 1858.04373256 ... 2266.55670483
  7690.10475858 7011.66933872]
 [2037.8597713  2074.53452947 1858.04373256 ... 2266.55670483
  7690.10475858 7011.66933872]]


### Identify nearest and second nearest matches for each fountain

In [227]:
# indexes of nearest fountains
nearest_idx = np.argmin(distances, axis=1).tolist()

# QID of nearest fountains
df['nearest_qid'] = query_result.iloc[nearest_idx]['place'].apply(lambda id:id[3:]).tolist()

# distance to nearest fountain
df['nearest_distance'] = np.min(distances, axis=1).tolist()


# then remove nearest
i_line=0
for i_col in nearest_idx:
    distances[i_line, i_col] = 100000
    i_line += 1
# find distance to second nearest
df['2nd_nearest_distance'] = np.min(distances, axis=1).tolist()

df.head(100)

Unnamed: 0,label_de,Y,X,Typ,Quality,nearest_qid,nearest_distance,2nd_nearest_distance
45,Kornmarktbrunnen,47.036915,8.314452,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD,Q29785731,1551.844862,1858.043733
34,Josefbrunnen,47.036915,8.314452,type_springbrunnen,WATER_QUALITY_BAD,Q29785731,1551.844862,1858.043733
169,Friedhofshallenbrunnen,47.036915,8.314452,type_sprudelbrunnen,WATER_QUALITY_GOOD,Q29785731,1551.844862,1858.043733
120,Wandbrunnen Moosmattstrasse,47.036915,8.314452,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD,Q29785731,1551.844862,1858.043733
193,Relibrunnen,47.036915,8.314452,type_zierbrunnen,WATER_QUALITY_BAD,Q29785731,1551.844862,1858.043733
...,...,...,...,...,...,...,...,...
122,Schöttibrunnen,47.036915,8.314452,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD,Q29785731,1551.844862,1858.043733
230,Spielbrunnen Mattstrasse 13,47.036915,8.314452,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD,Q29785731,1551.844862,1858.043733
111,Sälihaldebrunnen,47.036915,8.314452,type_spielbrunnen,WATER_QUALITY_GOOD,Q29785731,1551.844862,1858.043733
31,Baigneuse-Brunnen,47.036915,8.314452,FOUNTAIN_TYPE_DRINKING,WATER_QUALITY_GOOD,Q29785731,1551.844862,1858.043733


### Find out what information already exists for the nearest fountains

In [228]:
# does nearest have label in german?
df['nearest_has_label_de'] = (query_result.iloc[nearest_idx]['place'].apply(lambda p:p[3:]) != query_result.iloc[nearest_idx]['placeLabel']).tolist()

# does nearest have date?
df['nearest_has_date'] = query_result.iloc[nearest_idx]['date'].apply(lambda d:d is not None).tolist()

# does nearest have operator?
df['nearest_has_operator'] = query_result.iloc[nearest_idx]['operator'].apply(lambda id:id is not None).tolist()

# does nearest have catalog code?
df['nearest_has_code'] = query_result.iloc[nearest_idx]['catalog_code'].apply(lambda id:id is not None).tolist()

# does nearest have water type?
df['nearest_has_water_type'] = query_result.iloc[nearest_idx]['water_supply_type'].apply(lambda id:id is not None).tolist()

### Decide on whether nearest fountain should be considered a match

In [229]:
# The nearest fountain is a match if: 
# - no further than x m away
# - 2nd nearest fountain at nearest least ratio_min further away than the nearest fountain
def validate_proposal(qid, d1, d2, dmax=10, ratio_min=0.5):
    
    if d1 == 0 or (d1<=dmax and d2/d1-1 >= ratio_min):
        return 'match'
    elif d1<=dmax and d2/d1-1 < ratio_min:
        return 'unclear'
    else:
        return 'no match'
    
for index, row in df.iterrows():
    df.loc[index, 'match_found'] = validate_proposal(
        row['nearest_qid'], 
        row['nearest_distance'], 
        row['2nd_nearest_distance'],
        dmax=15
    )
dffinal = df.drop(columns=['nearest_distance', '2nd_nearest_distance'])

In [None]:
### Optional: Write matches to a text file

## Create Quickstatement commands from data

### Helper functions to format content according to Quickstatements v1 syntax

In [231]:
def process_coordinates(x, y):
    # format geographic coordinates
    return '@{1:1.8f}/{0:1.8f}'.format(x,y)


def process_year(date):
    # format date
    if np.isnan(date):
        return ''
    else:
        return '+{0:4d}-00-00T00:00:00Z/9'.format(int(date))

    
fountain_type_map = {
    'öffentlicher Brunnen': 'Q53628296',
    'Notwasserbrunnen': 'Q53628522',
    'privater Brunnen': 'Q53629707',
    'Brunnen in städtischer Liegenschaft': 'Q53628618',
    'Brunnen des Verschönerungsvereins': 'Q53628761',
    'Brunnen mit eigener Versorgung': 'Q53630002'
}

water_type_map = {
    'Verteilnetz': 'Q53633635',
    'Quellwasser': 'Q1881858',
    'eigene Versorgung': 'Q53634173',
    'Grundwasser': 'Q161598'
}

def process_fountain_type(type):
    # translate fountain types to wikidata values
    return fountain_type_map[type]


def process_water_type(type):
    # translate water types to wikidata values
    return water_type_map[type]


def process_label_de(text):
    # process German language labels
    if text is None:
        return ''
    elif 'brunnen' in text.lower():
        return '"{}"'.format(text)
    else:
        return '"Brunnen ({})"'.format(text)
    

def createline(lines, item, prop, value, extra, qualifiers=[]):
    # general function to create Quickstatement v1 commands
    if value != '' and value != '""':
        statement = '{}\t{}\t{}'.format(item, prop, value)
        if len(qualifiers):
            # append qualifiers if applicable
            for q in qualifiers:
                statement += '\t{}\t{}'.format(q['prop'], q['value'])
        statement += extra
        statement += '\n'
        lines.append(statement)
    return lines

### Create statements, taking care not to overwrite existing data

In [233]:
# initialize command storage list
lines = []
statedId = "\tS248\tQ76822625"
for index, row in dffinal.iterrows():
    
    # either create new or edit existing entity
    if row['match_found'] == 'no match':
        # create a new fountain
        lines.append('CREATE\n')
        item = 'LAST'
    elif row['match_found'] == 'unclear':
        print('unclear match')
        print(row)
        continue
    elif row['match_found'] == 'match':
        # update existing fountain
        item = row['nearest_qid']
        
        
    # Add this basic information only if creating a new entity
    if item == 'LAST':
        # instance of drinking fountain
        lines = createline(lines, item, 'P31', 'Q1630622', statedId)

        # instance of specific water fountain type
        #lines = createline(lines, item, 'P31', process_fountain_type(row['fountain_type']), statedId)

        # coordinates
        lines = createline(lines, item, 'P625', process_coordinates(row['X'], row['Y']), statedId)
    else:
        #lines = createline(lines, item, 'P31', process_fountain_type(row['fountain_type']), statedId)
        #short URL as per https://github.com/water-fountains/proximap/issues/244
        lines = createline(lines, item, 'P973', '"https://h2o.do/'+item+'"','')
        
        
    # For other properties, add information if the entity is new or if property does not yet exist
    
    # label in german
    if item == 'LAST' or not row['nearest_has_label_de']:
        lines = createline(lines, item, 'Lde', process_label_de(row['label_de']),statedId)
    
    # creation date
#     if item == 'LAST' or not row['nearest_has_date']:
#         lines = createline(lines, item, 'P571', process_year(row['date']),statedId)

    # operated by EWL    
#     if item == 'LAST' or not row['nearest_has_operator']:
#         lines = createline(lines, item, 'P137', 'Q27229237',statedId)
    
    
    # catalog number can always be added (it is hard to check for)
    #lines = createline(lines, item, 'P528', '"{}"'.format(row['operator_id']),statedId, [{
    #    'prop': 'P972',
    #    'value': 'Q53629101'
    #}])

## Write commands to file

In [234]:
quickStatFileName = "quickstatement_commands_lucenrne_water_fountain_"+dt.now().strftime(dtFmt)+".txt"
with io.open(quickStatFileName, "w", encoding='utf8') as f:
    f.writelines(lines)
print("wrote '"+quickStatFileName+"' with "+str(len(lines))+" lines")

wrote 'quickstatement_commands_lucenrne_water_fountain_191130_131757.txt' with 596 lines


## Import into Wikidata
* Go to https://tools.wmflabs.org/wikidata-todo/quick_statements.php.
* Authenticate yourself with your Wikidata account.
* Copy and paste the contents of quickstatement_commands*.txt into the blank field, and run the commands
