In [1]:
import pandas as pd
import io
import numpy as np
from urllib.request import urlopen
import json
from math import *
from gastrodon import RemoteEndpoint,QName,ttl,URIRef,inline
from matplotlib import pyplot

In [2]:
# helper function to replace None with ''
def none2str(string):
    if string is None:
        return ''
    else:
        return string

In [3]:
# helper to compute distances on the globe
def spherical_dist(pos1, pos2, r=6371000):
    pos1 = pos1 * np.pi / 180
    pos2 = pos2 * np.pi / 180
    cos_lat1 = np.cos(pos1[..., 0])
    cos_lat2 = np.cos(pos2[..., 0])
    cos_lat_d = np.cos(pos1[..., 0] - pos2[..., 0])
    cos_lon_d = np.cos(pos1[..., 1] - pos2[..., 1])
    return r * np.arccos(cos_lat_d - cos_lat1 * cos_lat2 * (1 - cos_lon_d))

In [4]:
#@prefix wikibase: <wikibase: <http://wikiba.se/ontology#> .
prefixes=inline("""
   @prefix wd: <http://www.wikidata.org/entity/> .
   @prefix wdt: <http://www.wikidata.org/prop/direct/> .
   @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
   @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
""").graph
endpoint=RemoteEndpoint(
   "http://query.wikidata.org/sparql"
   ,prefixes=prefixes
)

## Load data

In [5]:
data_url = 'https://data.stadt-zuerich.ch/dataset/brunnen/resource/d741cf9c-63be-495f-8c3e-9418168fcdbf/download/brunnen.json'

In [6]:
data_js = json.loads(urlopen(data_url).read())

## Modify columns

In [7]:
# convert to dataframe
df = pd.io.json.json_normalize(data_js['features'])
# extract coords
df['X'] = df['geometry.coordinates'].apply(lambda crds:crds[0])
df['Y'] = df['geometry.coordinates'].apply(lambda crds:crds[1])
# remove not needed columns
df = df.drop(columns=['geometry.coordinates', 'geometry.type', 'properties.objectid', 'type'])
# rename columns
df = df.rename(index=str, columns=
               {"properties.bezeichnung": "label_de", 
                "properties.brunnenart_txt": "fountain_type", 
                "properties.wasserart_txt": "water_type", 
                "properties.nummer":"operator_id",
                "properties.historisches_baujahr": "date"
               })
# remove "None" from appelation column
df['label_de'] = df['label_de'].apply(lambda a:none2str(a))
df.head()

Unnamed: 0,label_de,fountain_type,date,operator_id,water_type,X,Y
0,Aussichtsturm,öffentlicher Brunnen,1970.0,510,Verteilnetz,8.599255,47.369752
1,,öffentlicher Brunnen,1933.0,349,Verteilnetz,8.590811,47.369293
2,Biberlinterrasse,öffentlicher Brunnen,1965.0,365,Quellwasser,8.575754,47.36613
3,,öffentlicher Brunnen,1910.0,338,Quellwasser,8.564845,47.370993
4,,Notwasserbrunnen,1988.0,6069,Quellwasser,8.56439,47.369327


## Identify duplicates

In [8]:
# Get bounding box of data
buffer = 0.0003  #(about 20-30 meters)
bounds = {
    'minX': df['X'].min() - buffer,
    'minY': df['Y'].min() - buffer,
    'maxX': df['X'].max() + buffer,
    'maxY': df['Y'].max() + buffer
}

In [79]:
# query fountains within bounding box
query_string = """ SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator
WHERE
{{
  # Enter coordinates
  SERVICE wikibase:box {{
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point({minX} {minY})"^^geo:wktLiteral.
    bd:serviceParam wikibase:cornerEast "Point({maxX} {maxY})"^^geo:wktLiteral.
  }} .
  # Is a water well or fountain or subclass of fountain
  FILTER (EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q43483 }} || EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q483453 }}).
  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
  }} 
  OPTIONAL {{ ?place p:P528 ?catalog_code.
            ?catalog_code pq:P972 ?catalog.}}
  OPTIONAL {{ ?place wdt:P571 ?date.}}
  OPTIONAL {{ ?place wdt:P137 ?operator.}}
}}
  """.format(**bounds)

# Perform query
query_result = endpoint.select(query_string)

In [80]:
# Extract lat and lon
query_result['X'] = query_result['location'].apply(lambda l:float(l.split('(')[1].split(' ')[0]))
query_result['Y'] = query_result['location'].apply(lambda l:float(l.split(' ')[1].split(')')[0]))

In [81]:
len(query_result)

1294

In [82]:
query_result.head()

Unnamed: 0,place,placeLabel,location,date,catalog_code,catalogLabel,operator,X,Y
0,wd:Q27229836,Brunnen mit separater Steinfigur,Point(8.535417658 47.331831493),1933-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q2722...,Kunst im Stadtraum,,8.535418,47.331831
1,wd:Q27229839,Trinkbrunnen Tramendstation Wollishofen,Point(8.530696166 47.338459928),1965-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q2722...,Kunst im Stadtraum,,8.530696,47.33846
2,wd:Q27229840,"Wandbrunnen ""Kauernde"" und versch. Steinreliefs",Point(8.569822286 47.372759402),1910-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q2722...,Kunst im Stadtraum,,8.569822,47.372759
3,wd:Q27229841,Toblerbrunnen,Point(8.559929469 47.379465936),1914-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q2722...,Kunst im Stadtraum,,8.559929,47.379466
4,wd:Q27229842,"Brunnen ""Frau mit Zicklein""",Point(8.57142559 47.382487794),1955-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q2722...,Kunst im Stadtraum,,8.571426,47.382488


In [83]:
# remove duplicate entries
query_result = query_result.drop_duplicates('place')

In [84]:
len(query_result)

1292

In [85]:
# compute distance matrix
distances = spherical_dist(df[['X','Y']].values[:, None], query_result[['X','Y']].values)

# find nearest existing fountain for each fountain to import
#indexes of nearest
nearest_idx = np.argmin(distances, axis=1).tolist()
# qid of nearest
df['nearest_qid'] = query_result.iloc[nearest_idx]['place'].apply(lambda id:id[3:]).tolist()
# does nearest have label in german?
df['nearest_has_label_de'] = (query_result.iloc[nearest_idx]['place'].apply(lambda p:p[3:]) != query_result.iloc[nearest_idx]['placeLabel']).tolist()
# does nearest have date?
df['nearest_has_date'] = query_result.iloc[nearest_idx]['date'].apply(lambda d:d is not None).tolist()
# does nearest have operator?
df['nearest_has_operator'] = query_result.iloc[nearest_idx]['operator'].apply(lambda id:id is not None).tolist()
# does nearest have catalog code?
df['nearest_has_code'] = query_result.iloc[nearest_idx]['catalog_code'].apply(lambda id:id is not None).tolist()
# distance of nearest
df['nearest_distance'] = np.min(distances, axis=1).tolist()


# then remove nearest
i_line=0
for i_col in nearest_idx:
    distances[i_line, i_col] = 100000
    i_line += 1
# find distance to second nearest
df['2nd_nearest_distance'] = np.min(distances, axis=1).tolist()

df.head(100)

Unnamed: 0,label_de,fountain_type,date,operator_id,water_type,X,Y,nearest_qid,nearest_has_date,nearest_has_operator,nearest_has_code,nearest_distance,2nd_nearest_distance,qid,nearest_has_label_de
0,Aussichtsturm,öffentlicher Brunnen,1970.0,510,Verteilnetz,8.599255,47.369752,Q55165903,False,True,False,0.000000,342.913025,Q55165903,True
1,,öffentlicher Brunnen,1933.0,349,Verteilnetz,8.590811,47.369293,Q55165905,False,True,False,0.000000,541.765153,Q55165905,False
2,Biberlinterrasse,öffentlicher Brunnen,1965.0,365,Quellwasser,8.575754,47.366130,Q55165919,False,True,False,0.000000,360.590819,Q55165919,True
3,,öffentlicher Brunnen,1910.0,338,Quellwasser,8.564845,47.370993,Q27230192,True,False,True,1.980033,190.081520,Q27230192,True
4,,Notwasserbrunnen,1988.0,6069,Quellwasser,8.564390,47.369327,Q55165936,False,True,False,0.000000,188.767471,Q55165936,False
5,,öffentlicher Brunnen,1903.0,317,Quellwasser,8.560110,47.365197,Q55165939,False,True,False,0.000000,179.860503,Q55165939,False
6,,öffentlicher Brunnen,1907.0,329,Quellwasser,8.562215,47.365124,Q55165953,False,True,False,0.000000,213.801415,Q55165953,False
7,,öffentlicher Brunnen,1906.0,327,Quellwasser,8.560303,47.366987,Q55166016,False,True,False,0.000000,156.849604,Q55166016,False
8,,öffentlicher Brunnen,1903.0,315,Quellwasser,8.556960,47.368494,Q55166017,False,True,False,0.000000,133.190434,Q55166017,False
9,,öffentlicher Brunnen,1892.0,313,Quellwasser,8.555891,47.369544,Q55166032,False,True,False,0.000000,15.021392,Q55166032,False


In [86]:
# use the identified qid only if criteria are met: 
# - no further than 10 m away
# - next closest fountain at neares least 50% further away
def validate_proposal(qid, d1, d2, dmax=10, ratio_min=0.5):
    
    if d1 == 0 or (d1<=dmax and d2/d1-1 >= ratio_min):
        return qid
    elif d1<=dmax and d2/d1-1 < ratio_min:
        return 'unclear'
    else:
        return ''

In [87]:
for index, row in df.iterrows():
    df.loc[index, 'qid'] = validate_proposal(
        row['nearest_qid'], 
        row['nearest_distance'], 
        row['2nd_nearest_distance'],
        dmax=15
    )
dffinal = df.drop(columns=['nearest_qid', 'nearest_distance', '2nd_nearest_distance'])

In [88]:
dffinal.head()

Unnamed: 0,label_de,fountain_type,date,operator_id,water_type,X,Y,nearest_has_date,nearest_has_operator,nearest_has_code,qid,nearest_has_label_de
0,Aussichtsturm,öffentlicher Brunnen,1970.0,510,Verteilnetz,8.599255,47.369752,False,True,False,Q55165903,True
1,,öffentlicher Brunnen,1933.0,349,Verteilnetz,8.590811,47.369293,False,True,False,Q55165905,False
2,Biberlinterrasse,öffentlicher Brunnen,1965.0,365,Quellwasser,8.575754,47.36613,False,True,False,Q55165919,True
3,,öffentlicher Brunnen,1910.0,338,Quellwasser,8.564845,47.370993,True,False,True,Q27230192,True
4,,Notwasserbrunnen,1988.0,6069,Quellwasser,8.56439,47.369327,False,True,False,Q55165936,False


## Format for wikidata import

In [89]:
def process_coordinates(x, y):
    return '@{1:1.8f}/{0:1.8f}'.format(x,y)

def process_year(date):
    if np.isnan(date):
        return ''
    else:
        return '+{0:4d}-00-00T00:00:00Z/9'.format(int(date))

fountain_type_map = {
    'öffentlicher Brunnen': 'Q53628296',
    'Notwasserbrunnen': 'Q53628522',
    'privater Brunnen': 'Q53629707',
    'Brunnen in städtischer Liegenschaft': 'Q53628618',
    'Brunnen des Verschönerungsvereins': 'Q53628761',
    'Brunnen mit eigener Versorgung': 'Q53630002'
}

def process_fountain_type(type):
    return fountain_type_map[type]

def createline(lines, item, prop, value, qualifiers=[]):
    if value != '' and value != '""':
        statement = '{}\t{}\t{}'.format(item, prop, value)
        if len(qualifiers):
            # append qualifiers if applicable
            for q in qualifiers:
                statement += '\t{}\t{}'.format(q['prop'], q['value'])
        statement += '\n'
        lines.append(statement)
    return lines

def process_label_de(text):
    if text == '':
        return text
    elif 'brunnen' in text.lower():
        return '"{}"'.format(text)
    else:
        return '"Brunnen ({})"'.format(text)

In [94]:
lines = []

for index, row in dffinal.iterrows():
    if row['qid'] == '':
        # create a new fountain
        lines.append('CREATE\n')
        item = 'LAST'
    elif row['qid'] == 'unclear':
        print('unclear match')
        print(row)
        continue
    else:
        # update existing fountain
        item = row['qid']
        
    if item == 'LAST' or not row['nearest_has_label_de']:
        # label in german
        lines = createline(lines, item, 'Lde', process_label_de(row['label_de']))
    
    if item == 'LAST':
        # instance of drinking fountain
        lines = createline(lines, item, 'P31', 'Q1630622')

        # instance of specific water fountain type
        lines = createline(lines, item, 'P31', process_fountain_type(row['fountain_type']))

        # coordinates
        lines = createline(lines, item, 'P625', process_coordinates(row['X'], row['Y']))
    
    if item == 'LAST' or not row['nearest_has_date']:
        # creation date
        lines = createline(lines, item, 'P571', process_year(row['date']))
    
    if item == 'LAST' or not row['nearest_has_operator']:
        # operated by WVZ
        lines = createline(lines, item, 'P137', 'Q27229237')
    
    # if item == 'LAST' or not row['nearest_has_code']:
    # catalog number
    lines = createline(lines, item, 'P528', '"{}"'.format(row['operator_id']), [{
        'prop': 'P972',
        'value': 'Q53629101'
    }])

# Write commands to file

In [95]:
with io.open("quickstatement_commands.txt", "w", encoding='utf8') as f:
    f.writelines(lines)

In [92]:
len(dffinal['qid'].unique())

1272

In [93]:
df.iloc[904]

label_de                      Manessebrunnen
fountain_type           öffentlicher Brunnen
date                                    1931
operator_id                               53
water_type                       Verteilnetz
X                                    8.54677
Y                                    47.3709
nearest_qid                        Q27229690
nearest_has_date                        True
nearest_has_operator                   False
nearest_has_code                        True
nearest_distance                    0.284806
2nd_nearest_distance                 1.95253
qid                                Q27229690
nearest_has_label_de                    True
Name: 904, dtype: object