# Automatic import of Rome drinking fountains data to Wikidata
The following script downloads fountain data from osm as per https://github.com/water-fountains/proximap/issues/133, compares it to existing fountains in Wikidata for the same region, and creates Wikidata Quickstatement commands to complete the entries in Wikidata. New entities are created if no matching fountains are found.

## Initialize environment

In [1]:
from datetime import datetime as dt
dtFmt = "%y%m%d_%H%M%S"
print (dt.now().strftime(dtFmt))
import pandas as pd
import io
import numpy as np
from urllib.request import urlopen
import json
from math import *
from platform import python_version
print("Python v "+python_version())
#https://github.com/paulhoule/gastrodon/issues/7 
from gastrodon import RemoteEndpoint,QName,ttl,URIRef,inline
from matplotlib import pyplot


191201_082918
Python v 3.6.5


In [2]:
#@prefix wikibase: <wikibase: <http://wikiba.se/ontology#> .
prefixes=inline("""
   @prefix wd: <http://www.wikidata.org/entity/> .
   @prefix wdt: <http://www.wikidata.org/prop/direct/> .
   @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
   @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
""").graph
endpoint=RemoteEndpoint(
   #"https://query.wikidata.org/sparql"
    "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
   ,prefixes=prefixes
)

## Load data

In [3]:
df = pd.read_csv("osmFountainsRome191201_082705.csv")

In [4]:
df.head()

Unnamed: 0,type,id,lat,lon,amenity,fountain,created_by,wheelchair,flow,architect,...,operator,designation,level,location,covered,description:de,website,artwork_type,subject:wikidata,drinking_water:legal
0,node,246569213,41.824508,12.485546,drinking_water,nasone,,,,,...,,,,,,,,,,
1,node,246569214,41.852621,12.478328,drinking_water,nasone,,,,,...,,,,,,,,,,
2,node,246569215,41.854102,12.476834,drinking_water,nasone,,,,,...,,,,,,,,,,
3,node,246569216,41.863749,12.478948,drinking_water,nasone,,,,,...,,,,,,,,,,
4,node,246571139,41.904287,12.513278,drinking_water,,JOSM,,,,...,,,,,,,,,,


### Rename columns to make them easier to work with

In [5]:
# remove not needed columns
df = df.drop(columns=['type','id'])

In [6]:
# rename columns
df = df.rename(index=str, columns=
               {"lat": "Y"})

In [7]:
# rename columns
df = df.rename(index=str, columns=
               {"lon": "X"})

In [8]:
df.head()

Unnamed: 0,Y,X,amenity,fountain,created_by,wheelchair,flow,architect,description,drinking_water,...,operator,designation,level,location,covered,description:de,website,artwork_type,subject:wikidata,drinking_water:legal
0,41.824508,12.485546,drinking_water,nasone,,,,,,,...,,,,,,,,,,
1,41.852621,12.478328,drinking_water,nasone,,,,,,,...,,,,,,,,,,
2,41.854102,12.476834,drinking_water,nasone,,,,,,,...,,,,,,,,,,
3,41.863749,12.478948,drinking_water,nasone,,,,,,,...,,,,,,,,,,
4,41.904287,12.513278,drinking_water,,JOSM,,,,,,...,,,,,,,,,,


In [23]:
len(df)

127

## Identify already existing fountains
### Query fountains from Wikidata

In [9]:
# Find the geographic extent of the data

buffer = 0.0003  # in degrees, corresponds to about 20-30 meters)
bounds = {
    'minX': df['X'].min() - buffer,
    'minY': df['Y'].min() - buffer,
    'maxX': df['X'].max() + buffer,
    'maxY': df['Y'].max() + buffer
}
print("bounds: ")
for key,value in bounds.items():
    print(key+ ": "+str(value))

bounds: 
minX: 12.3238906
minY: 41.802459999999996
maxX: 12.6220409
maxY: 41.99526720000001


In [10]:
# Query fountains (both water wells and fountains) from Wikidata within bounding box found above
# placeLabel 

query_string = """SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator ?water_supply_type
WHERE
{{
  # Enter coordinates
  SERVICE wikibase:box {{
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point({minX} {minY})"^^geo:wktLiteral.
    bd:serviceParam wikibase:cornerEast "Point({maxX} {maxY})"^^geo:wktLiteral.
  }} .
  # Is a water well or fountain or subclass of fountain
  FILTER (EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q43483 }} || EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q483453 }}).
  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
  }} 
  OPTIONAL {{ ?place p:P528 ?catalog_code.
            ?catalog_code pq:P972 ?catalog.}}
  OPTIONAL {{ ?place wdt:P571 ?date.}}
  OPTIONAL {{ ?place wdt:P5623 ?water_supply_type}}
  OPTIONAL {{ ?place wdt:P137 ?operator.}}
  OPTIONAL {{ ?place rdfs:placeLabel ?placeLabel}}
}}
  """.format(**bounds)

print(query_string)

SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator ?water_supply_type
WHERE
{
  # Enter coordinates
  SERVICE wikibase:box {
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point(12.3238906 41.802459999999996)"^^geo:wktLiteral.
    bd:serviceParam wikibase:cornerEast "Point(12.6220409 41.99526720000001)"^^geo:wktLiteral.
  } .
  # Is a water well or fountain or subclass of fountain
  FILTER (EXISTS { ?place wdt:P31/wdt:P279* wd:Q43483 } || EXISTS { ?place wdt:P31/wdt:P279* wd:Q483453 }).
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
  } 
  OPTIONAL { ?place p:P528 ?catalog_code.
            ?catalog_code pq:P972 ?catalog.}
  OPTIONAL { ?place wdt:P571 ?date.}
  OPTIONAL { ?place wdt:P5623 ?water_supply_type}
  OPTIONAL { ?place wdt:P137 ?operator.}
  OPTIONAL { ?place rdfs:placeLabel ?placeLabel}
}
  


In [11]:
# Perform query
query_result = endpoint.select(query_string)

In [12]:
#print(query_string)
print("\n\nTotal number of rows incl. duplicates "+str(len(query_result))+" size "+str(query_result.size))



Total number of rows incl. duplicates 47 size 376


### Tidy up data

In [13]:
# Extract coordinates from Wikidata results

query_result['X'] = query_result['location'].apply(lambda l:float(l.split('(')[1].split(' ')[0]))
query_result['Y'] = query_result['location'].apply(lambda l:float(l.split(' ')[1].split(')')[0]))

In [14]:
query_result.head(100)

Unnamed: 0,place,placeLabel,location,date,catalog_code,catalogLabel,operator,water_supply_type,X,Y
0,wd:Q185382,Trevi-Brunnen,Point(12.483166666 41.900875),1735-01-01T00:00:00Z,,,,,12.483167,41.900875
1,wd:Q267971,Tritonenbrunnen,Point(12.488333333 41.903611111),,,,,,12.488333,41.903611
2,wd:Q655470,Vierströmebrunnen,Point(12.473075 41.898953),1640-01-01T00:00:00Z,,,,,12.473075,41.898953
3,wd:Q983774,Mosesbrunnen,Point(12.49441667 41.90436111),,,,,,12.494417,41.904361
4,wd:Q1085738,Q1085738,Point(12.479643 41.907771),1581-01-01T00:00:00Z,,,,,12.479643,41.907771
5,wd:Q1114231,Fontana della Barcaccia,Point(12.482212 41.905806),1629-01-01T00:00:00Z,,,,,12.482212,41.905806
6,wd:Q1217302,Die Vier Brunnen,Point(12.490736 41.901962),,,,,,12.490736,41.901962
7,wd:Q1227084,Dioskurenbrunnen,Point(12.48669167 41.89911389),1588-01-01T00:00:00Z,,,,,12.486692,41.899114
8,wd:Q1234446,Septizodium,Point(12.488719 41.885633),,,,,,12.488719,41.885633
9,wd:Q1303855,Galeerenbrunnen,Point(12.45516389 41.90633333),,,,,,12.455164,41.906333


In [128]:
# show duplicates
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html#pandas.DataFrame.duplicated

ids = query_result['placeLabel']
#ids = query_result['label']
duplicates = query_result[ids.isin(ids[ids.duplicated(keep=False)])]
dupli = duplicates.drop(columns=['catalog_code', 'catalogLabel', 'water_supply_type', 'date', 'catalogLabel', 'location', 'operator'])
dupli.sort_values(by=['placeLabel'],inplace=True)
print ("Duplicates: "+str(len(dupli))+"\n\n")
print(dupli.to_string())

Duplicates: 0


Empty DataFrame
Columns: [place, placeLabel, X, Y]
Index: []


In [129]:
def createMerge(dupli, extraText):
    #Create MERGE Command
    linesDup = []
    prevQ= ""
    prevX=0
    prevY=0
    for index, row in dupli.iterrows():
        
        # either create new or edit existing entity
        if row['X'] == prevX:
            if row['Y'] == prevY:
                lineDup = "MERGE\t"+prevQ[3:]+"\t"+row['place'][3:]+"\n"
                linesDup.append(lineDup)
            else:
                prevX=row['X']
                prevY=row['Y']
                prevQ=row['place']
        else:
            prevX=row['X']
            prevY=row['Y']
            prevQ=row['place']
            
    print("Merge commands"+extraText+" total: "+str(len(linesDup))+"\n\n")
    print(linesDup)
    
createMerge(dupli, "")    

Merge commands total: 0


[]


In [130]:
#write MERGE command to File
quickStatDupliFileName = "quickstatement_commands_Hamburg_drink_DUPLI_"+dt.now().strftime(dtFmt)+".txt"
with io.open(quickStatDupliFileName, "w", encoding='utf8') as f:
    f.writelines(linesDup)
print("wrote '"+quickStatDupliFileName+"' with "+str(len(linesDup))+" lines")

NameError: name 'linesDup' is not defined

In [37]:
# show duplicates STRICT
#idsS = query_result['placeLabel','X','Y'] #this does not work yet
#duplicatesS = query_result[ids.isin(ids[ids.duplicated(keep=False)])]
#dupliS = duplicatesS.drop(columns=['catalog_code', 'catalogLabel', 'water_supply_type', 'date', 'catalogLabel', 'location', 'operator'])
#dupliS.sort_values(by=['placeLabel'],inplace=True)
#print(dupliS.to_string())
#createMerge(dupliS, " - strict")  

In [132]:
# remove duplicate entries

# duplicate entries are caused when e.g. a fountain has catalog codes from two catalogs
query_result = query_result.drop_duplicates('place')
print("\n\nTotal number of rows without duplicates "+str(len(query_result)))



Total number of rows without duplicates 130


In [133]:
query_result.head(100)

Unnamed: 0,place,placeLabel,location,date,catalog_code,catalogLabel,operator,water_supply_type,X,Y
0,wd:Q1454814,Minervabrunnen,Point(9.95166667 53.54583333),,,,,,9.951667,53.545833
1,wd:Q814528,Behnbrunnen,Point(9.94194444 53.54805556),,,,,,9.941944,53.548056
2,wd:Q2358953,Stuhlmannbrunnen,Point(9.93527778 53.54972222),,,,,,9.935278,53.549722
3,wd:Q2651496,Alsterfontäne,Point(9.99495278 53.55461667),,,,,,9.994953,53.554617
4,wd:Q18632417,Vierländerin-Brunnen,Point(9.99014 53.54766),,,,,,9.990140,53.547660
...,...,...,...,...,...,...,...,...,...,...
95,wd:Q76308142,Q76308142,Point(10.2961728 53.4268728),,,,,,10.296173,53.426873
96,wd:Q76308157,Q76308157,Point(9.9418923 53.548183),,,,,,9.941892,53.548183
97,wd:Q76308173,Q76308173,Point(10.0831994 53.626006),,,,,,10.083199,53.626006
98,wd:Q76308188,Q76308188,Point(9.9919829 53.5509746),,,,,,9.991983,53.550975


### Compute distances between fountains

In [15]:
# helper function to compute distances on the globe, returns distances in meters
def spherical_dist(pos1, pos2, r=6371000):
    pos1 = pos1 * np.pi / 180
    pos2 = pos2 * np.pi / 180
    cos_lat1 = np.cos(pos1[..., 0])
    cos_lat2 = np.cos(pos2[..., 0])
    cos_lat_d = np.cos(pos1[..., 0] - pos2[..., 0])
    cos_lon_d = np.cos(pos1[..., 1] - pos2[..., 1])
    return r * np.arccos(cos_lat_d - cos_lat1 * cos_lat2 * (1 - cos_lon_d))


# compute distances from each ODZ fountain to each Wikidata fountain
distances = spherical_dist(df[['X','Y']].values[:, None], query_result[['X','Y']].values)

### Identify nearest and second nearest matches for each osm hamburg fountain

In [16]:
# indexes of nearest fountains
nearest_idx = np.argmin(distances, axis=1).tolist()

# QID of nearest fountains
df['nearest_qid'] = query_result.iloc[nearest_idx]['place'].apply(lambda id:id[3:]).tolist()

# distance to nearest fountain
df['nearest_distance'] = np.min(distances, axis=1).tolist()


# then remove nearest
i_line=0
for i_col in nearest_idx:
    distances[i_line, i_col] = 100000
    i_line += 1
# find distance to second nearest
df['2nd_nearest_distance'] = np.min(distances, axis=1).tolist()

df.head(100)

Unnamed: 0,Y,X,amenity,fountain,created_by,wheelchair,flow,architect,description,drinking_water,...,location,covered,description:de,website,artwork_type,subject:wikidata,drinking_water:legal,nearest_qid,nearest_distance,2nd_nearest_distance
0,41.824508,12.485546,drinking_water,nasone,,,,,,,...,,,,,,,,Q3747399,6050.595665,6645.370926
1,41.852621,12.478328,drinking_water,nasone,,,,,,,...,,,,,,,,Q3747399,2927.877099,3765.661694
2,41.854102,12.476834,drinking_water,nasone,,,,,,,...,,,,,,,,Q3747399,2763.708809,3620.838026
3,41.863749,12.478948,drinking_water,nasone,,,,,,,...,,,,,,,,Q3747399,1729.133230,2612.556528
4,41.904287,12.513278,drinking_water,,JOSM,,,,,,...,,,,,,,,Q3747439,1739.133915,1895.744243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,41.901662,12.485754,drinking_water,,,,,,,,...,,,,,,,,Q1227084,295.652129,300.123782
96,41.911419,12.487827,drinking_water,,JOSM,,,,,,...,,,,,,,,Q857540,778.610155,849.481370
97,41.892383,12.512610,drinking_water,nasone,,,,,,,...,,,,,,,,Q3747378,1056.359741,1395.008816
98,41.893211,12.509447,drinking_water,nasone,,,push-button,,,,...,,,,,,,,Q3747378,862.456277,1204.654015


### Find out what information already exists for the nearest fountains

In [17]:
# does nearest have label in german?
df['nearest_has_label_de'] = (query_result.iloc[nearest_idx]['place'].apply(lambda p:p[3:]) != query_result.iloc[nearest_idx]['placeLabel']).tolist()

# does nearest have date?
df['nearest_has_date'] = query_result.iloc[nearest_idx]['date'].apply(lambda d:d is not None).tolist()

# does nearest have operator?
df['nearest_has_operator'] = query_result.iloc[nearest_idx]['operator'].apply(lambda id:id is not None).tolist()

# does nearest have catalog code?
df['nearest_has_code'] = query_result.iloc[nearest_idx]['catalog_code'].apply(lambda id:id is not None).tolist()

# does nearest have water type?
df['nearest_has_water_type'] = query_result.iloc[nearest_idx]['water_supply_type'].apply(lambda id:id is not None).tolist()

### Decide on whether nearest fountain should be considered a match

In [18]:
# The nearest fountain is a match if: 
# - no further than x m away
# - 2nd nearest fountain at nearest least ratio_min further away than the nearest fountain
def validate_proposal(qid, d1, d2, dmax=10, ratio_min=0.5):
    
    if d1 == 0 or (d1<=dmax and d2/d1-1 >= ratio_min):
        return 'match'
    elif d1<=dmax and d2/d1-1 < ratio_min:
        return 'unclear'
    else:
        print("Q-# "+qid+"\td1 "+str(d1)+"\td2 "+str(d2))
        return 'no match'
    
for index, row in df.iterrows():
    df.loc[index, 'match_found'] = validate_proposal(
        row['nearest_qid'], 
        row['nearest_distance'], 
        row['2nd_nearest_distance'],
        dmax=15
    )
dffinal = df.drop(columns=['nearest_distance', '2nd_nearest_distance'])

Q-# Q3747399	d1 6050.595665472956	d2 6645.370926107748
Q-# Q3747399	d1 2927.877099271093	d2 3765.6616937942076
Q-# Q3747399	d1 2763.7088094028068	d2 3620.838025695476
Q-# Q3747399	d1 1729.133230091919	d2 2612.556528112138
Q-# Q3747439	d1 1739.1339154385607	d2 1895.7442430742722
Q-# Q3747439	d1 2060.63955208273	d2 2158.642702567936
Q-# Q7489878	d1 509.65950843993915	d2 683.2036843705455
Q-# Q74574057	d1 893.9491792650444	d2 1116.0454057674542
Q-# Q3747378	d1 1004.3358321320073	d2 1103.2445450044297
Q-# Q3747378	d1 762.6829404459017	d2 776.3531453657797
Q-# Q3747378	d1 772.1528659300075	d2 780.1462213885712
Q-# Q3747439	d1 465.26849091468034	d2 939.219047380278
Q-# Q3747439	d1 461.37868195931287	d2 934.2749619591492
Q-# Q3747439	d1 300.1468141026485	d2 960.1098778500419
Q-# Q3747443	d1 124.12821975323716	d2 528.8911228977391
Q-# Q3747443	d1 16.26333372481265	d2 522.412253708795
Q-# Q3747443	d1 343.14870038838325	d2 375.2712475047008
Q-# Q3747388	d1 3268.845123603653	d2 3498.396092621199


Q-# Q3747399	d1 4420.815886645879	d2 4846.889306660766
Q-# Q3747399	d1 4574.366745313418	d2 4959.513616984284
Q-# Q74574057	d1 5123.4671209685575	d2 5489.6134233006205
Q-# Q74574057	d1 4444.155921171953	d2 5010.608769876672
Q-# Q74574057	d1 3388.075953696482	d2 5087.382364698026
Q-# Q74574057	d1 8763.009508610654	d2 10428.71022265078
Q-# Q74574057	d1 8437.551694879958	d2 10100.57567460093
Q-# Q3747439	d1 669.3239064046525	d2 890.8431842396833
Q-# Q3747378	d1 1045.6289920078632	d2 1328.0307986935738
Q-# Q74574057	d1 2435.474624725843	d2 3686.7066305330754
Q-# Q74574057	d1 2359.879207255961	d2 3618.8089662764164
Q-# Q3747442	d1 123.31974617825855	d2 450.71096377943314
Q-# Q18379990	d1 571.5420803501823	d2 656.0994723464371
Q-# Q3747399	d1 1520.1772813560349	d2 1992.3129990775312
Q-# Q3747399	d1 2218.8696915997175	d2 2912.8665253609793
Q-# Q3747399	d1 2194.5320734455204	d2 2648.9763855943324
Q-# Q3747378	d1 922.4538541000771	d2 1002.8752709700132
Q-# Q3747378	d1 558.6488563439218	d2 1171.

Q-# Q74574057	d1 1928.5521794721744	d2 2243.925577750905
Q-# Q74574057	d1 2599.8759644367115	d2 3088.856153267219
Q-# Q3747444	d1 22.0489145596231	d2 147.37306902498113
Q-# Q3747388	d1 859.6760276632426	d2 938.0932796692828
Q-# Q3747378	d1 956.1164863193859	d2 1000.0539858522812
Q-# Q74574057	d1 758.5951049350039	d2 1117.3004846098686
Q-# Q18379990	d1 131.61929137196915	d2 136.84006912397604
Q-# Q3747429	d1 109.46134543490332	d2 350.9363129267426
Q-# Q74574057	d1 1310.401236182135	d2 2627.5996635683077
Q-# Q3747364	d1 380.56243171504826	d2 555.9963191310479
Q-# Q3747402	d1 1081.1459702695681	d2 1281.5059160368182
Q-# Q3747402	d1 2380.1439986039686	d2 3702.014450188625
Q-# Q3747402	d1 2299.9257633252414	d2 3591.0556269607887
Q-# Q3747402	d1 2124.6437404020003	d2 3701.9665246931277
Q-# Q3747402	d1 2763.5828272528797	d2 4200.634125874639
Q-# Q3747402	d1 2854.626340710208	d2 4245.56409829012
Q-# Q3747402	d1 4180.140822639084	d2 5620.284299482994
Q-# Q74574057	d1 1889.0046526765225	d2 3361.

Q-# Q12042914	d1 1949.322189178505	d2 2215.894117168742
Q-# Q3747402	d1 1406.731246086064	d2 1426.6119057663345
Q-# Q7489878	d1 709.3755856649527	d2 734.2125937233101
Q-# Q12042914	d1 2541.063187944867	d2 3004.2690743122653
Q-# Q12042914	d1 4505.024546283105	d2 4968.45804535188
Q-# Q29582887	d1 618.8242544100268	d2 628.5962952790434
Q-# Q1303855	d1 246.84141412292811	d2 391.644085873153
Q-# Q1303855	d1 1166.47683406544	d2 1264.6762080680164
Q-# Q1303855	d1 443.67236291389077	d2 584.4092337073696
Q-# Q12042914	d1 4720.676226021673	d2 4806.114472448308
Q-# Q12042914	d1 4784.094731054716	d2 4903.06081025441
Q-# Q12042914	d1 4851.129331412338	d2 4919.716921486064
Q-# Q12042914	d1 4622.457725286382	d2 4789.432349097709
Q-# Q12042914	d1 15260.41784753737	d2 15509.65483476182
Q-# Q12042914	d1 3041.6710318238675	d2 3402.8542446108877
Q-# Q12042914	d1 2801.775355983583	d2 3204.8792629102404
Q-# Q3747388	d1 850.9110501185522	d2 947.5321649281603
Q-# Q3747388	d1 741.9441716268927	d2 928.969294669

Q-# Q3747402	d1 5923.712292927162	d2 6495.374327062587
Q-# Q3747400	d1 241.71505654291184	d2 245.79172164908925
Q-# Q74574057	d1 565.0556672606425	d2 2038.3045359624086
Q-# Q3747370	d1 4808.54834426924	d2 5107.36796739834
Q-# Q3747370	d1 5201.074517131766	d2 5350.8003101771765
Q-# Q3747402	d1 5502.362493895327	d2 5567.883320398675
Q-# Q3747402	d1 6068.818687655079	d2 6325.4487154965445
Q-# Q3747402	d1 5775.485007735416	d2 5898.912421994513
Q-# Q3747402	d1 5658.062211364367	d2 5660.350316024066
Q-# Q3747402	d1 3262.2469557028485	d2 4627.20197392616
Q-# Q3747402	d1 2388.2444106200314	d2 3233.0278221046506
Q-# Q3747370	d1 4841.343769181312	d2 5299.1696310826555
Q-# Q3747370	d1 5452.598451202818	d2 5556.648346856904
Q-# Q3747370	d1 3801.5631016955626	d2 4089.7844848454097
Q-# Q3747402	d1 5766.4978743534375	d2 7013.724089589709
Q-# Q3747402	d1 6107.128500170488	d2 7247.657028408647
Q-# Q3747402	d1 9677.623121176854	d2 9997.467000607336
Q-# Q3747378	d1 474.2889577565114	d2 1107.4893212418817

Q-# Q3747378	d1 318.5432794249516	d2 754.0017589401331
Q-# Q74574057	d1 6899.890114822702	d2 8381.731446903046
Q-# Q29582887	d1 465.6818391763615	d2 495.1813949657336
Q-# Q1217302	d1 136.2256359632462	d2 381.3297443411073
Q-# Q1759793	d1 191.36330571701754	d2 232.11379432364834
Q-# Q5465141	d1 189.6232232785271	d2 211.98339903803736
Q-# Q3747364	d1 19.704049224414177	d2 359.7854786505623
Q-# Q3747399	d1 895.9141556356645	d2 1336.9045604243395
Q-# Q74574057	d1 1143.2119042277407	d2 2824.9596681326593
Q-# Q74574057	d1 4346.079947919572	d2 5973.77171537559
Q-# Q74574057	d1 4396.874900819361	d2 6022.079433046234
Q-# Q74574057	d1 1324.172236695328	d2 2815.1709976343063
Q-# Q3747402	d1 7555.520659913934	d2 8472.462435755357
Q-# Q74574057	d1 8304.151782817726	d2 9992.674853033817
Q-# Q12042914	d1 1925.8309690961944	d2 2084.132513133117
Q-# Q3747437	d1 186.73956512899963	d2 261.50092829767925
Q-# Q3747370	d1 1027.8440635752963	d2 3118.786654489706
Q-# Q3747402	d1 2095.2068617646146	d2 2713.788

Q-# Q74574057	d1 7818.096495919169	d2 9443.446805006874
Q-# Q74574057	d1 5350.310084686857	d2 6936.414551576518
Q-# Q74574057	d1 4371.356264962666	d2 5495.611825787394
Q-# Q5465137	d1 236.45200215328686	d2 424.3219671784401
Q-# Q74574057	d1 5930.475134623811	d2 6519.887402793731
Q-# Q74574057	d1 6060.376762074061	d2 6809.5426220379595
Q-# Q74574057	d1 6124.034609231968	d2 6998.278086195742
Q-# Q3747370	d1 5223.059316978547	d2 5443.432043462845
Q-# Q74574057	d1 2667.259979671513	d2 4351.310538696706
Q-# Q74574057	d1 1029.5426336805317	d2 2373.9521600929065
Q-# Q74574057	d1 6152.793155323108	d2 6459.745778215572
Q-# Q74574057	d1 6676.360697283486	d2 8372.922192462758
Q-# Q74574057	d1 3040.8640991544476	d2 4741.478711655458
Q-# Q3747399	d1 7750.277211645212	d2 8163.969790186759
Q-# Q74574057	d1 11301.27535890607	d2 12994.705539807379
Q-# Q74574057	d1 8402.778534123705	d2 10108.877597992367
Q-# Q74574057	d1 6444.365851756383	d2 8038.314395563418
Q-# Q74574057	d1 3709.957101532674	d2 4315.7

Q-# Q74574057	d1 2656.0843482632417	d2 3207.110856929606
Q-# Q74574057	d1 2668.264057545888	d2 3228.067687542395
Q-# Q74574057	d1 11352.268995159551	d2 13004.586700076838
Q-# Q7489878	d1 311.9310106959028	d2 522.4061205496319
Q-# Q3747399	d1 5793.662422994939	d2 6542.8390545692555
Q-# Q74574057	d1 5750.870115048296	d2 7333.549583550529
Q-# Q3747443	d1 215.16016634262104	d2 430.49310857302953
Q-# Q74574057	d1 1988.4118284465808	d2 3605.1131329623704
Q-# Q12042914	d1 32.83677331848073	d2 460.6982986509668
Q-# Q3747399	d1 615.6363543181733	d2 629.4861883917074
Q-# Q3747399	d1 4561.745920314292	d2 4738.3637013113375
Q-# Q74574057	d1 5097.703111728399	d2 5624.909895236947
Q-# Q1714672	d1 55.157163018139094	d2 172.61806185633733
Q-# Q3747402	d1 3891.063801895329	d2 5295.17491178399
Q-# Q3747400	d1 2794.7858281503045	d2 2812.346796869539
Q-# Q3747400	d1 2854.0208319142566	d2 2871.5364086613313
Q-# Q3747400	d1 2745.443884457385	d2 2763.0463514995417
Q-# Q1234446	d1 269.62520477778594	d2 321.47

In [19]:
len(dffinal)

1645

In [20]:
dffinal

Unnamed: 0,Y,X,amenity,fountain,created_by,wheelchair,flow,architect,description,drinking_water,...,artwork_type,subject:wikidata,drinking_water:legal,nearest_qid,nearest_has_label_de,nearest_has_date,nearest_has_operator,nearest_has_code,nearest_has_water_type,match_found
0,41.824508,12.485546,drinking_water,nasone,,,,,,,...,,,,Q3747399,True,False,False,False,False,no match
1,41.852621,12.478328,drinking_water,nasone,,,,,,,...,,,,Q3747399,True,False,False,False,False,no match
2,41.854102,12.476834,drinking_water,nasone,,,,,,,...,,,,Q3747399,True,False,False,False,False,no match
3,41.863749,12.478948,drinking_water,nasone,,,,,,,...,,,,Q3747399,True,False,False,False,False,no match
4,41.904287,12.513278,drinking_water,,JOSM,,,,,,...,,,,Q3747439,False,False,False,False,False,no match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,41.913437,12.461680,drinking_water,nasone,,,,,,,...,,,,Q3747400,False,False,False,False,False,no match
1641,41.902114,12.590335,drinking_water,,,yes,,,,,...,,,,Q74574057,False,False,False,False,False,no match
1642,41.888007,12.461600,drinking_water,nasone,,,,,,,...,,,,Q1435809,True,False,False,False,False,no match
1643,41.903375,12.512719,drinking_water,,,,,,,,...,,,,Q3747439,False,False,False,False,False,no match


In [21]:
print("\n\nTotal number of rows "+str(len(dffinal)))



Total number of rows 1645


## Create Quickstatement commands from data
### Helper functions to format content according to Quickstatements v1 syntax

In [45]:
def process_coordinates(x, y):
    # format geographic coordinates
    return '@{1:1.8f}/{0:1.8f}'.format(x,y)


def process_year(date):
    # format date
    if np.isnan(date):
        return ''
    else:
        return '+{0:4d}-00-00T00:00:00Z/9'.format(int(date))

    
fountain_type_map = {
    'öffentlicher Brunnen': 'Q53628296',
    'Notwasserbrunnen': 'Q53628522',
    'privater Brunnen': 'Q53629707',
    'Brunnen in städtischer Liegenschaft': 'Q53628618',
    'Brunnen des Verschönerungsvereins': 'Q53628761',
    'Brunnen mit eigener Versorgung': 'Q53630002'
}

water_type_map = {
    'Verteilnetz': 'Q53633635',
    'Quellwasser': 'Q1881858',
    'eigene Versorgung': 'Q53634173',
    'Grundwasser': 'Q161598'
}

def process_fountain_type(type):
    # translate fountain types to wikidata values
    return fountain_type_map[type]


def process_water_type(type):
    # translate water types to wikidata values
    return water_type_map[type]


def process_label(text, dfltLabel):
    # process German language labels
    #print (text + ' '+dfltLabel)
    if text is None:
        return dfltLabel
    elif 'nan' == text.lower():
        return '"'+dfltLabel+'"'
    elif 'none' == text.lower():
        return '"'+dfltLabel+'"'.format(text)
    elif 'brunnen' in text.lower():
        return '"{}"'.format(text)
    elif 'fountain' in text.lower():
        return '"{}"'.format(text)
    else:
        return '"'+dfltLabel+' ({})"'.format(text)

    
def process_label_de(text):
    dfltLabel = 'Brunnen'
    return process_label(text, dfltLabel)
    
def process_label_en(text):
    dfltLabel = 'Fountain'
    return process_label(text, dfltLabel)
    

def createline(lines, item, prop, value, extra, qualifiers=[]):
    # general function to create Quickstatement v1 commands
    if value != '' and value != '""':
        statement = '{}\t{}\t{}'.format(item, prop, value)
        if len(qualifiers):
            # append qualifiers if applicable
            for q in qualifiers:
                statement += '\t{}\t{}'.format(q['prop'], q['value'])
        statement += extra
        statement += '\n'
        lines.append(statement)
    return lines

### Create statements, taking care not to overwrite existing data

In [46]:
# initialize command storage list
lines = []
statedId = "\tS248\tQ1224853"
i=0
for index, row in dffinal.iterrows():
    i+=1
    # either create new or edit existing entity
    if row['match_found'] == 'no match':
        # create a new fountain
        lines.append('CREATE\n')
        item = 'LAST'
    elif row['match_found'] == 'unclear':
        print('unclear match')
        print(row)
        continue
    elif row['match_found'] == 'match':
        # update existing fountain
        item = row['nearest_qid']
        
        
    # Add this basic information only if creating a new entity
    if item == 'LAST':
        # instance of  fountain
        lines = createline(lines, item, 'P31', 'Q483453',statedId)

        # coordinates
        lines = createline(lines, item, 'P625', process_coordinates(row['X'], row['Y']),statedId)
        
    # For other properties, add information if the entity is new or if property does not yet exist
    
    # label in german
    if item == 'LAST' or not row['nearest_has_label_de']:
        lines = createline(lines, item, 'Lde', process_label_de(str(row['name'])),statedId)

    # label in english
    #if item == 'LAST' or not row['nearest_has_label_en']:
    #    lines = createline(lines, item, 'Len', process_label_en(str(row['name'])),statedId)
        
    # creation date
    #if item == 'LAST' or not row['nearest_has_date']:
    #    lines = createline(lines, item, 'P571', process_year(row['date']))

    # operated by  Hamburg Wasser
    #print(str(i)+": "+str(row['operator']))
    if  pd.isna(row['operator']):
        print(str(i)+": "+"operator none")
    else:
         lines = createline(lines, item, 'P137', 'Q1572943',statedId)
            
    # catalog number can always be added (it is hard to check for)
    #lines = createline(lines, item, 'P528', '"{}"'.format(row['operator_id']), [{
    #    'prop': 'P972',
    #    'value': 'Q53629101'
    #}])

1: operator none
2: operator none
3: operator none
4: operator none
5: operator none
6: operator none
7: operator none
8: operator none
9: operator none
10: operator none
11: operator none
12: operator none
13: operator none
14: operator none
15: operator none
16: operator none
17: operator none
18: operator none
19: operator none
20: operator none
21: operator none
22: operator none
23: operator none
24: operator none
25: operator none
26: operator none
27: operator none
28: operator none
29: operator none
30: operator none
31: operator none
32: operator none
33: operator none
34: operator none
35: operator none
36: operator none
37: operator none
38: operator none
39: operator none
40: operator none
41: operator none
42: operator none
43: operator none
44: operator none
45: operator none
46: operator none
47: operator none
48: operator none
49: operator none
50: operator none
51: operator none
52: operator none
53: operator none
54: operator none
55: operator none
56: operator none
5

608: operator none
609: operator none
610: operator none
611: operator none
612: operator none
613: operator none
614: operator none
615: operator none
616: operator none
617: operator none
618: operator none
619: operator none
620: operator none
621: operator none
622: operator none
623: operator none
624: operator none
625: operator none
626: operator none
627: operator none
628: operator none
629: operator none
630: operator none
631: operator none
632: operator none
633: operator none
634: operator none
635: operator none
636: operator none
637: operator none
638: operator none
639: operator none
640: operator none
641: operator none
642: operator none
643: operator none
644: operator none
645: operator none
646: operator none
647: operator none
648: operator none
649: operator none
650: operator none
651: operator none
652: operator none
653: operator none
654: operator none
655: operator none
656: operator none
657: operator none
658: operator none
659: operator none
660: operato

1250: operator none
1251: operator none
1252: operator none
1253: operator none
1254: operator none
1255: operator none
1256: operator none
1257: operator none
1258: operator none
1259: operator none
1260: operator none
1261: operator none
1262: operator none
1263: operator none
1264: operator none
1265: operator none
1266: operator none
1267: operator none
1268: operator none
1269: operator none
1270: operator none
1271: operator none
1272: operator none
1273: operator none
1274: operator none
1275: operator none
1276: operator none
1277: operator none
1278: operator none
1279: operator none
1280: operator none
1281: operator none
1282: operator none
1283: operator none
1284: operator none
1285: operator none
1286: operator none
1287: operator none
1288: operator none
1289: operator none
1290: operator none
1291: operator none
1292: operator none
1293: operator none
1294: operator none
1295: operator none
1296: operator none
1297: operator none
1298: operator none
1299: operator none


# Write commands to file

In [47]:
quickStatFileName = "quickstatement_commands_Rome_fountain_"+dt.now().strftime(dtFmt)+".txt"
with io.open(quickStatFileName, "w", encoding='utf8') as f:
    f.writelines(lines)
print("wrote '"+quickStatFileName+"' with "+str(len(lines))+" lines")

wrote 'quickstatement_commands_Rome_fountain_191201_085501.txt' with 8191 lines


# Import into Wikidata
- Go to https://tools.wmflabs.org/wikidata-todo/quick_statements.php.
- Authenticate yourself with your Wikidata account.
- Copy and paste the contents of quickstatement_commands*.txt into the blank field, and run the commands

see ../20191030_1600_import.png

...
58. Processing Q72935495 (Q72935495 Lde "Brunnen (Seelöwe-Planschbecken )")
59. Processing Q72935495 (Q72935495 P137 Q27229237)

All done!.

In [15]:
# it may well take half an hour until it works https://query.wikidata.org/