# Pandas2QuickStatements

The goal of these modules is to streamline the checking of water fountains as items in Wikidata.

In [1]:
from datetime import datetime as dt
dtFmt = "%y%m%d_%H%M%S"
print (dt.now().strftime(dtFmt))
import pandas as pd
import io
import numpy as np
from urllib.request import urlopen
import json
from math import *
from platform import python_version
print("Python v "+python_version())
#https://github.com/paulhoule/gastrodon/issues/7 
from gastrodon import RemoteEndpoint,QName,ttl,URIRef,inline
from matplotlib import pyplot

191124_192953
Python v 3.6.5


In [2]:
#@prefix wikibase: <wikibase: <http://wikiba.se/ontology#> .
prefixes=inline("""
   @prefix wd: <http://www.wikidata.org/entity/> .
   @prefix wdt: <http://www.wikidata.org/prop/direct/> .
   @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
   @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
""").graph
endpoint=RemoteEndpoint(
   "http://query.wikidata.org/sparql"
   ,prefixes=prefixes
)

In [3]:
# helper function to compute distances on the globe, returns distances in meters
def spherical_dist(pos1, pos2, r=6371000):
    pos1 = pos1 * np.pi / 180
    pos2 = pos2 * np.pi / 180
    cos_lat1 = np.cos(pos1[..., 0])
    cos_lat2 = np.cos(pos2[..., 0])
    cos_lat_d = np.cos(pos1[..., 0] - pos2[..., 0])
    cos_lon_d = np.cos(pos1[..., 1] - pos2[..., 1])
    return r * np.arccos(cos_lat_d - cos_lat1 * cos_lat2 * (1 - cos_lon_d))

In [4]:
# The nearest fountain is a match if: 
# - no further than x m away
# - 2nd nearest fountain at nearest least ratio_min further away than the nearest fountain
def validate_proposal(qid, d1, d2, dmax=10, ratio_min=0.5):
    
    if d1 == 0 or (d1<=dmax and d2/d1-1 >= ratio_min):
        return 'match'
    elif d1<=dmax and d2/d1-1 < ratio_min:
        return 'unclear'
    else:
        return 'no match'

The incoming data frame should have the following columns:
    'X', 'Y' and 'name'
Where the X and Y columns are the longitudial and latitudinal coordinates respectively and the name column is a unique identifier for the fountain, could just be numbers. The returned df will 

In [5]:
def identify_nearest_fountains(df, location):
    # Find the geographic extent of the data

    buffer = 0.0003  # in degrees, corresponds to about 20-30 meters)
    bounds = {
    'minX': df['X'].min() - buffer,
    'minY': df['Y'].min() - buffer,
    'maxX': df['X'].max() + buffer,
    'maxY': df['Y'].max() + buffer
    }
    
    for key,value in bounds.items():
        query_string = """ SELECT ?place ?placeLabel ?location ?date ?catalog_code ?catalogLabel ?operator ?water_supply_type
        WHERE
        {{
          # Enter coordinates
          SERVICE wikibase:box {{
            ?place wdt:P625 ?location .
            bd:serviceParam wikibase:cornerWest "Point({minX} {minY})"^^geo:wktLiteral.
            bd:serviceParam wikibase:cornerEast "Point({maxX} {maxY})"^^geo:wktLiteral.
          }} .
          # Is a water well or fountain or subclass of fountain
          FILTER (EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q43483 }} || EXISTS {{ ?place wdt:P31/wdt:P279* wd:Q483453 }}).
          SERVICE wikibase:label {{
            bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de" .
          }} 
          OPTIONAL {{ ?place p:P528 ?catalog_code.
                    ?catalog_code pq:P972 ?catalog.}}
          OPTIONAL {{ ?place wdt:P571 ?date.}}
          OPTIONAL {{ ?place wdt:P5623 ?water_supply_type}}
          OPTIONAL {{ ?place wdt:P137 ?operator.}}
        }}
          """.format(**bounds)

    # Perform query
    query_result = endpoint.select(query_string)
    
    
    # Extract coordinates from Wikidata results

    query_result['X'] = query_result['location'].apply(lambda l:float(l.split('(')[1].split(' ')[0]))
    query_result['Y'] = query_result['location'].apply(lambda l:float(l.split(' ')[1].split(')')[0]))
    
    # show duplicates
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html#pandas.DataFrame.duplicated

    ids = query_result['placeLabel']
    duplicates = query_result[ids.isin(ids[ids.duplicated(keep=False)])]
    dupli = duplicates.drop(columns=['catalog_code', 'catalogLabel', 'water_supply_type', 'date', 'catalogLabel', 'location', 'operator'])
    dupli.sort_values(by=['placeLabel'],inplace=True)
    
    
    #Create MERGE Command
    extraText = ""
    linesDup = []
    prevQ= ""
    prevX=0
    prevY=0
    for index, row in dupli.iterrows():
        
        # either create new or edit existing entity
        if row['X'] == prevX:
            if row['Y'] == prevY:
                lineDup = "MERGE\t"+prevQ[3:]+"\t"+row['place'][3:]+"\n"
                linesDup.append(lineDup)
            else:
                prevX=row['X']
                prevY=row['Y']
                prevQ=row['place']
        else:
            prevX=row['X']
            prevY=row['Y']
            prevQ=row['place']
            
    quickStatDupliFileName = "quickstatement_commands_{}_drink_DUPLI_".format(location)+dt.now().strftime(dtFmt)+".txt"
    with io.open(quickStatDupliFileName, "w", encoding='utf8') as f:
        f.writelines(linesDup)
    
    query_result = query_result.drop_duplicates('place')
    distances = spherical_dist(df[['X','Y']].values[:, None], query_result[['X','Y']].values)
    # indexes of nearest fountains
    nearest_idx = np.argmin(distances, axis=1).tolist()

    # QID of nearest fountains
    df['nearest_qid'] = query_result.iloc[nearest_idx]['place'].apply(lambda id:id[3:]).tolist()

    # distance to nearest fountain
    df['nearest_distance'] = np.min(distances, axis=1).tolist()

    # then remove nearest
    i_line=0
    for i_col in nearest_idx:
        distances[i_line, i_col] = 100000
        i_line += 1
    # find distance to second nearest
    df['2nd_nearest_distance'] = np.min(distances, axis=1).tolist()

    #Find out what information already exists for the nearest fountains
    
    # does nearest have label in german?
    df['nearest_has_label_de'] = (query_result.iloc[nearest_idx]['place'].apply(lambda p:p[3:]) != query_result.iloc[nearest_idx]['placeLabel']).tolist()

    # does nearest have date?
    df['nearest_has_date'] = query_result.iloc[nearest_idx]['date'].apply(lambda d:d is not None).tolist()

    # does nearest have operator?
    df['nearest_has_operator'] = query_result.iloc[nearest_idx]['operator'].apply(lambda id:id is not None).tolist()

    # does nearest have catalog code?
    df['nearest_has_code'] = query_result.iloc[nearest_idx]['catalog_code'].apply(lambda id:id is not None).tolist()

    # does nearest have water type?
    df['nearest_has_water_type'] = query_result.iloc[nearest_idx]['water_supply_type'].apply(lambda id:id is not None).tolist()
    
    #Decide whether nearest fountain should be considered a match
    
    for index, row in df.iterrows():
        df.loc[index, 'match_found'] = validate_proposal(
            row['nearest_qid'], 
            row['nearest_distance'], 
            row['2nd_nearest_distance'],
            dmax=15
        )
    
    return  df.drop(columns=['nearest_distance', '2nd_nearest_distance'])


Create Quickstatement commands from data. Below are some helper functions for 



In [6]:
def process_coordinates(x, y):
    # format geographic coordinates
    return '@{1:1.8f}/{0:1.8f}'.format(x,y)


def process_year(date):
    # format date
    if np.isnan(date):
        return ''
    else:
        return '+{0:4d}-00-00T00:00:00Z/9'.format(int(date))
    
    
def process_label_de(text):
    # process German language labels
    if text is None:
        return ''
    elif 'brunnen' in text.lower():
        return '"{}"'.format(text)
    else:
        return '"Brunnen ({})"'.format(text)
    

def createline(lines, item, prop, value, ref, qualifiers=[]):
    # general function to create Quickstatement v1 commands
    if value != '' and value != '""':
        statement = '{}\t{}\t{}'.format(item, prop, value)
        if len(qualifiers):
            # append qualifiers if applicable
            for q in qualifiers:
                statement += '\t{}\t{}'.format(q['prop'], q['value'])
        statement += ref
        statement += '\n'
        lines.append(statement)
    return lines 

In [7]:
def write_query(df, location, ref):
    lines = []

    for index, row in df.iterrows():

        # either create new or edit existing entity
        if row['match_found'] == 'no match':
            # create a new fountain
            lines.append('CREATE\n')
            item = 'LAST'
        elif row['match_found'] == 'unclear':
            print('unclear match')
            print(row)
            continue
        elif row['match_found'] == 'match':
            # update existing fountain
            item = row['nearest_qid']
        

        # Add this basic information only if creating a new entity
        if item == 'LAST':
            # instance of drinking fountain
            lines = createline(lines, item, 'P31', 'Q1630622', ref)

            # coordinates
            lines = createline(lines, item, 'P625', process_coordinates(row['X'], row['Y']),ref)


        # For other properties, add information if the entity is new or if property does not yet exist

        # label in german
        #if item == 'LAST' or not row['nearest_has_label_de']:
        #    lines = createline(lines, item, 'Lde', process_label_de(row['label_de']))

        # creation date
        #if item == 'LAST' or not row['nearest_has_date']:
        #    lines = createline(lines, item, 'P571', process_year(row['date']))

        # operated by t.b.d. operator per location  
        # lines = createline(lines, item, 'P137', 'Q72936279')

        # catalog number can always be added (it is hard to check for)
        #lines = createline(lines, item, 'P528', '"{}"'.format(row['operator_id']), [{
        #    'prop': 'P972',
        #    'value': 'Q53629101'
        #}])
        
        
    #Write the data 
    quickStatFileName = "quickstatement_commands_{}_drink_".format(location)+dt.now().strftime(dtFmt)+".txt"
    with io.open(quickStatFileName, "w", encoding='utf8') as f:
        f.writelines(lines)
    print("wrote '"+quickStatFileName+"' with "+str(len(lines))+" lines")
    
    return