# Reconciliation of Place Names using OpenRefine

This notebooks reconciles place names in Linked Art JSON-LD files with identifiers defined in a name authority 
- the Getty Thesaurus of Geographic Names® Online (TGN) 
- http://www.getty.edu/research/tools/vocabularies/tgn

The input data is a collection of Linked Art JSON-LD files created with another notebook for the artist, John Ruskin.

In [64]:
try:
    import ipywidgets as widgets
except:
    !pip install ipywidgets
    import ipywidgets as widgets

from ipywidgets import Layout
from ipywidgets import FileUpload

try:
    import os
except:
    !pip install os
    import os

try:
    import IPython
except:
    !pip install IPython
    import IPython   
    
from IPython.display import display, IFrame, HTML

try:
    import xmltodict
except:
    !pip install xmltodict
    import xmltodict

try:
    import json
except:
    !pip install json
    import json 
    
    
try:
    import requests
except:
    !pip install requests
    import requests

import csv

try:
    import cromulent 
except:
    !pip install cromulent
    import cromulent
    
from cromulent.model import factory

from cromulent.model import factory, Actor, Production, BeginningOfExistence, EndOfExistence, TimeSpan, Place
from cromulent.model import InformationObject, Phase, VisualItem 
from cromulent.vocab import Painting, Drawing,Miniature,add_art_setter, PrimaryName, Name, CollectionSet, instances, Sculpture 
from cromulent.vocab import aat_culture_mapping, AccessionNumber, Height, Width, SupportPart, Gallery, MuseumPlace 
from cromulent.vocab import BottomPart, Description, RightsStatement, MuseumOrg, Purchase
from cromulent.vocab import Furniture, Mosaic, Photograph, Coin, Vessel, Graphic, Enamel, Embroidery, PhotographPrint
from cromulent.vocab import PhotographAlbum, PhotographBook, PhotographColor, PhotographBW, Negative, Map, Clothing, Furniture
from cromulent.vocab import Sample, Architecture, Armor, Book, DecArts, Implement, Jewelry, Manuscript, SiteInstallation, Text, Print
from cromulent.vocab import TimeBasedMedia, Page, Folio, Folder, Box, Envelope, Binder, Case, FlatfileCabinet
from cromulent.vocab import HumanMadeObject,Tapestry,LocalNumber
from cromulent.vocab import Type,Set
from cromulent.vocab import TimeSpan, Actor, Group, Acquisition, Place
from cromulent.vocab import Production, TimeSpan, Actor
from cromulent.vocab import LinguisticObject,DigitalObject, DigitalService

from cromulent import reader




from lib import linkedart as la


try:
    import pandas as pd
except:
    !pip install pandas
    import pandas as pd
    
import requests


## Create CSV file from Linked Art JSON-LD

- Iterate through the Linked Art JSON-LD files
- Get artwork `id` and `_label` properties
- Save properties to CSV file


In [65]:
#vars 
# Linked Art JSON-LD file location
file_dir = "./data/ruskin/output/json/"

# output CSV file
csv_file = "./data/ruskin/ruskin-places.csv"

In [66]:
titles = []
file_list=os.listdir(file_dir)

# iterate Linked Art JSON-LD files
for file in file_list:
    # read file and append to 
    with open( file_dir + file) as json_file:
        artwork = json.load(json_file)
        titles.append({"id":artwork["id"], "place" : artwork["_label"], "place_modified": " ", "coords": " "})

# create CSV file
with open(csv_file, 'w') as f:  
    w = csv.DictWriter(f, ["id","place","place_modified","coords"])
    w.writeheader()
    w.writerows(titles)
     
# display CSV file
df = pd.read_csv(csv_file,low_memory=False)
display(df)

Unnamed: 0,id,place,place_modified,coords
0,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,,
1,https://www.harvardartmuseums.org/collections/...,"Tom Tower, Christ Church, Oxford",,
2,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,,
3,https://www.tate.org.uk/art/artworks/13033,View of Bologna,,
4,https://www.harvardartmuseums.org/collections/...,Fragment of the Alps,,
...,...,...,...,...
80,https://www.harvardartmuseums.org/collections/...,Withered Rush-blossom (Exercise in Lamp-Black),,
81,https://www.harvardartmuseums.org/collections/...,Study of a Magnified Pheasant's Feather,,
82,https://www.nga.gov/collection/72870,The Garden of San Miniato near Florence,,
83,https://www.harvardartmuseums.org/collections/...,Part of a Sketch of the Northwest Porch of St....,,


## Parse Place to Extract Potential Place Names

In [67]:
df = pd.read_csv(csv_file,low_memory=False)

places2 = ["Florence","Bologna","Lucca","Alps","Oxford","Rome", 
           "Venice","Fribourg","Neuchâtel","Sestri","Visp","Chamonix",
           "Abbeville","Schaffhausen","Verona","Vorarlberg","Baden","Schaffhausen","Faido","Normandy","Genève"
          
          ]
places = {"Venezia":["Venice","Venetian","St Mark","St. Mark"],
         }

for index,row in df.iterrows():
    # check if any value in places2 is present in rowp
    for place in places2:
        if place in row["place"]:
            df.at[index,"place_modified"] = place
    
    for place in places["Venezia"]:
        if place in row["place"]:
            df.at[index,"place_modified"] = "Venezia"

# remove records where place_modified is blank
df = df[df.place_modified != " "]
df.to_csv(csv_file, index=False) 

df = pd.read_csv(csv_file,low_memory=False)
display(df)

Unnamed: 0,id,place,place_modified,coords
0,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,Venezia,
1,https://www.harvardartmuseums.org/collections/...,"Tom Tower, Christ Church, Oxford",Oxford,
2,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,Venezia,
3,https://www.tate.org.uk/art/artworks/13033,View of Bologna,Bologna,
4,https://www.harvardartmuseums.org/collections/...,Fragment of the Alps,Alps,
5,https://www.harvardartmuseums.org/collections/...,Looking down from Florence towards Lucca,Lucca,
6,https://www.harvardartmuseums.org/collections/...,"Entrance to Feldkirch, the Vorarlberg",Vorarlberg,
7,https://www.harvardartmuseums.org/collections/...,Sketch of Verona,Verona,
8,https://www.harvardartmuseums.org/collections/...,Falls of Schaffhausen,Schaffhausen,
9,https://www.harvardartmuseums.org/collections/...,Towers at Baden,Baden,


## Use OpenRefine to Reconcile Place Names

https://openrefine.org

    "OpenRefine (previously Google Refine) is a powerful tool for working with messy data: cleaning it; 
    transforming it from one format into another; and extending it with web services and external data."

*Method*
    
- Install OpenRefine
- Open OpenRefine in browser
- Create project
- Upload places CSV file
- Review data

    
<img src="docs/media/img/openrefine.png"/>

### Reconcile data on `place_modified` column

- Right-click on `place_modified` column header
- Select `Start reconciling`

<img src="docs/media/img/openrefine2.png"/>

### Choose a service to reconcile data with

- Choose a service to reconcile data with from reconciliation services known to Wikidata - https://reconciliation-api.github.io/testbench/
- Getty vocab services was chosen due to the Getty Thesaurus of Geographic Names® Online (TGN) that's included https://www.getty.edu/research/tools/vocabularies/tgn/

<img src="docs/media/img/reconcileserv.png"/>

### Review reconciliation search results

- review reconciliation search results and select relevant match if found
- Create new column to hold the tgn identifiers

<img src="docs/media/img/tgncol.png"/>

### Manual reconciliation

- Some additional manual reconciliation was required using the TGN search form at http://www.getty.edu/research/tools/vocabularies/tgn
    
<img src="docs/media/img/tgn.png"/>
    

## Export CSV file from OpenRefine and review

- export CSV file from OpenRefine with column new containing TGN identifiers
- save as [data/ruskin/ruskin-places-rec.csv](data/ruskin/ruskin-places-rec.csv) `
- remove lines that do not have entry in tgn column and save file


In [68]:
file = "data/ruskin/ruskin-places-rec.csv" 

df = pd.read_csv(file,low_memory=False)
df = df[df.place_modified != "Sestri"]
df.to_csv(file, index=False) 

df = pd.read_csv(file,low_memory=False)
display(df)

Unnamed: 0,id,place,place_modified,tgn,coords
0,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,Venice,tgn/7018159,
1,https://www.harvardartmuseums.org/collections/...,"Tom Tower, Christ Church, Oxford",Oxford,tgn/7011931,
2,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,Venice,tgn/7018159,
3,https://www.tate.org.uk/art/artworks/13033,View of Bologna,Bologna,tgn/7003127,
4,https://www.harvardartmuseums.org/collections/...,Fragment of the Alps,Alps,tgn/7007746,
5,https://www.harvardartmuseums.org/collections/...,Looking down from Florence towards Lucca,Lucca,tgn/7003165,
6,https://www.harvardartmuseums.org/collections/...,"Entrance to Feldkirch, the Vorarlberg",Vorarlberg,tgn/7018001,
7,https://www.harvardartmuseums.org/collections/...,Sketch of Verona,Verona,tgn/7003262,
8,https://www.harvardartmuseums.org/collections/...,Falls of Schaffhausen,Schaffhausen,tgn/7106739,
9,https://www.harvardartmuseums.org/collections/...,Towers at Baden,Baden,tgn/8707496,


## Get Geocoordinates for TGN identifiers and add to CSV file

- Get geocoordinates for TGN identifiers
- request JSON file from http://vocab.getty.edu/tgn/ using TGN identifier
- add to CSV file

In [69]:
file = "./data/ruskin/ruskin-places-rec.csv" 
filecoord = "./data/ruskin/ruskin-places-rec-coords.csv" 

df = pd.read_csv(file,low_memory=False)
df['coords'] = df['coords'].astype(str)

for index,row in df.iterrows():  
    gid = row["tgn"]
    if "tgn" in str(gid):
        infof = "http://vocab.getty.edu/tgn/" + gid.split("tgn/",1)[1] +"-place.json"
        response = requests.get(infof)
        json_data = response.json()
        for prop in json_data:
            lat= json_data[prop]["http://www.w3.org/2003/01/geo/wgs84_pos#lat"][0]["value"]
            lng = json_data[prop]["http://www.w3.org/2003/01/geo/wgs84_pos#long"][0]["value"]
            latlng = str(lat) + "," + str(lng)
            df.at[index, "coords"] =  latlng
            print(gid + " " + latlng)
            
# save coords to file
df.to_csv(filecoord, index=False)


tgn/7018159 45.438611,12.326667
tgn/7011931 51.75,-1.25
tgn/7018159 45.438611,12.326667
tgn/7003127 44.466667,11.433333
tgn/7007746 46.416667,10
tgn/7003165 44.033333,10.45
tgn/7018001 47.25,9.9167
tgn/7003262 45.45,11
tgn/7106739 48.766667,10.633333
tgn/8707496 47.452702,8.309969
tgn/7010587 50.106602,1.832691
tgn/7106739 48.766667,10.633333
tgn/1032562 45.924308,6.867316
tgn/7018159 45.438611,12.326667
tgn/1064047 46.479417,8.797659
tgn/7007494 46.294803,7.880048
tgn/1064047 46.479417,8.797659
tgn/7018159 45.438611,12.326667
tgn/7018159 45.438611,12.326667
tgn/1064047 46.479417,8.797659
tgn/7018159 45.438611,12.326667
tgn/7003746 46.196732,6.110443
tgn/7003751 46.990867,6.797675
tgn/7018159 45.438611,12.326667
tgn/7018159 45.438611,12.326667
tgn/7018159 45.438611,12.326667
tgn/7018159 45.438611,12.326667
tgn/7106739 48.766667,10.633333
tgn/7007278 46.79572,7.154748
tgn/7018159 45.438611,12.326667
tgn/7018159 45.438611,12.326667
tgn/7002886 49,0
tgn/7018159 45.438611,12.326667
tgn/700

In [81]:
df = pd.read_csv(filecoord,low_memory=False)

# drop rows that do not have coords value
df.dropna(subset=['coords'])  # drop rows that have nan     
# sort records by coords
df.sort_values(by=['coords'])

# save coords to file
df.to_csv(filecoord, index=False)

display(df)

Unnamed: 0,id,place,place_modified,tgn,coords
0,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,Venice,tgn/7018159,"45.438611,12.326667"
1,https://www.harvardartmuseums.org/collections/...,"Tom Tower, Christ Church, Oxford",Oxford,tgn/7011931,"51.75,-1.25"
2,https://www.harvardartmuseums.org/collections/...,Study of a Venetian Capital,Venice,tgn/7018159,"45.438611,12.326667"
3,https://www.tate.org.uk/art/artworks/13033,View of Bologna,Bologna,tgn/7003127,"44.466667,11.433333"
4,https://www.harvardartmuseums.org/collections/...,Fragment of the Alps,Alps,tgn/7007746,"46.416667,10"
5,https://www.harvardartmuseums.org/collections/...,Looking down from Florence towards Lucca,Lucca,tgn/7003165,"44.033333,10.45"
6,https://www.harvardartmuseums.org/collections/...,"Entrance to Feldkirch, the Vorarlberg",Vorarlberg,tgn/7018001,"47.25,9.9167"
7,https://www.harvardartmuseums.org/collections/...,Sketch of Verona,Verona,tgn/7003262,"45.45,11"
8,https://www.harvardartmuseums.org/collections/...,Falls of Schaffhausen,Schaffhausen,tgn/7106739,"48.766667,10.633333"
9,https://www.harvardartmuseums.org/collections/...,Towers at Baden,Baden,tgn/8707496,"47.452702,8.309969"


## Incorporate Place Name and Coordinates into Linked Art JSON-LD Files

The next step is to update the Linked Art JSON-LD files with the place name and coordinate information.
- relevant parts of the Linked Art model are 
 - depiction
 - geospatial approximation
 - depiction of place with approximate location

-----

*Further information*

https://linked.art/model/object/aboutness/#depiction

https://linked.art/model/place/#geospatial-approximation


### Depiction

Many sorts of artwork depict things that can be pointed out in the artwork. These could be identifiable entities, such as a known Person or Object with a name or identifier, or unidentifiable (perhaps fictional) instances of a class of entity, such as a depiction of a battle but not any particular battle. For example a portrait depicts the person sitting for it, or a sketch of a generic landscape depicts a place even if it's not a particular, known location. The depiction pattern describes what is in the artwork's image.

This is modeled using the represents property on the VisualItem, which refers to the entity that is being depicted.

`{
  "@context": "https://linked.art/ns/v1/linked-art.json",
  "id": "https://linked.art/example/object/34",
  "type": "HumanMadeObject",
  "_label": "Self Portrait",
    "shows": [
    {
      "type": "VisualItem",
      "represents": [
        {
          "type": "Place",
          "_label": "Artist"
        }
      ]
    }
  ]}`

### Geospatial approximation

All recorded locations are approximate to some degree. It may be desirable to capture this approximation separately from the actual place, especially when that approximation is very uncertain. Especially if the place is the exact location of several events, and perhaps an address or other information is known, but not the exact geospatial coordinates.

Secondly, as a place is defined by exactly one definition, but there might be multiple approximations such as a polygon as well as the central point, the real place that an activity occured at can be related to multiple approximate places to capture these different approximations.

`{
  "@context": "https://linked.art/ns/v1/linked-art.json",
  "id": "https://linked.art/example/place/4",
  "type": "Place",
  "_label": "True Auction House Location",
  "approximated_by": [
    {
      "type": "Place",
      "_label": "Auction House Location Approximation",
      "defined_by": "POINT(-0.0032937526703165 51.515107154846)"
    }
  ]
}`


### Depiction of place with approximate location

`{
  "@context": "https://linked.art/ns/v1/linked-art.json",
  "id": "https://linked.art/example/object/34",
  "type": "HumanMadeObject",
  "_label": "geographical place name",
    "shows": [
    {
      "type": "VisualItem",
      "represents": [
        {
          "type": "Place",
          "_label": "Lucca",
          "approximated_by": [
                {
                  "type": "Place",
                  "_label": "Lucca - Location Approximation",
                  "defined_by": "POINT(-0.0032937526703165 51.515107154846)"
                }
              ]
        }
      ]
    }
  ]}`


### Update Linked Art JSON-LD files

In [92]:
ruskindir = "data/ruskin/output/json"
storyvisdir = "data/ruskin/storyvis/json"

# open file containing reconciled data with geo coordinates
df = pd.read_csv(filecoord,low_memory=False)

file_list=os.listdir(ruskindir)

cnt=1
    
for file in file_list:
    with open( ruskindir + "/" + file) as json_file:
        artwork = json.load(json_file)
        if artwork["id"] in df["id"].tolist():
            rows = df.loc[df['id'] == artwork["id"]]
            row=rows.iloc[0]
            
            pl = row["place"]
            coords = row["coords"]
            coords = coords.replace(",", " ")
           
            cnt = cnt+1
 
            approx_place = Place()
            approx_place._label = pl
            approx_place.defined_by = "POINT(" + coords + ")"
            
            place = Place()
            place._label = pl
            place.approximated_by = approx_place
            
            vi = VisualItem()
            vi.represents = place
            artwork["shows"] = factory.toJSON(vi)
            text_file = open(storyvisdir + "/" + str(cnt) + ".json", "wt")
            n = text_file.write(json.dumps(artwork,indent=2))
            text_file.close()
              