In [1]:
import csv
import json
import yaml
from SPARQLWrapper import SPARQLWrapper, JSON

Path to JSON file containing classifications, created with [RunwayML](https://runwayml.com/)

In [2]:
inputFileRunway = '../data/Annotation Group - Images and Colour Bars for BSO Examples.json'
outputFile = '../data/manualAnnotations.json'

Path to JSON file for pipeline

In [3]:
configFile = '../pipeline/config.yml'

Load image categories

In [4]:
with open(inputFileRunway, 'r') as f:
    data = json.load(f)

In [5]:
imageCategory = [d for d in data['categories'] if d['name'] == 'Image'][0]

We reuse some of the configuration from the pipeline. Specifically the SPARQL endpoint

In [6]:
try:
    with open(configFile, 'r') as f:
        config = yaml.safe_load(f)
except:
    raise Exception("Could not load config file at", configFile)

### Define helpers

In [7]:
def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in results["head"]["vars"]:
            if key in result:
                row[key] = result[key]["value"]
            else:
                row[key] = None
        rows.append(row)
    return rows

## Retrieve manually defined regions from SPARL endpoint

The automatically created image regions contain the statement `?region crm:P33_used_specific_technique <https://github.com/swiss-art-research-net/bso-image-segmentation>`. We can discriminate between manually and automatically created regions by looking at regions where this statement is absent, but the statement `?region crm:P2_has_type <https://resource.swissartresearch.net/type/imageRegion>` (in the named graph `<https://platform.swissartresearch.net/imageRegions>`) is present.


In [8]:
query = """
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX crmdig: <http://www.ics.forth.gr/isl/CRMdig/>
PREFIX la: <https://linked.art/ns/terms/>
PREFIX type: <https://resource.swissartresearch.net/type/>
PREFIX rso: <http://www.researchspace.org/ontology/>
SELECT ?region ?coordinates ?image ?artwork ?id WHERE {
    ?region a crmdig:D35_Area ;
        crm:P2_has_type type:imageRegion ;
        crmdig:L49_is_primary_area_of ?image ;
        rso:boundingBox ?bbox .
    ?artwork crm:P128_carries/la:digitally_shown_by/la:digitally_available_via/la:access_point ?image .
    FILTER NOT EXISTS {
        ?region crm:P33_used_specific_technique <https://github.com/swiss-art-research-net/bso-image-segmentation> .
    }
    BIND(STRAFTER(STR(?artwork), '/artwork/') as ?id)
    BIND(STRAFTER(?bbox, 'xywh=') as ?coordinates)
}
"""

In [9]:
sparql = SPARQLWrapper(config['endpoint'], returnFormat=JSON)
sparql.setQuery(query)
try:
    ret = sparql.query().convert()
except:
    raise Exception("Could not execute query against endpoint", config['endpoint'])
manuallyDefinedRegions = sparqlResultToDict(ret)

In [10]:
print("Found", len(manuallyDefinedRegions), "manually defined regions")

Found 248 manually defined regions


## Get image sizes

The training data is defined using relative coordinates. In order to calculate the relative coordinates of the regions from the absolute ones stored in the database, we need to know the original sizes of the images. As part of the data pipeline, image sizes are retrieved from the IIIF manifests and stored in the data CSV. We can hence retrieve the sizes from the CSV where available. Where they are not available, we need to query the IIIF manifest via the URL of the image.

In [11]:
pipelineData = []
pipelineDataIndex = {}
with open(config['dataFile'], 'r') as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader):
        pipelineData.append(row)
        pipelineDataIndex[row['id']] = i

In [12]:
for region in manuallyDefinedRegions:
    if region['id'] in pipelineDataIndex.keys():
        d = pipelineData[pipelineDataIndex[region['id']]]
        region['width'] = float(d['width'])
        region['height'] = float(d['height'])

## Convert regions to RunwayML format

In [13]:
files = {}
for region in manuallyDefinedRegions:
    filename = region['id'] + '.jpg'
    coordinates = region['coordinates'].split(',')
    coordinates[0] = float(coordinates[0]) / region['width']
    coordinates[1] = float(coordinates[1]) / region['height']
    coordinates[2] = float(coordinates[2]) / region['width']
    coordinates[3] = float(coordinates[3]) / region['height']
    files[filename] = [{
        "type": "BOUNDING_BOX",
        "categoryId": imageCategory['id'],
        "boundingBox": coordinates
    }]

In [14]:
output = {
    "categories" : data['categories'],
    "files": files
}

In [15]:
with open(outputFile, 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=4)