In [1]:
import cv2
import csv
import json
import urllib.request
import requests
import uuid
import time
import yaml
from os import path
from pathlib import Path
from string import Template
from SPARQLWrapper import SPARQLWrapper, JSON
from datetime import datetime
from tqdm import tqdm

In [2]:
configFile = '../pipeline/config.yml'

### Load Configuration

In [3]:
try:
    with open(configFile, 'r') as f:
        config = yaml.safe_load(f)
except:
    raise Exception("Could not load config file at", configFile)

### Define helpers

In [4]:
# Constants
SPARQL = 0
CSV = 1

def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in results["head"]["vars"]:
            if key in result:
                row[key] = result[key]["value"]
            else:
                row[key] = None
        rows.append(row)
    return rows

## Step 1: Get input data

Read data from input file. This assumes that it is already fully populated with the necessary information (image sizes) as well as document coordinates.

In [5]:
inputData = []

In [6]:
try:
    with open(config['dataFile'], 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            inputData.append({
                "id": row['id'],
                "image": row['image'],
                "width": row['width'],
                "height": row['height'],
                "documentCoordinates": row['documentCoordinates'] if 'documentCoordinates' in row else None
            })
except:
    print("No prior input file found")


In [7]:
data = inputData

## Step 2: Get problematic images

Query for the images that likely have been assigned the wrong region

In [8]:
problematicImagesQuery = """

# Select images that have been assigned more than 99 similar images, indicating that the similarity has 
# been assigned based on the colour bar

PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX la: <https://linked.art/ns/terms/>
PREFIX search: <https://platform.swissartresearch.net/search/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX crmdig: <http://www.ics.forth.gr/isl/CRMdig/>
PREFIX rso: <http://www.researchspace.org/ontology/>

SELECT ?id ?image (?coordinates as ?documentCoordinates) WHERE {
    {
        SELECT DISTINCT ?image ?subject ?coordinates(COUNT(?classification) as ?num_classification) WHERE {
            ?subject crm:P128_carries/la:digitally_shown_by/la:digitally_available_via/la:access_point ?image .
            ?region crmdig:L49_is_primary_area_of ?image ;
              crm:P2_has_type <https://resource.swissartresearch.net/type/imageRegion> ;
              crm:P33_used_specific_technique <https://github.com/swiss-art-research-net/bso-image-segmentation> ;
              rso:boundingBox ?bbox .
            ?classification crm:P140_assigned_attribute_to ?image ;
                crm:P33_used_specific_technique <https://github.com/swiss-art-research-net/bso-image-similarity> .
            BIND(STRAFTER(?bbox, "xywh=") as ?coordinates)
        } 
        GROUP BY ?image ?subject ?coordinates
    }
    FILTER(?num_classification > 15)
      ?subject a search:Object ;
        crm:P128_carries/la:digitally_shown_by/la:digitally_available_via ?iiif . 
      ?iiif dcterms:conformsTo <http://iiif.io/api/image> ;
            la:access_point ?image .
      BIND(STRAFTER(STR(?subject), "https://resource.swissartresearch.net/artwork/") as ?id)
    
} ORDER BY DESC(?num_classification) ?subject
"""

In [9]:
sparql = SPARQLWrapper(config['endpoint'], returnFormat=JSON)
sparql.setQuery(problematicImagesQuery)

In [10]:
try:
    ret = sparql.query().convert()
except:
    raise Exception("Could not execute query against endpoint", config['endpoint'])
queriedData = sparqlResultToDict(ret)

## Step 3: Select input rows with problematic coordinates

In [11]:
problematicRows = []
for row in queriedData:
    for i, inputRow in enumerate(data):
        if inputRow['id'] == row['id'] and inputRow['image'] == row['image']:
            row['index'] = i
            row['width'] = inputRow['width']
            row['height'] = inputRow['height']
            problematicRows.append(row)

In [12]:
print("Found", len(problematicRows), "problematic images")

Found 328 problematic images


## Step X: Delete problematic regions without fixing

Note: jump straight to output after this

In [13]:
for row in problematicRows:
    data[row['index']]['documentCoordinates'] = None

## Step 4: Fix Images

Crops can be problematic in different ways. There can be crops that only contain the calibration bar, either horizontally or vertically. For example 1. There can be crop that include a part of the colour bar, such as when it has been positioned too closely to the image. For example 2. And there are images that are cropped to the bottom half of the image, including part of the image and part of the colour bar. For example 3.

<ol>
    <li> <img src="https://www.e-manuscripta.ch/zuzneb/i3f/v20/1534179/1658,119,529,1591/,150/0/default.jpg">
    <img src="https://www.e-manuscripta.ch/zuzneb/i3f/v20/1642886/0,2333,3738,1047/150,/0/default.jpg"></li>
    <li><img src="https://www.e-rara.ch/zuz/i3f/v20/13908787/0,0,3196,2902/150,/0/default.jpg"></li>
    <li><img src="https://www.e-rara.ch/zuz/i3f/v20/14502046/0,1374,2527,1198/150,/0/default.jpg"></li>
</ol>

In [None]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

from openCV.BSOImageCropping import BSOImageCropping
from matplotlib import pyplot as plt


In [None]:
detector = BSOImageCropping(showImages=False)

def showImage(row):
    filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
    image = cv2.imread(filename)
    image = image[:,:,::-1]
    plt.figure()
    plt.imshow(image)

def showCrop(row, coordinates):
    coordinates = [float(d) for d in coordinates.split(",")]
    filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
    image = cv2.imread(filename)
    image = image[:,:,::-1]
    scaleFactor = int(row['width'])/image.shape[1]
    y0 = int(coordinates[1] / scaleFactor)
    y1 = y0 + int(coordinates[3] / scaleFactor)
    x0 = int(coordinates[0] / scaleFactor)
    x1 = x0 + int(coordinates[2] / scaleFactor)
    croppedImage = image.copy()[y0:y1, x0:x1]
    plt.figure()
    plt.imshow(croppedImage)

def getDocumentCoordinates(row, mode=0):
    filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
    if path.isfile(filename):
        image = cv2.imread(filename)
        image = image[:,:,::-1]

        if mode == 0:
            x,y,w,h = detector.cropImage(image, selectMethod=BSOImageCropping.SELECT_LARGEST)
        elif mode == 1:
            x,y,w,h = detector.cropImage(image, selectMethod=BSOImageCropping.SELECT_SQUAREST)
        elif mode == 2:
            x,y,w,h = detector.cropImage(image, invertImage=True, selectMethod=BSOImageCropping.SELECT_SQUAREST)
        elif mode == 3:
            x,y,w,h = detector.cropImage(image, invertImage=True, selectMethod=BSOImageCropping.SELECT_LARGEST)

        # Upscale to original size
        scaleFactor = int(row['width'])/image.shape[1]
        documentCoordinates = "%d,%d,%d,%d" % (int( x * scaleFactor),
                                                      int( y * scaleFactor),
                                                      int( w * scaleFactor),
                                                      int( h * scaleFactor))
        return documentCoordinates
    else:
        print("Could not open", filename)
        return False
    
def checkSizeOfRegion(row, documentCoordinates):
    minCoverage = 0.2
    
    coordinates = [int(d) for d in documentCoordinates.split(",")]
    width, height = int(row['width']), int(row['height'])
    if coordinates[2]/width < minCoverage or coordinates[3]/height < minCoverage:
        return False
    return True

def correctDocumentCoordinates(row, documentCoordinates):
    coordinates = [int(d) for d in documentCoordinates.split(",")]
    width, height = int(row['width']), int(row['height'])
    coordinates[2] = max(int(coordinates[2]-width/100), width)
    coordinates[3] = max(int(coordinates[3]-height/100), height)
    if coordinates == [0, 0, width, height]:
        # Region covers entire image or is too small
        # Return adjusted regions that excludes previous (problematic)
        # region at the top or bottom
        regionY = int(row['documentCoordinates'].split(",")[1])
        regionHeight = int(row['documentCoordinates'].split(",")[3])
        if regionY > height/2:
            # Colour bar in top region
            return ','.join(["0", "0", str(width), str(regionY)])
        else:
            # Colour bar in bottom region
            return ','.join(["0", str(regionHeight), str(width), str(height-regionHeight)])

    return documentCoordinates

In [None]:
# Skip fixing


# debug = False

# for row in problematicRows:
    
#     if row['documentCoordinates']:
#         numModes = 3

#         for mode in range(0, numModes + 1):
#             documentCoordinates = getDocumentCoordinates(row, mode)
#             if not documentCoordinates == row['documentCoordinates'] and checkSizeOfRegion(row, documentCoordinates):
#                 break

#         try:
#             documentCoordinates = correctDocumentCoordinates(row, documentCoordinates)
#         except:
#             print(row)
#             correctDocumentCoordinates(row, documentCoordinates)
#             break

#         if mode == numModes and not checkSizeOfRegion(row, documentCoordinates):
#             # All modes have been tried and region is too small
#             # Revert to previous region
#             documentCoordinates = row['documentCoordinates']
#             row['updated'] = False
#         else:
#             row['updated'] = True

#         if debug:
#             showImage(row)
#             showCrop(row, row['coordinates'])
#             showCrop(row, documentCoordinates)
            
#         row['newCoordinates'] = documentCoordinates

In [None]:
# NOTICE: Problematic regions are checked by comparing the new coordinates with the coordinates stored in the data.
# When a good region is identified, the old coordinates are replaced in the data.
# Therefore, if this cell is run, it will subsequently check against the corrected coordinates in the data
# and lead to wrong results. Make sure to not re-run the above cells after running this one this cell just once.

# for index, row in problematicRows:
#     data[index]['documentCoordinates'] = row['newCoordinates']


In [None]:
# Store coordinates in separate file
with open(config['dataFile'][:-4] + '-corrected.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['id','image','width','height','documentCoordinates'])
    writer.writeheader()
    for row in data:
        if not 'documentCoordinates' in row:
            row['documentCoordinates'] = None
        writer.writerow(row)


## Step 5: Remove regions that are either too big or too small

In [None]:
# If starting notebook from here, uncomment below
# data = []
# with open(config['dataFile'][:-4] + '-corrected.csv', 'r') as f:
#     reader = csv.DictReader(f)
#     for row in reader:
#         data.append(row)

In [None]:
maxCoverage = .99

regionsThatAreTooBig = []

for i, row in enumerate(data):
    if row['documentCoordinates']:
        imageWidth, imageHeight = int(row['width']), int(row['height'])
        x, y, regionWidth, regionHeight = [int(d) for d in row['documentCoordinates'].split(',')]
        if regionWidth > imageWidth * maxCoverage and regionHeight > imageHeight * maxCoverage:
            regionsThatAreTooBig.append(i)
            
print("Found %d regions that cover almost the entire image" % len(regionsThatAreTooBig))

In [None]:
minCoverage = .2

regionsThatAreTooSmall = []

for i, row in enumerate(data):
    if row['documentCoordinates']:
        imageWidth, imageHeight = int(row['width']), int(row['height'])
        x, y, regionWidth, regionHeight = [int(d) for d in row['documentCoordinates'].split(',')]
        if regionWidth < imageWidth * minCoverage and regionHeight < imageHeight * minCoverage:
            regionsThatAreTooSmall.append(i)
            
print("Found %d regions that are too small" % len(regionsThatAreTooSmall))

In [None]:
for i in regionsThatAreTooBig:
    data[i]['documentCoordinates'] = None
    
for i in regionsThatAreTooSmall:
    data[i]['documentCoordinates'] = None

In [None]:
# Store coordinates in separate file
with open(config['dataFile'][:-4] + '-corrected.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['id','image','width','height','documentCoordinates'])
    writer.writeheader()
    for row in data:
        if not 'documentCoordinates' in row:
            row['documentCoordinates'] = None
        writer.writerow(row)


## Step 6: Output as CIDOC-CRM RDF

Output as a Trig file that can be displayed and edited in the Mirador component of ResearchSpace & Metaphacts

In [14]:
namespaces = """
PREFIX Platform: <http://www.metaphacts.com/ontologies/platform#> 
PREFIX User: <http://www.metaphacts.com/resource/user/> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
PREFIX crmdig: <http://www.ics.forth.gr/isl/CRMdig/> 
PREFIX rso: <http://www.researchspace.org/ontology/> 
PREFIX prov: <http://www.w3.org/ns/prov#> 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ldp: <http://www.w3.org/ns/ldp#> 
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
"""

static = """

<https://platform.swissartresearch.net/imageRegions> {
    <https://resource.swissartresearch.net/type/imageRegion> a crm:E55_Type ;
    rdfs:label "Image Region" ;
    crm:P3_has_note "A region defining the visual image represented within a digital image. For example, the region denotes the visual item that is reproduced on a document which is photographed.".
}
"""

regionTemplate = Template('''<$uri/container/context> {
  Platform:formContainer ldp:contains <$uri/container> .
  
  <$uri>
    a crmdig:D35_Area, rso:EX_Digital_Image_Region;
    crmdig:L49_is_primary_area_of <$iiifImage>;
    crm:P33_used_specific_technique <https://github.com/swiss-art-research-net/bso-image-segmentation> ;
    rso:boundingBox "xywh=$x,$y,$w,$h";
    rso:displayLabel "image";
    rso:viewport "xywh=0,0,0,0";
    rdf:value "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\\"http://www.w3.org/2000/svg\\" d=\\"M${x0},${y0}l${halfW},0l0,0l${halfW},0l 0,${halfH}l 0,${halfH}l -${halfW},0l -${halfW},0l 0,-${halfH}z\\" data-paper-data=\\"{&quot;defaultStrokeValue&quot;:1,&quot;editStrokeValue&quot;:5,&quot;currentStrokeValue&quot;:1,&quot;rotation&quot;:0,&quot;deleteIcon&quot;:null,&quot;rotationIcon&quot;:null,&quot;group&quot;:null,&quot;editable&quot;:true,&quot;annotation&quot;:null}\\" id=\\"rectangle_e880ad36-1fef-4ce3-835d-716ba7db628a\\" fill-opacity=\\"0\\" fill=\\"#00bfff\\" fill-rule=\\"nonzero\\" stroke=\\"#00bfff\\" stroke-width=\\"4.04992\\" stroke-linecap=\\"butt\\" stroke-linejoin=\\"miter\\" stroke-miterlimit=\\"10\\" stroke-dasharray=\\"\\" stroke-dashoffset=\\"0\\" font-family=\\"none\\" font-weight=\\"none\\" font-size=\\"none\\" text-anchor=\\"none\\" style=\\"mix-blend-mode: normal\\"/></svg>" .
  
  <$uri/container>
    a ldp:Resource, prov:Entity;
    prov:generatedAtTime "$dateTime"^^xsd:dateTime;
    prov:wasAttributedTo User:admin .
}

<https://platform.swissartresearch.net/imageRegions> {
    <$uri> crm:P2_has_type <https://resource.swissartresearch.net/type/imageRegion> .
}

''')

In [15]:
dateTime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S+00:00z")

output = namespaces + static

missingDocumentCoordinates = []

for row in tqdm(data):
    if row['documentCoordinates'] is None:
        missingDocumentCoordinates.append(row)
        continue
        
    docCoords = row['documentCoordinates'].split(',')
    
    if len(docCoords) < 4:
        missingDocumentCoordinates.append(row)
        continue

    x = int(docCoords[0])
    y = int(docCoords[1])
    w = int(docCoords[2])
    h = int(docCoords[3])

    edges = {
        "topLeft": (x, y),
        "topRight": (x + w, y),
        "bottomRight": (x + w, y + h),
        "bottomLeft": (x, y + h)
    }
    iiifImage = row['image']
    identifier = str(uuid.uuid3(uuid.NAMESPACE_DNS, iiifImage))
    uri = "https://resource.swissartresearch.net/digitalobject/" + identifier
    x0 = edges['topLeft'][0]
    y0 = edges['topLeft'][1]
    x1 = edges['bottomRight'][0]
    y1 = edges['bottomRight'][1]
    x = x0
    y = y0
    w = x1 - x0
    h = y1 - y0
    output += regionTemplate.substitute(
        uri=uri,
        iiifImage=iiifImage,
        x=int(x),
        y=int(y),
        w=int(w),
        h=int(h),
        x0=x0,
        y0=y0,
        halfW=float(w/2),
        halfH=float(h/2),
        dateTime=dateTime
    )

# Write summary of missing corodinates
if len(missingDocumentCoordinates) > 0:
    print("Could not detect coordinates in %d images:" % len(missingDocumentCoordinates))
    print('\n'.join([d['id'] for d in missingDocumentCoordinates]))
    
filename = path.join(config['trigFile'])
with open(filename, 'w') as f:
    f.write(output)

100%|██████████| 28103/28103 [00:01<00:00, 22518.25it/s]


Could not detect coordinates in 2386 images:
zbz-990109044120205508
zbz-990109044900205508
zbz-990109045940205508
zbz-990109046110205508
zbz-990109047090205508
zbz-990109047970205508
zbz-990109048200205508
zbz-990109054790205508
zbz-990109055390205508
zbz-990109055450205508
zbz-990109055600205508
zbz-990109056190205508
zbz-990109056340205508
zbz-990109061840205508
zbz-990109065670205508
zbz-990109072630205508
zbz-990109111330205508
zbz-990109111980205508
zbz-990109113640205508
zbz-990109117140205508
zbz-990109117460205508
zbz-990109118470205508
zbz-990109119630205508
zbz-990053006550205508
zbz-990053196900205508
zbz-990053196950205508
zbz-990053197430205508
zbz-990053357000205508
zbz-990053358820205508
zbz-990053366110205508
zbz-990053693940205508
zbz-990053720580205508
zbz-990053746190205508
zbz-990053746260205508
zbz-990053746420205508
zbz-990053799800205508
zbz-990053846880205508
zbz-990053891260205508
zbz-990053935410205508
zbz-990054280370205508
zbz-990054336780205508
zbz-99005501