In [1]:
import csv
import json
import urllib.request
import requests
import uuid
import time
import yaml
from os import path
from pathlib import Path
from string import Template
from SPARQLWrapper import SPARQLWrapper, JSON
from datetime import datetime
from tqdm import tqdm

In [2]:
configFile = '../pipeline/config.yml'

### Load Configuration

In [3]:
try:
    with open(configFile, 'r') as f:
        config = yaml.safe_load(f)
except:
    raise Exception("Could not load config file at", configFile)

### Define helpers

In [4]:
# Constants
SPARQL = 0
CSV = 1

def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in results["head"]["vars"]:
            if key in result:
                row[key] = result[key]["value"]
            else:
                row[key] = None
        rows.append(row)
    return rows

def writeData(data):
    try:
        with open(config['dataFile'], 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['id','image','width','height','documentCoordinates'])
            writer.writeheader()
            for row in data:
                if not 'documentCoordinates' in row:
                    row['documentCoordinates'] = None
                writer.writerow(row)
    except:
        raise Exception("Could not write to", config['dataFile'])

## Step 1: Get input data

In [5]:
mode = False
if config['mode'] == "SPARQL":
    mode = SPARQL
elif config['mode'] == "CSV":
    mode  = CSV
else:
    raise Exception("mode not specified or invalid (should be SPARQL or CSV)")

Read data from input file, if present. This is being done for both CSV and SPARQL mode as the SPARQL results will be cashed in the CSV file and updated when data is changed.

In [6]:
inputData = []
try:
    with open(config['dataFile'], 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            inputData.append({
                "id": row['id'],
                "image": row['image'],
                "width": row['width'],
                "height": row['height'],
                "documentCoordinates": row['documentCoordinates'] if 'documentCoordinates' in row else None
            })
except:
    print("No prior input file found")


If in SPARQL mode, get data from SPARQL endpoint

In [7]:
if mode == SPARQL:
    if not config['endpoint'] or not config['query']:
        raise Exception("incomplete configuration for SPARQL mode")
        
    sparql = SPARQLWrapper(config['endpoint'], returnFormat=JSON)
    sparql.setQuery(config['query'])
    try:
        ret = sparql.query().convert()
    except:
        raise Exception("Could not execute query against endpoint", config['endpoint'])
    queriedData = sparqlResultToDict(ret)

If in SPARQL mode, merge queried data with data stored in CSV file.
- add entries that exist in SPARQL result, but not in the CSV file
- add width/height information when it is only available in either the CSV file or the SPARQL output (prioritising the SPARQL data)
Store merged data in CSV file

In [8]:
data = inputData

if mode == SPARQL:
    inputDataHash = {}
    queriedDataHash = {}

    for row in inputData:
        inputDataHash[row['id']] = row
    for row in queriedData:
        queriedDataHash[row['id']] = row

    idsInInputData = [d['id'] for d in inputData]
    for row in queriedData:
        if row['id'] not in idsInInputData:
            data.append(row)

    for row in data:
        if not row['width']:
            if row['id'] in queriedDataHash and queriedDataHash[row['id']]['width']:
                row['width'] = queriedDataHash[row['id']]['width']
            elif row['id'] in inputDataHash and inputDataHash[row['id']]['width']:
                row['width'] = inputDataHash[row['id']]['width']
        if not row['height']:
            if row['id'] in queriedDataHash and queriedDataHash[row['id']]['height']:
                row['height'] = queriedDataHash[row['id']]['height']
            elif row['id'] in inputDataHash and inputDataHash[row['id']]['width']:
                row['height'] = inputDataHash[row['id']]['height']
    
    writeData(data)

## Step 2: Get (missing) image sizes

If the original image size is not specified, call the IIIF Image API to read the size from the JSON rsponse

In [9]:
for row in tqdm(data):
    if not row['width'] or not row['height']:
        uri = row['image'] + '/info.json'
        try:
            with urllib.request.urlopen(uri) as url:
                manifestData = json.loads(url.read().decode())
                
        except:
            print("Could not open", uri)
            next
        row['width'] = manifestData['width']
        row['height'] = manifestData['height']
        writeData(data)

100%|██████████| 28103/28103 [00:00<00:00, 269716.37it/s]


Write data to file

In [10]:
writeData(data)

## Step 3: Download images

Download the images that do not yet exist in the image folder. The images will be downloaded resized to a width of 1024 pixels.

In [11]:
try:
    Path(config['imageDirectory']).mkdir(parents=True, exist_ok=True)
except:
    raise Exception("Could not add/access folder", config['imageDirectory'])

## Step 4: Detect Images

In [12]:
import cv2
import random
import numpy as np
from PIL import Image

from matplotlib import pyplot as plt
from os import listdir
from os.path import isfile, join

from IPython.display import display

In [17]:
class BSOImageCropping:
    
    showImages = False
    extension = 50
    
    METHOD_TRHESH = 0
    METHOD_CANNY = 1

    SELECT_SQUAREST = 0
    SELECT_LARGEST = 1
    
    def __init__(self, showImages=False):
        self.showImages = showImages
        
    def applyMorphologyClose(self, image):
        # Applie a Kernel that "smears" the image horizontally
        # and slightly downwards. Can help in some instances to
        # close gaps between shapes, e.g. text
        padding = 15
        kernel = np.ones((2,10), np.uint8)
        if image[image.shape[0]-padding][padding] > 127:
            return cv2.morphologyEx(255-image.copy(), cv2.MORPH_CLOSE, kernel)
        else:
            return cv2.morphologyEx(image.copy(), cv2.MORPH_CLOSE, kernel)
    
    def blurImage(self, image, amount=5):
        return cv2.blur(image.copy(), (amount, amount))
    
    def cannyImage(self, image):
        return cv2.Canny(image.copy(), 10, 120)
    
    def displayImage(self, image):
        plt.figure()
        plt.imshow(image)
        
    def erodeImage(self, image, iterations=5):
        kernel = np.ones((5,5),np.uint8)
        return cv2.erode(image.copy(), kernel, iterations)
        
    def extendImage(self, image):
        return cv2.copyMakeBorder(image.copy(), self.extension, self.extension, self.extension, self.extension, cv2.BORDER_REPLICATE)
    
    def makeBW(self, image):
        return cv2.cvtColor(image.copy(), cv2.COLOR_RGB2GRAY)
        
    def thresholdImage(self, image):   
        padding = 5
        if image[image.shape[0]-padding][image.shape[1]-padding] > 127:
            thresholdMethod = cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU
        else:
            thresholdMethod = cv2.THRESH_BINARY+cv2.THRESH_OTSU
        
        ret, thresh = cv2.threshold(image,0,255,thresholdMethod)   
        return thresh
    
    def cropImage(self, image, preprocessMethod=METHOD_TRHESH, selectMethod=SELECT_SQUAREST):
        # Extend image to improve recognition of (document) edges that
        # are close to the image edge
        extendedImage = self.extendImage(image)
        
        # Convert image to Black and White
        grayImage = self.makeBW(extendedImage)
        
        # Smear or blur image to remove detail and close gaps
        morphImage = self.applyMorphologyClose(grayImage)
        #blurImage = self.blurImage(morphImage)
        
        # Binarise image
        if preprocessMethod == self.METHOD_CANNY:
            thresh = self.cannyImage(morphImage)
        else:
            thresh = self.thresholdImage(morphImage)
        
        # Detect contours
        contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # (for debugging) Draw contours on image
        contourImage = np.zeros(extendedImage.shape)
        cv2.drawContours(contourImage, contours, -1, (0,255,0), 3)
            
        # Retrieve the areas of the contours and select n largest
        areas = [cv2.contourArea(c) for c in contours]
        indicesOfLargestContours = [areas.index(x) for x in sorted(areas, reverse=True)[:2]]
        
        # Pick contour based on selection method, either the largest contour detected
        # or the one which is closest in ratio to a square. The ratio method can be useful
        # to discard a colour calibration bar which might be detected
        if selectMethod == self.SELECT_SQUAREST:
            ratios = []
            for i in indicesOfLargestContours:
                x,y,w,h = cv2.boundingRect(contours[i])
                ratios.append(max(w,h)/min(w,h))
            indexToPick = np.argmin(ratios)
        elif selectMethod == self.SELECT_LARGEST:
            indexToPick = 0
        
        # (for debugging) Draw detected contours
        drawThickness = 8
        
        rectangleImage = extendedImage.copy()
        for i in indicesOfLargestContours:
            x,y,w,h = cv2.boundingRect(contours[i])
            cv2.rectangle(rectangleImage, (x, y) , (x + w, y + h), (0,255,0) ,drawThickness)
        
        # Retrieve coordinates of selected contour
        chosenX, chosenY, chosenW, chosenH = cv2.boundingRect(contours[indicesOfLargestContours[indexToPick]])
        
        x0 = chosenX
        x1 = chosenX + chosenW
        y0 = chosenY
        y1 = chosenY + chosenH
        
        # Remove border that has been added in the first step
        x0 = max(0, chosenX - self.extension)
        y0 = max(0, chosenY - self.extension)
        x1 = min(x0 + chosenW, image.shape[1])
        y1 = min(y0 + chosenH, image.shape[0])
        
        # (for debugging) Draw rectangle of selected region
        cv2.rectangle(rectangleImage, (chosenX, chosenY), (chosenX + chosenW, chosenY + chosenH), (255,0,0), drawThickness)

        croppedImage = image.copy()[y0:y1, x0:x1]
        
        if self.showImages:
            self.displayImage(image)
            #self.displayImage(morphImage)
            self.displayImage(thresh)
            self.displayImage(contourImage)
            #self.displayImage(rectangleImage)
            self.displayImage(croppedImage)
            
        return x0, y0, x1-x0, y1-y0

In [14]:
detector = BSOImageCropping()
    
for row in tqdm(data):
    if not row['documentCoordinates'] or len(row['documentCoordinates']) == 0:
        
        filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
        if isfile(filename):
            image = cv2.imread(filename)
            image = image[:,:,::-1]
            if not 'zbz' in filename:
                x,y,w,h = detector.cropImage(image, selectMethod=BSOImageCropping.SELECT_LARGEST)
            else:
                x,y,w,h = detector.cropImage(image, selectMethod=BSOImageCropping.SELECT_SQUAREST)

            # Upscale to original size
            scaleFactor = int(row['width'])/image.shape[1]
            row['documentCoordinates'] = "%d,%d,%d,%d" % (int( x * scaleFactor),
                                                          int( y * scaleFactor),
                                                          int( w * scaleFactor),
                                                          int( h * scaleFactor))

            # Store coordinates in data after every prediction
            writeData(data)
        else:
            print("Could not open", filename)

 87%|████████▋ | 24513/28103 [1:58:05<18:19,  3.27it/s]  

Could not open ../data/images/nb-812808.jpg


 88%|████████▊ | 24737/28103 [1:59:12<16:19,  3.44it/s]

Could not open ../data/images/nb-815037.jpg


 88%|████████▊ | 24743/28103 [1:59:14<15:54,  3.52it/s]

Could not open ../data/images/nb-815050.jpg


 88%|████████▊ | 24746/28103 [1:59:14<13:47,  4.05it/s]

Could not open ../data/images/nb-815054.jpg


 88%|████████▊ | 24748/28103 [1:59:15<11:35,  4.82it/s]

Could not open ../data/images/nb-815062.jpg


 88%|████████▊ | 24760/28103 [1:59:18<16:14,  3.43it/s]

Could not open ../data/images/nb-815093.jpg


 88%|████████▊ | 24762/28103 [1:59:18<12:57,  4.30it/s]

Could not open ../data/images/nb-815097.jpg


 88%|████████▊ | 24765/28103 [1:59:19<12:36,  4.41it/s]

Could not open ../data/images/nb-815102.jpg


 88%|████████▊ | 24773/28103 [1:59:21<15:47,  3.51it/s]

Could not open ../data/images/nb-815125.jpg


 88%|████████▊ | 24832/28103 [1:59:39<15:55,  3.42it/s]

Could not open ../data/images/nb-815670.jpg


 90%|█████████ | 25395/28103 [2:02:29<13:29,  3.35it/s]

Could not open ../data/images/nb-822350.jpg


 92%|█████████▏| 25739/28103 [2:04:13<11:13,  3.51it/s]

Could not open ../data/images/nb-838092.jpg


 92%|█████████▏| 25754/28103 [2:04:17<11:50,  3.31it/s]

Could not open ../data/images/nb-838152.jpg
Could not open ../data/images/nb-838155.jpg
Could not open ../data/images/nb-838157.jpg
Could not open ../data/images/nb-838160.jpg
Could not open ../data/images/nb-838162.jpg
Could not open ../data/images/nb-838164.jpg
Could not open ../data/images/nb-838166.jpg
Could not open ../data/images/nb-838168.jpg
Could not open ../data/images/nb-838170.jpg
Could not open ../data/images/nb-838172.jpg
Could not open ../data/images/nb-838174.jpg
Could not open ../data/images/nb-838176.jpg
Could not open ../data/images/nb-838178.jpg
Could not open ../data/images/nb-838180.jpg
Could not open ../data/images/nb-838182.jpg


 92%|█████████▏| 25879/28103 [2:04:50<11:03,  3.35it/s]

Could not open ../data/images/nb-841831.jpg


 92%|█████████▏| 25886/28103 [2:04:52<10:10,  3.63it/s]

Could not open ../data/images/nb-841890.jpg


 92%|█████████▏| 25958/28103 [2:05:13<10:44,  3.33it/s]

Could not open ../data/images/nb-861242.jpg


 93%|█████████▎| 26034/28103 [2:05:36<11:09,  3.09it/s]

Could not open ../data/images/nb-870419.jpg
Could not open ../data/images/nb-870422.jpg


100%|██████████| 28103/28103 [2:15:54<00:00,  3.45it/s]


## Step 5: Output as CIDOC-CRM RDF

Output as a Trig file that can be displayed and edited in the Mirador component of ResearchSpace & Metaphacts

In [15]:
namespaces = """
PREFIX Platform: <http://www.metaphacts.com/ontologies/platform#> 
PREFIX User: <http://www.metaphacts.com/resource/user/> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
PREFIX crmdig: <http://www.ics.forth.gr/isl/CRMdig/> 
PREFIX rso: <http://www.researchspace.org/ontology/> 
PREFIX prov: <http://www.w3.org/ns/prov#> 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ldp: <http://www.w3.org/ns/ldp#> 
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
"""

static = """

<https://platform.swissartresearch.net/imageRegions> {
    <https://resource.swissartresearch.net/type/imageRegion> a crm:E55_Type ;
    rdfs:label "Image Region" ;
    crm:P3_has_note "A region defining the visual image represented within a digital image. For example, the region denotes the visual item that is reproduced on a document which is photographed.".
}
"""

regionTemplate = Template('''<$uri/container/context> {
  Platform:formContainer ldp:contains <$uri/container> .
  
  <$uri>
    a crmdig:D35_Area, rso:EX_Digital_Image_Region;
    crmdig:L49_is_primary_area_of <$iiifImage>;
    crm:P33_used_specific_technique <https://github.com/swiss-art-research-net/bso-image-segmentation> ;
    rso:boundingBox "xywh=$x,$y,$w,$h";
    rso:displayLabel "image";
    rso:viewport "xywh=0,0,0,0";
    rdf:value "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\\"http://www.w3.org/2000/svg\\" d=\\"M${x0},${y0}l${halfW},0l0,0l${halfW},0l 0,${halfH}l 0,${halfH}l -${halfW},0l -${halfW},0l 0,-${halfH}z\\" data-paper-data=\\"{&quot;defaultStrokeValue&quot;:1,&quot;editStrokeValue&quot;:5,&quot;currentStrokeValue&quot;:1,&quot;rotation&quot;:0,&quot;deleteIcon&quot;:null,&quot;rotationIcon&quot;:null,&quot;group&quot;:null,&quot;editable&quot;:true,&quot;annotation&quot;:null}\\" id=\\"rectangle_e880ad36-1fef-4ce3-835d-716ba7db628a\\" fill-opacity=\\"0\\" fill=\\"#00bfff\\" fill-rule=\\"nonzero\\" stroke=\\"#00bfff\\" stroke-width=\\"4.04992\\" stroke-linecap=\\"butt\\" stroke-linejoin=\\"miter\\" stroke-miterlimit=\\"10\\" stroke-dasharray=\\"\\" stroke-dashoffset=\\"0\\" font-family=\\"none\\" font-weight=\\"none\\" font-size=\\"none\\" text-anchor=\\"none\\" style=\\"mix-blend-mode: normal\\"/></svg>" .
  
  <$uri/container>
    a ldp:Resource, prov:Entity;
    prov:generatedAtTime "$dateTime"^^xsd:dateTime;
    prov:wasAttributedTo User:admin .
}

<https://platform.swissartresearch.net/imageRegions> {
    <$uri> crm:P2_has_type <https://resource.swissartresearch.net/type/imageRegion> .
}

''')

In [16]:
dateTime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S+00:00z")

output = namespaces + static

missingDocumentCoordinates = []

for row in tqdm(data):
    if row['documentCoordinates'] is None:
        missingDocumentCoordinates.append(row)
        continue
        
    docCoords = row['documentCoordinates'].split(',')
    
    if len(docCoords) < 4:
        missingDocumentCoordinates.append(row)
        continue

    x = int(docCoords[0])
    y = int(docCoords[1])
    w = int(docCoords[2])
    h = int(docCoords[3])

    edges = {
        "topLeft": (x, y),
        "topRight": (x + w, y),
        "bottomRight": (x + w, y + h),
        "bottomLeft": (x, y + h)
    }
    iiifImage = row['image']
    identifier = str(uuid.uuid3(uuid.NAMESPACE_DNS, iiifImage))
    uri = "https://resource.swissartresearch.net/digitalobject/" + identifier
    x0 = edges['topLeft'][0]
    y0 = edges['topLeft'][1]
    x1 = edges['bottomRight'][0]
    y1 = edges['bottomRight'][1]
    x = x0
    y = y0
    w = x1 - x0
    h = y1 - y0
    output += regionTemplate.substitute(
        uri=uri,
        iiifImage=iiifImage,
        x=int(x),
        y=int(y),
        w=int(w),
        h=int(h),
        x0=x0,
        y0=y0,
        halfW=float(w/2),
        halfH=float(h/2),
        dateTime=dateTime
    )

# Write summary of missing corodinates
if len(missingDocumentCoordinates) > 0:
    print("Could not detect coordinates in %d images:" % len(missingDocumentCoordinates))
    print('\n'.join([d['id'] for d in missingDocumentCoordinates]))
    
filename = path.join(config['trigFile'])
with open(filename, 'w') as f:
    f.write(output)

100%|██████████| 28103/28103 [00:01<00:00, 19079.31it/s]


Could not detect coordinates in 32 images:
nb-812808
nb-815037
nb-815050
nb-815054
nb-815062
nb-815093
nb-815097
nb-815102
nb-815125
nb-815670
nb-822350
nb-838092
nb-838152
nb-838155
nb-838157
nb-838160
nb-838162
nb-838164
nb-838166
nb-838168
nb-838170
nb-838172
nb-838174
nb-838176
nb-838178
nb-838180
nb-838182
nb-841831
nb-841890
nb-861242
nb-870419
nb-870422
