In [13]:
import csv
import json
import urllib.request
import requests
import uuid
import time
import yaml
from os import path
from pathlib import Path
from string import Template
from SPARQLWrapper import SPARQLWrapper, JSON
from datetime import datetime
from tqdm import tqdm

In [2]:
configFile = '../pipeline/config.yml'

### Load Configuration

In [3]:
try:
    with open(configFile, 'r') as f:
        config = yaml.safe_load(f)
except:
    raise Exception("Could not load config file at", configFile)

### Define helpers

In [4]:
# Constants
SPARQL = 0
CSV = 1

def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in results["head"]["vars"]:
            if key in result:
                row[key] = result[key]["value"]
            else:
                row[key] = None
        rows.append(row)
    return rows

def writeData(data):
    try:
        with open(config['dataFile'], 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['id','image','width','height','documentCoordinates'])
            writer.writeheader()
            for row in data:
                if not 'documentCoordinates' in row:
                    row['documentCoordinates'] = None
                writer.writerow(row)
    except:
        raise Exception("Could not write to", config['dataFile'])

## Step 1: Get input data

In [5]:
mode = False
if config['mode'] == "SPARQL":
    mode = SPARQL
elif config['mode'] == "CSV":
    mode  = CSV
else:
    raise Exception("mode not specified or invalid (should be SPARQL or CSV)")

Read data from input file, if present. This is being done for both CSV and SPARQL mode as the SPARQL results will be cashed in the CSV file and updated when data is changed.

In [6]:
inputData = []
try:
    with open(config['dataFile'], 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            inputData.append({
                "id": row['id'],
                "image": row['image'],
                "width": row['width'],
                "height": row['height'],
                "documentCoordinates": row['documentCoordinates'] if 'documentCoordinates' in row else None
            })
except:
    print("No prior input file found")


If in SPARQL mode, get data from SPARQL endpoint

In [7]:
if mode == SPARQL:
    if not config['endpoint'] or not config['query']:
        raise Exception("incomplete configuration for SPARQL mode")
        
    sparql = SPARQLWrapper(config['endpoint'], returnFormat=JSON)
    sparql.setQuery(config['query'])
    try:
        ret = sparql.query().convert()
    except:
        raise Exception("Could not execute query against endpoint", config['endpoint'])
    queriedData = sparqlResultToDict(ret)

If in SPARQL mode, merge queried data with data stored in CSV file.
- add entries that exist in SPARQL result, but not in the CSV file
- add width/height information when it is only available in either the CSV file or the SPARQL output (prioritising the SPARQL data)
Store merged data in CSV file

In [8]:
data = inputData

if mode == SPARQL:
    inputDataHash = {}
    queriedDataHash = {}

    for row in inputData:
        inputDataHash[row['id']] = row
    for row in queriedData:
        queriedDataHash[row['id']] = row

    idsInInputData = [d['id'] for d in inputData]
    for row in queriedData:
        if row['id'] not in idsInInputData:
            data.append(row)

    for row in data:
        if not row['width']:
            if row['id'] in queriedDataHash and queriedDataHash[row['id']]['width']:
                row['width'] = queriedDataHash[row['id']]['width']
            elif row['id'] in inputDataHash and inputDataHash[row['id']]['width']:
                row['width'] = inputDataHash[row['id']]['width']
        if not row['height']:
            if row['id'] in queriedDataHash and queriedDataHash[row['id']]['height']:
                row['height'] = queriedDataHash[row['id']]['height']
            elif row['id'] in inputDataHash and inputDataHash[row['id']]['width']:
                row['height'] = inputDataHash[row['id']]['height']
    
    writeData(data)

## Step 2: Get (missing) image sizes

If the original image size is not specified, call the IIIF Image API to read the size from the JSON rsponse

In [9]:
for row in tqdm(data):
    if not row['width'] or not row['height']:
        uri = row['image'] + '/info.json'
        try:
            with urllib.request.urlopen(uri) as url:
                manifestData = json.loads(url.read().decode())
                
        except:
            print("Could not open", uri)
            next
        row['width'] = manifestData['width']
        row['height'] = manifestData['height']

 55%|█████▌    | 15126/27450 [00:00<00:00, 151218.32it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-1000691/info.json


 71%|███████   | 19452/27450 [03:04<07:14, 18.39it/s]    

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-1098612/info.json


 72%|███████▏  | 19809/27450 [04:50<20:13,  6.29it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-1344065/info.json


 73%|███████▎  | 19937/27450 [06:10<42:15,  2.96it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-1344568/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-162210/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-162781/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-162857/info.json


 74%|███████▍  | 20323/27450 [07:57<32:14,  3.68it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477148/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477149/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477150/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477151/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477152/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477153/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477154/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477155/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477156/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477157/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477158/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477159/info.json
Could not open https://bso-iiif.swissart

 74%|███████▍  | 20415/27450 [08:06<23:21,  5.02it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477174/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477175/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477176/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477177/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477178/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477179/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477180/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-477181/info.json


 80%|███████▉  | 21930/27450 [17:29<09:02, 10.18it/s]   

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-479323/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-479324/info.json


 87%|████████▋ | 23786/27450 [29:42<06:03, 10.08it/s]   

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-481966/info.json


 88%|████████▊ | 24291/27450 [32:49<05:11, 10.13it/s]   

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-802241/info.json


 89%|████████▉ | 24515/27450 [34:23<05:26,  9.00it/s]   

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-812808/info.json


 90%|█████████ | 24739/27450 [35:55<04:56,  9.15it/s]   

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815037/info.json


 90%|█████████ | 24745/27450 [35:56<05:20,  8.45it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815050/info.json


 90%|█████████ | 24748/27450 [35:56<04:56,  9.12it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815054/info.json


 90%|█████████ | 24750/27450 [35:56<04:46,  9.41it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815062/info.json


 90%|█████████ | 24762/27450 [35:58<04:40,  9.60it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815093/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815097/info.json


 90%|█████████ | 24767/27450 [35:58<04:31,  9.87it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815102/info.json


 90%|█████████ | 24775/27450 [35:59<04:31,  9.85it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815125/info.json


 90%|█████████ | 24834/27450 [36:05<04:29,  9.70it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-815670/info.json


 93%|█████████▎| 25397/27450 [39:24<03:25, 10.01it/s]   

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-822350/info.json


 93%|█████████▎| 25439/27450 [40:35<6:06:42, 10.94s/it] 

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-824913/info.json


 94%|█████████▍| 25742/27450 [42:14<02:48, 10.14it/s]  

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838092/info.json


 94%|█████████▍| 25756/27450 [42:15<02:48, 10.06it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838152/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838155/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838157/info.json


 94%|█████████▍| 25760/27450 [42:16<02:43, 10.36it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838160/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838162/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838164/info.json


 94%|█████████▍| 25762/27450 [42:16<02:41, 10.45it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838166/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838168/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838170/info.json


 94%|█████████▍| 25764/27450 [42:17<04:39,  6.04it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838172/info.json


 94%|█████████▍| 25765/27450 [42:17<05:14,  5.37it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838174/info.json


 94%|█████████▍| 25766/27450 [42:17<05:22,  5.22it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838176/info.json


 94%|█████████▍| 25767/27450 [42:17<05:49,  4.82it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838178/info.json


 94%|█████████▍| 25768/27450 [42:18<05:47,  4.83it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838180/info.json


 94%|█████████▍| 25769/27450 [42:18<06:42,  4.17it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-838182/info.json


 94%|█████████▍| 25881/27450 [42:31<02:46,  9.40it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-841831/info.json


 94%|█████████▍| 25888/27450 [42:32<02:42,  9.62it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-841890/info.json


 95%|█████████▍| 25960/27450 [43:46<07:46,  3.19it/s]  

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-861242/info.json


 95%|█████████▍| 26035/27450 [43:54<02:58,  7.91it/s]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-870419/info.json
Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-870422/info.json


 96%|█████████▋| 26423/27450 [46:50<2:35:58,  9.11s/it]

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-912863/info.json


 97%|█████████▋| 26539/27450 [47:01<01:30, 10.05it/s]  

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-919765/info.json


 97%|█████████▋| 26704/27450 [48:27<03:24,  3.64it/s]  

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-932814/info.json


 99%|█████████▉| 27142/27450 [50:19<00:31,  9.64it/s]  

Could not open https://bso-iiif.swissartresearch.net/iiif/2/nb-960111/info.json


100%|██████████| 27450/27450 [52:04<00:00,  8.79it/s]  


Write data to file

In [10]:
writeData(data)

## Step 3: Download images

Download the images that do not yet exist in the image folder. The images will be downloaded resized to a width of 1024 pixels.

In [11]:
try:
    Path(config['imageDirectory']).mkdir(parents=True, exist_ok=True)
except:
    raise Exception("Could not add/access folder", config['imageDirectory'])

In [15]:
maxRetries = 5
for row in tqdm(data):
    filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
    if not path.exists(filename):
        url = row['image'] + '/full/1024,/0/default.jpg'
        r = requests.get(url, allow_redirects = True)
        retries = 1
        while not 'image' in r.headers['Content-Type'] and retries <= maxRetries:
            # Try again if no image comes back
            time.sleep(1)
            r = requests.get(url, allow_redirects = True)
            retries += 1
        if retries >= maxRetries:
            print("Could not download", row['id'])
        else:
            with open(filename, 'wb') as f:
                f.write(r.content)

 68%|██████▊   | 18557/27450 [00:00<00:00, 97513.30it/s]

Could not download nb-1000691
Could not download nb-1098612


 73%|███████▎  | 19926/27450 [00:19<00:11, 669.54it/s]  

Could not download nb-1344065


 73%|███████▎  | 19934/27450 [00:26<00:16, 448.28it/s]

Could not download nb-1344568
Could not download nb-162210


 73%|███████▎  | 20050/27450 [00:39<00:31, 234.97it/s]

Could not download nb-162781


 73%|███████▎  | 20067/27450 [00:46<00:42, 173.93it/s]

Could not download nb-162857
Could not download nb-477148


 74%|███████▍  | 20389/27450 [01:03<01:12, 97.85it/s] 

Could not download nb-477149
Could not download nb-477150


 74%|███████▍  | 20391/27450 [01:17<01:50, 63.85it/s]

Could not download nb-477151


 74%|███████▍  | 20392/27450 [01:24<02:18, 51.08it/s]

Could not download nb-477152
Could not download nb-477153
Could not download nb-477154


 74%|███████▍  | 20395/27450 [01:44<04:15, 27.62it/s]

Could not download nb-477155


 74%|███████▍  | 20396/27450 [01:51<05:11, 22.66it/s]

Could not download nb-477156
Could not download nb-477157
Could not download nb-477158


 74%|███████▍  | 20399/27450 [02:11<09:08, 12.86it/s]

Could not download nb-477159


 74%|███████▍  | 20400/27450 [02:17<10:59, 10.68it/s]

Could not download nb-477160
Could not download nb-477161


 74%|███████▍  | 20402/27450 [02:31<16:17,  7.21it/s]

Could not download nb-477162


 74%|███████▍  | 20403/27450 [02:38<20:01,  5.86it/s]

Could not download nb-477163
Could not download nb-477164


 74%|███████▍  | 20405/27450 [02:51<30:30,  3.85it/s]

Could not download nb-477165


 74%|███████▍  | 20407/27450 [02:58<37:28,  3.13it/s]

Could not download nb-477167
Could not download nb-477168


 74%|███████▍  | 20409/27450 [03:11<57:18,  2.05it/s]

Could not download nb-477169


 74%|███████▍  | 20410/27450 [03:18<1:10:53,  1.65it/s]

Could not download nb-477170
Could not download nb-477171


 74%|███████▍  | 20412/27450 [03:31<1:47:02,  1.10it/s]

Could not download nb-477172


 74%|███████▍  | 20413/27450 [03:38<2:10:44,  1.11s/it]

Could not download nb-477173
Could not download nb-477174


 74%|███████▍  | 20415/27450 [03:52<3:11:42,  1.64s/it]

Could not download nb-477175


 74%|███████▍  | 20416/27450 [03:59<3:48:58,  1.95s/it]

Could not download nb-477176
Could not download nb-477177
Could not download nb-477178


 74%|███████▍  | 20419/27450 [04:20<5:51:49,  3.00s/it]

Could not download nb-477179


 74%|███████▍  | 20420/27450 [04:27<6:30:32,  3.33s/it]

Could not download nb-477180


 75%|███████▌  | 20699/27450 [04:34<10:28, 10.74it/s]  

Could not download nb-477181


 80%|███████▉  | 21925/27450 [05:06<03:12, 28.77it/s]

Could not download nb-479323


 80%|████████  | 22091/27450 [05:19<04:08, 21.55it/s]

Could not download nb-479324


 87%|████████▋ | 23886/27450 [05:43<01:31, 38.98it/s] 

Could not download nb-481966


 89%|████████▉ | 24436/27450 [06:08<01:25, 35.32it/s]

Could not download nb-802241


 90%|████████▉ | 24656/27450 [06:14<01:12, 38.35it/s]

Could not download nb-812808


 90%|█████████ | 24738/27450 [06:21<01:45, 25.61it/s]

Could not download nb-815037
Could not download nb-815050
Could not download nb-815054


 90%|█████████ | 24749/27450 [06:41<05:16,  8.55it/s]

Could not download nb-815062


 90%|█████████ | 24761/27450 [06:47<06:29,  6.90it/s]

Could not download nb-815093
Could not download nb-815097


 90%|█████████ | 24766/27450 [07:02<10:55,  4.10it/s]

Could not download nb-815102


 90%|█████████ | 24774/27450 [07:08<12:54,  3.46it/s]

Could not download nb-815125


 91%|█████████ | 24988/27450 [07:15<03:13, 12.71it/s]

Could not download nb-815670


 93%|█████████▎| 25396/27450 [07:22<01:12, 28.28it/s]

Could not download nb-822350


 93%|█████████▎| 25631/27450 [07:28<00:51, 35.15it/s]

Could not download nb-824913


 94%|█████████▍| 25740/27450 [07:35<01:04, 26.70it/s]

Could not download nb-838092
Could not download nb-838152
Could not download nb-838155


 94%|█████████▍| 25757/27450 [07:55<02:57,  9.54it/s]

Could not download nb-838157


 94%|█████████▍| 25758/27450 [08:02<03:52,  7.28it/s]

Could not download nb-838160
Could not download nb-838162
Could not download nb-838164


 94%|█████████▍| 25761/27450 [08:22<07:40,  3.66it/s]

Could not download nb-838166


 94%|█████████▍| 25762/27450 [08:28<09:26,  2.98it/s]

Could not download nb-838168
Could not download nb-838170


 94%|█████████▍| 25764/27450 [08:42<14:15,  1.97it/s]

Could not download nb-838172


 94%|█████████▍| 25765/27450 [08:48<17:32,  1.60it/s]

Could not download nb-838174
Could not download nb-838176


 94%|█████████▍| 25767/27450 [09:02<26:16,  1.07it/s]

Could not download nb-838178


 94%|█████████▍| 25768/27450 [09:08<31:56,  1.14s/it]

Could not download nb-838180


 94%|█████████▍| 25783/27450 [09:15<23:18,  1.19it/s]

Could not download nb-838182


 94%|█████████▍| 25880/27450 [09:22<05:49,  4.49it/s]

Could not download nb-841831


 94%|█████████▍| 25888/27450 [09:28<07:09,  3.64it/s]

Could not download nb-841890


 95%|█████████▍| 25959/27450 [09:36<04:37,  5.38it/s]

Could not download nb-861242


 95%|█████████▍| 26035/27450 [09:42<03:21,  7.02it/s]

Could not download nb-870419


 95%|█████████▌| 26131/27450 [09:49<02:03, 10.69it/s]

Could not download nb-870422


 96%|█████████▋| 26464/27450 [09:57<00:45, 21.83it/s]

Could not download nb-912863


 97%|█████████▋| 26676/27450 [10:05<00:27, 28.60it/s]

Could not download nb-919765


 99%|█████████▉| 27225/27450 [10:11<00:04, 45.63it/s] 

Could not download nb-960111


100%|██████████| 27450/27450 [10:49<00:00, 42.25it/s]


## Step 4: Apply model

Apply the model. This step is based on the code provided in the DH Segment example at https://github.com/dhlab-epfl/dhSegment/blob/master/demo.py

In [16]:
import os
from glob import glob

import cv2
import numpy as np
import tensorflow as tf
from imageio import imread, imsave
from tqdm import tqdm

from dh_segment.io import PAGE
from dh_segment.inference import LoadedModel
from dh_segment.post_processing import boxes_detection, binarization

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [17]:
def page_make_binary_mask(probs: np.ndarray, threshold: float=-1) -> np.ndarray:
    """
    Computes the binary mask of the detected Page from the probabilities outputed by network
    :param probs: array with values in range [0, 1]
    :param threshold: threshold between [0 and 1], if negative Otsu's adaptive threshold will be used
    :return: binary mask
    """

    mask = binarization.thresholding(probs, threshold)
    mask = binarization.cleaning_binary(mask, kernel_size=5)
    return mask


def format_quad_to_string(quad):
    """
    Formats the corner points into a string.
    :param quad: coordinates of the quadrilateral
    :return:
    """
    s = ''
    for corner in quad:
        s += '{},{},'.format(corner[0], corner[1])
    return s[:-1]


In [18]:
modelDir = '../pretrained_models/bso_model/'

In [None]:
with tf.Session():
    # Load model
    m = LoadedModel(modelDir, predict_mode='filename')
    for row in tqdm(data):
        if not row['documentCoordinates'] or len(row['documentCoordinates']) == 0:
            filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
            if not path.isfile(filename):
                print("File does not exist:", filename)
            else:
                # For each image, predict each pixel's label
                prediction_outputs = m.predict(filename)
                probs = prediction_outputs['probs'][0]
                probs = probs[:, :, 2]  # Take only class '2' (class 0 is the background, class 1 is the document, class 2 is the image)
                probs = probs / np.max(probs)  # Normalize to be in [0, 1]

                # Binarize the predictions
                page_bin = page_make_binary_mask(probs)

                # Upscale to have full resolution image (cv2 uses (w,h) and not (h,w) for giving shapes)        
                original_shape = prediction_outputs['original_shape']
                original_size = tuple(original_shape[::-1])
                original_size = (round(original_size[0] / 1024 * int(row['width'])), round(original_size[1] / 1024 * int(row['height'])))
                bin_upscaled = cv2.resize(page_bin.astype(np.uint8, copy=False),
                                          original_size, interpolation=cv2.INTER_NEAREST)

                # Find quadrilateral enclosing the page
                pred_page_coords = boxes_detection.find_boxes(bin_upscaled.astype(np.uint8, copy=False),
                                                              mode='min_rectangle', min_area=0.2, n_max_boxes=1)

                # Rescale coordinates
                if pred_page_coords is not None:
                    row['documentCoordinates'] = format_quad_to_string(pred_page_coords)

                    # Store coordinates in data after every prediction
                    writeData(data)

Loading ../pretrained_models/bso_model/
INFO:tensorflow:Restoring parameters from ../pretrained_models/bso_model/variables/variables


 61%|██████    | 16780/27450 [03:28<03:37, 49.09it/s] 

File does not exist: ../data/images/nb-1000691.jpg


 71%|███████   | 19360/27450 [03:54<02:35, 52.12it/s] 

## Step 5: Output as CIDOC-CRM RDF

Output as a Trig file that can be displayed and edited in the Mirador component of ResearchSpace & Metaphacts

In [35]:
namespaces = """
@prefix Platform: <http://www.metaphacts.com/ontologies/platform#> .
@prefix User: <http://www.metaphacts.com/resource/user/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix crmdig: <http://www.ics.forth.gr/isl/CRMdig/> .
@prefix rso: <http://www.researchspace.org/ontology/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/>.
"""

regionTemplate = Template('''<$uri/container/context> {
  Platform:formContainer ldp:contains <$uri/container> .
  
  <$uri>
    a crmdig:D35_Area, rso:EX_Digital_Image_Region;
    crmdig:L49_is_primary_area_of <$iiifImage>;
    rso:boundingBox "xywh=$x,$y,$w,$h";
    rso:displayLabel "image";
    rso:viewport "xywh=0,0,0,0";
    rdf:value "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\\"http://www.w3.org/2000/svg\\" d=\\"M${x0},${y0}l${halfW},0l0,0l${halfW},0l 0,${halfH}l 0,${halfH}l -${halfW},0l -${halfW},0l 0,-${halfH}z\\" data-paper-data=\\"{&quot;defaultStrokeValue&quot;:1,&quot;editStrokeValue&quot;:5,&quot;currentStrokeValue&quot;:1,&quot;rotation&quot;:0,&quot;deleteIcon&quot;:null,&quot;rotationIcon&quot;:null,&quot;group&quot;:null,&quot;editable&quot;:true,&quot;annotation&quot;:null}\\" id=\\"rectangle_e880ad36-1fef-4ce3-835d-716ba7db628a\\" fill-opacity=\\"0\\" fill=\\"#00bfff\\" fill-rule=\\"nonzero\\" stroke=\\"#00bfff\\" stroke-width=\\"4.04992\\" stroke-linecap=\\"butt\\" stroke-linejoin=\\"miter\\" stroke-miterlimit=\\"10\\" stroke-dasharray=\\"\\" stroke-dashoffset=\\"0\\" font-family=\\"none\\" font-weight=\\"none\\" font-size=\\"none\\" text-anchor=\\"none\\" style=\\"mix-blend-mode: normal\\"/></svg>" .
  
  <$uri/container>
    a ldp:Resource, prov:Entity;
    prov:generatedAtTime "$dateTime"^^xsd:dateTime;
    prov:wasAttributedTo User:admin .
}

<https://platform.swissartresearch.net/documentRegions> {
    <$uri> crm:P2_has_type <https://resource.swissartresearch.net/type/documentRegion> .
}

''')

In [36]:
dateTime = datetime.now().strftime("%Y-%m-%dT%H:%M:%Sz")

output = namespaces

missingDocumentCoordinates = []

for row in tqdm(data):
    if row['documentCoordinates'] is not None:
        docCoords = row['documentCoordinates'].split(',')
        if len(docCoords) < 8:
            missingDocumentCoordinates.append(row)
            continue

        xCoords = [int(docCoords[0]), int(docCoords[2]), int(docCoords[4]), int(docCoords[6])]
        yCoords = [int(docCoords[1]), int(docCoords[3]), int(docCoords[5]), int(docCoords[7])]
        edges = {
            "topLeft": (min(xCoords), min(yCoords)),
            "topRight": (max(xCoords), min(yCoords)),
            "bottomRight": (max(xCoords), max(yCoords)),
            "bottomLeft": (min(xCoords), max(yCoords))
        }
        iiifImage = row['image']
        identifier = str(uuid.uuid3(uuid.NAMESPACE_DNS, iiifImage))
        uri = "https://resource.swissartresearch.net/digitalobject/" + identifier
        x0 = edges['topLeft'][0]
        y0 = edges['topLeft'][1]
        x1 = edges['bottomRight'][0]
        y1 = edges['bottomRight'][1]
        x = x0
        y = y0
        w = x1 - x0
        h = y1 - y0
        output += regionTemplate.substitute(
            uri=uri,
            iiifImage=iiifImage,
            x=int(x),
            y=int(y),
            w=int(w),
            h=int(h),
            x0=x0,
            y0=y0,
            halfW=float(w/2),
            halfH=float(h/2),
            dateTime=dateTime
        )

# Write summary of missing corodinates
if len(missingDocumentCoordinates) > 0:
    print("Could not detect coordinates in %d images:" % len(missingDocumentCoordinates))
    print('\n'.join([d['id'] for d in missingDocumentCoordinates]))
    
filename = path.join(config['trigFile'])
with open(filename, 'w') as f:
    f.write(output)

100%|██████████| 27450/27450 [00:01<00:00, 24820.21it/s]


Could not detect coordinates in 39 images:
zbz-990100839710205508
zbz-990101416840205508
zbz-990101521100205508
zbz-990102101150205508
zbz-990102101150205508
zbz-990102782640205508
zbz-990103614330205508
zbz-990104876180205508
zbz-990105616830205508
zbz-990106138050205508
zbz-990106138320205508
zbz-990106141270205508
zbz-990106262790205508
zbz-990106263300205508
zbz-990107379980205508
zbz-990107401490205508
zbz-990107405070205508
zbz-990107405210205508
zbz-990107405300205508
zbz-990107454510205508
zbz-990107778670205508
zbz-990108007420205508
zbz-990108007920205508
zbz-990108008120205508
zbz-990108014810205508
zbz-990108015010205508
zbz-990108015040205508
zbz-990108015120205508
zbz-990108015130205508
zbz-990108978310205508
zbz-990109329750205508
zbz-990109597470205508
zbz-990110251690205508
zbz-990110251830205508
zbz-990110251900205508
zbz-990110252020205508
zbz-990110252050205508
zbz-990110607760205508
zbz-990110608440205508
