In [1]:
import csv
import json
import urllib.request
import requests
import uuid
import yaml
from os import path
from pathlib import Path
from string import Template
from SPARQLWrapper import SPARQLWrapper, JSON
from datetime import datetime
from tqdm import tqdm

In [2]:
configFile = '../pipeline/config.yml'

### Load Configuration

In [3]:
try:
    with open(configFile, 'r') as f:
        config = yaml.safe_load(f)
except:
    raise Exception("Could not load config file at", configFile)

### Define helpers

In [4]:
# Constants
SPARQL = 0
CSV = 1

def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in results["head"]["vars"]:
            if key in result:
                row[key] = result[key]["value"]
            else:
                row[key] = None
        rows.append(row)
    return rows

def writeData(data):
    try:
        with open(config['dataFile'], 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['id','image','width','height','documentCoordinates'])
            writer.writeheader()
            for row in data:
                if not 'documentCoordinates' in row:
                    row['documentCoordinates'] = None
                writer.writerow(row)
    except:
        raise Exception("Could not write to", config['dataFile'])

## Step 1: Get input data

In [5]:
mode = False
if config['mode'] == "SPARQL":
    mode = SPARQL
elif config['mode'] == "CSV":
    mode  = CSV
else:
    raise Exception("mode not specified or invalid (should be SPARQL or CSV)")

Read data from input file, if present. This is being done for both CSV and SPARQL mode as the SPARQL results will be cashed in the CSV file and updated when data is changed.

In [6]:
inputData = []
try:
    with open(config['dataFile'], 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            inputData.append({
                "id": row['id'],
                "image": row['image'],
                "width": row['width'],
                "height": row['height'],
                "documentCoordinates": row['documentCoordinates'] if 'documentCoordinates' in row else None
            })
except:
    print("No prior input file found")


If in SPARQL mode, get data from SPARQL endpoint

In [7]:
if mode == SPARQL:
    if not config['endpoint'] or not config['query']:
        raise Exception("incomplete configuration for SPARQL mode")
        
    sparql = SPARQLWrapper(config['endpoint'], returnFormat=JSON)
    sparql.setQuery(config['query'])
    try:
        ret = sparql.query().convert()
    except:
        raise Exception("Could not execute query against endpoint", config['endpoint'])
    queriedData = sparqlResultToDict(ret)

If in SPARQL mode, merge queried data with data stored in CSV file.
- add entries that exist in SPARQL result, but not in the CSV file
- add width/height information when it is only available in either the CSV file or the SPARQL output (prioritising the SPARQL data)
Store merged data in CSV file

In [8]:
data = inputData

if mode == SPARQL:
    inputDataHash = {}
    queriedDataHash = {}

    for row in inputData:
        inputDataHash[row['id']] = row
    for row in queriedData:
        queriedDataHash[row['id']] = row

    idsInInputData = [d['id'] for d in inputData]
    for row in queriedData:
        if row['id'] not in idsInInputData:
            data.append(row)

    for row in data:
        if not row['width']:
            if row['id'] in queriedDataHash and queriedDataHash[row['id']]['width']:
                row['width'] = queriedDataHash[row['id']]['width']
            elif row['id'] in inputDataHash and inputDataHash[row['id']]['width']:
                row['width'] = inputDataHash[row['id']]['width']
        if not row['height']:
            if row['id'] in queriedDataHash and queriedDataHash[row['id']]['height']:
                row['height'] = queriedDataHash[row['id']]['height']
            elif row['id'] in inputDataHash and inputDataHash[row['id']]['width']:
                row['height'] = inputDataHash[row['id']]['height']
    
    writeData(data)

## Step 2: Get (missing) image sizes

If the original image size is not specified, call the IIIF Image API to read the size from the JSON rsponse

In [9]:
for row in tqdm(data):
    if not row['width'] or not row['height']:
        uri = row['image'] + '/info.json'
        try:
            with urllib.request.urlopen(uri) as url:
                manifestData = json.loads(url.read().decode())
                
        except:
            print("Could not open", uri)
            next
        row['width'] = manifestData['width']
        row['height'] = manifestData['height']

100%|██████████| 100/100 [00:00<00:00, 128817.69it/s]


Write data to file

In [10]:
writeData(data)

## Step 3: Download images

Download the images that do not yet exist in the image folder. The images will be downloaded resized to a width of 1024 pixels.

In [11]:
try:
    Path(config['imageDirectory']).mkdir(parents=True, exist_ok=True)
except:
    raise Exception("Could not add/access folder", config['imageDirectory'])

In [12]:
maxRetries = 5
for row in tqdm(data):
    filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
    if not path.exists(filename):
        url = row['image'] + '/full/1024,/0/default.jpg'
        r = requests.get(url, allow_redirects = True)
        retries = 1
        while not 'image' in r.headers['Content-Type'] and retries <= maxRetries:
            # Try again if no image comes back
            time.sleep(1)
            r = requests.get(url, allow_redirects = True)
            retries += 1
        if retries >= maxRetries:
            raise Exception("Could not download", row['id'])
        else:
            with open(filename, 'wb') as f:
                f.write(r.content)

100%|██████████| 100/100 [00:00<00:00, 30073.16it/s]


## Step 4: Apply model

Apply the model. This step is based on the code provided in the DH Segment example at https://github.com/dhlab-epfl/dhSegment/blob/master/demo.py

In [13]:
import os
from glob import glob

import cv2
import numpy as np
import tensorflow as tf
from imageio import imread, imsave
from tqdm import tqdm

from dh_segment.io import PAGE
from dh_segment.inference import LoadedModel
from dh_segment.post_processing import boxes_detection, binarization

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [14]:
def page_make_binary_mask(probs: np.ndarray, threshold: float=-1) -> np.ndarray:
    """
    Computes the binary mask of the detected Page from the probabilities outputed by network
    :param probs: array with values in range [0, 1]
    :param threshold: threshold between [0 and 1], if negative Otsu's adaptive threshold will be used
    :return: binary mask
    """

    mask = binarization.thresholding(probs, threshold)
    mask = binarization.cleaning_binary(mask, kernel_size=5)
    return mask


def format_quad_to_string(quad):
    """
    Formats the corner points into a string.
    :param quad: coordinates of the quadrilateral
    :return:
    """
    s = ''
    for corner in quad:
        s += '{},{},'.format(corner[0], corner[1])
    return s[:-1]


In [15]:
modelDir = '../pretrained_models/bso_model/'

In [16]:
with tf.Session():
    # Load model
    m = LoadedModel(modelDir, predict_mode='filename')
    for row in tqdm(data):
        if not row['documentCoordinates'] or len(row['documentCoordinates']) == 0:
            filename = path.join(config['imageDirectory'], row['id'] + '.jpg')
             # For each image, predict each pixel's label
            prediction_outputs = m.predict(filename)
            probs = prediction_outputs['probs'][0]
            probs = probs[:, :, 2]  # Take only class '2' (class 0 is the background, class 1 is the document, class 2 is the image)
            probs = probs / np.max(probs)  # Normalize to be in [0, 1]

            # Binarize the predictions
            page_bin = page_make_binary_mask(probs)

            # Upscale to have full resolution image (cv2 uses (w,h) and not (h,w) for giving shapes)        
            original_shape = prediction_outputs['original_shape']
            original_size = tuple(original_shape[::-1])
            original_size = (round(original_size[0] / 1024 * int(row['width'])), round(original_size[1] / 1024 * int(row['height'])))
            bin_upscaled = cv2.resize(page_bin.astype(np.uint8, copy=False),
                                      original_size, interpolation=cv2.INTER_NEAREST)

            # Find quadrilateral enclosing the page
            pred_page_coords = boxes_detection.find_boxes(bin_upscaled.astype(np.uint8, copy=False),
                                                          mode='min_rectangle', min_area=0.2, n_max_boxes=1)

            # Rescale coordinates
            row['documentCoordinates'] = format_quad_to_string(pred_page_coords)

Loading ../pretrained_models/bso_model/
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ../pretrained_models/bso_model/variables/variables


100%|██████████| 100/100 [05:03<00:00,  3.03s/it]


Store coordinates in data

In [17]:
writeData(data)

## Step 5: Output as CIDOC-CRM RDF

Output as Trig files that can be displayed and edited in the Mirador component of ResearchSpace & Metaphact

In [18]:
regionTemplate = Template('''
@prefix Platform: <http://www.metaphacts.com/ontologies/platform#> .
@prefix User: <http://www.metaphacts.com/resource/user/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix crmdig: <http://www.ics.forth.gr/isl/CRMdig/> .
@prefix rso: <http://www.researchspace.org/ontology/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/>.


<$uri/container/context> {
  Platform:formContainer ldp:contains <$uri/container> .
  
  <$uri>
    a crmdig:D35_Area, rso:EX_Digital_Image_Region;
    crm:P2_has_type <https://resource.swissartresearch.net/type/documentRegion> ;
    crmdig:L49_is_primary_area_of <$iiifImage>;
    rso:boundingBox "xywh=$x,$y,$w,$h";
    rso:displayLabel "image";
    rso:viewport "xywh=0,0,0,0";
    rdf:value "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\\"http://www.w3.org/2000/svg\\" d=\\"M${x0},${y0}L${x1},${y0}L${x1},${y1}L${x0},${y1}z\\" data-paper-data=\\"{&quot;defaultStrokeValue&quot;:1,&quot;editStrokeValue&quot;:5,&quot;currentStrokeValue&quot;:1,&quot;rotation&quot;:0,&quot;deleteIcon&quot;:null,&quot;rotationIcon&quot;:null,&quot;group&quot;:null,&quot;editable&quot;:true,&quot;annotation&quot;:null}\\" id=\\"rectangle_e880ad36-1fef-4ce3-835d-716ba7db628a\\" fill-opacity=\\"0\\" fill=\\"#00bfff\\" fill-rule=\\"nonzero\\" stroke=\\"#00bfff\\" stroke-width=\\"4.04992\\" stroke-linecap=\\"butt\\" stroke-linejoin=\\"miter\\" stroke-miterlimit=\\"10\\" stroke-dasharray=\\"\\" stroke-dashoffset=\\"0\\" font-family=\\"none\\" font-weight=\\"none\\" font-size=\\"none\\" text-anchor=\\"none\\" style=\\"mix-blend-mode: normal\\"/></svg>" .
  
  <$uri/container>
    a ldp:Resource, prov:Entity;
    prov:generatedAtTime "$dateTime"^^xsd:dateTime;
    prov:wasAttributedTo User:admin .
}
''')

In [19]:
try:
    Path(config['trigDirectory']).mkdir(parents=True, exist_ok=True)
except:
    raise Exception("Could not add/access folder", config['trigDirectory'])

In [20]:
dateTime = datetime.now().strftime("%Y-%m-%dT%H:%M:%Sz")

for row in tqdm(data):
    docCoords = row['documentCoordinates'].split(',')
    xCoords = [int(docCoords[0]), int(docCoords[2]), int(docCoords[4]), int(docCoords[6])]
    yCoords = [int(docCoords[1]), int(docCoords[3]), int(docCoords[5]), int(docCoords[7])]
    edges = {
        "topLeft": (min(xCoords), min(yCoords)),
        "topRight": (max(xCoords), min(yCoords)),
        "bottomRight": (max(xCoords), max(yCoords)),
        "bottomLeft": (min(xCoords), max(yCoords))
    }
    iiifImage = row['image']
    identifier = str(uuid.uuid3(uuid.NAMESPACE_DNS, iiifImage))
    uri = "https://resource.swissartresearch.net/digitalobject/" + identifier
    x0 = edges['topLeft'][0]
    y0 = edges['topLeft'][1]
    x1 = edges['bottomRight'][0]
    y1 = edges['bottomRight'][1]
    x = x0
    y = y0
    w = x1 - x0
    h = y1 - y0
    output = regionTemplate.substitute(
        uri=uri,
        iiifImage=iiifImage,
        x=int(x),
        y=int(y),
        w=int(w),
        h=int(h),
        x0=x0,
        y0=y0,
        x1=x1,
        y1=y1,
        dateTime=dateTime
    )
    filename = path.join(config['trigDirectory'], identifier + ".trig")
    with open(filename, 'w') as f:
        f.write(output)

100%|██████████| 100/100 [00:00<00:00, 394.61it/s]


In [21]:
edges

{'topLeft': (94, 97),
 'topRight': (1812, 97),
 'bottomRight': (1812, 1418),
 'bottomLeft': (94, 1418)}