In [1]:
import csv
import json
import urllib.request
import uuid
from datetime import datetime
from os.path import join
from pathlib import Path
from PIL import Image
from string import Template
from tqdm import tqdm

In [2]:
imageCSV = '../data/images.csv'
predictionsCSV = '../output/pages.txt'
outputDirectory = '../output/rdf/'

In [3]:
images = []
with open(imageCSV, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        images.append(row)

In [4]:
predictions = []
with open(predictionsCSV, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        xCoords = [int(row[1]), int(row[3]), int(row[5]), int(row[7])]
        yCoords = [int(row[2]), int(row[4]), int(row[6]), int(row[8])]
        predictions.append({
            "image": row[0],
            "topLeft": (min(xCoords), min(yCoords)),
            "topRight": (max(xCoords), min(yCoords)),
            "bottomRight": (max(xCoords), max(yCoords)),
            "bottomLeft": (min(xCoords), max(yCoords))
        })


In [5]:
for p in tqdm(predictions):
    image = Image.open(join('..', p['image']))
    p['imageSize'] = image.size

100%|██████████| 26733/26733 [00:14<00:00, 1784.88it/s]


In [6]:
for p in predictions:
    size = p['imageSize']
    p['pct'] = "pct:%.1f,%.1f,%.1f,%.1f" % (
        p['topLeft'][0] / size[0] * 100, 
        p['topLeft'][1] / size[1] * 100, 
        (p['bottomRight'][0] - p['topLeft'][0]) / size[0] * 100, 
        (p['bottomRight'][1] - p['topLeft'][1]) / size[1] * 100)

In [7]:
for image in images:
    if 'nb-' in image['id']:
        image['iiif'] = "https://bso-iiif.swissartresearch.net/iiif/2/" + image['id']
    else:
        image['iiif'] = image['image'].split('/full/')[0]

In [8]:
imagesById = {}
for image in images:
    imagesById[image['id']] = image

In [9]:
for p in predictions:
    imageId = p['image'].split('/images/')[1].split('.jpg')[0]
    try:
        p['imageData'] = dict(imagesById[imageId])
    except:
        print("Could not find",imageId)

In [12]:
#predictions[:10]
#selection = [d for d in predictions if '1717326' in d['imageData']['iiif']]

In [None]:
for p in tqdm(predictions):
    uri = p['imageData']['iiif']
    try:
        with urllib.request.urlopen(uri) as url:
            data = json.loads(url.read().decode())
    except:
        print("Could not open", uri)
        next
    p['originalSize'] = (data['width'], data['height']) 

  0%|          | 5/26733 [00:02<3:12:41,  2.31it/s]

In [None]:
regionTemplate = Template('''
@prefix Platform: <http://www.metaphacts.com/ontologies/platform#> .
@prefix User: <http://www.metaphacts.com/resource/user/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix crmdig: <http://www.ics.forth.gr/isl/CRMdig/> .
@prefix rso: <http://www.researchspace.org/ontology/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/>.


<$uri/container/context> {
  Platform:formContainer ldp:contains <$uri/container> .
  
  <$uri>
    a crmdig:D35_Area, rso:EX_Digital_Image_Region;
    crm:P2_has_type <https://resource.swissartresearch.net/type/documentRegion> ;
    crmdig:L49_is_primary_area_of <$iiifImage>;
    rso:boundingBox "xywh=$x,$y,$w,$h";
    rso:displayLabel "image";
    rso:viewport "xywh=0,0,0,0";
    rdf:value "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\\"http://www.w3.org/2000/svg\\" d=\\"M${x0},${y0}L${x1},${y0}L${x1},${y1}L${x0},${y1}z\\" data-paper-data=\\"{&quot;defaultStrokeValue&quot;:1,&quot;editStrokeValue&quot;:5,&quot;currentStrokeValue&quot;:1,&quot;rotation&quot;:0,&quot;deleteIcon&quot;:null,&quot;rotationIcon&quot;:null,&quot;group&quot;:null,&quot;editable&quot;:true,&quot;annotation&quot;:null}\\" id=\\"rectangle_e880ad36-1fef-4ce3-835d-716ba7db628a\\" fill-opacity=\\"0\\" fill=\\"#00bfff\\" fill-rule=\\"nonzero\\" stroke=\\"#00bfff\\" stroke-width=\\"4.04992\\" stroke-linecap=\\"butt\\" stroke-linejoin=\\"miter\\" stroke-miterlimit=\\"10\\" stroke-dasharray=\\"\\" stroke-dashoffset=\\"0\\" font-family=\\"none\\" font-weight=\\"none\\" font-size=\\"none\\" text-anchor=\\"none\\" style=\\"mix-blend-mode: normal\\"/></svg>" .
  
  <$uri/container>
    a ldp:Resource, prov:Entity;
    prov:generatedAtTime "$dateTime"^^xsd:dateTime;
    prov:wasAttributedTo User:admin .
}
''')

In [None]:
dateTime = datetime.now().strftime("%Y-%m-%dT%H:%M:%Sz")
for p in tqdm(predictions):
    iiifImage = p['imageData']['iiif']
    identifier = str(uuid.uuid3(uuid.NAMESPACE_DNS, iiifImage))
    uri = "https://resource.swissartresearch.net/digitalobject/" + identifier
    x0 = p['topLeft'][0] / p['imageSize'][0] * p['originalSize'][0]
    y0 = p['topLeft'][1] / p['imageSize'][1] * p['originalSize'][1]
    x1 = p['bottomRight'][0] / p['imageSize'][0] * p['originalSize'][0]
    y1 = p['bottomRight'][1] / p['imageSize'][1] * p['originalSize'][1]
    x = x0
    y = y0
    w = x1 - x0
    h = y1 - y0
    output = regionTemplate.substitute(
        uri=uri,
        iiifImage=iiifImage,
        x=int(x),
        y=int(y),
        w=int(w),
        h=int(h),
        x0=x0,
        y0=y0,
        x1=x1,
        y1=y1,
        dateTime=dateTime
    )
    filename = join(outputDirectory, identifier + ".trig")
    with open(filename, 'w') as f:
        f.write(output)