In [7]:
import re
import requests
from PIL import Image
from urllib import request
from os import path
from configparser import ConfigParser
from hashlib import blake2b
from string import Template
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

In [8]:
propsFile = '../services/researchplatform/apps/bso/config/ui.prop'
outputDir = '../services/researchplatform/apps/static/assets/no_auth'

endpoint = "http://localhost:7776/blazegraph/sparql"
namedGraph = 'https://resource.swissartresearch.net/graph/thumbnails'

thumbnailLocation = "http://localhost:7778/assets/no_auth"
thumbnailPrefix = "thumbnail-"
thumbnailPredicate = "http://schema.org/thumbnail"

In [9]:
def downloadAsThumbnail(*, url, directory, prefix, targetWidth=400):
    filepath = path.join(outputDir, generateFilename(url, prefix))
    if not path.exists(filepath):
        request.urlretrieve(url, filepath)
        img = Image.open(filepath, 'r')
        (width, height) = (img.width, img.height)
        if width > targetWidth:
            img = img.resize((targetWidth, int(height/width*targetWidth)))
        img.save(filepath, 'jpeg', quality=75, optimize=True)
        
            
    return filepath

def downloadAll(*,data,directory,prefix):
    for row in tqdm(data):
        downloadAsThumbnail(url=row['thumbnail'], directory=directory, prefix=prefix)

def generateFilename(url, prefix):   
    def filenameHash(name, extension='.jpg'):
        h = blake2b(digest_size=20)
        h.update(name.encode())
        return h.hexdigest() + extension
    
    return prefix + filenameHash(url)

def generateTTLdata(data, filenamePrefix, location, predicate):
    ttlTemplate = Template("""
        <$subject> <$predicate> <$location/$filename> .
    """)
    return ttlTemplate.substitute(
        subject=data['subject'],
        filename=generateFilename(data['thumbnail'], filenamePrefix),
        predicate=predicate,
        location=location
    )

def getThumbnailQueries(propsfile,*, filterCondition=None):
    with open(propsFile, 'r') as f:
        rawConfig = f.read()
    configString = "[ui]\n" + rawConfig
    config = ConfigParser()
    config.read_string(configString)
    queries = re.split(r'(?<!\\),', config['ui']['preferredThumbnails'])
    queries = [re.sub(r'\\n|\\\\,', '', d) for d in queries]
    if filterCondition:
        filteredQueries = [d for d in queries if filterCondition in d]
    return filteredQueries

def ingestToTriplestore(*, endpoint, data, graph, prefix, location, predicate):
    output = ''
    for row in data:
        output += generateTTLdata(row, prefix, location, predicate)
    r = requests.post(
        url=endpoint, 
        data=output,
        params={"context-uri": graph},
        headers={"Content-Type": "application/x-turtle"})
    return r
    
def queryThumbnails(*,endpoint, queries, limit=None):
    sparql = SPARQLWrapper(endpoint)
    sparql.setReturnFormat(JSON)
    queryTemplate = Template("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX crmdig: <http://www.ics.forth.gr/isl/CRMdig/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        PREFIX search: <https://platform.swissartresearch.net/search/>
        SELECT $select WHERE {
            $queryParts
        }
    """)
    select = ['?subject']
    queryParts = []
    for i, query in enumerate(queries):
        variable = "?p%d" % i
        queryParts.append(query.replace("?value", variable))
        select.append(variable)
    query = queryTemplate.substitute(select=' '.join(select), queryParts=' UNION '.join(queryParts))
    if limit:
        query += " LIMIT %d" % limit
    sparql.setQuery(query)
    ret = sparqlResultToDict(sparql.queryAndConvert())
    thumbnails = []
    for row in ret:
        for i in range(len(queries)):
            variable = "p%d" % i
            if variable in row:
                thumbnails.append({
                    'subject': row['subject'],
                    'thumbnail': row[variable]
                })
                continue
    return thumbnails

def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in list(result.keys()):
            row[key] = result[key]["value"]
        rows.append(row)
    return rows

In [10]:
queries = getThumbnailQueries(propsFile, filterCondition='wdt:P18')
thumbnails = queryThumbnails(endpoint=endpoint, queries=queries)
downloadAll(data=thumbnails,
            prefix=thumbnailPrefix,
            directory=outputDir)
r = ingestToTriplestore(endpoint=endpoint,
                    prefix=thumbnailPrefix,
                    data=thumbnails, 
                    graph=namedGraph,
                    location=thumbnailLocation, 
                    predicate=thumbnailPredicate
)
print(r.text)

100%|██████████| 18/18 [00:00<00:00, 13016.81it/s]

<?xml version="1.0"?><data modified="18" milliseconds="14"/>



