In [1]:
import sys
!{sys.executable} -m pip install strsimpy

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import json
import strsimpy
import uuid
import datetime
from string import Template
from strsimpy import Jaccard
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON

In [3]:
def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in results["head"]["vars"]:
            if key in result:
                row[key] = result[key]["value"]
            else:
                row[key] = None
        rows.append(row)
    return rows

In [4]:
outputJsonFile = 'output/similarityResult.json'
outputTtlFile = 'output/titleSimilarities.ttl'
outputGraph = 'http://resource.swissartresearch.net/graph/titleSimilarities'

In [5]:
endpoint = "http://localhost:8887/blazegraph/sparql"    
sparql = SPARQLWrapper(endpoint, returnFormat=JSON)

In [6]:
labelsQuery = """
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX search: <https://platform.swissartresearch.net/search/>
SELECT ?subject ?label WHERE {
    ?subject a search:Object ;
        rdfs:label ?label
}
"""

In [7]:
# sparql.setQuery(labelsQuery)
# try:
#     ret = sparql.query().convert()
# except:
#     raise Exception("Could not execute query against endpoint", endpoint)
# labelsData = sparqlResultToDict(ret)

In [8]:
# with open(outputJsonFile, 'w') as fp:
#     json.dump(labelsData, fp, indent=4)

In [9]:
with open(outputJsonFile, 'r') as f:
    labelsData = json.load(f)

In [10]:
j = Jaccard(2)
cutoff = 0.5
maxSimilarCandidates = 100
for test in tqdm([d for d in labelsData if not 'candidates' in d]):
    test['candidates'] = []
    testLabel = test['label']
    for candidate in labelsData:
        if candidate != test:
            similarity = j.similarity(testLabel, candidate['label'])
            if similarity > cutoff:
                test['candidates'].append({
                    'subject': candidate['subject'],
                    'label': candidate['label'],
                    'similarity': similarity
                })
        if len(test['candidates']) > maxSimilarCandidates:
            continue
    with open('similarityResult.json', 'w') as fp:
        json.dump(labelsData, fp, indent=4)

0it [00:00, ?it/s]


In [11]:
# import json
# with open(outputJsonFile, 'w') as fp:
#     json.dump(labelsData, fp, indent=4)

In [12]:
namespaces = """
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
"""


In [15]:
dateTime = datetime.datetime.now()

technique = 'https://github.com/swiss-art-research-net/bso-data-pipeline/blob/main/experiments/title-similarities.ipynb'

classificationTemplate = Template("""
<$classification> a crm:E13_Attribute_Assignment .
<$classification> crm:P140_assigned_attribute_to <$subject> .
<$classification> crm:P141_assigned <$objectUri> .
<$classification> crm:P4_has_time-span <$classification/date> .
<$classification> crm:P33_used_specific_technique <$technique> .
<$classification> rdf:value "$confidence"^^xsd:float .
<$classification/date> a crm:E52_Time-Span .
<$classification/date> crm:P82_at_some_time_within "$dateTime"^^xsd:dateTime .
""")


with open(outputTtlFile, 'w') as f:
    f.write(namespaces)
    

with open(outputTtlFile, 'a') as f:  
    for row in tqdm(labelsData):
        for candidate in row['candidates']:
            classificationUri = 'https://resource.swissartresearch.net/classification/' + str(uuid.uuid4())
            subjectUri = row['subject']
            objectUri = candidate['subject']
            f.write(classificationTemplate.substitute(graph=outputGraph, 
                                          classification=classificationUri, 
                                          subject=subjectUri, 
                                          objectUri=objectUri,
                                          technique=technique,
                                          confidence=candidate['similarity'],
                                          dateTime=dateTime.strftime("%Y-%m-%dT%H:%M:%S")))

100%|██████████| 28836/28836 [00:14<00:00, 1928.78it/s]
