In [1]:
import re
import urllib.parse
import pandas as pd
import time
from SPARQLWrapper import SPARQLWrapper, JSON
from IPython.display import display
from itables import show
from tqdm import tqdm

In [2]:
logFile = '../logs/sniffer.txt'
endpoint = "http://localhost:8887/blazegraph/sparql"

In [3]:
with open(logFile, 'r') as f:
    logRaw = f.read()

In [4]:
logCleaned = re.sub(r'\n\nT.(\d{1,3}\.){3}\d{1,3}:\d{1,5}.*\n','',logRaw)

In [5]:
queryPattern = r'query=\S*'

In [6]:
queriesRaw = re.findall(queryPattern, logCleaned)

In [7]:
print("Found %d queries" % len(queriesRaw))

Found 1016 queries


In [11]:
queries = []
for query in queriesRaw:
    query = query[len("query="):]
    query = urllib.parse.unquote_plus(query)
    query = query.replace("\n", "")
    if query[-3:] == 'GET':
        query = query[:-3]
    if query[-4:] == 'POST':
        query = query[:-4]
    if query[-8:] == 'HTTP/1.1':
        query = query[:-8]
    queries.append(query)

In [12]:
sparql = SPARQLWrapper(endpoint)
sparql.method = "POST"
sparql.setReturnFormat(JSON)

In [13]:
analysis = []
for query in tqdm(queries):
    sparql.setQuery(query)
    try:
        start = time.perf_counter()
        ret = sparql.queryAndConvert()
        responseTime = time.perf_counter() - start
    except Exception as e:
        print("Query:", query)
        print("Error:", e)
        ret = False
    if ret:
        analysis.append({
            "query": query,
            "return": ret,
            "responseTime": responseTime
        })

100%|██████████| 1016/1016 [01:37<00:00, 10.45it/s]


In [14]:
table = [{
    "index": i,
    "query": d['query'],
    "responseTime": d['responseTime'],
    "numVars": len(d['return']['head']['vars']) if not 'boolean' in d['return'] else 'boolean',
    "numResults": len(d['return']['results']['bindings']) if not 'boolean' in d['return'] else 'boolean'
} for i, d in enumerate(analysis)]
df = pd.DataFrame(table)

In [15]:
show(df)

<IPython.core.display.Javascript object>

index,numResults,numVars,query,responseTime


In [13]:
print("Total response time is %d seconds" % sum(d['responseTime'] for d in analysis))

Total response time is 43 seconds


In [14]:
queriesRelatedToFielddefinitions = [d for d in analysis if 'fieldDefinition' in d['query']]

In [15]:
print("There are %d out of %d queries that do refer to fielddefinitions" % (len(queriesRelatedToFielddefinitions), len(queries)))
  

There are 0 out of 982 queries that do refer to fielddefinitions


In [16]:
queriesRelatedToLabelService = [d for d in analysis if '?p0 ?p1' in d['query']]

In [17]:
print("There are %d out of %d queries that do refer to labels or thumbnails" % (len(queriesRelatedToLabelService), len(queries)))
  

There are 152 out of 982 queries that do refer to labels or thumbnails


In [16]:
print(queries[98])

SELECT ?subject ?p0 ?p1 ?p2 ?p3 ?p4 ?p5 WHERE {{{ 	?subject <http://schema.org/thumbnail> ?p0 .  }VALUES (?subject) { (<https://resource.swissartresearch.net/artwork/zbz-990101540730205508>)(<https://resource.swissartresearch.net/artwork/zbz-990107347230205508>)(<https://resource.swissartresearch.net/artwork/zbz-990101531380205508>)(<https://resource.swissartresearch.net/artwork/zbz-990101849710205508>)(<https://resource.swissartresearch.net/artwork/zbz-990107781740205508>)(<https://resource.swissartresearch.net/artwork/zbz-990101844240205508>)(<https://resource.swissartresearch.net/artwork/zbz-990107344720205508>)(<https://resource.swissartresearch.net/artwork/zbz-990102358080205508>)(<https://resource.swissartresearch.net/artwork/zbz-990101840970205508>)(<https://resource.swissartresearch.net/artwork/zbz-990101449840205508>) }}UNION{{ 	?subject <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://platform.swissartresearch.net/search/Object> . 	?subject <http://www.cidoc-crm.org