# Benchmarks
In this notebook we will p

## Setup

In [1]:
import logging
import os
import sys

# set up module paths for imports
module_path = os.path.abspath(os.path.join('..'))
hercules_sync_path = os.path.abspath(os.path.join('..', 'hercules_sync'))
sys.path.append(module_path)
sys.path.append(hercules_sync_path)

# start logging system and set logging level
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.info("Starting logger")

INFO:root:Starting logger


## Datasets used

In [2]:
import bz2
import urllib.request

def read_zipped_dataset(url, decompressor=bz2):
    response = urllib.request.urlopen(url)
    content = bz2.decompress(response.read())
    return content

def get_first_lines(string, num_lines):
    return b'\n'.join(string.split(b'\n')[:num_lines])


In [3]:
DBPEDIA_PERSONDATA_URL = 'http://downloads.dbpedia.org/3.4/en/persondata_en.nt.bz2'
NUM_TRIPLES_FINAL = 4000
NUM_TRIPLES_PREVIEW = 100

dbpedia_dataset = read_zipped_dataset(DBPEDIA_PERSONDATA_URL, bz2)
dbpedia_dataset_preview = get_first_lines(dbpedia_dataset, NUM_TRIPLES_PREVIEW)
dbpedia_dataset_final = get_first_lines(dbpedia_dataset, NUM_TRIPLES_FINAL)

## Analysing bottlenecks of the application

In [4]:
from hercules_sync.external.uri_factory_mock import URIFactory
from hercules_sync.git import GitFile
from hercules_sync.synchronization import GraphDiffSyncAlgorithm, OntologySynchronizer

algorithm = GraphDiffSyncAlgorithm()
synchronizer = OntologySynchronizer(algorithm)
factory = URIFactory()
factory.reset_factory()

In [5]:
import paramiko

from hercules_sync.triplestore import WikibaseAdapter
from secret import SSH_USER, SSH_PASS, USERNAME, PASSWORD

wikibase_host = '156.35.94.149'
ssh_port = '22'
mediawiki_api_url = f'http://{wikibase_host}:8181/w/api.php'
sparql_endpoint_url = f'http://{wikibase_host}:8282/proxy/wdqs/bigdata/namespace/wdq/sparql'
adapter = WikibaseAdapter(mediawiki_api_url, sparql_endpoint_url, USERNAME, PASSWORD)

def reset_wb_state(factory):
    global adapter
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(wikibase_host, ssh_port, SSH_USER, SSH_PASS)
    stdin, stdout, stderr = ssh.exec_command('cd wikibase-docker && sh clean_restart.sh')
    factory.reset_factory()
    adapter = WikibaseAdapter(mediawiki_api_url, sparql_endpoint_url, USERNAME, PASSWORD)


http://156.35.94.149:8181/w/api.php


INFO:hercules_sync.triplestore.wikibase_adapter:Mappings property was not found in the wikibase. Creating it...


Successfully logged in as WikibaseAdmin


Please set P2302 and Q21502410 in your wikibase or set `core_props` manually.
Continuing with no core_props
  "Please set P2302 and Q21502410 in your wikibase or set `core_props` manually.\n" +
INFO:hercules_sync.triplestore.wikibase_adapter:Mappings property has been created: P1


In [6]:
def execute_synchronization(source_content, target_content, synchronizer, adapter):
    gitfile = GitFile(None, source_content, target_content)
    ops = synchronizer.synchronize(gitfile)
    for op in ops:
        res = op.execute(adapter)
        if not res.successful:
            print(f"Error synchronizing triple: {res.message}")

In [None]:
import cProfile

#reset_wb_state(factory)
cProfile.run('execute_synchronization("", dbpedia_dataset_preview, synchronizer, adapter)')

INFO:hercules_sync.triplestore.wikibase_adapter:Create triple: URIElement: http://dbpedia.org/resource/Abel_Streight - Type: item - URIElement: http://www.w3.org/1999/02/22-rdf-syntax-ns#type - Type: item - URIElement: http://xmlns.com/foaf/0.1/Person - Type: item
INFO:hercules_sync.triplestore.wikibase_adapter:Create triple: URIElement: http://dbpedia.org/resource/Alan_Perlis - Type: item - URIElement: http://xmlns.com/foaf/0.1/surname - Type: item - LiteralElement: Perlis - Language: de
INFO:hercules_sync.triplestore.wikibase_adapter:Create triple: URIElement: http://dbpedia.org/resource/Aaron_Sprinkle - Type: item - URIElement: http://xmlns.com/foaf/0.1/name - Type: item - LiteralElement: Aaron Sprinkle - Language: de
INFO:hercules_sync.triplestore.wikibase_adapter:Create triple: URIElement: http://dbpedia.org/resource/A._J._Carlson - Type: item - URIElement: http://xmlns.com/foaf/0.1/surname - Type: item - LiteralElement: Carlson - Language: de
INFO:hercules_sync.triplestore.wikiba

As we can see above...

## Batch vs Basic operations

In [None]:
def _synchronize(source_content, target_content, synchronizer, adapter, ops_callback):
    gitfile = GitFile(None, source_content, target_content)
    ops = ops_callback(gitfile)
    if optimize_ops:
        ops = optimize_ops(ops)
    
    for op in ops:
        res = op.execute(adapter)
        if not res.successful:
            print(f"Error synchronizing triple: {res.message}")

def execute_basic_synchronization(source_content, target_content, synchronizer, adapter):
    ops_callback = lambda f: synchronizer.synchronize(f)
    return _synchronize(source_content, target_content, synchronizer, adapter, ops_callback)
    
def execute_batch_synchronization(source_content, target_content, synchronizer, adapter):
    ops_callback = lambda f: optimize_ops(synchronizer.synchronize(f))
    return _synchronize(source_content, target_content, synchronizer, adapter, ops_callback)
