# ICIJ Offshore Leaks Database

In [1]:
from dtgraph import Neo4jGraph, Rule, Transformation
hostname = "localhost"
uri = f"bolt://{hostname}:7687"
graph = Neo4jGraph(uri, database="neo4j", username="", password="")

## Experiment 1: Single rule refactoring
Motivate the rule. [TODO]
### Transformation rule
We look at the execution time of the transformation rule:

In [2]:
panama_address_to_global = Rule('''
MATCH (a:Address { sourceID: "Paradise Papers - Malta corporate registry" }) 
WHERE a.country_code IS NOT NULL
GENERATE 
(x = (a):T_Address {
    source = a.sourceID
})-[():T_LOCATED]->(y = (a.country_code):T_Country {
    name = a.country
})
''')

In [3]:
#panama_address_to_global._compile(with_diagnose=False)
#print(panama_address_to_global._compiled)

In [4]:
my_transform = Transformation([panama_address_to_global], with_diagnose = False) # no need for diagnose's specific functionalities for the VLDB exp.
rep, tt = 2, 0
for i in range(rep):
    print(f"Iteration {i=}:")
    tt += my_transform.apply_on(graph)
    my_transform.abort(keep_index = True) # Strange bug: indexes created with the Neo4j Python driver cannot be used in subsequent query plans
avg_time = tt/rep
print(f"Average execution time: {avg_time:.3f} ms.")

Iteration i=0:
Index: Added 0 index, completed after 1 ms.
Rule: Added 246866 labels, created 123433 nodes, set 493153 properties, created 123240 relationships, completed after 3443 ms.
Abort: Deleted 123433 nodes, deleted 123240 relationships, completed after 797 ms.
Iteration i=1:
Index: Added 0 index, completed after 1 ms.
Rule: Added 246866 labels, created 123433 nodes, set 493153 properties, created 123240 relationships, completed after 3222 ms.
Abort: Deleted 123433 nodes, deleted 123240 relationships, completed after 804 ms.
Average execution time: 3332.500 ms.


### Input query
We now look at the execution time and number of rows of the input query.

In [5]:
panama_address_to_global_input_rule = '''
MATCH (a:Address { sourceID: "Paradise Papers - Malta corporate registry" }) 
WHERE a.country_code IS NOT NULL
RETURN a'''
nb_rows, exec_time = graph.query(panama_address_to_global_input_rule)
print(f"Execution time of the input rule: {exec_time} ms.")
print(f"Size of intermediate data: {nb_rows} rows.")

Execution time of the input rule: 890 ms.
Size of intermediate data: 123240 rows.


### Results

Present the statistics and give number of conflicting elements on the output. [TODO]

In [6]:
_, summary = panama_address_to_global.apply_on(graph)
# clean output data 
my_transform._graph = graph
my_transform.abort(keep_index = True)
num_nodes = summary.counters.nodes_created
num_rels = summary.counters.relationships_created

Rule: Added 246866 labels, created 123433 nodes, set 493153 properties, created 123240 relationships, completed after 3282 ms.
Abort: Deleted 123433 nodes, deleted 123240 relationships, completed after 609 ms.


In [7]:
print(f"Ratio between intermediate data and size of the output: {(num_nodes + num_rels) / nb_rows:.3f}")
print(f"Time to compute the intermediary results, per row: {exec_time / nb_rows:.3f} ms.")
print(f"Time to construct the output, per element: { avg_time / (num_nodes + num_rels):.3f} ms.")

Ratio between intermediate data and size of the output: 2.002
Time to compute the intermediary results, per row: 0.007 ms.
Time to construct the output, per element: 0.014 ms.
