# Initial data ingestion to JanusGraph

In [1]:
%env THOTH_DEPLOYMENT_NAME=thoth-test-core

from thoth.storages.graph import GraphDatabase
from thoth.storages import SolverResultsStore
from thoth.storages import AnalysisResultsStore

from thoth.lab import obtain_location
from thoth.lab import GraphQueryResult as gqr

#graph_db = GraphDatabase.create(obtain_location('thoth-sbu-janusgraph-test1', verify=False, only_netloc=True), port=80)
graph_db = GraphDatabase.create('localhost', port=8182)
solver_results = SolverResultsStore()
analysis_results = AnalysisResultsStore()

graph_db.connect()
solver_results.connect()
analysis_results.connect()

env: THOTH_DEPLOYMENT_NAME=thoth-test-core


## Place for experiments:

In [2]:
gqr(graph_db.g.V().drop().next())
assert gqr(graph_db.g.V().count().next()).result == 0

## Syncing image analysis results - with adapter cache

In [3]:
gqr(graph_db.g.V().count().next()).result

0

In [4]:
%%time
import asyncio
from itertools import islice

async def sync_analysis_results():
    for document_name in islice(analysis_results.get_document_listing(), 7):
        print("Syncing %s" % document_name)
        analysis_document = analysis_results.retrieve_document(document_name)
        await graph_db.sync_analysis_result(analysis_document)

loop = asyncio.get_event_loop()
loop.run_until_complete(sync_analysis_results())

Syncing fridex-thoth-package-extract-base-notebook-hv8pt
Syncing fridex-thoth-package-extract-fedora-27-2shjq
Syncing fridex-thoth-package-extract-fedora-27-46c6z
Syncing fridex-thoth-package-extract-fedora-27-4bvfd
Syncing fridex-thoth-package-extract-fedora-27-4kbwk
Syncing fridex-thoth-package-extract-fedora-27-4x9j9
Syncing fridex-thoth-package-extract-fedora-27-6wg7n
CPU times: user 1min 10s, sys: 3.34 s, total: 1min 13s
Wall time: 4min


In [5]:
gqr(graph_db.g.E().count().next()).result

25831

In [6]:
gqr(graph_db.g.V().count().next()).result

1602

Errors reported above are actually fine - the given artifacts do not carry package name informarmation (these are `requirements.txt`). We will filter them out on source code level since we do not want to sync such data anyway.

In [7]:
%%time
import asyncio
from itertools import islice

async def sync_solver_results():
    for document_name in islice(solver_results.get_document_listing(), 7):
        print("Syncing %s" % document_name)
        analysis_document = solver_results.retrieve_document(document_name)
        await graph_db.sync_solver_result(analysis_document)
        
loop = asyncio.get_event_loop()
loop.run_until_complete(sync_solver_results())

Syncing fridex-thoth-solver-fc26-thoth-solver-fc26-222k6
Syncing fridex-thoth-solver-fc26-thoth-solver-fc26-22zfg
Syncing fridex-thoth-solver-fc26-thoth-solver-fc26-249zq
Syncing fridex-thoth-solver-fc26-thoth-solver-fc26-2564s
Syncing fridex-thoth-solver-fc26-thoth-solver-fc26-25qhp
Syncing fridex-thoth-solver-fc26-thoth-solver-fc26-27vw6
Syncing fridex-thoth-solver-fc26-thoth-solver-fc26-2882l
CPU times: user 2min, sys: 5.94 s, total: 2min 6s
Wall time: 10min 1s


In [9]:
print(f"Number of vertexes present in graph: {gqr(graph_db.g.V().count().next()).result:d}")
print(f"Number of edges present in graph: {gqr(graph_db.g.E().count().next()).result:d}")
#print(f"Number of analysis files ingested: {len(list(analysis_results.get_document_listing()))}")
#print(f"Number of solver resultes ingested: {len(list(solver_results.get_document_listing()))}")

Number of vertexes present in graph: 2747
Number of edges present in graph: 41497


**Note:** JanusGraph was run in verbose mode on localhost with indexes and schema configured.