%%capture
!pip install neo4j

In [1]:
from os import pardir
from os.path import join

from utils.literature import get_files, get_document_title, get_authors, get_ref_entires, DataLoader, is_english
from utils.processing import create_chunks
from utils.metadata import CORDMetadata
from utils.grid.grid import GridLookup

from neo4j import GraphDatabase
from neomodel import StructuredNode, StringProperty, IntegerProperty, RelationshipTo, RelationshipFrom, config, db, UniqueProperty, UniqueIdProperty

from multiprocessing.pool import Pool
from tqdm._tqdm_notebook import tqdm
from IPython.display import clear_output

import math

In [2]:
auth = ('neo4j', 'password')
address = f'bolt://localhost:7687'

In [8]:
def add_new(tx, title, doc_id, author, institution, journal):
    tx.run("MERGE (a:Document {title: $title, doc_id: $doc_id})"
           "MERGE (b:Author {name: $author})"
           "MERGE (c:Institution {name: $institution_name})"
           "MERGE (d:Journal {name: $journal})"
           "MERGE (e:Country {name: $country, code: $code})"
           "MERGE (b)-[:WORKS_FOR]->(c)"
           "MERGE (c)-[:EMPLOYED]->(b)"
           "MERGE (a)-[:WROTE]->(b)"
           "MERGE (b)-[:WRITTEN_BY]->(a)"
           "MERGE (c)-[:LOCATED_IN]->(e)"
           "MERGE (e)-[:LOCATES]->(c)"
           "MERGE (a)-[:PUBLISHED_IN]->(d)"
           "MERGE (d)-[:PUBLISHED]->(a)",
           title=title, doc_id=doc_id, author=author, institution_name=institution['Name'], 
           country = institution['Country'], code=institution['Code'], journal=journal)

In [4]:
root_dir = join(pardir, 'dataset')
files = get_files(root_dir)
metadata_lookup = CORDMetadata()
grid_lookup = GridLookup()

In [5]:
undefined = {
    'Name': 'undefined', 
    'Country': 'undefined', 
    'Code': 'undefined'}

In [6]:
def process_chunk(args):
    with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"), encrypted=False) as driver:        
        #iterate over all documents in chunk
        for fpath, doc_id in args:
            doc_id = int(doc_id)
            data_loader = DataLoader(fpath, grid_lookup)

            doc_title = get_document_title(fpath, data_loader)
            sha = data_loader.get_paper_id()
            journal = metadata_lookup.get_journal(sha)
            #database should only contain english documents with an valid document title
            if doc_title == '' or not is_english(doc_title):
                continue

            authors = get_authors(fpath, data_loader)
            for author, institution in authors:
                if author is None: continue
                institution = institution if institution is not None else undefined
                journal = journal if journal is not None and journal != 'nan' else 'undefined'
                with driver.session() as session:
                    session.write_transaction(add_new, doc_title, doc_id, author, institution, journal)

In [9]:
chunks = create_chunks(files)

#TODO: Fix error with multithreading to allow parallel processing of chunks
for i, chunk in enumerate(chunks):
    clear_output(wait=True)
    print(f'processing {i} of {len(chunks)}')
    process_chunk(chunk)

processing 384 of 385
