In [1]:
%%capture
!pip install neo4j

In [2]:
from os import pardir
from os.path import join
from utils.literature import get_files, get_document_title, get_authors, get_bib_entries, DataLoader, is_english
from utils.processing import create_chunks
from utils.metadata import CORDMetadata
from utils.grid.grid import GridLookup
from neo4j import GraphDatabase
from multiprocessing.pool import Pool
from tqdm._tqdm_notebook import tqdm
from IPython.display import clear_output
import math

In [3]:
auth = ('neo4j', 'password')
address = f'bolt://localhost:7687'

In [4]:
#TODO: Split Query in smaller chunks
def add_new(tx, title, doc_id, author, institution, journal):
    tx.run("MERGE (a:Document {title: $title, doc_id: $doc_id})"
           "MERGE (b:Author {name: $author})"
           "MERGE (c:Institution {name: $institution_name})"
           "MERGE (d:Journal {name: $journal})"
           "MERGE (e:Country {name: $country, code: $code})"
           "MERGE (b)-[:WORKS_FOR]->(c)"
           "MERGE (c)-[:EMPLOYED]->(b)"
           "MERGE (a)-[:WRITTEN_BY]->(b)"
           "MERGE (b)-[:WROTE]->(a)"
           "MERGE (c)-[:LOCATED_IN]->(e)"
           "MERGE (e)-[:LOCATES]->(c)"
           "MERGE (a)-[:PUBLISHED_IN]->(d)"
           "MERGE (d)-[:PUBLISHED]->(a)",
           title=title, doc_id=doc_id, author=author, institution_name=institution['Name'], 
           country = institution['Country'], code=institution['Code'], journal=journal)

In [5]:
def add_bib_query(tx, title, ref_title, author):
    tx.run("MATCH (a:Document {title: $title}) "
           "MERGE (b:Document {title: $ref_title, doc_id: $doc_id}) "
           "MERGE (c:Author {name: $author}) "
           "MERGE (a)-[:REFERENCED]->(b) "
           "MERGE (b)-[:REFERENCED_BY]->(a) "
           "MERGE (b)-[:WRITTEN_BY]->(c) "
           "MERGE (c)-[:WROTE]->(b)", 
            title=title, ref_title=ref_title, author=author, doc_id=-1)

In [6]:
def mark_bib_query(tx, title, ref_title):
    tx.run(""
           "MERGE (b:Document {title: $ref_title}) "
           "MERGE (a)-[:REFERENCED]->(b) "
           "MERGE (b)-[:REFERENCED_BY]->(a)", title=title, ref_title=ref_title)

In [7]:
def add_author_query(tx, title, author):
    tx.run("MATCH (a:Document {title: $title}) "
           "MERGE (b:Author {name: $author}) "
           "MERGE (a)-[:WRITTEN_BY]->(b) "
           "MERGE (b)-[:WROTE]->(a)", 
           title=title, author=author)

In [8]:
root_dir = join(pardir, 'dataset')
files = get_files(root_dir)
metadata_lookup = CORDMetadata()
grid_lookup = GridLookup()

In [9]:
undefined = {
    'Name': 'undefined', 
    'Country': 'undefined', 
    'Code': 'undefined'}

In [10]:
def process_chunk_documents(args):
    with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"), encrypted=False) as driver:  
        with driver.session() as session:      
            #iterate over all documents in chunk
            for fpath, doc_id in args:
                doc_id = int(doc_id)
                data_loader = DataLoader(fpath, grid_lookup)

                doc_title = get_document_title(fpath, data_loader)
                
                sha = data_loader.get_paper_id()
                journal = metadata_lookup.get_journal(sha)

                #database should only contain english documents with an valid document title
                if doc_title == '' or not is_english(doc_title):
                    continue

                #avoid duplicates from dataset
                doc_exists = added.get(doc_title) is not None
                if doc_exists:
                    continue
                
                added[doc_title] = True
            
                # enable normalize names in order to match ref entries with existing documents
                authors = get_authors(fpath, data_loader, plausibility_check=True, clean_names=True, normalize_names=False) 
                
                for author, institution in authors:
                    if author is None: continue
                    institution = institution if institution is not None else undefined
                    journal = journal if journal is not None and journal != 'nan' else 'undefined'
                    session.write_transaction(add_new, doc_title, doc_id, author, institution, journal)

In [11]:
def process_chunk_bib_ref(args):
    with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"), encrypted=False) as driver:  
        with driver.session() as session:      
            #iterate over all documents in chunk
            for fpath, doc_id in args:
                doc_id = int(doc_id)

                data_loader = DataLoader(fpath, grid_lookup)
                doc_title = get_document_title(fpath, data_loader)

                #database should only contain english documents with an valid document title
                if doc_title == '' or not is_english(doc_title):
                    continue
                
                ref_entries = get_bib_entries(fpath, data_loader)

                for bib_title, authors in ref_entries:
                    #try to merge ref entry with existing document
                    doc_exists = added.get(doc_title) is not None

                    if not doc_exists:
                        for author, _ in authors:
                            if author is None: continue
                            session.write_transaction(add_bib_query, doc_title, bib_title, author)
                    else:
                        session.write_transaction(mark_bib_query, doc_title, bib_title)

In [14]:
# ache of added documents, used in process_chunk_bib_ref 
# for checking if ref entry is already existent in the database 
# --> improve performance
added = dict()

pool = Pool() #FIXME: Each thread requires large ammount of ram 
chunks = create_chunks(files)

for _ in tqdm(pool.imap_unordered(process_chunk_documents, chunks), total=len(chunks)):
    pass

pool.close()
pool.join()

HBox(children=(FloatProgress(value=0.0, max=385.0), HTML(value='')))




In [13]:
"""
Currently not performant enough to run in acceptable time
"""

#pool = Pool() #FIXME: Each thread requires large ammount of ram 
#chunks = create_chunks(files)
#
#for _ in tqdm(pool.imap_unordered(process_chunk_bib_ref, chunks), total=len(chunks)):
#    pass

#pool.close()
#pool.join()

'\nCurrently not performant enough to run in acceptable time\n'