- [ ] Fix error: WRITTEN_BY <-> WROTE
- [x] Normalize Names (A. B. Name)
- [ ] Add references to property graph
- [ ] Try Topic Modeling (https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)
- [ ] Split add_new into smaller chunks

In [None]:
%%capture
!pip install neo4j

In [1]:
from os import pardir
from os.path import join
from utils.literature import get_files, get_document_title, get_authors, get_bib_entries, DataLoader, is_english
from utils.processing import create_chunks
from utils.metadata import CORDMetadata
from utils.grid.grid import GridLookup
from neo4j import GraphDatabase
from multiprocessing.pool import Pool
from tqdm._tqdm_notebook import tqdm
from IPython.display import clear_output
import math

In [2]:
auth = ('neo4j', 'password')
address = f'bolt://localhost:7687'

In [3]:
#TODO: Split Query in smaller chunks
def add_new(tx, title, doc_id, author, institution, journal):
    tx.run("MERGE (a:Document {title: $title, doc_id: $doc_id})"
           "MERGE (b:Author {name: $author})"
           "MERGE (c:Institution {name: $institution_name})"
           "MERGE (d:Journal {name: $journal})"
           "MERGE (e:Country {name: $country, code: $code})"
           "MERGE (b)-[:WORKS_FOR]->(c)"
           "MERGE (c)-[:EMPLOYED]->(b)"
           "MERGE (a)-[:WRITTEN_BY]->(b)"
           "MERGE (b)-[:WROTE]->(a)"
           "MERGE (c)-[:LOCATED_IN]->(e)"
           "MERGE (e)-[:LOCATES]->(c)"
           "MERGE (a)-[:PUBLISHED_IN]->(d)"
           "MERGE (d)-[:PUBLISHED]->(a)",
           title=title, doc_id=doc_id, author=author, institution_name=institution['Name'], 
           country = institution['Country'], code=institution['Code'], journal=journal)

In [4]:
def add_bib_query(tx, title, ref_title, author):
    tx.run("MATCH (a:Document {title: $title}) "
           "MERGE (b:Document {title: $ref_title, doc_id: $doc_id}) "
           "MERGE (c:Author {name: $author}) "
           "MERGE (a)-[:REFERENCED]->(b) "
           "MERGE (b)-[:REFERENCED_BY]->(a) "
           "MERGE (b)-[:WRITTEN_BY]->(c) "
           "MERGE (c)-[:WROTE]->(b)", 
            title=title, ref_title=ref_title, author=author, doc_id=-1)

In [5]:
def mark_bib_query(tx, title, ref_title):
    tx.run(""
           "MERGE (b:Document {title: $ref_title}) "
           "MERGE (a)-[:REFERENCED]->(b) "
           "MERGE (b)-[:REFERENCED_BY]->(a)", title=title, ref_title=ref_title)

In [6]:
def add_author_query(tx, title, author):
    tx.run("MATCH (a:Document {title: $title}) "
           "MERGE (b:Author {name: $author}) "
           "MERGE (a)-[:WRITTEN_BY]->(b) "
           "MERGE (b)-[:WROTE]->(a)", 
           title=title, author=author)

In [7]:
root_dir = join(pardir, 'dataset')
files = get_files(root_dir)
metadata_lookup = CORDMetadata()
grid_lookup = GridLookup()

In [8]:
undefined = {
    'Name': 'undefined', 
    'Country': 'undefined', 
    'Code': 'undefined'}

In [14]:
def process_chunk_documents(args):
    with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"), encrypted=False) as driver:  
        with driver.session() as session:      
            #iterate over all documents in chunk
            for fpath, doc_id in args:
                doc_id = int(doc_id)
                data_loader = DataLoader(fpath, grid_lookup)

                doc_title = get_document_title(fpath, data_loader)
                
                sha = data_loader.get_paper_id()
                journal = metadata_lookup.get_journal(sha)

                #database should only contain english documents with an valid document title
                if doc_title == '' or not is_english(doc_title):
                    continue

                #avoid duplicates from dataset
                doc_exists = added.get(doc_title) is not None
                if doc_exists:
                    continue
                
                added[doc_title] = True
                
                # add document with metadata to database
                authors = get_authors(fpath, data_loader)
                for author, institution in authors:
                    if author is None: continue
                    institution = institution if institution is not None else undefined
                    journal = journal if journal is not None and journal != 'nan' else 'undefined'
                    session.write_transaction(add_new, doc_title, doc_id, author, institution, journal)


In [15]:
def process_chunk_bib_ref(args):
    with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"), encrypted=False) as driver:  
        with driver.session() as session:      
            #iterate over all documents in chunk
            for fpath, doc_id in args:
                doc_id = int(doc_id)

                data_loader = DataLoader(fpath, grid_lookup)
                doc_title = get_document_title(fpath, data_loader)

                #database should only contain english documents with an valid document title
                if doc_title == '' or not is_english(doc_title):
                    continue
                
                ref_entries = get_bib_entries(fpath, data_loader)

                for bib_title, authors in ref_entries:
                    #try to merge ref entry with existing document
                    doc_exists = added.get(doc_title) is not None

                    if not doc_exists:
                        for author, _ in authors:
                            if author is None: continue
                            session.write_transaction(add_bib_query, doc_title, bib_title, author)
                    else:
                        session.write_transaction(mark_bib_query, doc_title, bib_title)

In [12]:
# ache of added documents, used in process_chunk_bib_ref 
# for checking if ref entry is already existent in the database 
# --> improve performance
added = dict()

pool = Pool(16) #FIXME: Each thread requires large ammount of ram 
chunks = create_chunks(files)

for _ in tqdm(pool.imap_unordered(process_chunk_documents, chunks), total=len(chunks)):
    pass

pool.close()
pool.join()

HBox(children=(FloatProgress(value=0.0, max=385.0), HTML(value='')))

Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/home/tobias/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/tobias/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tobias/anaconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "<ipython-input-10-bd590f9fdfea>", line 26, in process_chunk_documents
    authors = get_authors(fpath, data_loader)
  File "/home/tobias/Desktop/covid19-search/processing/utils/literature.py", line 182, in get_authors
    return dl.get_authors()
  File "/home/tobias/Desktop/covid19-search/processing/utils/literature.py", line 92, in get_authors
    plausibility_check=plausibility_check)
  File "/home/tobias/Desktop/covid19-search/processing/utils/literature.py", line 77, in __parse_institution
    is_institution = self.grid_lookup.get_instituti

KeyboardInterrupt: 

In [31]:
pool = Pool() #FIXME: Each thread requires large ammount of ram 
chunks = create_chunks(files)

for _ in tqdm(pool.imap_unordered(process_chunk_bib_ref, chunks), total=len(chunks)):
    pass

pool.close()
pool.join()

HBox(children=(FloatProgress(value=0.0, max=385.0), HTML(value='')))

Process ForkPoolWorker-64:
Traceback (most recent call last):
  File "/home/tobias/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tobias/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Process ForkPoolWorker-74:
  File "/home/tobias/anaconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))


KeyboardInterrupt: 