In [1]:
%%capture
!pip install python-arango
!pip install pycld2

In [2]:
from __future__ import print_function

import string
import re
import sys

import pycld2 as cld2

from multiprocessing.pool import Pool
from tqdm._tqdm_notebook import tqdm

from IPython.display import display, clear_output

import numpy as np
import pandas as pd

from os import getcwd, pardir
from os.path import join, basename

from utils.literature import DataLoader, get_document_title, is_english, get_section, get_sections
from utils.preprocessing import NLPPipeline, Tokenizer, Stemmer, ToLowercase, Lemmatizer, StopwordRemover, CitationRemover, SymbolRemover, ContentInBracketsRemover, NonAlphanumericRemover
from glob import glob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer #TODO: Try lemmatizer instead of stemmer

from arango import ArangoClient

nltk.download()

## Specification of the document paths

- download from: [Kaggle](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge)
- extract folder content to `{ProjectDir}/dataset`

In [3]:
root_dir = join(pardir, 'dataset')

json_paths = [
    join(root_dir, 'arxiv', 'arxiv', 'pdf_json'),
    join(root_dir, 'arxiv', 'arxiv', 'pdf_json'),
    join(root_dir, 'comm_use_subset', 'comm_use_subset', 'pdf_json'),
    join(root_dir, 'noncomm_use_subset', 'noncomm_use_subset', 'pdf_json'),
    join(root_dir, 'custom_license', 'custom_license', 'pdf_json'),
    join(root_dir, 'biorxiv_medrxiv', 'biorxiv_medrxiv', 'pdf_json'),
]

files = []
[files.extend(glob(join(path, '*.json'))) for path in json_paths];

## Building the document index

Each document gets stored in the document_index collection with the following layout: 
```json
{
    _id: ..., 
    document_title: ...
}
```

## Building the inverted index

```json
{
    'stem': ...,
    'doc_ids': [
        {
            {'doc_id': ..., 
             'count': {
                 'title': ...,
                 'abstract': ...,
                 'body_text': ...}
             },
             {'doc_id': ..., 
             'count': {
                 'title': ...,
                 'abstract': ...,
                 'body_text': ...}
             }
        }
    ]
}
```

In [4]:
##Create indices

In [5]:
pipeline = NLPPipeline([
    ToLowercase(),
    CitationRemover(),
    ContentInBracketsRemover(),
    Tokenizer(),
    NonAlphanumericRemover(),
    #SymbolRemover(),
    StopwordRemover(),
    Stemmer()
])

In [6]:
def initialize_database():
    client = ArangoClient(hosts='http://localhost:8529')
    sys_db = client.db('_system', username='root', password='')

    if 'covid_19' not in sys_db.databases():
        sys_db.create_database('covid_19')

    db = client.db('covid_19', username='root', password='')

    if not db.has_collection('inverted_index'):
        db.create_collection('inverted_index')
    if not db.has_collection('document_index'):
        db.create_collection('document_index')

    return db, db.collection('document_index'), db.collection('inverted_index')

In [7]:
def get_connection_instance():
    client = ArangoClient(hosts='http://localhost:8529')
    db = client.db('covid_19', username='root', password='')
    return db.collection('document_index'), db.collection('inverted_index')

In [8]:
db, document_index_collection, inverted_index_collection = initialize_database()

In [9]:
count_object = {
    'title': {'title': 1, 'abstract': 0, 'body_text': 0},
    'abstract': {'title': 0, 'abstract': 1, 'body_text': 0},
    'body_text': {'title': 0, 'abstract': 0, 'body_text': 1}
}

In [10]:
def try_get_doc_id(json_object, doc_id):
    # TODO: O(n) --> improve with b-tree ? 
    res = list(filter(lambda f: (f["doc_id"] == doc_id), json_object['doc_ids']))
    if len(res) == 0:
        return None
    return res[0]

In [11]:
def create_document(stem, doc_id, count_object):
    return {'_key': stem, 'doc_ids': [{'doc_id': doc_id, 'count': count_object}]}

In [12]:
def update_document_index(doc_id, document_title, document_index_collection):
    if str(doc_id) not in document_index_collection:
        document_index_collection.insert({'_key': str(doc_id), 'document_title': document_title})

In [13]:
def update_inverted_index(doc_id, stemmed_tokens, section, inverted_index_collection):
    #update inverted index
    for stem in stemmed_tokens:
        if stem is '' or stem is ' ': continue
        if stem in inverted_index_collection:
            #try to find current document id in stem entry
            tmp = inverted_index_collection[stem]
            doc_id_object = try_get_doc_id(tmp, doc_id)
            if doc_id_object is not None:
                # update occurrence of stem in document section
                try_get_doc_id(tmp, doc_id)['count']['title'] += 1
                inverted_index_collection.update(tmp)
            else:
                # add document id
                tmp['doc_ids'].append({'doc_id': doc_id, 'count': count_object[section]})
                inverted_index_collection.update(tmp)
        else:
            doc = create_document(stem, doc_id, count_object[section])
            inverted_index_collection.insert(doc)

In [14]:
%%timeit
get_connection_instance()

28.4 µs ± 227 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [15]:
def process_chunk(args, update_doc_idx=True, update_inv_indx=True):
    # open a connection to the database
    document_index_collection, inverted_index_collection = get_connection_instance()
    #iterate over all documents in chunk
    for fpath, doc_id in args:
        doc_id = int(doc_id)
        data_loader = DataLoader(fpath)

        doc_title = get_document_title(fpath, data_loader)

        #database should only contain english documents with an valid document title
        if doc_title == '' or not is_english(doc_title):
            continue

        if doc_title not in document_index_collection:
            update_document_index(
                doc_id, 
                doc_title, 
                document_index_collection)

        for section in get_sections():
            text = get_section(fpath, section, dl=data_loader)
            stemmed_tokens = pipeline.transform(text)
            update_inverted_index(doc_id, stemmed_tokens, section, inverted_index_collection)

In [16]:
def create_chunks(files, chunk_size=128):
    doc_ids = list(range(len(files)))
    chunks = list()
    for i in range(0, len(files), chunk_size):
        indices = np.array(doc_ids[i: min(i+chunk_size, len(files))])
        chunks.append(list(zip(
            files[i: min(i+chunk_size, len(files))], 
            doc_ids[i: min(i+chunk_size, len(files))])))
    return chunks

In [17]:
for chunk in create_chunks(files, chunk_size=1024):
    process_chunk(chunk)

DocumentParseError: bad collection name in document ID "Perception of emergent epidemic of COVID-2019 / SARS CoV-2 on the Polish Internet"

pool = Pool()
chunks = create_chunks(files)

for _ in tqdm(pool.imap_unordered(process_chunk, chunks), total=len(chunks)):
    pass

pool.close()
pool.join()