In [1]:
!pip install pymongo
!pip install pycld2



In [2]:
from __future__ import print_function

import string
import re
import sys

import pycld2 as cld2

from multiprocessing.pool import Pool
from tqdm._tqdm_notebook import tqdm

from IPython.display import display, clear_output

import numpy as np
import pandas as pd

from os import getcwd, pardir
from os.path import join, basename

from utils.literature import DataLoader
from utils.preprocessing import NLPPipeline, Tokenizer, Stemmer, ToLowercase, Lemmatizer, StopwordRemover, CitationRemover, SymbolRemover, ContentInBracketsRemover
from glob import glob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer #TODO: Try lemmatizer instead of stemmer

import pymongo
from pymongo import MongoClient as DBClient

nltk.download()

## Specification of the document paths

- download from: [Kaggle](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge)
- extract folder content to `{ProjectDir}/dataset`

In [3]:
root_dir = join(pardir, 'dataset')

json_paths = [
    join(root_dir, 'arxiv', 'arxiv', 'pdf_json'),
    join(root_dir, 'arxiv', 'arxiv', 'pdf_json'),
    join(root_dir, 'comm_use_subset', 'comm_use_subset', 'pdf_json'),
    join(root_dir, 'noncomm_use_subset', 'noncomm_use_subset', 'pdf_json'),
    join(root_dir, 'custom_license', 'custom_license', 'pdf_json'),
    join(root_dir, 'biorxiv_medrxiv', 'biorxiv_medrxiv', 'pdf_json'),
]

files = []
[files.extend(glob(join(path, '*.json'))) for path in json_paths];

## Building the document index

Each document gets stored in the document_index collection with the following layout: 
```json
{
    _id: ..., 
    document_title: ...
}
```

## Building the inverted index

```json
{
    'stem': ...,
    'doc_ids': [
        {
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
        }
    ]
}
```

In [4]:
with DBClient('localhost', 27017, w=0) as client:
    db = client['covid_19']
    document_index_collection = db['document_index']
    inverted_index_collection = db['inverted_index']

    if '_id' not in document_index_collection.index_information():
        document_index_collection.create_index('_id')

    index_names = ['_id', 'doc_ids.doc_id']

    for index_name in index_names:
        if index_name not in document_index_collection.index_information():
            inverted_index_collection.create_index(index_name, unique=True)

TODO: Test
pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

In [5]:
pipeline = NLPPipeline([
    ToLowercase(),
    CitationRemover(),
    ContentInBracketsRemover(),
    Tokenizer(),
    SymbolRemover(),
    StopwordRemover(),
    Lemmatizer()
])

In [6]:
def is_english(stemmed_tokens, count = 20):
    is_reliable, _, details = cld2.detect(' '.join(stemmed_tokens[:count]))
    if not is_reliable or details[0][1] != 'en':
        return False
    return True

In [7]:
def get_full_text(fpath, dl=None):
    if dl is None: dl = DataLoader(fpath)
    return dl.get_full_text()

In [8]:
def get_document_title(fpath, dl=None):
    if dl is None: dl = DataLoader(fpath)
    return dl.get_title()

In [9]:
def create_post(stem, doc_id):
    return {'_id': stem, 'doc_ids': [{'doc_id': doc_id, 'count': 1}]}

In [10]:
def process_chunk(args):
    with DBClient('localhost', 27017, w=0) as client:
        db = client['covid_19']
        document_index_collection = db['document_index']
        inverted_index_collection = db['inverted_index']
restart0since Tue May 12 2020
        for fpath, doc_id in args:
            doc_id = int(doc_id)

            #preprocessing
            dl = DataLoader(fpath)
            text = get_full_text(fpath, dl)
            stemmed_tokens = pipeline.transform(text)

            #get relevant metadata
            document_title = get_document_title(fpath, dl)

            if not is_english(stemmed_tokens) or document_title == '':
                continue

            #update document index
            doc_id_entry = document_index_collection.find({'_id': doc_id}).limit(1)
            if doc_id_entry.count() == 0:
                document_index_collection.insert_one({'_id': doc_id, 'document_title': document_title})

            #update inverted index
            for stem in stemmed_tokens:
                stem_entry = inverted_index_collection.find({'_id': stem}).limit(1)
                if stem_entry.count() > 0:
                    #try to find current document id in stem entry
                    doc_id_object = inverted_index_collection.find(
                        {'_id': stem, 
                        'doc_ids': {'$elemMatch': {'doc_id': doc_id}}}).limit(1)
                    if doc_id_object.count() > 0:
                        # update occurrence of stem in document
                        inverted_index_collection.update(
                            {'_id': stem, 
                            'doc_ids': {'$elemMatch': {'doc_id': doc_id}}},
                            {'$inc': {'doc_ids.$.count': 1}})
                    else:
                        # add document id
                        inverted_index_collection.update(
                            {'_id': stem},
                            {'$push': {'doc_ids': {'doc_id': doc_id, 'count': 1}}})
                else:
                    post = create_post(stem, doc_id)
                    inverted_index_collection.insert_one(post)


In [11]:
def create_chunks(files, chunk_size=100):
    doc_ids = list(range(len(files)))
    chunks = list()
    for i in range(0, len(files), chunk_size):
        indices = np.array(doc_ids[i: min(i+chunk_size, len(files))])
        chunks.append(list(zip(
            files[i: min(i+chunk_size, len(files))], 
            doc_ids[i: min(i+chunk_size, len(files))])))
    return chunks

In [12]:
pool = Pool()
chunks = create_chunks(files)

for _ in tqdm(pool.imap_unordered(process_chunk, chunks), total=len(chunks)):
    pass

pool.close()
pool.join()

HBox(children=(FloatProgress(value=0.0, max=492.0), HTML(value='')))


