In [1]:
!pip install pymongo



In [12]:
from __future__ import print_function

import string
import re
import sys

from multiprocessing import pool
from tqdm._tqdm_notebook import tqdm

from IPython.display import display, clear_output

import numpy as np
import pandas as pd

from os import getcwd, pardir
from os.path import join, basename

from utils.literature import DataLoader
from utils.preprocessing import NLPPipeline, Tokenizer, Stemmer, ToLowercase, Lemmatizer, StopwordRemover, CitationRemover, SymbolRemover, ContentInBracketsRemover
from glob import glob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer #TODO: Try lemmatizer instead of stemmer

import pymongo
from pymongo import MongoClient as DBClient

nltk.download()

## Specification of the document paths

- download from: [Kaggle](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge)
- extract folder content to `{ProjectDir}/dataset`

In [2]:
root_dir = join(pardir, 'dataset')

json_paths = [
    join(root_dir, 'arxiv', 'arxiv', 'pdf_json'),
    join(root_dir, 'arxiv', 'arxiv', 'pdf_json'),
    join(root_dir, 'comm_use_subset', 'comm_use_subset', 'pdf_json'),
    join(root_dir, 'noncomm_use_subset', 'noncomm_use_subset', 'pdf_json'),
    join(root_dir, 'custom_license', 'custom_license', 'pdf_json'),
    join(root_dir, 'biorxiv_medrxiv', 'biorxiv_medrxiv', 'pdf_json'),
]

files = []
[files.extend(glob(join(path, '*.json'))) for path in json_paths];

## Building the document index

Each document gets stored in the document_index collection with the following layout: 
```json
{
    _id: ..., 
    document_title: ...
}
```

## Building the reversed index

```json
{
    'stem': ...,
    'doc_ids': [
        {
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
        }
    ]
}
```

TODO: Test
pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

In [8]:
pipeline = NLPPipeline([
    ToLowercase(),
    CitationRemover(),
    ContentInBracketsRemover(),
    Tokenizer(),
    SymbolRemover(),
    StopwordRemover(),
    Lemmatizer()
])

In [6]:
def get_full_text(fpath, dl=None):
    if dl is None: dl = DataLoader(fpath)
    return dl.get_full_text()

In [7]:
def get_document_title(fpath, dl=None):
    if dl is None: dl = DataLoader(fpath)
    return dl.get_title()

In [8]:
def create_post(stem, doc_id):
    return {'_id': stem, 'doc_ids': [{'doc_id': doc_id, 'count': 1}]}

In [19]:
def process_chunk(args):
    with DBClient('localhost', 27017, w=0) as client:
        db = client['covid_19']
        document_index_collection = db['document_index']
        reversed_index_collection = db['reversed_index']

        for fpath, doc_id in args:
            doc_id = int(doc_id)

            #preprocessing
            dl = DataLoader(fpath)
            text = get_full_text(fpath, dl)
            stemmed_tokens = pipeline.transform(text)

            #get relevant metadata
            document_title = get_document_title(fpath, dl)

            #update document index
            doc_id_entry = document_index_collection.find({'_id': doc_id}).limit(1)
            if doc_id_entry.count() == 0:
                document_index_collection.insert_one({'_id': doc_id, 'document_title': document_title})

            #update reversed index
            for stem in stemmed_tokens:
                stem_entry = reversed_index_collection.find({'_id': stem}).limit(1)
                if stem_entry.count() > 0:
                    #try to find current document id in stem entry
                    doc_id_object = reversed_index_collection.find(
                        {'_id': stem, 
                        'doc_ids': {'$elemMatch': {'doc_id': doc_id}}}).limit(1)
                    if doc_id_object.count() > 0:
                        # update occurrence of stem in document
                        reversed_index_collection.update(
                            {'_id': stem, 
                            'doc_ids': {'$elemMatch': {'doc_id': doc_id}}},
                            {'$inc': {'doc_ids.$.count': 1}})
                    else:
                        # add document id
                        reversed_index_collection.update(
                            {'_id': stem},
                            {'$push': {'doc_ids': {'doc_id': doc_id, 'count': 1}}})
                else:
                    post = create_post(stem, doc_id)
                    reversed_index_collection.insert_one(post)


In [17]:
def create_chunks(files, chunk_size=100):
    doc_ids = list(range(len(files)))
    chunks = list()
    for i in range(0, len(files), chunk_size):
        indices = np.array(doc_ids[i: min(i+chunk_size, len(files))])
        chunks.append(list(zip(
            files[i: min(i+chunk_size, len(files))], 
            doc_ids[i: min(i+chunk_size, len(files))])))
    return chunks

In [20]:
pool = Pool()

for _ in tqdm(pool.imap_unordered(process_chunk, create_chunks(files)), total=len(files)):
    pass

pool.close()
pool.join()



  0%|          | 0/46528 [00:00<?, ?it/s][A[A

  0%|          | 1/46528 [03:31<2739:23:34, 211.96s/it][A[A

  0%|          | 2/46528 [03:38<1943:05:24, 150.35s/it][A[A

  0%|          | 3/46528 [03:46<1392:36:45, 107.76s/it][A[A

  0%|          | 4/46528 [03:50<989:43:04, 76.58s/it][A[A

  0%|          | 5/46528 [03:54<705:29:21, 54.59s/it][A[A

  0%|          | 6/46528 [03:57<507:50:01, 39.30s/it][A[A

  0%|          | 7/46528 [03:59<364:15:08, 28.19s/it][A[A

  0%|          | 8/46528 [04:17<323:31:24, 25.04s/it][A[A

  0%|          | 9/46528 [07:56<1073:38:30, 83.09s/it][A[A

  0%|          | 10/46528 [07:59<763:32:51, 59.09s/it][A[A

  0%|          | 11/46528 [07:59<535:31:39, 41.45s/it][A[A

  0%|          | 12/46528 [08:19<453:05:50, 35.07s/it][A[A

  0%|          | 13/46528 [08:21<323:33:13, 25.04s/it][A[A

  0%|          | 14/46528 [08:28<255:37:19, 19.78s/it][A[A

  0%|          | 15/46528 [08:34<200:07:42, 15.49s/it][A[A

  0%|          | 16/4

AutoReconnect: connection closed