In [1]:
!pip install pymongo



In [2]:
from __future__ import print_function

import string
import re
import sys

from IPython.display import display, clear_output

import numpy as np
import pandas as pd

from os import getcwd
from os.path import join, basename

from utils.literature import DataLoader
from glob import glob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer #TODO: Try lemmatizer instead of stemmer

import pymongo
from pymongo import MongoClient as DBClient

nltk.download()

## Specification of the document paths

- download from: [Kaggle](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge)
- extract folder content to `{ProjectDir}/dataset`

In [3]:
cwd = getcwd()

json_paths = [
    join(cwd, 'dataset', 'arxiv', 'arxiv', 'pdf_json'),
    join(cwd, 'dataset', 'arxiv', 'arxiv', 'pdf_json'),
    join(cwd, 'dataset', 'comm_use_subset', 'comm_use_subset', 'pdf_json'),
    join(cwd, 'dataset', 'noncomm_use_subset', 'noncomm_use_subset', 'pdf_json'),
    join(cwd, 'dataset', 'custom_license', 'custom_license', 'pdf_json'),
    join(cwd, 'dataset' 'biorxiv_medrxiv', 'biorxiv_medrxiv', 'pdf_json'),
]
string.punctuation
files = []
[files.extend(glob(join(path, '*.json'))) for path in json_paths];

## Loading and updating the metadata information

In [4]:
metadata = pd.read_csv(join(getcwd(), 'dataset', 'metadata.csv'))

In [5]:
metadata['doc_id'] = np.arange(len(metadata))

## Connecting to the database

In [6]:
client = DBClient('localhost', 27017, w=0)
db = client['covid_19']
document_index_collection = db['document_index']
reversed_index_collection = db['reversed_index']

## Building the document index

Each document gets stored in the document_index collection with the following layout: 
```json
{
    _id: ..., 
    document_title: ...
}
```

In [7]:
doc_idxs = dict(zip(metadata['doc_id'], metadata['title']))

In [8]:
for _id, document_title in doc_idxs.items():
    doc_id = document_index_collection.find({'_id': _id}).limit(1)
    if doc_id.count() == 0:
        document_index_collection.insert_one({'_id': _id, 'document_title': document_title})

## Building the reversed index

```json
{
    'stem': ...,
    'doc_ids': [
        {
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
            {'doc_id': ..., 'count': ...},
        }
    ]
}
```

TODO: Test
pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

In [9]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer('\w+')

In [10]:
def get_metadata(metadata, fpath):
    sha = basename(fpath)[:-5]
    return sha, metadata.loc[metadata['sha'] == sha]

In [11]:
def get_full_text(fpath):
    dl = DataLoader(fpath)
    return dl.get_full_text()

In [12]:
def create_post(stem, doc_id):
    return {'_id': stem, 'doc_ids': [{'doc_id': doc_id, 'count': 1}]}

In [26]:
def process_file(args):
    fpath, doc_id = args
    doc_id = int(doc_id)
    #preprocessing
    text = get_full_text(fpath)
    text = text.strip()
    tokens = tokenizer.tokenize(text)
    filtered_tokens = [w for w in tokens if not w in stop_words] 
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    for stem in stemmed_tokens:
        stem_entry = reversed_index_collection.find({'_id': stem}).limit(1)
        if stem_entry.count() > 0:
            #try to find current document id in stem entry
            doc_id_object = reversed_index_collection.find(
                {'_id': stem, 
                 'doc_ids': {'$elemMatch': {'doc_id': doc_id}}}).limit(1)
            if doc_id_object.count() > 0:
                # update occurrence of stem in document
                reversed_index_collection.update(
                    {'_id': stem, 
                     'doc_ids': {'$elemMatch': {'doc_id': doc_id}}},
                    {'$inc': {'doc_ids.$.count': 1}})
            else:
                 # add document id
                 reversed_index_collection.update(
                    {'_id': stem},
                    {'$push': {'doc_ids': {'doc_id': doc_id, 'count': 1}}})
        else:
            post = create_post(stem, doc_id)
            reversed_index_collection.insert_one(post)

In [40]:
from multiprocessing import Pool
import tqdm
pool = Pool(processes=8)
args = list(zip(files, np.arange(len(files))))

for _ in tqdm.tqdm(pool.imap_unordered(process_file, args), total=len(files)):
    pass

pool.close()








  0%|          | 0/46528 [00:00<?, ?it/s][A[A[A[A[A[A[A

InvalidDocument: cannot encode object: 7, of type: <class 'numpy.int64'>

In [41]:
print(args[0])

('/home/tobias/Desktop/covid19-search/dataset/arxiv/arxiv/pdf_json/f5a816cbca04dc4caa8f0f73e37ac7387d67b402.json', 0)
