In [1]:
import numpy as np
import scipy.sparse as sp
import argparse
import os
import math
import logging

from multiprocessing import Pool as ProcessPool
from multiprocessing.util import Finalize
from functools import partial
from collections import Counter

from drqa import retriever
from drqa import tokenizers

In [2]:
DOC2IDX = None
PROCESS_TOK = None
PROCESS_DB = None

def init(tokenizer_class, db_class, db_opts):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)


def fetch_text(doc_id):
    global PROCESS_DB
    return PROCESS_DB.get_doc_text(doc_id)


def tokenize(text):
    global PROCESS_TOK
    return PROCESS_TOK.tokenize(text)

In [3]:
db_class = retriever.get_class('sqlite')
doc_db = db_class("/Users/zhengjiexu/Desktop/DrQA/data/jay_test/test_docs.db")
doc_ids = doc_db.get_doc_ids()  
DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

In [4]:
DOC2IDX

{'car': 0, 'cat': 1, 'dog': 2, 'robot': 3}

In [5]:
tok_class = tokenizers.get_class('spacy') 
PROCESS_TOK = tok_class()

In [6]:
PROCESS_DB = doc_db

In [7]:
def count(ngram, hash_size, doc_id):
    """Fetch the text of a document and compute hashed ngrams counts."""
    global DOC2IDX
    row, col, data = [], [], []
    # Tokenize
    tokens = tokenize(retriever.utils.normalize(fetch_text(doc_id)))

    # Get ngrams from tokens, with stopword/punctuation filtering.
    ngrams = tokens.ngrams(
        n=ngram, uncased=True, filter_fn=retriever.utils.filter_ngram
    )
    print('----ngrams----')
    print(ngrams[:10])
    print('\n----number of grams before hash----')
    print(len(ngrams))

    # Hash ngrams and count occurences
    counts = Counter([retriever.utils.hash(gram, hash_size) for gram in ngrams])
    print('\n----number of grams after hash----')
    print(len(counts))
    

    # Return in sparse matrix data format.
    row.extend(counts.keys())
    col.extend([DOC2IDX[doc_id]] * len(counts))
    data.extend(counts.values())
    return row, col, data

In [8]:
row, col, data = count(ngram=2, hash_size=2**24, doc_id='robot')

----ngrams----
['robot', 'machine', 'especially', 'especially one', 'one', 'one programmable', 'programmable', 'computer—', 'computer— capable', 'capable']

----number of grams before hash----
17

----number of grams after hash----
17


In [13]:
print(row), print(col), print(data)

[2599104, 10120610, 16177419, 5477921, 9537228, 4992930, 14898010, 14564878, 14259886, 3938108, 5633020, 11336432, 2498602, 4731736, 8237383, 14811187, 4120384]
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


(None, None, None)

In [15]:
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)


In [27]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p')
console = logging.StreamHandler()
console.setFormatter(fmt)
logger.addHandler(console)

In [22]:
from collections import namedtuple
Args = namedtuple('Args', ['tokenizer', 'num_workers', 'ngram', 'hash_size', 'db_path'])

In [23]:
args = Args('spacy', 1, 2, 2**24, "/Users/zhengjiexu/Desktop/DrQA/data/jay_test/test_docs.db")

In [25]:
args.tokenizer, args.num_workers, args.ngram, args.hash_size, args.db_path

('spacy',
 1,
 2,
 16777216,
 '/Users/zhengjiexu/Desktop/DrQA/data/jay_test/test_docs.db')

In [28]:
count_matrix, doc_dict = get_count_matrix(
        args, 'sqlite', {'db_path': args.db_path}
)

03/11/2019 04:28:53 PM: [ Mapping... ]
03/11/2019 04:28:53 PM: [ -------------------------Batch 1/4------------------------- ]


----ngrams----
['car', 'automobile', 'wheeled', 'wheeled motor', 'motor', 'motor vehicle', 'vehicle', 'vehicle used', 'used', 'transportation']

----number of grams before hash----
10

----number of grams after hash----
10


03/11/2019 04:28:55 PM: [ -------------------------Batch 2/4------------------------- ]


----ngrams----
['cat', 'felis', 'felis catus', 'catus', 'small', 'small carnivorous', 'carnivorous', 'carnivorous mammal.[1][2', 'mammal.[1][2', 'domesticated']

----number of grams before hash----
26

----number of grams after hash----
24


03/11/2019 04:28:55 PM: [ -------------------------Batch 3/4------------------------- ]


----ngrams----
['domestic', 'domestic dog', 'dog', 'canis', 'canis lupus', 'lupus', 'lupus familiaris', 'familiaris', 'considered', 'subspecies']

----number of grams before hash----
37

----number of grams after hash----
32


03/11/2019 04:28:55 PM: [ -------------------------Batch 4/4------------------------- ]


----ngrams----
['robot', 'machine', 'especially', 'especially one', 'one', 'one programmable', 'programmable', 'computer—', 'computer— capable', 'capable']

----number of grams before hash----
17

----number of grams after hash----
17


03/11/2019 04:28:55 PM: [ Creating sparse matrix... ]


In [31]:
count_matrix.shape

(16777216, 4)

In [32]:
doc_dict

({'car': 0, 'cat': 1, 'dog': 2, 'robot': 3}, ['car', 'cat', 'dog', 'robot'])

In [40]:
def get_tfidf_matrix(cnts):
    """Convert the word count matrix into tfidf one.

    tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5))
    * tf = term frequency in document
    * N = number of documents
    * Nt = number of occurences of term in all documents
    """
    print('--count_matrix shape--:', count_matrix.shape)
    Ns = get_doc_freqs(cnts); print('--Ns shape--:', Ns.shape)
    idfs = np.log((cnts.shape[1] - Ns + 0.5) / (Ns + 0.5)); print('--idfs shape--:', idfs.shape)
    idfs[idfs < 0] = 0
    idfs = sp.diags(idfs, 0); print('--diag shape--:', idfs.shape)
    tfs = cnts.log1p(); print('--tfs shape--:', tfs.shape)
    tfidfs = idfs.dot(tfs); print('--tfidfs--:', tfidfs.shape)
    return tfidfs

In [41]:
def get_doc_freqs(cnts):
    """Return word --> # of docs it appears in."""
    binary = (cnts > 0).astype(int)
    freqs = np.array(binary.sum(1)).squeeze()
    return freqs

In [42]:
tfidf = get_tfidf_matrix(count_matrix)

--count_matrix shape--: (16777216, 4)
--Ns shape--: (16777216,)
--idfs shape--: (16777216,)
--diag shape--: (16777216, 16777216)
--tfs shape--: (16777216, 4)
--tfidfs--: (16777216, 4)


In [45]:
tfidf.shape

(16777216, 4)

In [43]:
doc_freqs = get_doc_freqs(count_matrix)

In [44]:
doc_freqs.shape

(16777216,)