In [1]:
import numpy as np
import scipy.sparse as sp
import argparse
import os
import math
import logging

from multiprocessing import Pool as ProcessPool
from multiprocessing.util import Finalize
from functools import partial
from collections import Counter

from drqa import retriever
from drqa import tokenizers

In [2]:
DOC2IDX = None
PROCESS_TOK = None
PROCESS_DB = None

def init(tokenizer_class, db_class, db_opts):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)


def fetch_text(doc_id):
    global PROCESS_DB
    return PROCESS_DB.get_doc_text(doc_id)


def tokenize(text):
    global PROCESS_TOK
    return PROCESS_TOK.tokenize(text)

In [3]:
db_class = retriever.get_class('sqlite')
doc_db = db_class("/Users/zhengjiexu/Desktop/DrQA/data/wikipedia/docs.db")
doc_ids = doc_db.get_doc_ids()  
DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

In [None]:
def get_class(name):
    if name == 'spacy':
        return SpacyTokenizer
    if name == 'corenlp':
        return CoreNLPTokenizer
    if name == 'regexp':
        return RegexpTokenizer
    if name == 'simple':
        return SimpleTokenizer

    raise RuntimeError('Invalid tokenizer: %s' % name)

In [4]:
tok_class = tokenizers.get_class('simple') 
PROCESS_TOK = tok_class()

In [5]:
PROCESS_DB = doc_db

In [20]:
from sklearn.utils import murmurhash3_32
def hash(token, num_buckets):
    """Unsigned 32 bit murmurhash for feature hashing."""
    return murmurhash3_32(token, positive=True) % num_buckets

In [16]:
def count(ngram, hash_size, doc_id):
    """Fetch the text of a document and compute hashed ngrams counts."""
    global DOC2IDX
    row, col, data = [], [], []
    # Tokenize
    tokens = tokenize(retriever.utils.normalize(fetch_text(doc_id)))

    # Get ngrams from tokens, with stopword/punctuation filtering.
    ngrams = tokens.ngrams(
        n=ngram, uncased=True, filter_fn=retriever.utils.filter_ngram
    )
    print('----ngrams----')
    print(ngrams[:10])
    print('\n----number of grams before hash----')
    print(len(ngrams))

    # Hash ngrams and count occurences
    counts = Counter([retriever.utils.hash(gram, hash_size) for gram in ngrams])
    print('\n----number of grams after hash----')
    print(len(counts))
    

    # Return in sparse matrix data format.
    row.extend(counts.keys())
    col.extend([DOC2IDX[doc_id]] * len(counts))
    data.extend(counts.values())
    return row, col, data

In [17]:
row, col, data = count(ngram=2, hash_size=2**24, doc_id='Black Forest')

----ngrams----
['black', 'black forest', 'forest', 'black', 'black forest', 'forest', 'large', 'large forested', 'forested', 'forested mountain']

----number of grams before hash----
6058

----number of grams after hash----
3198


In [21]:
hash('black forest', 2**24)

13459119

In [22]:
13459119 in row

True

In [24]:
# row is hashed ngrams
row[:5]

[16396370, 13459119, 15897870, 12058803, 4973702]

In [25]:
# col is doc index
col[:5]

[736352, 736352, 736352, 736352, 736352]

In [27]:
# data is count of hashed ngrams
data[:5]

[170, 168, 186, 11, 1]