diff --git a/src/zope/index/text/okapiindex.py b/src/zope/index/text/okapiindex.py index de363ae..6e4c57b 100644 --- a/src/zope/index/text/okapiindex.py +++ b/src/zope/index/text/okapiindex.py @@ -191,21 +191,20 @@ """ import os import platform +from BTrees.Length import Length from zope.index.text.baseindex import BaseIndex from zope.index.text.baseindex import inverse_doc_frequency +_py_impl = getattr(platform, 'python_implementation', lambda: None) +_is_pypy = _py_impl() == 'PyPy' +PURE_PYTHON = os.environ.get('PURE_PYTHON') or _is_pypy try: from zope.index.text.okascore import score -except ImportError: #pragma NO COVERAGE +except ImportError: # pragma: no cover score = None -from BTrees.Length import Length +score = None if PURE_PYTHON else score -_py_impl = getattr(platform, 'python_implementation', lambda: None) -_is_pypy = _py_impl() == 'PyPy' -PURE_PYTHON = os.environ.get('PURE_PYTHON') or _is_pypy -if PURE_PYTHON: - score = None PY2 = str is bytes @@ -216,7 +215,7 @@ class OkapiIndex(BaseIndex): # BM25 free parameters. K1 = 1.2 - B = 0.75 + B = 0.75 assert K1 >= 0.0 assert 0.0 <= B <= 1.0 @@ -268,87 +267,88 @@ def _change_doc_len(self, delta): # D to TF(D,t)*IDF(t) directly, where the product is computed as a float. # NOTE: This may be overridden below, by a function that computes the # same thing but with the inner scoring loop in C. - if score is None: - def _search_wids(self, wids): - if not wids: - return [] - N = float(self.documentCount()) # total # of docs - try: - doclen = self._totaldoclen() - except TypeError: - # _totaldoclen has not yet been upgraded - doclen = self._totaldoclen - meandoclen = doclen / N - K1 = self.K1 - B = self.B - K1_plus1 = K1 + 1.0 - B_from1 = 1.0 - B - - # f(D, t) * (k1 + 1) - # TF(D, t) = ------------------------------------------- - # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) - - L = [] - docid2len = self._docweight - for t in wids: - d2f = self._wordinfo[t] # map {docid -> f(docid, t)} - idf = inverse_doc_frequency(len(d2f), N) # an unscaled float - result = self.family.IF.Bucket() - for docid, f in d2f.items(): - lenweight = B_from1 + B * docid2len[docid] / meandoclen - tf = f * K1_plus1 / (f + K1 * lenweight) - result[docid] = tf * idf - L.append((result, 1)) - return L - - # Note about the above: the result is tf * idf. tf is - # small -- it can't be larger than k1+1 = 2.2. idf is - # formally unbounded, but is less than 14 for a term that - # appears in only 1 of a million documents. So the - # product is probably less than 32, or 5 bits before the - # radix point. If we did the scaled-int business on both - # of them, we'd be up to 25 bits. Add 64 of those and - # we'd be in overflow territory. That's pretty unlikely, - # so we *could* just store scaled_int(tf) in - # result[docid], and use scaled_int(idf) as an invariant - # weight across the whole result. But besides skating - # near the edge, it's not a speed cure, since the - # computation of tf would still be done at Python speed, - # and it's a lot more work than just multiplying by idf. - else: - # The same function as _search_wids above, but with the inner scoring - # loop written in C (module okascore, function score()). - # Cautions: okascore hardcodes the values of K, B1, and the scaled_int - # function. - def _search_wids(self, wids): - if not wids: - return [] - N = float(self.documentCount()) # total # of docs - try: - doclen = self._totaldoclen() - except TypeError: - # _totaldoclen has not yet been upgraded - doclen = self._totaldoclen - meandoclen = doclen / N - #K1 = self.K1 - #B = self.B - #K1_plus1 = K1 + 1.0 - #B_from1 = 1.0 - B - - # f(D, t) * (k1 + 1) - # TF(D, t) = ------------------------------------------- - # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) - - L = [] - docid2len = self._docweight - for t in wids: - d2f = self._wordinfo[t] # map {docid -> f(docid, t)} - idf = inverse_doc_frequency(len(d2f), N) # an unscaled float - result = self.family.IF.Bucket() - items = d2f.items() if PY2 else list(d2f.items()) - score(result, items, docid2len, idf, meandoclen) - L.append((result, 1)) - return L + def _python_search_wids(self, wids): + if not wids: + return [] + N = float(self.documentCount()) # total # of docs + try: + doclen = self._totaldoclen() + except TypeError: + # _totaldoclen has not yet been upgraded + doclen = self._totaldoclen + meandoclen = doclen / N + K1 = self.K1 + B = self.B + K1_plus1 = K1 + 1.0 + B_from1 = 1.0 - B + + # f(D, t) * (k1 + 1) + # TF(D, t) = ------------------------------------------- + # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) + + L = [] + docid2len = self._docweight + for t in wids: + d2f = self._wordinfo[t] # map {docid -> f(docid, t)} + idf = inverse_doc_frequency(len(d2f), N) # an unscaled float + result = self.family.IF.Bucket() + for docid, f in d2f.items(): + lenweight = B_from1 + B * docid2len[docid] / meandoclen + tf = f * K1_plus1 / (f + K1 * lenweight) + result[docid] = tf * idf + L.append((result, 1)) + return L + + # Note about the above: the result is tf * idf. tf is + # small -- it can't be larger than k1+1 = 2.2. idf is + # formally unbounded, but is less than 14 for a term that + # appears in only 1 of a million documents. So the + # product is probably less than 32, or 5 bits before the + # radix point. If we did the scaled-int business on both + # of them, we'd be up to 25 bits. Add 64 of those and + # we'd be in overflow territory. That's pretty unlikely, + # so we *could* just store scaled_int(tf) in + # result[docid], and use scaled_int(idf) as an invariant + # weight across the whole result. But besides skating + # near the edge, it's not a speed cure, since the + # computation of tf would still be done at Python speed, + # and it's a lot more work than just multiplying by idf. + + # The same function as _search_wids above, but with the inner scoring + # loop written in C (module okascore, function score()). + # Cautions: okascore hardcodes the values of K, B1, and the scaled_int + # function. + def _c_search_wids(self, wids): + if not wids: + return [] + N = float(self.documentCount()) # total # of docs + try: + doclen = self._totaldoclen() + except TypeError: + # _totaldoclen has not yet been upgraded + doclen = self._totaldoclen + meandoclen = doclen / N + #K1 = self.K1 + #B = self.B + #K1_plus1 = K1 + 1.0 + #B_from1 = 1.0 - B + + # f(D, t) * (k1 + 1) + # TF(D, t) = ------------------------------------------- + # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) + + L = [] + docid2len = self._docweight + for t in wids: + d2f = self._wordinfo[t] # map {docid -> f(docid, t)} + idf = inverse_doc_frequency(len(d2f), N) # an unscaled float + result = self.family.IF.Bucket() + items = d2f.items() if PY2 else list(d2f.items()) + score(result, items, docid2len, idf, meandoclen) + L.append((result, 1)) + return L + + _search_wids = _python_search_wids if score is None else _c_search_wids def query_weight(self, terms): # Get the wids. diff --git a/src/zope/index/text/tests/test_okapiindex.py b/src/zope/index/text/tests/test_okapiindex.py index afceda8..87a2b28 100644 --- a/src/zope/index/text/tests/test_okapiindex.py +++ b/src/zope/index/text/tests/test_okapiindex.py @@ -17,6 +17,8 @@ # pylint:disable=protected-access +from zope.index.text.okapiindex import PURE_PYTHON + class OkapiIndexTestMixin(object): def _getBTreesFamily(self): @@ -127,6 +129,10 @@ def test_query_weight_empty_wids(self): index.index_doc(1, 'one two three') self.assertEqual(index.query_weight(()), 0.0) + def test__search_wids_empty_wids(self): + index = self._makeOne() + self.assertEqual([], index._search_wids(())) + def test_query_weight_oov_wids(self): index = self._makeOne() index.index_doc(1, 'one two three') @@ -142,6 +148,12 @@ def test_query_weight_hit_multiple_occurences(self): index.index_doc(1, 'one one two three one') self.assertGreater(index.query_weight(['one']), 0.0) +class OkapiIndexPurePythonTestMixin(OkapiIndexTestMixin): + + def _makeOne(self): + index = super(OkapiIndexPurePythonTestMixin, self)._makeOne() + index._search_wids = index._python_search_wids + return index class OkapiIndexTest32(OkapiIndexTestMixin, unittest.TestCase): @@ -155,9 +167,18 @@ def _getBTreesFamily(self): import BTrees return BTrees.family64 +@unittest.skipIf(PURE_PYTHON, "Already tested") +class OkapiIndexPurePythonTest32(OkapiIndexPurePythonTestMixin, OkapiIndexTest32): + pass + +@unittest.skipIf(PURE_PYTHON, "Already tested") +class OkapiIndexPurePythonTest64(OkapiIndexPurePythonTestMixin, OkapiIndexTest64): + pass + + class TestScore(unittest.TestCase): def test_score_extension(self): - from zope.index.text.okapiindex import PURE_PYTHON, score + from zope.index.text.okapiindex import score assert_score = self.assertIsNone if PURE_PYTHON else self.assertIsNotNone assert_score(score)