100% for okapiindex.py on CPython. Expose and test both the pure-pyth…

…on and C implementations.
zopefoundation · Nov 2, 2017 · 56907aa · 56907aa
1 parent 866b0f7
commit 56907aa
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 90 deletions.
diff --git a/src/zope/index/text/okapiindex.py b/src/zope/index/text/okapiindex.py
@@ -191,21 +191,20 @@
 """
 import os
 import platform
+from BTrees.Length import Length
 
 from zope.index.text.baseindex import BaseIndex
 from zope.index.text.baseindex import inverse_doc_frequency
+_py_impl = getattr(platform, 'python_implementation', lambda: None)
+_is_pypy = _py_impl() == 'PyPy'
+PURE_PYTHON = os.environ.get('PURE_PYTHON') or _is_pypy
 try:
     from zope.index.text.okascore import score
-except ImportError: #pragma NO COVERAGE
+except ImportError: # pragma: no cover
     score = None
-from BTrees.Length import Length
 
+score = None if PURE_PYTHON else score
 
-_py_impl = getattr(platform, 'python_implementation', lambda: None)
-_is_pypy = _py_impl() == 'PyPy'
-PURE_PYTHON = os.environ.get('PURE_PYTHON') or _is_pypy
-if PURE_PYTHON:
-    score = None
 
 PY2 = str is bytes
 
@@ -216,7 +215,7 @@ class OkapiIndex(BaseIndex):
 
     # BM25 free parameters.
     K1 = 1.2
-    B  = 0.75
+    B = 0.75
     assert K1 >= 0.0
     assert 0.0 <= B <= 1.0
 
@@ -268,87 +267,88 @@ def _change_doc_len(self, delta):
     # D to TF(D,t)*IDF(t) directly, where the product is computed as a float.
     # NOTE:  This may be overridden below, by a function that computes the
     # same thing but with the inner scoring loop in C.
-    if score is None:
-        def _search_wids(self, wids):
-            if not wids:
-                return []
-            N = float(self.documentCount())  # total # of docs
-            try:
-                doclen = self._totaldoclen()
-            except TypeError:
-                # _totaldoclen has not yet been upgraded
-                doclen = self._totaldoclen
-            meandoclen = doclen / N
-            K1 = self.K1
-            B = self.B
-            K1_plus1 = K1 + 1.0
-            B_from1 = 1.0 - B
-
-            #                           f(D, t) * (k1 + 1)
-            #   TF(D, t) =  -------------------------------------------
-            #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-
-            L = []
-            docid2len = self._docweight
-            for t in wids:
-                d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-                idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
-                result = self.family.IF.Bucket()
-                for docid, f in d2f.items():
-                    lenweight = B_from1 + B * docid2len[docid] / meandoclen
-                    tf = f * K1_plus1 / (f + K1 * lenweight)
-                    result[docid] = tf * idf
-                L.append((result, 1))
-            return L
-
-            # Note about the above: the result is tf * idf.  tf is
-            # small -- it can't be larger than k1+1 = 2.2.  idf is
-            # formally unbounded, but is less than 14 for a term that
-            # appears in only 1 of a million documents.  So the
-            # product is probably less than 32, or 5 bits before the
-            # radix point.  If we did the scaled-int business on both
-            # of them, we'd be up to 25 bits.  Add 64 of those and
-            # we'd be in overflow territory.  That's pretty unlikely,
-            # so we *could* just store scaled_int(tf) in
-            # result[docid], and use scaled_int(idf) as an invariant
-            # weight across the whole result.  But besides skating
-            # near the edge, it's not a speed cure, since the
-            # computation of tf would still be done at Python speed,
-            # and it's a lot more work than just multiplying by idf.
-    else:
-        # The same function as _search_wids above, but with the inner scoring
-        # loop written in C (module okascore, function score()).
-        # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
-        # function.
-        def _search_wids(self, wids):
-            if not wids:
-                return []
-            N = float(self.documentCount())  # total # of docs
-            try:
-                doclen = self._totaldoclen()
-            except TypeError:
-                # _totaldoclen has not yet been upgraded
-                doclen = self._totaldoclen
-            meandoclen = doclen / N
-            #K1 = self.K1
-            #B = self.B
-            #K1_plus1 = K1 + 1.0
-            #B_from1 = 1.0 - B
-
-            #                           f(D, t) * (k1 + 1)
-            #   TF(D, t) =  -------------------------------------------
-            #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-
-            L = []
-            docid2len = self._docweight
-            for t in wids:
-                d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-                idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
-                result = self.family.IF.Bucket()
-                items = d2f.items() if PY2 else list(d2f.items())
-                score(result, items, docid2len, idf, meandoclen)
-                L.append((result, 1))
-            return L
+    def _python_search_wids(self, wids):
+        if not wids:
+            return []
+        N = float(self.documentCount())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
+        K1 = self.K1
+        B = self.B
+        K1_plus1 = K1 + 1.0
+        B_from1 = 1.0 - B
+
+        #                           f(D, t) * (k1 + 1)
+        #   TF(D, t) =  -------------------------------------------
+        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+        L = []
+        docid2len = self._docweight
+        for t in wids:
+            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+            result = self.family.IF.Bucket()
+            for docid, f in d2f.items():
+                lenweight = B_from1 + B * docid2len[docid] / meandoclen
+                tf = f * K1_plus1 / (f + K1 * lenweight)
+                result[docid] = tf * idf
+            L.append((result, 1))
+        return L
+
+        # Note about the above: the result is tf * idf.  tf is
+        # small -- it can't be larger than k1+1 = 2.2.  idf is
+        # formally unbounded, but is less than 14 for a term that
+        # appears in only 1 of a million documents.  So the
+        # product is probably less than 32, or 5 bits before the
+        # radix point.  If we did the scaled-int business on both
+        # of them, we'd be up to 25 bits.  Add 64 of those and
+        # we'd be in overflow territory.  That's pretty unlikely,
+        # so we *could* just store scaled_int(tf) in
+        # result[docid], and use scaled_int(idf) as an invariant
+        # weight across the whole result.  But besides skating
+        # near the edge, it's not a speed cure, since the
+        # computation of tf would still be done at Python speed,
+        # and it's a lot more work than just multiplying by idf.
+
+    # The same function as _search_wids above, but with the inner scoring
+    # loop written in C (module okascore, function score()).
+    # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
+    # function.
+    def _c_search_wids(self, wids):
+        if not wids:
+            return []
+        N = float(self.documentCount())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
+        #K1 = self.K1
+        #B = self.B
+        #K1_plus1 = K1 + 1.0
+        #B_from1 = 1.0 - B
+
+        #                           f(D, t) * (k1 + 1)
+        #   TF(D, t) =  -------------------------------------------
+        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+        L = []
+        docid2len = self._docweight
+        for t in wids:
+            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+            result = self.family.IF.Bucket()
+            items = d2f.items() if PY2 else list(d2f.items())
+            score(result, items, docid2len, idf, meandoclen)
+            L.append((result, 1))
+        return L
+
+    _search_wids = _python_search_wids if score is None else _c_search_wids
 
     def query_weight(self, terms):
         # Get the wids.

diff --git a/src/zope/index/text/tests/test_okapiindex.py b/src/zope/index/text/tests/test_okapiindex.py
@@ -17,6 +17,8 @@
 
 # pylint:disable=protected-access
 
+from zope.index.text.okapiindex import PURE_PYTHON
+
 class OkapiIndexTestMixin(object):
 
     def _getBTreesFamily(self):
@@ -127,6 +129,10 @@ def test_query_weight_empty_wids(self):
         index.index_doc(1, 'one two three')
         self.assertEqual(index.query_weight(()), 0.0)
 
+    def test__search_wids_empty_wids(self):
+        index = self._makeOne()
+        self.assertEqual([], index._search_wids(()))
+
     def test_query_weight_oov_wids(self):
         index = self._makeOne()
         index.index_doc(1, 'one two three')
@@ -142,6 +148,12 @@ def test_query_weight_hit_multiple_occurences(self):
         index.index_doc(1, 'one one two three one')
         self.assertGreater(index.query_weight(['one']), 0.0)
 
+class OkapiIndexPurePythonTestMixin(OkapiIndexTestMixin):
+
+    def _makeOne(self):
+        index = super(OkapiIndexPurePythonTestMixin, self)._makeOne()
+        index._search_wids = index._python_search_wids
+        return index
 
 class OkapiIndexTest32(OkapiIndexTestMixin, unittest.TestCase):
 
@@ -155,9 +167,18 @@ def _getBTreesFamily(self):
         import BTrees
         return BTrees.family64
 
+@unittest.skipIf(PURE_PYTHON, "Already tested")
+class OkapiIndexPurePythonTest32(OkapiIndexPurePythonTestMixin, OkapiIndexTest32):
+    pass
+
+@unittest.skipIf(PURE_PYTHON, "Already tested")
+class OkapiIndexPurePythonTest64(OkapiIndexPurePythonTestMixin, OkapiIndexTest64):
+    pass
+
+
 class TestScore(unittest.TestCase):
 
     def test_score_extension(self):
-        from zope.index.text.okapiindex import PURE_PYTHON, score
+        from zope.index.text.okapiindex import score
         assert_score = self.assertIsNone if PURE_PYTHON else self.assertIsNotNone
         assert_score(score)