Skip to content

Commit

Permalink
100% for okapiindex.py on CPython. Expose and test both the pure-pyth…
Browse files Browse the repository at this point in the history
…on and C implementations.
  • Loading branch information
jamadden committed Nov 2, 2017
1 parent 866b0f7 commit 56907aa
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 90 deletions.
178 changes: 89 additions & 89 deletions src/zope/index/text/okapiindex.py
Expand Up @@ -191,21 +191,20 @@
"""
import os
import platform
from BTrees.Length import Length

from zope.index.text.baseindex import BaseIndex
from zope.index.text.baseindex import inverse_doc_frequency
_py_impl = getattr(platform, 'python_implementation', lambda: None)
_is_pypy = _py_impl() == 'PyPy'
PURE_PYTHON = os.environ.get('PURE_PYTHON') or _is_pypy
try:
from zope.index.text.okascore import score
except ImportError: #pragma NO COVERAGE
except ImportError: # pragma: no cover
score = None
from BTrees.Length import Length

score = None if PURE_PYTHON else score

_py_impl = getattr(platform, 'python_implementation', lambda: None)
_is_pypy = _py_impl() == 'PyPy'
PURE_PYTHON = os.environ.get('PURE_PYTHON') or _is_pypy
if PURE_PYTHON:
score = None

PY2 = str is bytes

Expand All @@ -216,7 +215,7 @@ class OkapiIndex(BaseIndex):

# BM25 free parameters.
K1 = 1.2
B = 0.75
B = 0.75
assert K1 >= 0.0
assert 0.0 <= B <= 1.0

Expand Down Expand Up @@ -268,87 +267,88 @@ def _change_doc_len(self, delta):
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float.
# NOTE: This may be overridden below, by a function that computes the
# same thing but with the inner scoring loop in C.
if score is None:
def _search_wids(self, wids):
if not wids:
return []
N = float(self.documentCount()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
K1 = self.K1
B = self.B
K1_plus1 = K1 + 1.0
B_from1 = 1.0 - B

# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = self.family.IF.Bucket()
for docid, f in d2f.items():
lenweight = B_from1 + B * docid2len[docid] / meandoclen
tf = f * K1_plus1 / (f + K1 * lenweight)
result[docid] = tf * idf
L.append((result, 1))
return L

# Note about the above: the result is tf * idf. tf is
# small -- it can't be larger than k1+1 = 2.2. idf is
# formally unbounded, but is less than 14 for a term that
# appears in only 1 of a million documents. So the
# product is probably less than 32, or 5 bits before the
# radix point. If we did the scaled-int business on both
# of them, we'd be up to 25 bits. Add 64 of those and
# we'd be in overflow territory. That's pretty unlikely,
# so we *could* just store scaled_int(tf) in
# result[docid], and use scaled_int(idf) as an invariant
# weight across the whole result. But besides skating
# near the edge, it's not a speed cure, since the
# computation of tf would still be done at Python speed,
# and it's a lot more work than just multiplying by idf.
else:
# The same function as _search_wids above, but with the inner scoring
# loop written in C (module okascore, function score()).
# Cautions: okascore hardcodes the values of K, B1, and the scaled_int
# function.
def _search_wids(self, wids):
if not wids:
return []
N = float(self.documentCount()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
#B_from1 = 1.0 - B

# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = self.family.IF.Bucket()
items = d2f.items() if PY2 else list(d2f.items())
score(result, items, docid2len, idf, meandoclen)
L.append((result, 1))
return L
def _python_search_wids(self, wids):
if not wids:
return []
N = float(self.documentCount()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
K1 = self.K1
B = self.B
K1_plus1 = K1 + 1.0
B_from1 = 1.0 - B

# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = self.family.IF.Bucket()
for docid, f in d2f.items():
lenweight = B_from1 + B * docid2len[docid] / meandoclen
tf = f * K1_plus1 / (f + K1 * lenweight)
result[docid] = tf * idf
L.append((result, 1))
return L

# Note about the above: the result is tf * idf. tf is
# small -- it can't be larger than k1+1 = 2.2. idf is
# formally unbounded, but is less than 14 for a term that
# appears in only 1 of a million documents. So the
# product is probably less than 32, or 5 bits before the
# radix point. If we did the scaled-int business on both
# of them, we'd be up to 25 bits. Add 64 of those and
# we'd be in overflow territory. That's pretty unlikely,
# so we *could* just store scaled_int(tf) in
# result[docid], and use scaled_int(idf) as an invariant
# weight across the whole result. But besides skating
# near the edge, it's not a speed cure, since the
# computation of tf would still be done at Python speed,
# and it's a lot more work than just multiplying by idf.

# The same function as _search_wids above, but with the inner scoring
# loop written in C (module okascore, function score()).
# Cautions: okascore hardcodes the values of K, B1, and the scaled_int
# function.
def _c_search_wids(self, wids):
if not wids:
return []
N = float(self.documentCount()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
#B_from1 = 1.0 - B

# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = self.family.IF.Bucket()
items = d2f.items() if PY2 else list(d2f.items())
score(result, items, docid2len, idf, meandoclen)
L.append((result, 1))
return L

_search_wids = _python_search_wids if score is None else _c_search_wids

def query_weight(self, terms):
# Get the wids.
Expand Down
23 changes: 22 additions & 1 deletion src/zope/index/text/tests/test_okapiindex.py
Expand Up @@ -17,6 +17,8 @@

# pylint:disable=protected-access

from zope.index.text.okapiindex import PURE_PYTHON

class OkapiIndexTestMixin(object):

def _getBTreesFamily(self):
Expand Down Expand Up @@ -127,6 +129,10 @@ def test_query_weight_empty_wids(self):
index.index_doc(1, 'one two three')
self.assertEqual(index.query_weight(()), 0.0)

def test__search_wids_empty_wids(self):
index = self._makeOne()
self.assertEqual([], index._search_wids(()))

def test_query_weight_oov_wids(self):
index = self._makeOne()
index.index_doc(1, 'one two three')
Expand All @@ -142,6 +148,12 @@ def test_query_weight_hit_multiple_occurences(self):
index.index_doc(1, 'one one two three one')
self.assertGreater(index.query_weight(['one']), 0.0)

class OkapiIndexPurePythonTestMixin(OkapiIndexTestMixin):

def _makeOne(self):
index = super(OkapiIndexPurePythonTestMixin, self)._makeOne()
index._search_wids = index._python_search_wids
return index

class OkapiIndexTest32(OkapiIndexTestMixin, unittest.TestCase):

Expand All @@ -155,9 +167,18 @@ def _getBTreesFamily(self):
import BTrees
return BTrees.family64

@unittest.skipIf(PURE_PYTHON, "Already tested")
class OkapiIndexPurePythonTest32(OkapiIndexPurePythonTestMixin, OkapiIndexTest32):
pass

@unittest.skipIf(PURE_PYTHON, "Already tested")
class OkapiIndexPurePythonTest64(OkapiIndexPurePythonTestMixin, OkapiIndexTest64):
pass


class TestScore(unittest.TestCase):

def test_score_extension(self):
from zope.index.text.okapiindex import PURE_PYTHON, score
from zope.index.text.okapiindex import score
assert_score = self.assertIsNone if PURE_PYTHON else self.assertIsNotNone
assert_score(score)

0 comments on commit 56907aa

Please sign in to comment.