diff --git a/OkapiIndex.py b/OkapiIndex.py index b2de1e6..f6dc43a 100644 --- a/OkapiIndex.py +++ b/OkapiIndex.py @@ -70,6 +70,8 @@ def unindex_doc(self, docid): # As currently written, the weights are always 1, and the IIBucket maps # D to TF(D,t)*IDF(t) directly, where the product is computed as a float # but stored as a scaled_int. + # NOTE: This is overridden below, by a function that computes the + # same thing but with the inner scoring loop in C. def _search_wids(self, wids): if not wids: return [] @@ -87,7 +89,6 @@ def _search_wids(self, wids): L = [] docid2len = self._docweight for t in wids: - assert self._wordinfo.has_key(t) # caller responsible for OOV d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() @@ -111,6 +112,35 @@ def _search_wids(self, wids): # of tf would still be done at Python speed, and it's a lot more # work than just multiplying by idf. + # The same function as _search_wids above, but with the inner scoring + # loop written in C (module okascore, function score()). + # Cautions: okascore hardcodes the values of K, B1, and the scaled_int + # function. + def _search_wids(self, wids): + from Products.ZCTextIndex.okascore import score + if not wids: + return [] + N = float(len(self._docweight)) # total # of docs + meandoclen = self._totaldoclen / N + #K1 = self.K1 + #B = self.B + #K1_plus1 = K1 + 1.0 + #B_from1 = 1.0 - B + + # f(D, t) * (k1 + 1) + # TF(D, t) = ------------------------------------------- + # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) + + L = [] + docid2len = self._docweight + for t in wids: + d2f = self._wordinfo[t] # map {docid -> f(docid, t)} + idf = inverse_doc_frequency(len(d2f), N) # an unscaled float + result = IIBucket() + score(result, d2f.items(), docid2len, idf, meandoclen) + L.append((result, 1)) + return L + def query_weight(self, terms): # This method was inherited from the cosine measure, and doesn't # make sense for Okapi measures in the way the cosine measure uses diff --git a/Setup b/Setup index 4e17b6c..b80df4e 100644 --- a/Setup +++ b/Setup @@ -1,2 +1,3 @@ *shared* stopper stopper.c +okascore okascore.c diff --git a/okascore.c b/okascore.c new file mode 100644 index 0000000..5257d7d --- /dev/null +++ b/okascore.c @@ -0,0 +1,124 @@ +/***************************************************************************** + + Copyright (c) 2002 Zope Corporation and Contributors. + All Rights Reserved. + + This software is subject to the provisions of the Zope Public License, + Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. + THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED + WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS + FOR A PARTICULAR PURPOSE + + ****************************************************************************/ + +/* okascore.c + * + * The inner scoring loop of OkapiIndex._search_wids() coded in C. + * + * With the Python scoring loop, + * + * query: python + * # results: 10 of 19056 in 534.77 ms + * query: python + * # results: 10 of 19056 in 277.52 ms + * + * The first timing is cold, the second timing from an immediate repeat of + * the same query. With the timing loop here in C: + * + * query: python + * # results: 10 of 19056 in 380.74 ms -- 40% speedup + * query: python + * # results: 10 of 19056 in 118.96 ms -- 133% speedup + */ + +#include "Python.h" + +#define K1 1.2 +#define B 0.75 + +static PyObject * +score(PyObject *self, PyObject *args) +{ + /* Believe it or not, floating these common subexpressions "by hand" + gets better code out of MSVC 6. */ + const double B_FROM1 = 1.0 - B; + const double K1_PLUS1 = K1 + 1.0; + + /* Inputs */ + PyObject *result; /* IIBucket result, maps d to score */ + PyObject *d2fitems; /* ._wordinfo[t].items(), maps d to f(d, t) */ + PyObject *d2len; /* ._docweight, maps d to # words in d */ + double idf; /* inverse doc frequency of t */ + double meandoclen; /* average number of words in a doc */ + + int n, i; + + if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len, + &idf, &meandoclen)) + return NULL; + + n = PyObject_Length(d2fitems); + for (i = 0; i < n; ++i) { + PyObject *d_and_f; /* d2f[i], a (d, f) pair */ + PyObject *d; + double f; + PyObject *doclen; /* ._docweight[d] */ + double lenweight; + double tf; + double score; + PyObject *scaled_int; + int status; + + d_and_f = PySequence_GetItem(d2fitems, i); + if (d_and_f == NULL) + return NULL; + if (!(PyTuple_Check(d_and_f) && + PyTuple_Size(d_and_f) == 2)) { + PyErr_SetString(PyExc_TypeError, + "d2fitems must produce 2-item tuples"); + Py_DECREF(d_and_f); + return NULL; + } + d = PyTuple_GET_ITEM(d_and_f, 0); + f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1)); + + doclen = PyObject_GetItem(d2len, d); + if (doclen == NULL) { + Py_DECREF(d_and_f); + return NULL; + } + lenweight = B_FROM1 + B * PyInt_AsLong(doclen) / meandoclen; + + tf = f * K1_PLUS1 / (f + K1 * lenweight); + score = tf * idf; + scaled_int = PyInt_FromLong((long)(score * 1024.0 + 0.5)); + status = PyObject_SetItem(result, d, scaled_int); + Py_DECREF(d_and_f); + Py_DECREF(doclen); + Py_DECREF(scaled_int); + if (status < 0) + return NULL; + } + Py_INCREF(Py_None); + return Py_None; +} + +static char score__doc__[] = +"score(result, d2fitems, d2len, idf, meandoclen)\n" +"\n" +"Do the inner scoring loop for an Okapi index.\n"; + +static PyMethodDef okascore_functions[] = { + {"score", score, METH_VARARGS, score__doc__}, + {NULL} +}; + +void +initokascore(void) +{ + PyObject *m; + + m = Py_InitModule3("okascore", okascore_functions, + "inner scoring loop for Okapi rank"); +}