Since I did the work to write the inner Okapi scoring loop in C, may as

well check it in. This yields an overall 133% speedup on a "hot" search for 'python' in my python-dev archive (a word that appears in all but 2 documents). For those who read the email, turned out it was a significant speedup to iterate over an IIBTree's items rather than to materialize the items into an explicit list first. This is now within 20% of simply doing "IIBucket(the_IIBTree)" (i.e., no arithmetic at all), so there's no significant possibility remaining for speeding the inner score loop.
zopefoundation · May 20, 2002 · bd71f89 · bd71f89
1 parent 05bc9db
commit bd71f89
Show file tree

Hide file tree

Showing 3 changed files with 156 additions and 1 deletion.
diff --git a/OkapiIndex.py b/OkapiIndex.py
@@ -70,6 +70,8 @@ def unindex_doc(self, docid):
     # As currently written, the weights are always 1, and the IIBucket maps
     # D to TF(D,t)*IDF(t) directly, where the product is computed as a float
     # but stored as a scaled_int.
+    # NOTE:  This is overridden below, by a function that computes the
+    # same thing but with the inner scoring loop in C.
     def _search_wids(self, wids):
         if not wids:
             return []
@@ -87,7 +89,6 @@ def _search_wids(self, wids):
         L = []
         docid2len = self._docweight
         for t in wids:
-            assert self._wordinfo.has_key(t)  # caller responsible for OOV
             d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
             idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
             result = IIBucket()
@@ -111,6 +112,35 @@ def _search_wids(self, wids):
         # of tf would still be done at Python speed, and it's a lot more
         # work than just multiplying by idf.
 
+    # The same function as _search_wids above, but with the inner scoring
+    # loop written in C (module okascore, function score()).
+    # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
+    # function.
+    def _search_wids(self, wids):
+        from Products.ZCTextIndex.okascore import score
+        if not wids:
+            return []
+        N = float(len(self._docweight))  # total # of docs
+        meandoclen = self._totaldoclen / N
+        #K1 = self.K1
+        #B = self.B
+        #K1_plus1 = K1 + 1.0
+        #B_from1 = 1.0 - B
+
+        #                           f(D, t) * (k1 + 1)
+        #   TF(D, t) =  -------------------------------------------
+        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+        L = []
+        docid2len = self._docweight
+        for t in wids:
+            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+            result = IIBucket()
+            score(result, d2f.items(), docid2len, idf, meandoclen)
+            L.append((result, 1))
+        return L
+
     def query_weight(self, terms):
         # This method was inherited from the cosine measure, and doesn't
         # make sense for Okapi measures in the way the cosine measure uses

diff --git a/Setup b/Setup
@@ -1,2 +1,3 @@
 *shared*
 stopper stopper.c
+okascore okascore.c
diff --git a/okascore.c b/okascore.c
@@ -0,0 +1,124 @@
+/*****************************************************************************
+
+  Copyright (c) 2002 Zope Corporation and Contributors.
+  All Rights Reserved.
+
+  This software is subject to the provisions of the Zope Public License,
+  Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+  FOR A PARTICULAR PURPOSE
+
+ ****************************************************************************/
+
+/*	okascore.c
+ *
+ *	The inner scoring loop of OkapiIndex._search_wids() coded in C.
+ *
+ * With the Python scoring loop,
+ *
+ *      query: python
+ *      # results: 10 of 19056 in 534.77 ms
+ *      query: python
+ *      # results: 10 of 19056 in 277.52 ms
+ *
+ * The first timing is cold, the second timing from an immediate repeat of
+ * the same query.  With the timing loop here in C:
+ *
+ *     query: python
+ *     # results: 10 of 19056 in 380.74 ms  -- 40% speedup
+ *     query: python
+ *     # results: 10 of 19056 in 118.96 ms  -- 133% speedup
+ */
+
+#include "Python.h"
+
+#define K1 1.2
+#define B  0.75
+
+static PyObject *
+score(PyObject *self, PyObject *args)
+{
+	/* Believe it or not, floating these common subexpressions "by hand"
+	   gets better code out of MSVC 6. */
+	const double B_FROM1 = 1.0 - B;
+	const double K1_PLUS1 = K1 + 1.0;
+
+	/* Inputs */
+	PyObject *result;	/* IIBucket result, maps d to score */
+	PyObject *d2fitems;	/* ._wordinfo[t].items(), maps d to f(d, t) */
+	PyObject *d2len;	/* ._docweight, maps d to # words in d */
+	double idf;		/* inverse doc frequency of t */
+	double meandoclen;	/* average number of words in a doc */
+
+	int n, i;
+
+	if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
+						   &idf, &meandoclen))
+		return NULL;
+
+	n = PyObject_Length(d2fitems);
+	for (i = 0; i < n; ++i) {
+		PyObject *d_and_f;	/* d2f[i], a (d, f) pair */
+		PyObject *d;
+		double f;
+		PyObject *doclen;	/* ._docweight[d] */
+		double lenweight;
+		double tf;
+		double score;
+		PyObject *scaled_int;
+		int status;
+
+		d_and_f = PySequence_GetItem(d2fitems, i);
+		if (d_and_f == NULL)
+			return NULL;
+		if (!(PyTuple_Check(d_and_f) &&
+		      PyTuple_Size(d_and_f) == 2)) {
+			PyErr_SetString(PyExc_TypeError,
+				"d2fitems must produce 2-item tuples");
+			Py_DECREF(d_and_f);
+			return NULL;
+		}
+		d = PyTuple_GET_ITEM(d_and_f, 0);
+		f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
+
+		doclen = PyObject_GetItem(d2len, d);
+		if (doclen == NULL) {
+			Py_DECREF(d_and_f);
+			return NULL;
+		}
+		lenweight = B_FROM1 + B * PyInt_AsLong(doclen) / meandoclen;
+
+		tf = f * K1_PLUS1 / (f + K1 * lenweight);
+		score = tf * idf;
+		scaled_int = PyInt_FromLong((long)(score * 1024.0 + 0.5));
+		status = PyObject_SetItem(result, d, scaled_int);
+		Py_DECREF(d_and_f);
+		Py_DECREF(doclen);
+		Py_DECREF(scaled_int);
+		if (status < 0)
+			return NULL;
+	}
+	Py_INCREF(Py_None);
+	return Py_None;
+}
+
+static char score__doc__[] =
+"score(result, d2fitems, d2len, idf, meandoclen)\n"
+"\n"
+"Do the inner scoring loop for an Okapi index.\n";
+
+static PyMethodDef okascore_functions[] = {
+	{"score",	   score,	  METH_VARARGS, score__doc__},
+	{NULL}
+};
+
+void
+initokascore(void)
+{
+	PyObject *m;
+
+	m = Py_InitModule3("okascore", okascore_functions,
+			    "inner scoring loop for Okapi rank");
+}