Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Since I did the work to write the inner Okapi scoring loop in C, may as
Browse files Browse the repository at this point in the history
well check it in.  This yields an overall 133% speedup on a "hot" search
for 'python' in my python-dev archive (a word that appears in all but
2 documents).  For those who read the email, turned out it was a
significant speedup to iterate over an IIBTree's items rather than to
materialize the items into an explicit list first.

This is now within 20% of simply doing "IIBucket(the_IIBTree)" (i.e.,
no arithmetic at all), so there's no significant possibility remaining
for speeding the inner score loop.
  • Loading branch information
Tim Peters committed May 20, 2002
1 parent 05bc9db commit bd71f89
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 1 deletion.
32 changes: 31 additions & 1 deletion OkapiIndex.py
Expand Up @@ -70,6 +70,8 @@ def unindex_doc(self, docid):
# As currently written, the weights are always 1, and the IIBucket maps
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float
# but stored as a scaled_int.
# NOTE: This is overridden below, by a function that computes the
# same thing but with the inner scoring loop in C.
def _search_wids(self, wids):
if not wids:
return []
Expand All @@ -87,7 +89,6 @@ def _search_wids(self, wids):
L = []
docid2len = self._docweight
for t in wids:
assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
Expand All @@ -111,6 +112,35 @@ def _search_wids(self, wids):
# of tf would still be done at Python speed, and it's a lot more
# work than just multiplying by idf.

# The same function as _search_wids above, but with the inner scoring
# loop written in C (module okascore, function score()).
# Cautions: okascore hardcodes the values of K, B1, and the scaled_int
# function.
def _search_wids(self, wids):
from Products.ZCTextIndex.okascore import score
if not wids:
return []
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
#B_from1 = 1.0 - B

# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
score(result, d2f.items(), docid2len, idf, meandoclen)
L.append((result, 1))
return L

def query_weight(self, terms):
# This method was inherited from the cosine measure, and doesn't
# make sense for Okapi measures in the way the cosine measure uses
Expand Down
1 change: 1 addition & 0 deletions Setup
@@ -1,2 +1,3 @@
*shared*
stopper stopper.c
okascore okascore.c
124 changes: 124 additions & 0 deletions okascore.c
@@ -0,0 +1,124 @@
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/

/* okascore.c
*
* The inner scoring loop of OkapiIndex._search_wids() coded in C.
*
* With the Python scoring loop,
*
* query: python
* # results: 10 of 19056 in 534.77 ms
* query: python
* # results: 10 of 19056 in 277.52 ms
*
* The first timing is cold, the second timing from an immediate repeat of
* the same query. With the timing loop here in C:
*
* query: python
* # results: 10 of 19056 in 380.74 ms -- 40% speedup
* query: python
* # results: 10 of 19056 in 118.96 ms -- 133% speedup
*/

#include "Python.h"

#define K1 1.2
#define B 0.75

static PyObject *
score(PyObject *self, PyObject *args)
{
/* Believe it or not, floating these common subexpressions "by hand"
gets better code out of MSVC 6. */
const double B_FROM1 = 1.0 - B;
const double K1_PLUS1 = K1 + 1.0;

/* Inputs */
PyObject *result; /* IIBucket result, maps d to score */
PyObject *d2fitems; /* ._wordinfo[t].items(), maps d to f(d, t) */
PyObject *d2len; /* ._docweight, maps d to # words in d */
double idf; /* inverse doc frequency of t */
double meandoclen; /* average number of words in a doc */

int n, i;

if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
&idf, &meandoclen))
return NULL;

n = PyObject_Length(d2fitems);
for (i = 0; i < n; ++i) {
PyObject *d_and_f; /* d2f[i], a (d, f) pair */
PyObject *d;
double f;
PyObject *doclen; /* ._docweight[d] */
double lenweight;
double tf;
double score;
PyObject *scaled_int;
int status;

d_and_f = PySequence_GetItem(d2fitems, i);
if (d_and_f == NULL)
return NULL;
if (!(PyTuple_Check(d_and_f) &&
PyTuple_Size(d_and_f) == 2)) {
PyErr_SetString(PyExc_TypeError,
"d2fitems must produce 2-item tuples");
Py_DECREF(d_and_f);
return NULL;
}
d = PyTuple_GET_ITEM(d_and_f, 0);
f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));

doclen = PyObject_GetItem(d2len, d);
if (doclen == NULL) {
Py_DECREF(d_and_f);
return NULL;
}
lenweight = B_FROM1 + B * PyInt_AsLong(doclen) / meandoclen;

tf = f * K1_PLUS1 / (f + K1 * lenweight);
score = tf * idf;
scaled_int = PyInt_FromLong((long)(score * 1024.0 + 0.5));
status = PyObject_SetItem(result, d, scaled_int);
Py_DECREF(d_and_f);
Py_DECREF(doclen);
Py_DECREF(scaled_int);
if (status < 0)
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
}

static char score__doc__[] =
"score(result, d2fitems, d2len, idf, meandoclen)\n"
"\n"
"Do the inner scoring loop for an Okapi index.\n";

static PyMethodDef okascore_functions[] = {
{"score", score, METH_VARARGS, score__doc__},
{NULL}
};

void
initokascore(void)
{
PyObject *m;

m = Py_InitModule3("okascore", okascore_functions,
"inner scoring loop for Okapi rank");
}

0 comments on commit bd71f89

Please sign in to comment.