Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Use the new SetOps for mass union/intersection.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tim Peters committed May 15, 2002
1 parent 6aadf5a commit 81933db
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 56 deletions.
29 changes: 7 additions & 22 deletions Index.py
Expand Up @@ -17,11 +17,12 @@
import math

from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IISet
from BTrees.IIBTree import weightedIntersection, weightedUnion
from BTrees.IIBTree import IIBTree, IIBucket

from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion

import ZODB
from Persistence import Persistent
Expand Down Expand Up @@ -62,7 +63,7 @@ def __init__(self, lexicon):
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)

def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
Expand Down Expand Up @@ -114,15 +115,15 @@ def unindex_doc(self, docid):

def search(self, term):
wids = self._lexicon.termToWordIds(term)
return self._union(self._search_wids(wids))
return mass_weightedUnion(self._search_wids(wids))

def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
return self._union(self._search_wids(wids))
return mass_weightedUnion(self._search_wids(wids))

def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
hits = self._intersection(self._search_wids(wids))
hits = mass_weightedIntersection(self._search_wids(wids))
if not hits:
return hits
code = WidCode.encode(wids)
Expand All @@ -149,22 +150,6 @@ def _search_wids(self, wids):
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
return L

def _intersection(self, L):
if not L:
return IIBTree()
d2w, weight = L[0]
dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
for d2w, weight in L[1:]:
dummy, result = weightedIntersection(result, d2w, 1, weight)
return result

def _union(self, L):
# XXX This can be optimized, see OkapiIndex
result = IIBTree()
for d2w, weight in L:
dummy, result = weightedUnion(result, d2w, 1, weight)
return result

def query_weight(self, terms):
wids = []
for term in terms:
Expand Down
41 changes: 7 additions & 34 deletions OkapiIndex.py
Expand Up @@ -20,12 +20,13 @@
import math

from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IISet
from BTrees.IIBTree import weightedIntersection, weightedUnion
from BTrees.IIBTree import IIBTree, IIBucket

from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion

# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
Expand Down Expand Up @@ -98,15 +99,15 @@ def unindex_doc(self, docid):

def search(self, term):
wids = self._lexicon.termToWordIds(term)
return self._union(self._search_wids(wids))
return mass_weightedUnion(self._search_wids(wids))

def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
return self._union(self._search_wids(wids))
return mass_weightedUnion(self._search_wids(wids))

def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
hits = self._intersection(self._search_wids(wids))
hits = mass_weightedIntersection(self._search_wids(wids))
if not hits:
return hits
code = WidCode.encode(wids)
Expand Down Expand Up @@ -156,34 +157,6 @@ def _search_wids(self, wids):
# of tf would still done at Python speed, and it's a lot more
# work than just multiplying by idf.

def _intersection(self, L):
if not L:
return IIBTree()
# Intersect with smallest first.
L = L[:] # don't mutate the caller's L
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
d2w, weight = L[0]
dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
for d2w, weight in L[1:]:
dummy, result = weightedIntersection(result, d2w, 1, weight)
return result

def _union(self, L):
if not L:
return IIBTree()
# Balance unions as closely as possible, smallest to largest.
merge = NBest(len(L))
for x, weight in L:
merge.add((x, weight), len(x))
while len(merge) > 1:
# Merge the two smallest so far, and add back to the queue.
(x, wx), dummy = merge.pop_smallest()
(y, wy), dummy = merge.pop_smallest()
dummy, z = weightedUnion(x, y, wx, wy)
merge.add((z, 1), len(z))
(result, weight), score = merge.pop_smallest()
return result

def query_weight(self, terms):
# XXX I have no idea what to put here
return 10
Expand Down

0 comments on commit 81933db

Please sign in to comment.