Factor out most of the code for indexing a doc. The cosine index may

take longer to construct now; both indexers' _get_frequencies routines were fiddled to return the same kind of stuff again, and I had previously fiddled the cosine indexer's _get_frequencies to do something weirder but (probably) faster than this.
zopefoundation · May 17, 2002 · 94bbe5c · 94bbe5c
1 parent ce70276
commit 94bbe5c
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 42 deletions.
diff --git a/BaseIndex.py b/BaseIndex.py
@@ -82,8 +82,26 @@ def get_words(self, docid):
         """Returns the wordids for a given docid"""
         return WidCode.decode(self._docwords[docid])
 
-    # Subclass must override.
+    # A subclass may wish to extend or override this.
     def index_doc(self, docid, text):
+        # XXX If docid is already known, do something smart.
+        wids = self._lexicon.sourceToWordIds(text)
+        wid2weight, docweight = self._get_frequencies(wids)
+        for wid, weight in wid2weight.items():
+            self._add_wordinfo(wid, weight, docid)
+        self._docweight[docid] = docweight
+        self._docwords[docid] = WidCode.encode(wids)
+        return len(wids)
+
+    # Subclass must override.
+    def _get_frequencies(self, wids):
+        # Compute term frequencies and a doc weight, whatever those mean
+        # to an indexer.
+        # Return pair:
+        #    {wid0: w(d, wid0), wid1: w(d, wid1),  ...],
+        #    docweight
+        # The wid->weight mappings are fed into _add_wordinfo, and docweight
+        # becomes the value of _docweight[docid].
         raise NotImplementedError
 
     # A subclass may wish to extend or override this.

diff --git a/CosineIndex.py b/CosineIndex.py
@@ -69,15 +69,6 @@ def __init__(self, lexicon):
     #    W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
     #        computed by self.query_weight()
 
-    def index_doc(self, docid, text):
-        wids = self._lexicon.sourceToWordIds(text)
-        uniqwids, freqs, docweight = self._get_frequencies(wids)
-        for i in range(len(uniqwids)):
-            self._add_wordinfo(uniqwids[i], freqs[i], docid)
-        self._docweight[docid] = docweight
-        self._docwords[docid] = WidCode.encode(wids)
-        return len(wids)
-
     def _search_wids(self, wids):
         if not wids:
             return []
@@ -111,30 +102,22 @@ def query_weight(self, terms):
         return scaled_int(math.sqrt(sum))
 
     def _get_frequencies(self, wids):
-        """Return individual doc-term weights and docweight."""
-        # Computes w(d, t) for each term, and W(d).
-        # Return triple:
-        #    [wid0, wid1, ...],
-        #    [w(d, wid0)/W(d), w(d, wid1)/W(d), ...],
-        #    W(d)
-        # The second list and W(d) are scaled_ints.
         d = {}
+        dget = d.get
         for wid in wids:
-            d[wid] = d.get(wid, 0) + 1
+            d[wid] = dget(wid, 0) + 1
         Wsquares = 0.0
-        weights = []
-        push = weights.append
-        for count in d.values():
+        for wid, count in d.items():
             w = doc_term_weight(count)
             Wsquares += w * w
-            push(w)
+            d[wid] = w
         W = math.sqrt(Wsquares)
         #print "W = %.3f" % W
-        for i in xrange(len(weights)):
-            #print i, ":", "%.3f" % weights[i],
-            weights[i] = scaled_int(weights[i] / W)
-            #print "->", weights[i]
-        return d.keys(), weights, scaled_int(W)
+        for wid, weight in d.items():
+            #print i, ":", "%.3f" % weight,
+            d[wid] = scaled_int(weight / W)
+            #print "->", d[wid]
+        return d, scaled_int(W)
 
     # The rest are helper methods to support unit tests
 

diff --git a/OkapiIndex.py b/OkapiIndex.py
@@ -55,16 +55,8 @@ def __init__(self, lexicon):
         self._totaldoclen = 0L
 
     def index_doc(self, docid, text):
-        wids = self._lexicon.sourceToWordIds(text)
-        self._docweight[docid] = len(wids)
-        self._totaldoclen += len(wids)
-
-        wid2count = self._get_frequencies(wids)
-        for wid, count in wid2count.items():
-            self._add_wordinfo(wid, count, docid)
-
-        self._docwords[docid] = WidCode.encode(wids)
-        return len(wids)
+        count = BaseIndex.index_doc(self, docid, text)
+        self._totaldoclen += count
 
     def unindex_doc(self, docid):
         self._totaldoclen -= self._docweight[docid]
@@ -125,15 +117,11 @@ def query_weight(self, terms):
         return 10   # arbitrary
 
     def _get_frequencies(self, wids):
-        """Return individual term frequencies."""
-        # Computes f(d, t) for each term.
-        # Returns a dict mapping wid to the number of times wid appeares
-        # in wids, {t -> f(d, t)}
         d = {}
         dget = d.get
         for wid in wids:
             d[wid] = dget(wid, 0) + 1
-        return d
+        return d, len(wids)
 
 """
 "Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.