Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Factor out most of the code for indexing a doc. The cosine index may
Browse files Browse the repository at this point in the history
take longer to construct now; both indexers' _get_frequencies routines
were fiddled to return the same kind of stuff again, and I had
previously fiddled the cosine indexer's _get_frequencies to do something
weirder but (probably) faster than this.
  • Loading branch information
Tim Peters committed May 17, 2002
1 parent ce70276 commit 94bbe5c
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 42 deletions.
20 changes: 19 additions & 1 deletion BaseIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,26 @@ def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])

# Subclass must override.
# A subclass may wish to extend or override this.
def index_doc(self, docid, text):
# XXX If docid is already known, do something smart.
wids = self._lexicon.sourceToWordIds(text)
wid2weight, docweight = self._get_frequencies(wids)
for wid, weight in wid2weight.items():
self._add_wordinfo(wid, weight, docid)
self._docweight[docid] = docweight
self._docwords[docid] = WidCode.encode(wids)
return len(wids)

# Subclass must override.
def _get_frequencies(self, wids):
# Compute term frequencies and a doc weight, whatever those mean
# to an indexer.
# Return pair:
# {wid0: w(d, wid0), wid1: w(d, wid1), ...],
# docweight
# The wid->weight mappings are fed into _add_wordinfo, and docweight
# becomes the value of _docweight[docid].
raise NotImplementedError

# A subclass may wish to extend or override this.
Expand Down
35 changes: 9 additions & 26 deletions CosineIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,6 @@ def __init__(self, lexicon):
# W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
# computed by self.query_weight()

def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
uniqwids, freqs, docweight = self._get_frequencies(wids)
for i in range(len(uniqwids)):
self._add_wordinfo(uniqwids[i], freqs[i], docid)
self._docweight[docid] = docweight
self._docwords[docid] = WidCode.encode(wids)
return len(wids)

def _search_wids(self, wids):
if not wids:
return []
Expand Down Expand Up @@ -111,30 +102,22 @@ def query_weight(self, terms):
return scaled_int(math.sqrt(sum))

def _get_frequencies(self, wids):
"""Return individual doc-term weights and docweight."""
# Computes w(d, t) for each term, and W(d).
# Return triple:
# [wid0, wid1, ...],
# [w(d, wid0)/W(d), w(d, wid1)/W(d), ...],
# W(d)
# The second list and W(d) are scaled_ints.
d = {}
dget = d.get
for wid in wids:
d[wid] = d.get(wid, 0) + 1
d[wid] = dget(wid, 0) + 1
Wsquares = 0.0
weights = []
push = weights.append
for count in d.values():
for wid, count in d.items():
w = doc_term_weight(count)
Wsquares += w * w
push(w)
d[wid] = w
W = math.sqrt(Wsquares)
#print "W = %.3f" % W
for i in xrange(len(weights)):
#print i, ":", "%.3f" % weights[i],
weights[i] = scaled_int(weights[i] / W)
#print "->", weights[i]
return d.keys(), weights, scaled_int(W)
for wid, weight in d.items():
#print i, ":", "%.3f" % weight,
d[wid] = scaled_int(weight / W)
#print "->", d[wid]
return d, scaled_int(W)

# The rest are helper methods to support unit tests

Expand Down
18 changes: 3 additions & 15 deletions OkapiIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,8 @@ def __init__(self, lexicon):
self._totaldoclen = 0L

def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
self._docweight[docid] = len(wids)
self._totaldoclen += len(wids)

wid2count = self._get_frequencies(wids)
for wid, count in wid2count.items():
self._add_wordinfo(wid, count, docid)

self._docwords[docid] = WidCode.encode(wids)
return len(wids)
count = BaseIndex.index_doc(self, docid, text)
self._totaldoclen += count

def unindex_doc(self, docid):
self._totaldoclen -= self._docweight[docid]
Expand Down Expand Up @@ -125,15 +117,11 @@ def query_weight(self, terms):
return 10 # arbitrary

def _get_frequencies(self, wids):
"""Return individual term frequencies."""
# Computes f(d, t) for each term.
# Returns a dict mapping wid to the number of times wid appeares
# in wids, {t -> f(d, t)}
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
return d
return d, len(wids)

"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
Expand Down

0 comments on commit 94bbe5c

Please sign in to comment.