Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Buncha updates:
Browse files Browse the repository at this point in the history
- Use slightly more portable values for the Data.fs and
  Zope/lib/python.

- Add -t NNN option to specify how often to commit a transaction;
  default 20,000.

- Change -p into -p NNN to specify how often (counted in commits) to
  pack (default 0 -- never pack).

- Reworked the commit and pack logic to maintain the various counters
  across folders.

- Store relative paths (e.g. "inbox/1").

- Store the mtime of indexed messages in doctimes[docid].

- Store the mtime of indexed folders in watchfolders[folder] (unused).

- Refactor updatefolder() to:

  (a) Avoid indexing messages it's already indexed and whose mtime
      hasn't changed.  (This probably needs an override just in case.)

  (b) Unindex messages that no longer exist in the folder.

- Include the folder name and the message header fields from, to, cc,
  bcc, and subject in the text to be indexed.
  • Loading branch information
gvanrossum committed May 23, 2002
1 parent 5a51ea2 commit 21f3d62
Showing 1 changed file with 94 additions and 36 deletions.
130 changes: 94 additions & 36 deletions tests/mhindex.py
Expand Up @@ -2,24 +2,27 @@

"""MH mail indexer."""

import os
import re
import sys
import time
import mhlib
import getopt
import traceback
from StringIO import StringIO
from stat import ST_MTIME

DATAFS = "/home/guido/.Data.fs"
ZOPECODE = "/home/guido/projects/ds9/lib/python"
DATAFS = "~/.Data.fs"
ZOPECODE = "~/projects/Zope/lib/python"

sys.path.append(ZOPECODE)
sys.path.append(os.path.expanduser(ZOPECODE))

from ZODB import DB
from ZODB.FileStorage import FileStorage
from Persistence import Persistent
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from BTrees.IIBTree import IIBTree

from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
Expand All @@ -33,7 +36,7 @@

def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Opu")
opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Op:t:u")
except getopt.error, msg:
print msg
sys.exit(2)
Expand All @@ -42,8 +45,9 @@ def main():
optimize = 0
nbest = NBEST
maxlines = MAXLINES
datafs = DATAFS
datafs = os.path.expanduser(DATAFS)
pack = 0
trans = 20000
for o, a in opts:
if o == "-b":
bulk = 1
Expand All @@ -56,18 +60,18 @@ def main():
if o == "-O":
optimize = 1
if o == "-p":
pack = 1
pack = int(a)
if o == "-t":
trans = ont(a)
if o == "-u":
update = 1
ix = Indexer(datafs, update or bulk)
ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack)
if bulk:
if optimize:
ix.optimize(args)
ix.bulkupdate(args)
elif update:
ix.update(args)
if pack:
ix.pack()
elif args:
for i in range(len(args)):
a = args[i]
Expand All @@ -79,12 +83,18 @@ def main():
ix.query(" ".join(args), nbest, maxlines)
else:
ix.interact(nbest)
if pack:
ix.pack()

class Indexer:

filestorage = database = connection = root = None

def __init__(self, datafs, writable=0):
def __init__(self, datafs, writable=0, trans=0, pack=0):
self.trans_limit = trans
self.pack_limit = pack
self.trans_count = 0
self.pack_count = 0
self.stopdict = get_stopdict()
self.mh = mhlib.MH()
self.filestorage = FileStorage(datafs, read_only=(not writable))
Expand All @@ -99,6 +109,14 @@ def __init__(self, datafs, writable=0):
self.docpaths = self.root["docpaths"]
except KeyError:
self.docpaths = self.root["docpaths"] = IOBTree()
try:
self.doctimes = self.root["doctimes"]
except KeyError:
self.doctimes = self.root["doctimes"] = IIBTree()
try:
self.watchfolders = self.root["watchfolders"]
except KeyError:
self.watchfolders = self.root["watchfolders"] = {}
self.path2docid = OIBTree()
for docid in self.docpaths.keys():
path = self.docpaths[docid]
Expand Down Expand Up @@ -195,6 +213,7 @@ def formatresults(self, text, results, maxlines=MAXLINES,
path = self.docpaths[docid]
score = min(100, int(score * factor))
print "Rank: %d Score: %d%% File: %s" % (rank, score, path)
path = os.path.join(self.mh.getpath(), path)
fp = open(path)
msg = mhlib.Message("<folder>", 0, fp)
for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
Expand Down Expand Up @@ -254,6 +273,7 @@ def update(self, args):
msgs.sort()

self.updatefolder(f, msgs)
self.commit()

def optimize(self, args):
uniqwords = {}
Expand All @@ -279,19 +299,14 @@ def prescan(self, f, msgs, uniqwords):
for n in msgs:
print "prescanning", n
m = f.openmessage(n)
text = self.getmessagetext(m)
text = self.getmessagetext(m, f.name)
for p in pipeline:
text = p.process(text)
for word in text:
uniqwords[word] = uniqwords.get(word, 0) + 1

def bulkupdate(self, args):
chunk = 5000
target = len(self.docpaths) + chunk
for folder in args:
if len(self.docpaths) >= target:
self.pack()
target = len(self.docpaths) + chunk
if folder.startswith("+"):
folder = folder[1:]
print "\nFOLDER", folder
Expand All @@ -302,46 +317,54 @@ def bulkupdate(self, args):
continue
self.updatefolder(f, f.listmessages())
print "Total", len(self.docpaths)
self.pack()
self.commit()
print "Indexed", self.index.lexicon._nbytes, "bytes and",
print self.index.lexicon._nwords, "words;",
print len(self.index.lexicon._words), "unique words."

def updatefolder(self, f, msgs):
done = 0
new = 0
self.watchfolders[f.name] = self.getmtime(f.name)
for n in msgs:
print "indexing", n
path = "%s/%s" % (f.name, n)
docid = self.path2docid.get(path, 0)
if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
print "unchanged", docid, path
continue
docid = self.newdocid(path)
m = f.openmessage(n)
text = self.getmessagetext(m)
path = f.getmessagefilename(n)
self.unindexpath(path)
text = self.getmessagetext(m, f.name)
if not text:
self.unindexpath(path)
continue
docid = self.newdocid(path)
print "indexing", docid, path
self.index.index_text(docid, text)
done += 1
new = 1
if done%500 == 0:
self.commit()
new = 0
if new:
self.commit()
self.maycommit()
# Remove messages from the folder that no longer exist
for path in self.path2docid.keys(f.name):
if not path.startswith(f.name + "/"):
break
if self.getmtime(path) == 0:
self.unindexpath(path)
print "done."

def unindexpath(self, path):
if self.path2docid.has_key(path):
docid = self.path2docid[path]
print "unindexing", docid, path
del self.docpaths[docid]
del self.doctimes[docid]
del self.path2docid[path]
try:
self.index.unindex(docid)
except KeyError, msg:
print "KeyError", msg
self.maycommit()

def getmessagetext(self, m):
def getmessagetext(self, m, name=None):
L = []
if name:
L.append("_folder " + name) # To restrict search to a folder
self.getheaders(m, L)
try:
self.getmsgparts(m, L, 0)
except:
Expand All @@ -361,22 +384,57 @@ def getmsgparts(self, m, L, level):
elif ctype == "message/rfc822":
f = StringIO(m.getbodytext())
m = mhlib.Message("<folder>", 0, f)
self.getheaders(m, L)
self.getmsgparts(m, L, level+1)

def getheaders(self, m, L):
H = []
for key in "from", "to", "cc", "bcc", "subject":
value = m.get(key)
if value:
H.append(value)
if H:
L.append("\n".join(H))

def newdocid(self, path):
docid = self.path2docid.get(path)
if docid is not None:
self.doctimes[docid] = self.getmtime(path)
return docid
docid = self.maxdocid + 1
self.maxdocid = docid
self.docpaths[docid] = path
self.doctimes[docid] = self.getmtime(path)
self.path2docid[path] = docid
return docid

def getmtime(self, path):
path = os.path.join(self.mh.getpath(), path)
try:
st = os.stat(path)
except os.error, msg:
return 0
return st[ST_MTIME]

def maycommit(self):
self.trans_count += 1
if self.trans_count >= self.trans_limit > 0:
self.commit()

def commit(self):
print "committing..."
get_transaction().commit()
if self.trans_count > 0:
print "committing..."
get_transaction().commit()
self.trans_count = 0
self.pack_count += 1
if self.pack_count >= self.pack_limit > 0:
self.pack()

def pack(self):
print "packing..."
self.database.pack()
if self.pack_count > 0:
print "packing..."
self.database.pack()
self.pack_count = 0

class TextIndex(Persistent):

Expand Down

0 comments on commit 21f3d62

Please sign in to comment.