/
index.py
810 lines (634 loc) · 28.3 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
#===============================================================================
# Copyright 2007 Matt Chaput
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
"""Contains the main functions/classes for creating, maintaining, and using
an index.
"""
from __future__ import division
import os.path, re
from sys import byteorder
from bisect import bisect_right
import cPickle
from threading import Lock
from array import array
import generalcounter
from whoosh import fields, store
_DEF_INDEX_NAME = "MAIN"
_EXTENSIONS = "dci|dcz|tiz|fvz"
_index_version = -101
_int_size = array("i").itemsize
_ulong_size = array("L").itemsize
_float_size = array("f").itemsize
# Exceptions
class OutOfDateError(Exception):
"""Raised when you try to commit changes to an index which is not
the latest generation.
"""
pass
class EmptyIndexError(Exception):
"""Raised when you try to work with an index that has no indexed terms.
"""
pass
class IndexLockedError(Exception):
"""Raised when you try to write to or lock an already-locked index (or
one that was accidentally left in a locked state).
"""
pass
class IndexError(Exception):
"""Generic index error."""
pass
# Utility functions
def _toc_pattern(indexname):
"""Returns a regular expression object that matches TOC filenames.
name is the name of the index.
"""
return re.compile("_%s_([0-9]+).toc" % indexname)
def _segment_pattern():
"""Returns a regular expression object that matches segment filenames.
name is the name of the index.
"""
return re.compile("([0-9]+).(%s)" % (_EXTENSIONS))
def getdatastoreindex(name, schema = None, indexname = None, **kwargs):
"""Convenience function to create an index in a directory. Takes care of creating
a FileStorage object for you. dirname is the filename of the directory in
which to create the index. schema is a fields.Schema object describing the
index's fields. indexname is the name of the index to create; you only need to
specify this if you are creating multiple indexes within the
same storage object.
If you specify both a schema and keyword arguments, the schema wins.
Returns an Index object.
"""
if not indexname:
indexname = _DEF_INDEX_NAME
storage = store.DataStoreStorage(name)
try:
return Index(storage, indexname = indexname)
except EmptyIndexError:
if kwargs and not schema:
schema = fields.Schema(**kwargs)
elif not schema and not kwargs:
raise Exception("You must specify either a schema or keyword arguments.")
return Index(storage, schema = schema, indexname = indexname, create = True)
def create_in(dirname, schema = None, indexname = None, **kwargs):
"""Convenience function to create an index in a directory. Takes care of creating
a FileStorage object for you. dirname is the filename of the directory in
which to create the index. schema is a fields.Schema object describing the
index's fields. indexname is the name of the index to create; you only need to
specify this if you are creating multiple indexes within the
same storage object.
If you specify both a schema and keyword arguments, the schema wins.
Returns an Index object.
"""
if not indexname:
indexname = _DEF_INDEX_NAME
storage = store.FileStorage(dirname)
if kwargs and not schema:
schema = fields.Schema(**kwargs)
elif not schema and not kwargs:
raise Exception("You must specify either a schema or keyword arguments.")
return Index(storage, schema = schema, indexname = indexname, create = True)
def open_dir(dirname, indexname = None):
"""Convenience function for opening an index in a directory. Takes care of creating
a FileStorage object for you. dirname is the filename of the directory in
containing the index. indexname is the name of the index to create; you only need to
specify this if you have multiple indexes within the same storage object.
Returns an Index object.
"""
if indexname is None:
indexname = _DEF_INDEX_NAME
return Index(store.FileStorage(dirname), indexname = indexname)
def exists_in(dirname, indexname = None):
"""Returns True if dirname contains a Whoosh index."""
if indexname is None:
indexname = _DEF_INDEX_NAME
if os.path.exists(dirname):
try:
ix = open_dir(dirname)
return ix.latest_generation() > -1
except EmptyIndexError:
pass
return False
def exists(storage, indexname):
if indexname is None:
indexname = _DEF_INDEX_NAME
try:
ix = Index(storage, indexname = indexname)
return ix.latest_generation() > -1
except EmptyIndexError:
pass
return False
# A mix-in that adds methods for deleting
# documents from self.segments. These methods are on IndexWriter as
# well as Index for convenience, so they're broken out here.
class DeletionMixin(object):
"""Mix-in for classes that support deleting documents from self.segments."""
def delete_document(self, docnum, delete = True):
"""Deletes a document by number."""
self.segments.delete_document(docnum, delete = delete)
def deleted_count(self):
"""Returns the total number of deleted documents in this index.
"""
return self.segments.deleted_count()
def is_deleted(self, docnum):
"""Returns True if a given document number is deleted but
not yet optimized out of the index.
"""
return self.segments.is_deleted(docnum)
def has_deletions(self):
"""Returns True if this index has documents that are marked
deleted but haven't been optimized out of the index yet.
"""
return self.segments.has_deletions()
def delete_by_term(self, fieldname, text):
"""Deletes any documents containing "term" in the "fieldname"
field. This is useful when you have an indexed field containing
a unique ID (such as "pathname") for each document.
:*returns*: the number of documents deleted.
"""
from whoosh.query import Term
q = Term(fieldname, text)
return self.delete_by_query(q)
def delete_by_query(self, q):
"""Deletes any documents matching a query object.
:*returns*: the number of documents deleted.
"""
count = 0
for docnum in q.docs(self._searcher):
self.delete_document(docnum)
count += 1
return count
# Index class
class Index(DeletionMixin):
"""Represents an indexed collection of documents.
"""
def __init__(self, storage, schema = None, create = False, indexname = _DEF_INDEX_NAME):
"""
:storage: The store.Storage object in which this index resides.
See the store module for more details.
:schema: A fields.Schema object defining the fields of this index. If you omit
this argument for an existing index, the object will load the pickled Schema
object that was saved with the index. If you are creating a new index
(create = True), you must supply this argument.
:create: Whether to create a new index. If this is True, you must supply
a Schema instance using the schema keyword argument.
:indexname: An optional name to use for the index. Use this if you need
to keep multiple indexes in the same storage object.
"""
self.storage = storage
self.indexname = indexname
if schema is not None and not isinstance(schema, fields.Schema):
raise ValueError("%r is not a Schema object" % schema)
self.generation = self.latest_generation()
if create:
if schema is None:
raise IndexError("To create an index you must specify a schema")
self.schema = schema
self.generation = 0
self.segment_counter = 0
self.segments = SegmentSet()
# Clear existing files
self.unlock()
prefix = "_%s_" % self.indexname
for filename in self.storage:
if filename.startswith(prefix):
storage.delete_file(filename)
self._write()
elif self.generation >= 0:
self._read(schema)
else:
raise EmptyIndexError
# Open a searcher for this index. This is used by the
# deletion methods, but mostly it's to keep the underlying
# files open so they don't get deleted from underneath us.
self._searcher = self.searcher()
self.segment_num_lock = Lock()
def __del__(self):
if hasattr(self, "_searcher") and self._searcher and not self._searcher.is_closed:
self._searcher.close()
def close(self):
self._searcher.close()
def latest_generation(self):
"""Returns the generation number of the latest generation of this
index.
"""
pattern = _toc_pattern(self.indexname)
max = -1
for filename in self.storage:
m = pattern.match(filename)
if m:
num = int(m.group(1))
if num > max: max = num
return max
def refresh(self):
"""Returns a new Index object representing the latest generation
of this index (if this object is the latest generation, returns
self).
:*returns*: index.Index
"""
if not self.up_to_date():
return self.__class__(self.storage, indexname = self.indexname)
else:
return self
def up_to_date(self):
"""Returns True if this object represents the latest generation of
this index. Returns False if this object is not the latest
generation (that is, someone else has updated the index since
you opened this object).
"""
return self.generation == self.latest_generation()
def _write(self):
# Writes the content of this index to the .toc file.
for field in self.schema:
field.clean()
stream = self.storage.create_file(self._toc_filename())
stream.write_varint(_int_size)
stream.write_varint(_ulong_size)
stream.write_varint(_float_size)
stream.write_string(byteorder)
stream.write_int(_index_version)
stream.write_string(cPickle.dumps(self.schema, -1))
stream.write_int(self.generation)
stream.write_int(self.segment_counter)
stream.write_pickle(self.segments)
stream.close()
def _read(self, schema):
# Reads the content of this index from the .toc file.
stream = self.storage.open_file(self._toc_filename())
if stream.read_varint() != _int_size or \
stream.read_varint() != _ulong_size or \
stream.read_varint() != _float_size or \
stream.read_string() != byteorder:
raise IndexError("Index was created on a different architecture")
version = stream.read_int()
if version != _index_version:
raise IndexError("Don't know how to read index version %s" % version)
# If the user supplied a schema object with the constructor,
# don't load the pickled schema from the saved index.
if schema:
self.schema = schema
stream.skip_string()
else:
self.schema = cPickle.loads(stream.read_string())
generation = stream.read_int()
assert generation == self.generation
self.segment_counter = stream.read_int()
self.segments = stream.read_pickle()
stream.close()
def _next_segment_name(self):
#Returns the name of the next segment in sequence.
generalcounter.increment("nextsegment")
return str(generalcounter.get_count("nextsegment"))
def _toc_filename(self):
# Returns the computed filename of the TOC for this
# index name and generation.
return "_%s_%s.toc" % (self.indexname, self.generation)
def last_modified(self):
"""Returns the last modified time of the .toc file.
"""
return self.storage.file_modified(self._toc_filename())
def __repr__(self):
return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.indexname)
def lock(self):
"""Locks this index for writing, or raises an error if the index
is already locked. Returns true if the index was successfully
locked.
"""
return self.storage.lock("_%s_LOCK" % self.indexname)
def unlock(self):
"""Unlocks the index. Only call this if you were the one who locked
it (without getting an exception) in the first place!
"""
self.storage.unlock("_%s_LOCK" % self.indexname)
def is_empty(self):
"""Returns True if this index is empty (that is, it has never
had any documents successfully written to it.
"""
return len(self.segments) == 0
def optimize(self):
"""Optimizes this index's segments. This will fail if the index
is already locked for writing.
"""
if len(self.segments) < 2 and not self.segments.has_deletions():
return
from whoosh import writing
w = writing.IndexWriter(self)
w.commit(writing.OPTIMIZE)
def commit(self, new_segments = None):
"""Commits pending edits (such as deletions) to this index object.
Raises OutOfDateError if this index is not the latest generation
(that is, if someone has updated the index since you opened
this object).
:new_segments: a replacement SegmentSet. This is used by
IndexWriter to update the index after it finishes
writing.
"""
self._searcher.close()
if not self.up_to_date():
raise OutOfDateError
if new_segments:
self.segments = new_segments
self.generation += 1
self._write()
self.clean_files()
self._searcher = self.searcher()
def clean_files(self):
"""Attempts to remove unused index files (called when a new generation
is created). If existing Index and/or reader objects have the files
open, they may not get deleted immediately (i.e. on Windows)
but will probably be deleted eventually by a later call to clean_files.
"""
storage = self.storage
current_segment_names = set([s.name for s in self.segments])
tocpattern = _toc_pattern(self.indexname)
segpattern = _segment_pattern()
for filename in storage:
m = tocpattern.match(filename)
if m:
num = int(m.group(1))
if num != self.generation:
try:
storage.delete_file(filename)
except WindowsError:
# Another process still has this file open
pass
else:
m = segpattern.match(filename)
if m:
name = m.group(1)
if name not in current_segment_names:
try:
storage.delete_file(filename)
except WindowsError:
# Another process still has this file open
pass
def doc_count_all(self):
"""Returns the total number of documents, DELETED OR UNDELETED,
in this index.
"""
return self.segments.doc_count_all()
def doc_count(self):
"""Returns the total number of UNDELETED documents in this index.
"""
return self.segments.doc_count()
def max_weight(self):
"""Returns the maximum term weight in this index.
This is used by some scoring algorithms.
"""
return self.segments.max_weight()
def field_length(self, fieldid):
"""Returns the total number of terms in a given field.
This is used by some scoring algorithms. Note that this
necessarily includes terms in deleted documents.
"""
fieldnum = self.schema.to_number(fieldid)
return sum(s.field_length(fieldnum) for s in self.segments)
def term_reader(self):
"""Returns a TermReader object for this index.
:*returns*: reading.TermReader
"""
from whoosh import reading
segments = self.segments
if len(segments) == 1:
return reading.TermReader(self.storage, segments[0], self.schema)
else:
term_readers = [reading.TermReader(self.storage, s, self.schema)
for s in segments]
doc_offsets = segments.doc_offsets()
return reading.MultiTermReader(term_readers, doc_offsets, self.schema)
def doc_reader(self):
"""Returns a DocReader object for this index.
:*returns*: reading.DocReader
"""
from whoosh import reading
schema = self.schema
segments = self.segments
if len(segments) == 1:
return reading.DocReader(self.storage, segments[0], schema)
else:
doc_readers = [reading.DocReader(self.storage, s, self.schema)
for s in segments]
doc_offsets = segments.doc_offsets()
return reading.MultiDocReader(doc_readers, doc_offsets, schema)
def searcher(self, **kwargs):
"""Returns a Searcher object for this index. Keyword arguments
are passed to the Searcher object's constructor.
:*returns*: searching.Searcher
"""
from whoosh.searching import Searcher
return Searcher(self, **kwargs)
def writer(self, **kwargs):
"""Returns an IndexWriter object for this index.
:*returns*: writing.IndexWriter
"""
from whoosh.writing import IndexWriter
return IndexWriter(self, **kwargs)
def find(self, querystring, parser = None, **kwargs):
"""Parses querystring, runs the query in this index, and returns a
Result object. Any additional keyword arguments are passed to
Searcher.search() along with the parsed query.
:querystring: The query string to parse and search for.
:parser: A Parser object to use to parse 'querystring'.
The default is to use a standard qparser.QueryParser.
This object must implement a parse(str) method which returns a
query.Query instance.
:*returns*: searching.Results
"""
if parser is None:
from whoosh.qparser import QueryParser
parser = QueryParser(self.schema)
return self._searcher.search(parser.parse(querystring), **kwargs)
# SegmentSet object
class SegmentSet(object):
"""This class is never instantiated by the user. It is used by the Index
object to keep track of the segments in the index.
"""
def __init__(self, segments = None):
if segments is None:
self.segments = []
else:
self.segments = segments
self._doc_offsets = self.doc_offsets()
def __repr__(self):
return repr(self.segments)
def __len__(self):
""":*returns*: the number of segments in this set."""
return len(self.segments)
def __iter__(self):
return iter(self.segments)
def __getitem__(self, n):
return self.segments.__getitem__(n)
def append(self, segment):
"""Adds a segment to this set."""
self.segments.append(segment)
self._doc_offsets = self.doc_offsets()
def _document_segment(self, docnum):
"""Returns the index.Segment object containing the given document
number.
"""
offsets = self._doc_offsets
if len(offsets) == 1: return 0
return bisect_right(offsets, docnum) - 1
def _segment_and_docnum(self, docnum):
"""Returns an (index.Segment, segment_docnum) pair for the
segment containing the given document number.
"""
segmentnum = self._document_segment(docnum)
offset = self._doc_offsets[segmentnum]
segment = self.segments[segmentnum]
return segment, docnum - offset
def copy(self):
""":*returns*: a deep copy of this set."""
return self.__class__([s.copy() for s in self.segments])
def doc_offsets(self):
# Recomputes the document offset list. This must be called if you
# change self.segments.
offsets = []
base = 0
for s in self.segments:
offsets.append(base)
base += s.doc_count_all()
return offsets
def doc_count_all(self):
"""
:*returns*: the total number of documents, DELETED or
UNDELETED, in this set.
"""
return sum(s.doc_count_all() for s in self.segments)
def doc_count(self):
"""
:*returns*: the number of undeleted documents in this set.
"""
return sum(s.doc_count() for s in self.segments)
def max_weight(self):
"""
:*returns*: the maximum frequency of any term in the set.
"""
if not self.segments:
return 0
return max(s.max_weight for s in self.segments)
def has_deletions(self):
"""
:*returns*: True if this index has documents that are marked
deleted but haven't been optimized out of the index yet.
This includes deletions that haven't been written to disk
with Index.commit() yet.
"""
return any(s.has_deletions() for s in self.segments)
def delete_document(self, docnum, delete = True):
"""Deletes a document by number.
You must call Index.commit() for the deletion to be written to disk.
"""
segment, segdocnum = self._segment_and_docnum(docnum)
segment.delete_document(segdocnum, delete = delete)
def deleted_count(self):
"""
:*returns*: the total number of deleted documents in this index.
"""
return sum(s.deleted_count() for s in self.segments)
def is_deleted(self, docnum):
"""
:*returns*: True if a given document number is deleted but not yet
optimized out of the index.
"""
segment, segdocnum = self._segment_and_docnum(docnum)
return segment.is_deleted(segdocnum)
class Segment(object):
"""Do not instantiate this object directly. It is used by the Index
object to hold information about a segment. A list of objects of this
class are pickled as part of the TOC file.
The TOC file stores a minimal amount of information -- mostly a list of
Segment objects. Segments are the real reverse indexes. Having multiple
segments allows quick incremental indexing: just create a new segment for
the new documents, and have the index overlay the new segment over previous
ones for purposes of reading/search. "Optimizing" the index combines the
contents of existing segments into one (removing any deleted documents
along the way).
"""
def __init__(self, name, max_doc, max_weight, field_length_totals,
deleted = None):
"""
:name: The name of the segment (the Index object computes this from its
name and the generation).
:max_doc: The maximum document number in the segment.
:term_count: Total count of all terms in all documents.
:max_weight: The maximum weight of any term in the segment. This is used
by some scoring algorithms.
:field_length_totals: A dictionary mapping field numbers to the total
number of terms in that field across all documents in the segment.
:deleted: A collection of deleted document numbers, or None
if no deleted documents exist in this segment.
"""
self.name = name
self.max_doc = max_doc
self.max_weight = max_weight
self.field_length_totals = field_length_totals
self.deleted = deleted
self.doclen_filename = self.name + ".dci"
self.docs_filename = self.name + ".dcz"
self.term_filename = self.name + ".tiz"
self.vector_filename = self.name + ".fvz"
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.name)
def copy(self):
return Segment(self.name, self.max_doc,
self.max_weight, self.field_length_totals,
self.deleted)
def doc_count_all(self):
"""
:*returns*: the total number of documents, DELETED OR UNDELETED,
in this segment.
"""
return self.max_doc
def doc_count(self):
""":*returns*: the number of (undeleted) documents in this segment."""
return self.max_doc - self.deleted_count()
def has_deletions(self):
""":*returns*: True if any documents in this segment are deleted."""
return self.deleted_count() > 0
def deleted_count(self):
""":*returns*: the total number of deleted documents in this segment."""
if self.deleted is None: return 0
return len(self.deleted)
def field_length(self, fieldnum):
"""
:fieldnum: the internal number of the field.
:*returns*: the total number of terms in the given field across all
documents in this segment.
"""
return self.field_length_totals.get(fieldnum, 0)
def delete_document(self, docnum, delete = True):
"""Deletes the given document number. The document is not actually
removed from the index until it is optimized.
:docnum: The document number to delete.
:delete: If False, this undeletes a deleted document.
"""
if delete:
if self.deleted is None:
self.deleted = set()
elif docnum in self.deleted:
raise KeyError("Document %s in segment %r is already deleted"
% (docnum, self.name))
self.deleted.add(docnum)
else:
if self.deleted is None or docnum not in self.deleted:
raise KeyError("Document %s is not deleted" % docnum)
self.deleted.remove(docnum)
def is_deleted(self, docnum):
""":*returns*: True if the given document number is deleted."""
if self.deleted is None: return False
return docnum in self.deleted
# Debugging functions
if __name__ == '__main__':
pass