Skip to content

Commit

Permalink
[opt](chinese) chinese tokenizer lowercase interface (apache#201)
Browse files Browse the repository at this point in the history
  • Loading branch information
zzzxl1993 authored Mar 16, 2024
1 parent e9c7f1f commit fe7ecdb
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 5 deletions.
18 changes: 15 additions & 3 deletions src/core/CLucene/index/IndexWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1255,7 +1255,7 @@ void IndexWriter::resetMergeExceptions() {
void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_dirs,
std::vector<lucene::store::Directory *> dest_dirs,
std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec,
std::vector<uint32_t> dest_index_docs) {
std::vector<uint32_t> dest_index_docs, bool maybe_skip) {
CND_CONDITION(src_dirs.size() > 0, "Source directory not found.");
CND_CONDITION(dest_dirs.size() > 0, "Destination directory not found.");
this->_trans_vec = std::move(trans_vec);
Expand Down Expand Up @@ -1387,7 +1387,7 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
}

/// merge terms
mergeTerms(hasProx);
mergeTerms(hasProx, maybe_skip);

/// merge null_bitmap
mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
Expand Down Expand Up @@ -1613,7 +1613,7 @@ class postingQueue : public CL_NS(util)::PriorityQueue<DestDoc*,CL_NS(util)::Del

};

void IndexWriter::mergeTerms(bool hasProx) {
void IndexWriter::mergeTerms(bool hasProx, bool maybe_skip) {
auto queue = _CLNEW SegmentMergeQueue(readers.size());
auto numSrcIndexes = readers.size();
//std::vector<TermPositions *> postingsList(numSrcIndexes);
Expand Down Expand Up @@ -1664,6 +1664,18 @@ void IndexWriter::mergeTerms(bool hasProx) {
top = queue->top();
}

if (maybe_skip && smallestTerm) {
auto containsUpperCase = [](const std::wstring_view& ws_term) {
return std::any_of(ws_term.begin(), ws_term.end(),
[](wchar_t ch) { return std::iswupper(ch) != 0; });
};

std::wstring_view ws_term(smallestTerm->text(), smallestTerm->textLength());
if (containsUpperCase(ws_term)) {
_CLTHROWA(CL_ERR_InvalidState, "need rewrite, skip index compaction");
}
}

std::vector<std::vector<uint32_t>> docDeltaBuffers(numDestIndexes);
std::vector<std::vector<uint32_t>> freqBuffers(numDestIndexes);
auto destPostingQueues = _CLNEW postingQueue(matchSize);
Expand Down
4 changes: 2 additions & 2 deletions src/core/CLucene/index/IndexWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,14 +317,14 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
void indexCompaction(std::vector<lucene::store::Directory*>& src_dirs,
std::vector<lucene::store::Directory*> dest_dirs,
std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec,
std::vector<uint32_t> dest_index_docs);
std::vector<uint32_t> dest_index_docs, bool maybe_skip = false);

// create new fields info
void mergeFields(bool hasProx);
// write fields info file
void writeFields(lucene::store::Directory* d, std::string segment);
// merge terms and write files
void mergeTerms(bool hasProx);
void mergeTerms(bool hasProx, bool maybe_skip = false);
// merge null_bitmap
void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);

Expand Down

0 comments on commit fe7ecdb

Please sign in to comment.