Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix TermGenerator do not stop stemmed term. #173

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion xapian-applications/omega/index_file.cc
Expand Up @@ -181,7 +181,7 @@ index_add_default_filters()
// pod2text's output character set doesn't seem to be documented, but from
// inspecting the source it looks like it's probably iso-8859-1.
index_command("text/x-perl",
Filter("pod2text", "text/plain", "iso-8859-1", false));
Filter("perl -MPod::Text -e '$p = Pod::Text->new(); $p->no_errata_section(1); $p->parse_file($ARGV[0])'", "text/plain", "iso-8859-1", false));
// FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
// appearing as single ligatures. For European languages, it's actually
// better to use -e2 (ISO-8859-1) and then convert, so let's do that for
Expand Down
18 changes: 11 additions & 7 deletions xapian-core/queryparser/termgenerator_internal.cc
Expand Up @@ -267,18 +267,22 @@ TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
if (strategy == TermGenerator::STEM_NONE ||
!stemmer.internal.get()) return true;

// Note, this uses the lowercased term, but that's OK as we
// only want to avoid stemming terms starting with a digit.
if (strategy == TermGenerator::STEM_SOME && !should_stem(term)) {
return true;
}

// Add stemmed form without positional information.
const string& stem = stemmer(term);

// Stop stemmed term which belongs to stopword list.
if (strategy == TermGenerator::STEM_SOME) {
if (current_stop_mode == TermGenerator::STOP_STEMMED &&
(*stopper)(term))
(*stopper)(stem))
return true;

// Note, this uses the lowercased term, but that's OK as we
// only want to avoid stemming terms starting with a digit.
if (!should_stem(term)) return true;
}

// Add stemmed form without positional information.
const string& stem = stemmer(term);
if (rare(stem.empty())) return true;
string stemmed_term;
if (strategy != TermGenerator::STEM_ALL) {
Expand Down
23 changes: 23 additions & 0 deletions xapian-core/tests/api_termgen.cc
Expand Up @@ -878,3 +878,26 @@ DEFINE_TESTCASE(tg_max_word_length1, !backend) {

return true;
}

DEFINE_TESTCASE(stop_stemmed_terms, !backend) {
Xapian::TermGenerator termgen;
termgen.set_stemmer(Xapian::Stem("en"));

Xapian::Document doc;
termgen.set_document(doc);

array<const char *, 3> x = {{"bowl", "a", "an"}};
Xapian::SimpleStopper *stopper = new Xapian::SimpleStopper(x.begin(), x.end());
termgen.set_stopper(stopper->release());

termgen.set_stopper_strategy(termgen.STOP_STEMMED);
termgen.set_stemming_strategy(termgen.STEM_SOME);

termgen.index_text("cups bowls mugs");

TEST_STRINGS_EQUAL(format_doc_termlist(doc),
"Zcup:1 Zmug:1 bowls[2] cups[1] mugs[3]");

return true;

}