Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A partial patch for better tokenisation of mixed Chinese numbers #220

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
59 changes: 57 additions & 2 deletions xapian-core/queryparser/termgenerator_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,47 @@ check_infix(unsigned ch)
return 0;
}

/// check if ch is a Chinese character about digits
static inline bool
is_Chinese_digit(unsigned ch)
ojwb marked this conversation as resolved.
Show resolved Hide resolved
{
// Below are Chinese characters reprensent digits
ojwb marked this conversation as resolved.
Show resolved Hide resolved
switch (ch) {
case 0x96f6: // CHINESE ZERO (the same in Simplified and Traditional)
case 0x4e00: // SIMPLIFIED CHINESE ONE
case 0x4e8c: // SIMPLIFIED CHINESE TWO
case 0x4e24: // SIMPLIFIED CHINESE ANOTHER TWO
case 0x4e09: // SIMPLIFIED CHINESE THREE
case 0x56db: // SIMPLIFIED CHINESE FOUR
case 0x4e94: // SIMPLIFIED CHINESE FIVE
case 0x516d: // SIMPLIFIED CHINESE SIX
case 0x4e03: // SIMPLIFIED CHINESE SEVEN
case 0x516b: // SIMPLIFIED CHINESE EIGHT
case 0x4e5d: // SIMPLIFIED CHINESE NINE
case 0x5341: // SIMPLIFIED CHINESE TEN
case 0x767e: // SIMPLIFIED CHINESE HUNDRED
case 0x5343: // SIMPLIFIED CHINESE THOUSAND
case 0x4e07: // SIMPLIFIED CHINESE TEN THOUSAND
case 0x4ebf: // SIMPLIFIED CHINESE HUNDRED MILLION
case 0x58f9: // TRADITIONAL CHINESE ONE
case 0x8d30: // TRADITIONAL CHINESE TWO
case 0x53c1: // TRADITIONAL CHINESE THREE
case 0x8086: // TRADITIONAL CHINESE FOUR
case 0x4f0d: // TRADITIONAL CHINESE FIVE
case 0x9646: // TRADITIONAL CHINESE SIX
case 0x67d2: // TRADITIONAL CHINESE SEVEN
case 0x634c: // TRADITIONAL CHINESE EIGHT
case 0x7396: // TRADITIONAL CHINESE NINE
case 0x62fe: // TRADITIONAL CHINESE TEN
case 0x4f70: // TRADITIONAL CHINESE HUNDRED
case 0x4edf: // TRADITIONAL CHINESE THOUSAND
case 0x842c: // TRADITIONAL CHINESE TEN THOUSAND
case 0x5104: // TRADITIONAL CHINESE HUNDRED MILLION
return true;
}
return false;
}

static inline unsigned
check_infix_digit(unsigned ch)
{
Expand Down Expand Up @@ -229,9 +270,23 @@ parse_terms(Utf8Iterator itor, unsigned cjk_flags, bool with_positions,
do {
Unicode::append_utf8(term, ch);
prevch = ch;
if (++itor == Utf8Iterator() ||
(cjk_flags && CJK::codepoint_is_cjk(*itor)))
if (++itor == Utf8Iterator())
goto endofterm;
else if (cjk_flags) {
// Only deal mixed Chinese numbers which start
ojwb marked this conversation as resolved.
Show resolved Hide resolved
// with an Arabic digit and next is a Chinese digit.
ojwb marked this conversation as resolved.
Show resolved Hide resolved
if (is_digit(prevch) && is_Chinese_digit(*itor)) {
ojwb marked this conversation as resolved.
Show resolved Hide resolved
do {
ch = *itor;
Unicode::append_utf8(term, ch);
} while (++itor != Utf8Iterator() &&
(is_digit(*itor) || is_Chinese_digit(*itor)));
ojwb marked this conversation as resolved.
Show resolved Hide resolved
goto endofterm;
}
else if (CJK::codepoint_is_cjk(*itor)) {
ojwb marked this conversation as resolved.
Show resolved Hide resolved
goto endofterm;
}
}
ch = check_wordchar(*itor);
} while (ch);

Expand Down
5 changes: 5 additions & 0 deletions xapian-core/tests/api_termgen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ static const test test_simple[] = {
{ "", "극지라", "극[1] 극지:1 라[3] 지[2] 지라:1" },
{ "", "ウルス アップ", "ア[4] アッ:1 ウ[1] ウル:1 ス[3] ッ[5] ップ:1 プ[6] ル[2] ルス:1" },

// Test parsing mixed CJK numbers (only with Chinese now)
{ "", "2千3百", "2千3百[1]"},
{ "", "有2千三百", "2千三百[2] 有[1]"},
{ "", "我有2千3百零一块钱", "2千3百零一[3] 块[4] 块钱:1 我[1] 我有:1 有[2] 钱[5]"},

ojwb marked this conversation as resolved.
Show resolved Hide resolved
// Non-CJK in CJK-mode:
{ "", "hello World Test", "hello[1] test[3] world[2]" },

Expand Down