Skip to content

Commit

Permalink
Fix bug in segmenter involving short strings of complex scripts (#3341)
Browse files Browse the repository at this point in the history
  • Loading branch information
sffc committed Apr 18, 2023
1 parent 6af64d2 commit 502c9ff
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 8 deletions.
18 changes: 14 additions & 4 deletions components/segmenter/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1072,20 +1072,25 @@ where
iter.current_pos_data = start_point;
let breaks = complex_language_segment_str(iter.complex, &s);
iter.result_cache = breaks;
let mut i = iter.get_current_codepoint()?.len_utf8();
let first_pos = *iter.result_cache.first()?;
let mut i = left_codepoint.len_utf8();
loop {
if i == first_pos {
// Re-calculate breaking offset
iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
return iter.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iter.get_current_position()
);
i += T::get_current_position_character_len(iter);
iter.advance_iter();
if iter.is_eof() {
iter.result_cache.clear();
return Some(iter.len);
}
i += T::get_current_position_character_len(iter);
}
}

Expand Down Expand Up @@ -1173,10 +1178,10 @@ impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
iterator.iter = start_iter;
iterator.current_pos_data = start_point;
let breaks = complex_language_segment_utf16(iterator.complex, &s);
let mut i = 1;
iterator.result_cache = breaks;
// result_cache vector is utf-16 index that is in BMP.
let first_pos = *iterator.result_cache.first()?;
let mut i = 1;
loop {
if i == first_pos {
// Re-calculate breaking offset
Expand All @@ -1188,12 +1193,17 @@ impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
.collect();
return iterator.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iterator.get_current_position()
);
i += 1;
iterator.advance_iter();
if iterator.is_eof() {
iterator.result_cache.clear();
return Some(iterator.len);
}
i += 1;
}
}
}
Expand Down
18 changes: 14 additions & 4 deletions components/segmenter/src/word.rs
Original file line number Diff line number Diff line change
Expand Up @@ -461,19 +461,24 @@ where
let breaks = complex_language_segment_str(iter.complex.unwrap(), &s);
iter.result_cache = breaks;
let first_pos = *iter.result_cache.first()?;
let mut i = iter.get_current_codepoint()?.len_utf8();
let mut i = left_codepoint.len_utf8();
loop {
if i == first_pos {
// Re-calculate breaking offset
iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
return iter.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iter.get_current_position()
);
i += T::get_current_position_character_len(iter);
iter.advance_iter();
if iter.is_eof() {
iter.result_cache.clear();
return Some(iter.len);
}
i += T::get_current_position_character_len(iter);
}
}

Expand Down Expand Up @@ -519,22 +524,27 @@ impl<'l, 's> RuleBreakType<'l, 's> for WordBreakTypeUtf16 {
iter.current_pos_data = start_point;
#[allow(clippy::unwrap_used)] // iter.complex present for word segmenter
let breaks = complex_language_segment_utf16(iter.complex.unwrap(), &s);
let mut i = 1;
iter.result_cache = breaks;
// result_cache vector is utf-16 index that is in BMP.
let first_pos = *iter.result_cache.first()?;
let mut i = 1;
loop {
if i == first_pos {
// Re-calculate breaking offset
iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
return iter.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iter.get_current_position()
);
i += 1;
iter.advance_iter();
if iter.is_eof() {
iter.result_cache.clear();
return Some(iter.len);
}
i += 1;
}
}
}
Expand Down
58 changes: 58 additions & 0 deletions components/segmenter/tests/complex_word.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,61 @@ fn word_break_mixed_han() {
);
}
}

#[test]
fn word_line_th_wikipedia_auto() {
use icu_segmenter::LineSegmenter;

let text = "แพนด้าแดง (อังกฤษ: Red panda, Shining cat; จีน: 小熊貓; พินอิน: Xiǎo xióngmāo) สัตว์เลี้ยงลูกด้วยนมชนิดหนึ่ง มีชื่อวิทยาศาสตร์ว่า Ailurus fulgens";
assert_eq!(text.len(), 297);
let utf16: Vec<u16> = text.encode_utf16().collect();
assert_eq!(utf16.len(), 142);

let segmenter_word_auto =
WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter_line_auto =
LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists");

let breakpoints_word_utf8 = segmenter_word_auto.segment_str(text).collect::<Vec<_>>();
assert_eq!(
breakpoints_word_utf8,
[
0, 9, 18, 27, 28, 29, 38, 47, 48, 49, 52, 53, 58, 59, 60, 67, 68, 71, 72, 73, 82, 83,
84, 90, 93, 94, 95, 104, 113, 114, 115, 120, 121, 131, 132, 133, 148, 166, 175, 187,
193, 205, 220, 221, 227, 239, 272, 281, 282, 289, 290, 297
]
);

let breakpoints_line_utf8 = segmenter_line_auto.segment_str(text).collect::<Vec<_>>();
assert_eq!(
breakpoints_line_utf8,
[
0, 9, 18, 27, 28, 38, 47, 49, 53, 60, 68, 73, 82, 84, 87, 90, 95, 104, 113, 115, 121,
133, 148, 166, 175, 187, 193, 205, 220, 221, 227, 239, 272, 281, 282, 290, 297
]
);

let breakpoints_word_utf16 = segmenter_word_auto
.segment_utf16(&utf16)
.collect::<Vec<_>>();
assert_eq!(
breakpoints_word_utf16,
[
0, 3, 6, 9, 10, 11, 14, 17, 18, 19, 22, 23, 28, 29, 30, 37, 38, 41, 42, 43, 46, 47, 48,
50, 51, 52, 53, 56, 59, 60, 61, 65, 66, 74, 75, 76, 81, 87, 90, 94, 96, 100, 105, 106,
108, 112, 123, 126, 127, 134, 135, 142
]
);

let breakpoints_word_utf16 = segmenter_word_auto
.segment_utf16(&utf16)
.collect::<Vec<_>>();
assert_eq!(
breakpoints_word_utf16,
[
0, 3, 6, 9, 10, 11, 14, 17, 18, 19, 22, 23, 28, 29, 30, 37, 38, 41, 42, 43, 46, 47, 48,
50, 51, 52, 53, 56, 59, 60, 61, 65, 66, 74, 75, 76, 81, 87, 90, 94, 96, 100, 105, 106,
108, 112, 123, 126, 127, 134, 135, 142
]
);
}

0 comments on commit 502c9ff

Please sign in to comment.