Skip to content

Commit

Permalink
segmenter
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian committed Jun 23, 2023
1 parent 9c7be5b commit 1d41bba
Show file tree
Hide file tree
Showing 30 changed files with 593 additions and 379 deletions.
4 changes: 3 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions components/segmenter/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ license = "Unicode-DFS-2016"
categories = ["internationalization"]
# Keep this in sync with other crates unless there are exceptions
include = [
"data/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
Expand All @@ -42,9 +41,12 @@ serde = { version = "1.0", default-features = false, features = ["derive", "allo

libm = { version = "0.2", default-features = false, optional = true }

icu_segmenter_data = { path = "data", optional = true }
icu_locid_transform = { path = "../../components/locid_transform", features = ["data"], optional = true }

[dev-dependencies]
criterion = "0.4"
icu_testdata = { path = "../../provider/testdata", default-features = false, features = ["buffer", "icu_segmenter", "icu_properties", "icu_locid_transform"] }
icu_properties = { path = "../properties", features = ["data"] }
serde = { version = "1.0", default-features = false, features = ["derive"] }
serde_json = "1.0"
icu = { path = "../../components/icu" }
Expand All @@ -60,6 +62,7 @@ serde = ["dep:serde", "zerovec/serde", "icu_collections/serde", "icu_provider/se
datagen = ["serde", "dep:databake", "zerovec/databake", "icu_collections/databake"]
lstm = ["dep:libm"]
auto = ["lstm"] # Enabled try_new_auto_unstable constructors
data = ["dep:icu_segmenter_data"]

[lib]
path = "src/lib.rs"
Expand Down
12 changes: 4 additions & 8 deletions components/segmenter/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 10 additions & 23 deletions components/segmenter/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,12 @@ const TEST_STR_TH: &str =
fn line_break_iter_latin1(c: &mut Criterion) {
let mut group = c.benchmark_group("Line Break/Latin1");

let segmenter =
LineSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter = LineSegmenter::new_dictionary();

let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Anywhere;
options.word_option = LineBreakWordOption::BreakAll;
let segmenter_css =
LineSegmenter::try_new_dictionary_with_options_unstable(&icu_testdata::unstable(), options)
.expect("Data exists");
let segmenter_css = LineSegmenter::new_dictionary_with_options(options);

group.bench_function("En", |b| {
b.iter(|| {
Expand All @@ -47,19 +44,14 @@ fn line_break_iter_latin1(c: &mut Criterion) {
fn line_break_iter_utf8(c: &mut Criterion) {
let mut group = c.benchmark_group("Line Break/UTF8");

let segmenter_auto =
LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter_lstm =
LineSegmenter::try_new_lstm_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter_dictionary =
LineSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter_auto = LineSegmenter::new_auto();
let segmenter_lstm = LineSegmenter::new_lstm();
let segmenter_dictionary = LineSegmenter::new_dictionary();

let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Anywhere;
options.word_option = LineBreakWordOption::BreakAll;
let segmenter_css_dictionary =
LineSegmenter::try_new_dictionary_with_options_unstable(&icu_testdata::unstable(), options)
.expect("Data exists");
let segmenter_css_dictionary = LineSegmenter::new_dictionary_with_options(options);

// No need to test "auto", "lstm", or "dictionary" constructor variants since English uses only
// UAX14 rules for line breaking.
Expand Down Expand Up @@ -101,19 +93,14 @@ fn line_break_iter_utf16(c: &mut Criterion) {
let utf16_en: Vec<u16> = TEST_STR_EN.encode_utf16().collect();
let utf16_th: Vec<u16> = TEST_STR_TH.encode_utf16().collect();

let segmenter_auto =
LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter_lstm =
LineSegmenter::try_new_lstm_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter_dictionary =
LineSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists");
let segmenter_auto = LineSegmenter::new_auto();
let segmenter_lstm = LineSegmenter::new_lstm();
let segmenter_dictionary = LineSegmenter::new_dictionary();

let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Anywhere;
options.word_option = LineBreakWordOption::BreakAll;
let segmenter_css_dictionary =
LineSegmenter::try_new_dictionary_with_options_unstable(&icu_testdata::unstable(), options)
.expect("Data exists");
let segmenter_css_dictionary = LineSegmenter::new_dictionary_with_options(options);

// No need to test "auto", "lstm", or "dictionary" constructor variants since English uses only
// UAX14 rules for line breaking.
Expand Down
38 changes: 9 additions & 29 deletions components/segmenter/src/complex/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,22 +179,10 @@ mod tests {
use super::*;
use crate::{LineSegmenter, WordSegmenter};
use icu_provider::prelude::*;
use icu_provider_adapters::fork::ForkByKeyProvider;
use icu_provider_fs::FsDataProvider;
use std::path::PathBuf;

fn get_segmenter_testdata_provider() -> impl BufferProvider {
let segmenter_fs_provider = FsDataProvider::try_new(
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/testdata/provider"),
)
.unwrap();
ForkByKeyProvider::new(segmenter_fs_provider, icu_testdata::buffer())
}

#[test]
fn burmese_dictionary_test() {
let provider = get_segmenter_testdata_provider();
let segmenter = LineSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap();
let segmenter = LineSegmenter::new_dictionary();
// From css/css-text/word-break/word-break-normal-my-000.html
let s = "မြန်မာစာမြန်မာစာမြန်မာစာ";
let result: Vec<usize> = segmenter.segment_str(s).collect();
Expand All @@ -207,25 +195,19 @@ mod tests {

#[test]
fn cj_dictionary_test() {
let provider = get_segmenter_testdata_provider();
let dict_payload: DataPayload<DictionaryForWordOnlyAutoV1Marker> = provider
.as_deserializing()
let dict_payload: DataPayload<DictionaryForWordOnlyAutoV1Marker> = crate::provider::Baked
.load(DataRequest {
locale: &icu_locid::locale!("ja").into(),
metadata: Default::default(),
})
.unwrap()
.take_payload()
.unwrap();
let grph_payload: DataPayload<GraphemeClusterBreakDataV1Marker> = provider
.as_deserializing()
.load(Default::default())
.unwrap()
.take_payload()
.unwrap();
let word_segmenter =
WordSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap();
let dict_segmenter = DictionarySegmenter::new(dict_payload.get(), grph_payload.get());
let word_segmenter = WordSegmenter::new_dictionary();
let dict_segmenter = DictionarySegmenter::new(
dict_payload.get(),
crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
);

// Match case
let s = "龟山岛龟山岛";
Expand Down Expand Up @@ -262,8 +244,7 @@ mod tests {

#[test]
fn khmer_dictionary_test() {
let provider = get_segmenter_testdata_provider();
let segmenter = LineSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap();
let segmenter = LineSegmenter::new_dictionary();
let s = "ភាសាខ្មែរភាសាខ្មែរភាសាខ្មែរ";
let result: Vec<usize> = segmenter.segment_str(s).collect();
assert_eq!(result, vec![0, 27, 54, 81]);
Expand All @@ -275,8 +256,7 @@ mod tests {

#[test]
fn lao_dictionary_test() {
let provider = get_segmenter_testdata_provider();
let segmenter = LineSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap();
let segmenter = LineSegmenter::new_dictionary();
let s = "ພາສາລາວພາສາລາວພາສາລາວ";
let r: Vec<usize> = segmenter.segment_str(s).collect();
assert_eq!(r, vec![0, 12, 21, 33, 42, 54, 63]);
Expand Down
14 changes: 5 additions & 9 deletions components/segmenter/src/complex/lstm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,22 +355,18 @@ mod tests {

#[test]
fn segment_file_by_lstm() {
let lstm: DataPayload<LstmForWordLineAutoV1Marker> = icu_testdata::buffer()
.as_deserializing()
let lstm: DataPayload<LstmForWordLineAutoV1Marker> = crate::provider::Baked
.load(DataRequest {
locale: &locale!("th").into(),
metadata: Default::default(),
})
.unwrap()
.take_payload()
.unwrap();
let grapheme: DataPayload<GraphemeClusterBreakDataV1Marker> = icu_testdata::buffer()
.as_deserializing()
.load(Default::default())
.unwrap()
.take_payload()
.unwrap();
let lstm = LstmSegmenter::new(lstm.get(), grapheme.get());
let lstm = LstmSegmenter::new(
lstm.get(),
crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
);

// Importing the test data
let test_text_data = load_test_text(&format!(
Expand Down
Loading

0 comments on commit 1d41bba

Please sign in to comment.