diff --git a/Cargo.lock b/Cargo.lock index 5699aa81344..4a229e58bef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1986,10 +1986,12 @@ dependencies = [ "icu", "icu_collections", "icu_locid", + "icu_locid_transform", + "icu_properties", "icu_provider", "icu_provider_adapters", "icu_provider_fs", - "icu_testdata", + "icu_segmenter_data", "itertools", "libm", "serde", diff --git a/components/segmenter/Cargo.toml b/components/segmenter/Cargo.toml index 9cfe15fd63c..ab298b4b85b 100644 --- a/components/segmenter/Cargo.toml +++ b/components/segmenter/Cargo.toml @@ -16,7 +16,6 @@ license = "Unicode-DFS-2016" categories = ["internationalization"] # Keep this in sync with other crates unless there are exceptions include = [ - "data/*", "src/**/*", "examples/**/*", "benches/**/*", @@ -42,9 +41,12 @@ serde = { version = "1.0", default-features = false, features = ["derive", "allo libm = { version = "0.2", default-features = false, optional = true } +icu_segmenter_data = { path = "data", optional = true } +icu_locid_transform = { path = "../../components/locid_transform", features = ["data"], optional = true } + [dev-dependencies] criterion = "0.4" -icu_testdata = { path = "../../provider/testdata", default-features = false, features = ["buffer", "icu_segmenter", "icu_properties", "icu_locid_transform"] } +icu_properties = { path = "../properties", features = ["data"] } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = "1.0" icu = { path = "../../components/icu" } @@ -60,6 +62,7 @@ serde = ["dep:serde", "zerovec/serde", "icu_collections/serde", "icu_provider/se datagen = ["serde", "dep:databake", "zerovec/databake", "icu_collections/databake"] lstm = ["dep:libm"] auto = ["lstm"] # Enabled try_new_auto_unstable constructors +data = ["dep:icu_segmenter_data"] [lib] path = "src/lib.rs" diff --git a/components/segmenter/README.md b/components/segmenter/README.md index 84fe6f710d5..cef16239bda 100644 --- a/components/segmenter/README.md +++ b/components/segmenter/README.md @@ -28,8 +28,7 @@ Find line break opportunities: use icu::segmenter::LineSegmenter; let segmenter = - LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) - .expect("Data exists"); + LineSegmenter::new_auto(); let breakpoints: Vec = segmenter .segment_str("Hello World. Xin chào thế giới!") @@ -47,8 +46,7 @@ Find all grapheme cluster boundaries: use icu::segmenter::GraphemeClusterSegmenter; let segmenter = - GraphemeClusterSegmenter::try_new_unstable(&icu_testdata::unstable()) - .expect("Data exists"); + GraphemeClusterSegmenter::new(); let breakpoints: Vec = segmenter .segment_str("Hello World. Xin chào thế giới!") @@ -72,8 +70,7 @@ Find all word boundaries: use icu::segmenter::WordSegmenter; let segmenter = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) - .expect("Data exists"); + WordSegmenter::new_auto(); let breakpoints: Vec = segmenter .segment_str("Hello World. Xin chào thế giới!") @@ -94,8 +91,7 @@ Segment the string into sentences: use icu::segmenter::SentenceSegmenter; let segmenter = - SentenceSegmenter::try_new_unstable(&icu_testdata::unstable()) - .expect("Data exists"); + SentenceSegmenter::new(); let breakpoints: Vec = segmenter .segment_str("Hello World. Xin chào thế giới!") diff --git a/components/segmenter/benches/bench.rs b/components/segmenter/benches/bench.rs index 1040b9b08c5..0806dadd52e 100644 --- a/components/segmenter/benches/bench.rs +++ b/components/segmenter/benches/bench.rs @@ -17,15 +17,12 @@ const TEST_STR_TH: &str = fn line_break_iter_latin1(c: &mut Criterion) { let mut group = c.benchmark_group("Line Break/Latin1"); - let segmenter = - LineSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = LineSegmenter::new_dictionary(); let mut options = LineBreakOptions::default(); options.strictness = LineBreakStrictness::Anywhere; options.word_option = LineBreakWordOption::BreakAll; - let segmenter_css = - LineSegmenter::try_new_dictionary_with_options_unstable(&icu_testdata::unstable(), options) - .expect("Data exists"); + let segmenter_css = LineSegmenter::new_dictionary_with_options(options); group.bench_function("En", |b| { b.iter(|| { @@ -47,19 +44,14 @@ fn line_break_iter_latin1(c: &mut Criterion) { fn line_break_iter_utf8(c: &mut Criterion) { let mut group = c.benchmark_group("Line Break/UTF8"); - let segmenter_auto = - LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_lstm = - LineSegmenter::try_new_lstm_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_dictionary = - LineSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter_auto = LineSegmenter::new_auto(); + let segmenter_lstm = LineSegmenter::new_lstm(); + let segmenter_dictionary = LineSegmenter::new_dictionary(); let mut options = LineBreakOptions::default(); options.strictness = LineBreakStrictness::Anywhere; options.word_option = LineBreakWordOption::BreakAll; - let segmenter_css_dictionary = - LineSegmenter::try_new_dictionary_with_options_unstable(&icu_testdata::unstable(), options) - .expect("Data exists"); + let segmenter_css_dictionary = LineSegmenter::new_dictionary_with_options(options); // No need to test "auto", "lstm", or "dictionary" constructor variants since English uses only // UAX14 rules for line breaking. @@ -101,19 +93,14 @@ fn line_break_iter_utf16(c: &mut Criterion) { let utf16_en: Vec = TEST_STR_EN.encode_utf16().collect(); let utf16_th: Vec = TEST_STR_TH.encode_utf16().collect(); - let segmenter_auto = - LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_lstm = - LineSegmenter::try_new_lstm_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_dictionary = - LineSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter_auto = LineSegmenter::new_auto(); + let segmenter_lstm = LineSegmenter::new_lstm(); + let segmenter_dictionary = LineSegmenter::new_dictionary(); let mut options = LineBreakOptions::default(); options.strictness = LineBreakStrictness::Anywhere; options.word_option = LineBreakWordOption::BreakAll; - let segmenter_css_dictionary = - LineSegmenter::try_new_dictionary_with_options_unstable(&icu_testdata::unstable(), options) - .expect("Data exists"); + let segmenter_css_dictionary = LineSegmenter::new_dictionary_with_options(options); // No need to test "auto", "lstm", or "dictionary" constructor variants since English uses only // UAX14 rules for line breaking. diff --git a/components/segmenter/src/complex/dictionary.rs b/components/segmenter/src/complex/dictionary.rs index 80416c1762a..90360ee2b0f 100644 --- a/components/segmenter/src/complex/dictionary.rs +++ b/components/segmenter/src/complex/dictionary.rs @@ -179,22 +179,10 @@ mod tests { use super::*; use crate::{LineSegmenter, WordSegmenter}; use icu_provider::prelude::*; - use icu_provider_adapters::fork::ForkByKeyProvider; - use icu_provider_fs::FsDataProvider; - use std::path::PathBuf; - - fn get_segmenter_testdata_provider() -> impl BufferProvider { - let segmenter_fs_provider = FsDataProvider::try_new( - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/testdata/provider"), - ) - .unwrap(); - ForkByKeyProvider::new(segmenter_fs_provider, icu_testdata::buffer()) - } #[test] fn burmese_dictionary_test() { - let provider = get_segmenter_testdata_provider(); - let segmenter = LineSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap(); + let segmenter = LineSegmenter::new_dictionary(); // From css/css-text/word-break/word-break-normal-my-000.html let s = "မြန်မာစာမြန်မာစာမြန်မာစာ"; let result: Vec = segmenter.segment_str(s).collect(); @@ -207,9 +195,7 @@ mod tests { #[test] fn cj_dictionary_test() { - let provider = get_segmenter_testdata_provider(); - let dict_payload: DataPayload = provider - .as_deserializing() + let dict_payload: DataPayload = crate::provider::Baked .load(DataRequest { locale: &icu_locid::locale!("ja").into(), metadata: Default::default(), @@ -217,15 +203,11 @@ mod tests { .unwrap() .take_payload() .unwrap(); - let grph_payload: DataPayload = provider - .as_deserializing() - .load(Default::default()) - .unwrap() - .take_payload() - .unwrap(); - let word_segmenter = - WordSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap(); - let dict_segmenter = DictionarySegmenter::new(dict_payload.get(), grph_payload.get()); + let word_segmenter = WordSegmenter::new_dictionary(); + let dict_segmenter = DictionarySegmenter::new( + dict_payload.get(), + crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, + ); // Match case let s = "龟山岛龟山岛"; @@ -262,8 +244,7 @@ mod tests { #[test] fn khmer_dictionary_test() { - let provider = get_segmenter_testdata_provider(); - let segmenter = LineSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap(); + let segmenter = LineSegmenter::new_dictionary(); let s = "ភាសាខ្មែរភាសាខ្មែរភាសាខ្មែរ"; let result: Vec = segmenter.segment_str(s).collect(); assert_eq!(result, vec![0, 27, 54, 81]); @@ -275,8 +256,7 @@ mod tests { #[test] fn lao_dictionary_test() { - let provider = get_segmenter_testdata_provider(); - let segmenter = LineSegmenter::try_new_dictionary_with_buffer_provider(&provider).unwrap(); + let segmenter = LineSegmenter::new_dictionary(); let s = "ພາສາລາວພາສາລາວພາສາລາວ"; let r: Vec = segmenter.segment_str(s).collect(); assert_eq!(r, vec![0, 12, 21, 33, 42, 54, 63]); diff --git a/components/segmenter/src/complex/lstm/mod.rs b/components/segmenter/src/complex/lstm/mod.rs index c9f72cbd740..2ce5f6e1810 100644 --- a/components/segmenter/src/complex/lstm/mod.rs +++ b/components/segmenter/src/complex/lstm/mod.rs @@ -355,8 +355,7 @@ mod tests { #[test] fn segment_file_by_lstm() { - let lstm: DataPayload = icu_testdata::buffer() - .as_deserializing() + let lstm: DataPayload = crate::provider::Baked .load(DataRequest { locale: &locale!("th").into(), metadata: Default::default(), @@ -364,13 +363,10 @@ mod tests { .unwrap() .take_payload() .unwrap(); - let grapheme: DataPayload = icu_testdata::buffer() - .as_deserializing() - .load(Default::default()) - .unwrap() - .take_payload() - .unwrap(); - let lstm = LstmSegmenter::new(lstm.get(), grapheme.get()); + let lstm = LstmSegmenter::new( + lstm.get(), + crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, + ); // Importing the test data let test_text_data = load_test_text(&format!( diff --git a/components/segmenter/src/complex/mod.rs b/components/segmenter/src/complex/mod.rs index 1d2a63f462a..978e19f61df 100644 --- a/components/segmenter/src/complex/mod.rs +++ b/components/segmenter/src/complex/mod.rs @@ -67,6 +67,35 @@ impl ComplexPayloads { } } + #[cfg(feature = "lstm")] + #[cfg(feature = "data")] + pub(crate) fn new_lstm() -> Self { + #[allow(clippy::unwrap_used)] + // try_load is infallible if the provider only returns `MissingLocale`. + Self { + grapheme: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, + ), + my: try_load::(&crate::provider::Baked, locale!("my")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + km: try_load::(&crate::provider::Baked, locale!("km")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + lo: try_load::(&crate::provider::Baked, locale!("lo")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + th: try_load::(&crate::provider::Baked, locale!("th")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + ja: None, + } + } + #[cfg(feature = "lstm")] pub(crate) fn try_new_lstm(provider: &D) -> Result where @@ -92,6 +121,51 @@ impl ComplexPayloads { }) } + #[cfg(feature = "data")] + pub(crate) fn new_dict() -> Self { + #[allow(clippy::unwrap_used)] + // try_load is infallible if the provider only returns `MissingLocale`. + Self { + grapheme: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, + ), + my: try_load::( + &crate::provider::Baked, + locale!("my"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + km: try_load::( + &crate::provider::Baked, + locale!("km"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + lo: try_load::( + &crate::provider::Baked, + locale!("lo"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + th: try_load::( + &crate::provider::Baked, + locale!("th"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + ja: try_load::( + &crate::provider::Baked, + locale!("ja"), + ) + .unwrap() + .map(DataPayload::cast), + } + } + pub(crate) fn try_new_dict(provider: &D) -> Result where D: DataProvider @@ -118,6 +192,40 @@ impl ComplexPayloads { }) } + #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled. + #[cfg(feature = "data")] + pub(crate) fn new_auto() -> Self { + #[allow(clippy::unwrap_used)] + // try_load is infallible if the provider only returns `MissingLocale`. + Self { + grapheme: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, + ), + my: try_load::(&crate::provider::Baked, locale!("my")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + km: try_load::(&crate::provider::Baked, locale!("km")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + lo: try_load::(&crate::provider::Baked, locale!("lo")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + th: try_load::(&crate::provider::Baked, locale!("th")) + .unwrap() + .map(DataPayload::cast) + .map(Err), + ja: try_load::( + &crate::provider::Baked, + locale!("ja"), + ) + .unwrap() + .map(DataPayload::cast), + } + } + #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled. pub(crate) fn try_new_auto(provider: &D) -> Result where @@ -145,6 +253,46 @@ impl ComplexPayloads { }) } + #[cfg(feature = "data")] + pub(crate) fn new_southeast_asian() -> Self { + #[allow(clippy::unwrap_used)] + // try_load is infallible if the provider only returns `MissingLocale`. + Self { + grapheme: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, + ), + my: try_load::( + &crate::provider::Baked, + locale!("my"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + km: try_load::( + &crate::provider::Baked, + locale!("km"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + lo: try_load::( + &crate::provider::Baked, + locale!("lo"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + th: try_load::( + &crate::provider::Baked, + locale!("th"), + ) + .unwrap() + .map(DataPayload::cast) + .map(Ok), + ja: None, + } + } + pub(crate) fn try_new_southeast_asian(provider: &D) -> Result where D: DataProvider @@ -268,10 +416,8 @@ mod tests { const TEST_STR: &str = "ภาษาไทยภาษาไทย"; let utf16: Vec = TEST_STR.encode_utf16().collect(); - let lstm = - ComplexPayloads::try_new_lstm(&icu_testdata::buffer().as_deserializing()).unwrap(); - let dict = - ComplexPayloads::try_new_dict(&icu_testdata::buffer().as_deserializing()).unwrap(); + let lstm = ComplexPayloads::new_lstm(); + let dict = ComplexPayloads::new_dict(); assert_eq!( complex_language_segment_str(&lstm, TEST_STR), diff --git a/components/segmenter/src/grapheme.rs b/components/segmenter/src/grapheme.rs index bff29db5800..81b4d3ddaf2 100644 --- a/components/segmenter/src/grapheme.rs +++ b/components/segmenter/src/grapheme.rs @@ -66,8 +66,7 @@ pub type GraphemeClusterBreakIteratorUtf16<'l, 's> = /// ```rust /// use icu_segmenter::GraphemeClusterSegmenter; /// let segmenter = -/// GraphemeClusterSegmenter::try_new_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// GraphemeClusterSegmenter::new(); /// /// let breakpoints: Vec = segmenter.segment_str("Hello 🗺").collect(); /// // World Map (U+1F5FA) is encoded in four bytes in UTF-8. @@ -79,8 +78,7 @@ pub type GraphemeClusterBreakIteratorUtf16<'l, 's> = /// ```rust /// use icu_segmenter::GraphemeClusterSegmenter; /// let segmenter = -/// GraphemeClusterSegmenter::try_new_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// GraphemeClusterSegmenter::new(); /// /// let breakpoints: Vec = /// segmenter.segment_latin1(b"Hello World").collect(); @@ -94,8 +92,7 @@ pub type GraphemeClusterBreakIteratorUtf16<'l, 's> = /// ```rust /// # use icu_segmenter::GraphemeClusterSegmenter; /// # let segmenter = -/// # GraphemeClusterSegmenter::try_new_unstable(&icu_testdata::unstable()) -/// # .expect("Data exists"); +/// # GraphemeClusterSegmenter::new(); /// use itertools::Itertools; /// let text = "मांजर"; /// let grapheme_clusters: Vec<&str> = segmenter @@ -123,8 +120,7 @@ pub type GraphemeClusterBreakIteratorUtf16<'l, 's> = /// ```rust /// use icu_segmenter::GraphemeClusterSegmenter; /// let segmenter = -/// GraphemeClusterSegmenter::try_new_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// GraphemeClusterSegmenter::new(); /// /// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster, /// // but not a legacy grapheme cluster. @@ -137,8 +133,39 @@ pub struct GraphemeClusterSegmenter { payload: DataPayload, } +#[cfg(feature = "data")] +impl Default for GraphemeClusterSegmenter { + fn default() -> Self { + Self::new() + } +} + impl GraphemeClusterSegmenter { /// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] + pub fn new() -> Self { + Self { + payload: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1, + ), + } + } + + icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError, + #[cfg(skip)] + functions: [ + new, + try_new_with_any_provider, + try_new_with_buffer_provider, + try_new_unstable, + Self, + ]); + + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new)] pub fn try_new_unstable(provider: &D) -> Result where D: DataProvider + ?Sized, @@ -147,8 +174,6 @@ impl GraphemeClusterSegmenter { Ok(Self { payload }) } - icu_provider::gen_any_buffer_constructors!(locale: skip, options: skip, error: SegmenterError); - /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string). pub fn segment_str<'l, 's>( &'l self, @@ -239,11 +264,9 @@ impl GraphemeClusterSegmenter { } } -#[cfg(all(test, feature = "serde"))] #[test] fn empty_string() { - let segmenter = - GraphemeClusterSegmenter::try_new_with_buffer_provider(&icu_testdata::buffer()).unwrap(); + let segmenter = GraphemeClusterSegmenter::new(); let breaks: Vec = segmenter.segment_str("").collect(); assert_eq!(breaks, [0]); } diff --git a/components/segmenter/src/lib.rs b/components/segmenter/src/lib.rs index b809ed72dae..d94dd97c635 100644 --- a/components/segmenter/src/lib.rs +++ b/components/segmenter/src/lib.rs @@ -30,8 +30,7 @@ //! use icu::segmenter::LineSegmenter; //! //! let segmenter = -//! LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -//! .expect("Data exists"); +//! LineSegmenter::new_auto(); //! //! let breakpoints: Vec = segmenter //! .segment_str("Hello World. Xin chào thế giới!") @@ -49,8 +48,7 @@ //! use icu::segmenter::GraphemeClusterSegmenter; //! //! let segmenter = -//! GraphemeClusterSegmenter::try_new_unstable(&icu_testdata::unstable()) -//! .expect("Data exists"); +//! GraphemeClusterSegmenter::new(); //! //! let breakpoints: Vec = segmenter //! .segment_str("Hello World. Xin chào thế giới!") @@ -74,8 +72,7 @@ //! use icu::segmenter::WordSegmenter; //! //! let segmenter = -//! WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -//! .expect("Data exists"); +//! WordSegmenter::new_auto(); //! //! let breakpoints: Vec = segmenter //! .segment_str("Hello World. Xin chào thế giới!") @@ -96,8 +93,7 @@ //! use icu::segmenter::SentenceSegmenter; //! //! let segmenter = -//! SentenceSegmenter::try_new_unstable(&icu_testdata::unstable()) -//! .expect("Data exists"); +//! SentenceSegmenter::new(); //! //! let breakpoints: Vec = segmenter //! .segment_str("Hello World. Xin chào thế giới!") diff --git a/components/segmenter/src/line.rs b/components/segmenter/src/line.rs index 54eec0c740a..5c11e4de746 100644 --- a/components/segmenter/src/line.rs +++ b/components/segmenter/src/line.rs @@ -145,8 +145,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp /// ```rust /// # use icu_segmenter::LineSegmenter; /// # -/// # let segmenter = LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// # .expect("Data exists"); +/// # let segmenter = LineSegmenter::new_auto(); /// # /// let text = "Summary\r\nThis annex…"; /// let breakpoints: Vec = segmenter.segment_str(text).collect(); @@ -162,8 +161,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp /// use icu_segmenter::LineSegmenter; /// /// let segmenter = -/// LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// LineSegmenter::new_auto(); /// /// let breakpoints: Vec = /// segmenter.segment_str("Hello World").collect(); @@ -182,11 +180,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp /// options.strictness = LineBreakStrictness::Strict; /// options.word_option = LineBreakWordOption::BreakAll; /// options.ja_zh = false; -/// let segmenter = LineSegmenter::try_new_auto_with_options_unstable( -/// &icu_testdata::unstable(), -/// options, -/// ) -/// .expect("Data exists"); +/// let segmenter = LineSegmenter::new_auto_with_options(options); /// /// let breakpoints: Vec = /// segmenter.segment_str("Hello World").collect(); @@ -199,8 +193,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp /// use icu_segmenter::LineSegmenter; /// /// let segmenter = -/// LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// LineSegmenter::new_auto(); /// /// let breakpoints: Vec = /// segmenter.segment_latin1(b"Hello World").collect(); @@ -213,12 +206,8 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp /// use icu::properties::{maps, LineBreak}; /// use icu_segmenter::LineSegmenter; /// -/// # let segmenter = LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// # .expect("Data exists"); +/// # let segmenter = LineSegmenter::new_auto(); /// # -/// let data = maps::load_line_break(&icu_testdata::unstable()).expect("The data should be valid!"); -/// let lb = data.as_borrowed(); -/// /// let text = "Summary\r\nThis annex…"; /// /// let mandatory_breaks: Vec = segmenter @@ -227,7 +216,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp /// .filter(|&i| { /// text[..i].chars().next_back().map_or(false, |c| { /// matches!( -/// lb.get(c), +/// maps::LINE_BREAK.get(c), /// LineBreak::MandatoryBreak /// | LineBreak::CarriageReturn /// | LineBreak::LineFeed @@ -251,67 +240,119 @@ impl LineSegmenter { /// /// The current behavior, which is subject to change, is to use the LSTM model when available. /// - /// See also [`Self::try_new_auto_with_options_unstable`]. + /// See also [`Self::new_auto_with_options`]. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] #[cfg(feature = "auto")] - pub fn try_new_auto_unstable(provider: &D) -> Result - where - D: DataProvider - + DataProvider - + DataProvider - + ?Sized, - { - Self::try_new_auto_with_options_unstable(provider, Default::default()) + pub fn new_auto() -> Self { + Self::new_auto_with_options(Default::default()) } #[cfg(feature = "auto")] - icu_provider::gen_any_buffer_constructors!( + icu_provider::gen_any_buffer_data_constructors!( locale: skip, options: skip, error: SegmenterError, + #[cfg(skip)] functions: [ - Self::try_new_auto_unstable, + new_auto, try_new_auto_with_any_provider, - try_new_auto_with_buffer_provider + try_new_auto_with_buffer_provider, + try_new_auto_unstable, + Self, ] ); + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_auto)] + #[cfg(feature = "auto")] + pub fn try_new_auto_unstable(provider: &D) -> Result + where + D: DataProvider + + DataProvider + + DataProvider + + ?Sized, + { + Self::try_new_auto_with_options_unstable(provider, Default::default()) + } + /// Constructs a [`LineSegmenter`] with an invariant locale and LSTM data for /// complex scripts (Khmer, Lao, Myanmar, and Thai). /// /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than /// the full dictionary but more expensive during segmentation (inference). /// - /// See also [`Self::try_new_lstm_with_options_unstable`]. + /// See also [`Self::new_lstm_with_options`]. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] #[cfg(feature = "lstm")] - pub fn try_new_lstm_unstable(provider: &D) -> Result - where - D: DataProvider - + DataProvider - + DataProvider - + ?Sized, - { - Self::try_new_lstm_with_options_unstable(provider, Default::default()) + pub fn new_lstm() -> Self { + Self::new_lstm_with_options(Default::default()) } #[cfg(feature = "lstm")] - icu_provider::gen_any_buffer_constructors!( + icu_provider::gen_any_buffer_data_constructors!( locale: skip, options: skip, error: SegmenterError, + #[cfg(skip)] functions: [ - Self::try_new_lstm_unstable, + new_lstm, try_new_lstm_with_any_provider, - try_new_lstm_with_buffer_provider + try_new_lstm_with_buffer_provider, + try_new_lstm_unstable, + Self, ] ); + #[cfg(feature = "lstm")] + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_lstm)] + pub fn try_new_lstm_unstable(provider: &D) -> Result + where + D: DataProvider + + DataProvider + + DataProvider + + ?Sized, + { + Self::try_new_lstm_with_options_unstable(provider, Default::default()) + } + /// Constructs a [`LineSegmenter`] with an invariant locale and dictionary data for /// complex scripts (Khmer, Lao, Myanmar, and Thai). /// /// The dictionary model uses a list of words to determine appropriate breakpoints. It is /// faster than the LSTM model but requires more data. /// - /// See also [`Self::try_new_dictionary_with_options_unstable`]. + /// See also [`Self::new_dictionary_with_options`]. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] + pub fn new_dictionary() -> Self { + Self::new_dictionary_with_options(Default::default()) + } + + icu_provider::gen_any_buffer_data_constructors!( + locale: skip, + options: skip, + error: SegmenterError, + #[cfg(skip)] + functions: [ + new_dictionary, + try_new_dictionary_with_any_provider, + try_new_dictionary_with_buffer_provider, + try_new_dictionary_unstable, + Self, + ] + ); + + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_dictionary)] pub fn try_new_dictionary_unstable(provider: &D) -> Result where D: DataProvider @@ -322,24 +363,39 @@ impl LineSegmenter { Self::try_new_dictionary_with_options_unstable(provider, Default::default()) } - icu_provider::gen_any_buffer_constructors!( + /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and + /// the best available data for complex scripts (Khmer, Lao, Myanmar, and Thai). + /// + /// The current behavior, which is subject to change, is to use the LSTM model when available. + /// + /// See also [`Self::new_auto`]. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "auto")] + #[cfg(feature = "data")] + pub fn new_auto_with_options(options: LineBreakOptions) -> Self { + Self::new_lstm_with_options(options) + } + + #[cfg(feature = "auto")] + icu_provider::gen_any_buffer_data_constructors!( locale: skip, - options: skip, + options: LineBreakOptions, error: SegmenterError, + #[cfg(skip)] functions: [ - Self::try_new_dictionary_unstable, - try_new_dictionary_with_any_provider, - try_new_dictionary_with_buffer_provider + new_auto_with_options, + try_new_auto_with_options_with_any_provider, + try_new_auto_with_options_with_buffer_provider, + try_new_auto_with_options_unstable, + Self, ] ); - /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and - /// the best available data for complex scripts (Khmer, Lao, Myanmar, and Thai). - /// - /// The current behavior, which is subject to change, is to use the LSTM model when available. - /// - /// See also [`Self::try_new_auto_unstable`]. #[cfg(feature = "auto")] + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_auto_with_options)] pub fn try_new_auto_with_options_unstable( provider: &D, options: LineBreakOptions, @@ -353,26 +409,46 @@ impl LineSegmenter { Self::try_new_lstm_with_options_unstable(provider, options) } - #[cfg(feature = "auto")] - icu_provider::gen_any_buffer_constructors!( + /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and + /// LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai). + /// + /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than + /// the full dictionary but more expensive during segmentation (inference). + /// + /// See also [`Self::new_dictionary`]. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "lstm")] + #[cfg(feature = "data")] + pub fn new_lstm_with_options(options: LineBreakOptions) -> Self { + Self { + options, + payload: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1, + ), + complex: ComplexPayloads::new_lstm(), + } + } + + #[cfg(feature = "lstm")] + icu_provider::gen_any_buffer_data_constructors!( locale: skip, options: LineBreakOptions, error: SegmenterError, + #[cfg(skip)] functions: [ - Self::try_new_auto_with_options_unstable, - try_new_auto_with_options_with_any_provider, - try_new_auto_with_options_with_buffer_provider + try_new_lstm_with_options, + try_new_lstm_with_options_with_any_provider, + try_new_lstm_with_options_with_buffer_provider, + try_new_lstm_with_options_unstable, + Self, ] ); - /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and - /// LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai). - /// - /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than - /// the full dictionary but more expensive during segmentation (inference). - /// - /// See also [`Self::try_new_dictionary_unstable`]. #[cfg(feature = "lstm")] + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_lstm_with_options)] pub fn try_new_lstm_with_options_unstable( provider: &D, options: LineBreakOptions, @@ -390,25 +466,49 @@ impl LineSegmenter { }) } - #[cfg(feature = "lstm")] - icu_provider::gen_any_buffer_constructors!( + /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and + /// dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai). + /// + /// The dictionary model uses a list of words to determine appropriate breakpoints. It is + /// faster than the LSTM model but requires more data. + /// + /// See also [`Self::new_dictionary`]. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] + pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self { + Self { + options, + payload: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1, + ), + // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK + // characters [1]. Southeast Asian languages however require complex context analysis + // [2]. + // + // [1]: https://www.unicode.org/reports/tr14/#ID + // [2]: https://www.unicode.org/reports/tr14/#SA + complex: ComplexPayloads::new_southeast_asian(), + } + } + + icu_provider::gen_any_buffer_data_constructors!( locale: skip, options: LineBreakOptions, error: SegmenterError, + #[cfg(skip)] functions: [ - Self::try_new_lstm_with_options_unstable, - try_new_lstm_with_options_with_any_provider, - try_new_lstm_with_options_with_buffer_provider + new_dictionary_with_options, + try_new_dictionary_with_options_with_any_provider, + try_new_dictionary_with_options_with_buffer_provider, + try_new_dictionary_with_options_unstable, + Self, ] ); - /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and - /// dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai). - /// - /// The dictionary model uses a list of words to determine appropriate breakpoints. It is - /// faster than the LSTM model but requires more data. - /// - /// See also [`Self::try_new_dictionary_unstable`]. + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_dictionary_with_options)] pub fn try_new_dictionary_with_options_unstable( provider: &D, options: LineBreakOptions, @@ -432,17 +532,6 @@ impl LineSegmenter { }) } - icu_provider::gen_any_buffer_constructors!( - locale: skip, - options: LineBreakOptions, - error: SegmenterError, - functions: [ - Self::try_new_dictionary_with_options_unstable, - try_new_dictionary_with_options_with_any_provider, - try_new_dictionary_with_options_with_buffer_provider - ] - ); - /// Creates a line break iterator for an `str` (a UTF-8 string). /// /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. @@ -1213,22 +1302,11 @@ impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 { mod tests { use super::*; use crate::LineSegmenter; - use icu_provider_adapters::fork::ForkByKeyProvider; - use icu_provider_fs::FsDataProvider; - use std::path::PathBuf; - - fn get_segmenter_testdata_provider() -> impl BufferProvider { - let segmenter_fs_provider = FsDataProvider::try_new( - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/testdata/provider"), - ) - .unwrap(); - ForkByKeyProvider::new(segmenter_fs_provider, icu_testdata::buffer()) - } #[test] fn linebreak_property() { let payload = DataProvider::::load( - &get_segmenter_testdata_provider().as_deserializing(), + &crate::provider::Baked, Default::default(), ) .expect("Loading should succeed!") @@ -1264,7 +1342,7 @@ mod tests { #[allow(clippy::bool_assert_comparison)] // clearer when we're testing bools directly fn break_rule() { let payload = DataProvider::::load( - &get_segmenter_testdata_provider().as_deserializing(), + &crate::provider::Baked, Default::default(), ) .expect("Loading should succeed!") @@ -1374,10 +1452,8 @@ mod tests { #[test] fn linebreak() { - let segmenter = LineSegmenter::try_new_dictionary_unstable( - &get_segmenter_testdata_provider().as_deserializing(), - ) - .expect("Data exists"); + let segmenter = LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked) + .expect("Data exists"); let mut iter = segmenter.segment_str("hello world"); assert_eq!(Some(0), iter.next()); @@ -1502,8 +1578,7 @@ mod tests { fn thai_line_break() { const TEST_STR: &str = "ภาษาไทยภาษาไทย"; - let provider = get_segmenter_testdata_provider(); - let segmenter = LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap(); + let segmenter = LineSegmenter::new_lstm(); let breaks: Vec = segmenter.segment_str(TEST_STR).collect(); assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test"); @@ -1522,8 +1597,7 @@ mod tests { // "Burmese Language" in Burmese const TEST_STR: &str = "မြန်မာဘာသာစကား"; - let provider = get_segmenter_testdata_provider(); - let segmenter = LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap(); + let segmenter = LineSegmenter::new_lstm(); let breaks: Vec = segmenter.segment_str(TEST_STR).collect(); // LSTM model breaks more characters, but it is better to return [30]. assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test"); @@ -1539,8 +1613,7 @@ mod tests { fn khmer_line_break() { const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស"; - let provider = get_segmenter_testdata_provider(); - let segmenter = LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap(); + let segmenter = LineSegmenter::new_lstm(); let breaks: Vec = segmenter.segment_str(TEST_STR).collect(); // Note: This small sample matches the ICU dictionary segmenter assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test"); @@ -1559,8 +1632,7 @@ mod tests { fn lao_line_break() { const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ"; - let provider = get_segmenter_testdata_provider(); - let segmenter = LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap(); + let segmenter = LineSegmenter::new_lstm(); let breaks: Vec = segmenter.segment_str(TEST_STR).collect(); // Note: LSTM finds a break at '12' that the dictionary does not find assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test"); @@ -1572,8 +1644,7 @@ mod tests { #[test] fn empty_string() { - let segmenter = - LineSegmenter::try_new_auto_with_buffer_provider(&icu_testdata::buffer()).unwrap(); + let segmenter = LineSegmenter::new_auto(); let breaks: Vec = segmenter.segment_str("").collect(); assert_eq!(breaks, [0]); } diff --git a/components/segmenter/src/provider/mod.rs b/components/segmenter/src/provider/mod.rs index 49f26860855..114da2c8fc0 100644 --- a/components/segmenter/src/provider/mod.rs +++ b/components/segmenter/src/provider/mod.rs @@ -26,6 +26,24 @@ use icu_collections::codepointtrie::CodePointTrie; use icu_provider::prelude::*; use zerovec::ZeroVec; +#[cfg(feature = "data")] +#[derive(Debug)] +/// Baked data +pub struct Baked; + +#[cfg(feature = "data")] +const _: () = { + use crate as icu_segmenter; + icu_segmenter_data::impl_segmenter_dictionary_w_auto_v1!(Baked); + icu_segmenter_data::impl_segmenter_dictionary_wl_ext_v1!(Baked); + icu_segmenter_data::impl_segmenter_grapheme_v1!(Baked); + icu_segmenter_data::impl_segmenter_line_v1!(Baked); + #[cfg(feature = "lstm")] + icu_segmenter_data::impl_segmenter_lstm_wl_auto_v1!(Baked); + icu_segmenter_data::impl_segmenter_sentence_v1!(Baked); + icu_segmenter_data::impl_segmenter_word_v1!(Baked); +}; + /// Pre-processed Unicode data in the form of tables to be used for rule-based breaking. /// ///
diff --git a/components/segmenter/src/sentence.rs b/components/segmenter/src/sentence.rs index 71d738ec910..e4eeb60ed40 100644 --- a/components/segmenter/src/sentence.rs +++ b/components/segmenter/src/sentence.rs @@ -60,9 +60,7 @@ pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, Rule /// /// ```rust /// use icu_segmenter::SentenceSegmenter; -/// let segmenter = -/// SentenceSegmenter::try_new_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// let segmenter = SentenceSegmenter::new(); /// /// let breakpoints: Vec = /// segmenter.segment_str("Hello World").collect(); @@ -73,9 +71,7 @@ pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, Rule /// /// ```rust /// use icu_segmenter::SentenceSegmenter; -/// let segmenter = -/// SentenceSegmenter::try_new_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// let segmenter = SentenceSegmenter::new(); /// /// let breakpoints: Vec = /// segmenter.segment_latin1(b"Hello World").collect(); @@ -88,9 +84,7 @@ pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, Rule /// /// ```rust /// # use icu_segmenter::SentenceSegmenter; -/// # let segmenter = -/// # SentenceSegmenter::try_new_unstable(&icu_testdata::unstable()) -/// # .expect("Data exists"); +/// # let segmenter = SentenceSegmenter::new(); /// use itertools::Itertools; /// let text = "Ceci tuera cela. Le livre tuera l’édifice."; /// let sentences: Vec<&str> = segmenter @@ -108,8 +102,40 @@ pub struct SentenceSegmenter { payload: DataPayload, } +#[cfg(feature = "data")] +impl Default for SentenceSegmenter { + fn default() -> Self { + Self::new() + } +} + impl SentenceSegmenter { /// Constructs a [`SentenceSegmenter`] with an invariant locale. + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] + pub fn new() -> Self { + Self { + payload: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1, + ), + } + } + + icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError, + #[cfg(skip)] + functions: [ + new, + try_new_with_any_provider, + try_new_with_buffer_provider, + try_new_unstable, + Self, + ] + ); + + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new)] pub fn try_new_unstable(provider: &D) -> Result where D: DataProvider + ?Sized, @@ -118,8 +144,6 @@ impl SentenceSegmenter { Ok(Self { payload }) } - icu_provider::gen_any_buffer_constructors!(locale: skip, options: skip, error: SegmenterError); - /// Creates a sentence break iterator for an `str` (a UTF-8 string). /// /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. @@ -190,8 +214,7 @@ impl SentenceSegmenter { #[cfg(all(test, feature = "serde"))] #[test] fn empty_string() { - let segmenter = - SentenceSegmenter::try_new_with_buffer_provider(&icu_testdata::buffer()).unwrap(); + let segmenter = SentenceSegmenter::new(); let breaks: Vec = segmenter.segment_str("").collect(); assert_eq!(breaks, [0]); } diff --git a/components/segmenter/src/word.rs b/components/segmenter/src/word.rs index e9b3234f31b..fd24a17c35e 100644 --- a/components/segmenter/src/word.rs +++ b/components/segmenter/src/word.rs @@ -95,9 +95,7 @@ pub type WordBreakIteratorUtf16<'l, 's> = WordBreakIterator<'l, 's, WordBreakTyp /// /// ```rust /// use icu_segmenter::WordSegmenter; -/// let segmenter = -/// WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// let segmenter = WordSegmenter::new_auto(); /// /// let breakpoints: Vec = /// segmenter.segment_str("Hello World").collect(); @@ -108,9 +106,7 @@ pub type WordBreakIteratorUtf16<'l, 's> = WordBreakIterator<'l, 's, WordBreakTyp /// /// ```rust /// use icu_segmenter::WordSegmenter; -/// let segmenter = -/// WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// .expect("Data exists"); +/// let segmenter = WordSegmenter::new_auto(); /// /// let breakpoints: Vec = /// segmenter.segment_latin1(b"Hello World").collect(); @@ -123,8 +119,7 @@ pub type WordBreakIteratorUtf16<'l, 's> = WordBreakIterator<'l, 's, WordBreakTyp /// /// ```rust /// # use icu_segmenter::WordSegmenter; -/// # let segmenter = WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// # .expect("Data exists"); +/// # let segmenter = WordSegmenter::new_auto(); /// use itertools::Itertools; /// let text = "Mark’d ye his words?"; /// let segments: Vec<&str> = segmenter @@ -142,8 +137,7 @@ pub type WordBreakIteratorUtf16<'l, 's> = WordBreakIterator<'l, 's, WordBreakTyp /// ```rust /// # use itertools::Itertools; /// # use icu_segmenter::{WordType, WordSegmenter}; -/// # let segmenter = WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) -/// # .expect("Data exists"); +/// # let segmenter = WordSegmenter::new_auto(); /// # let text = "Mark’d ye his words?"; /// let words: Vec<&str> = { /// let mut it = segmenter.segment_str(text); @@ -178,9 +172,7 @@ impl WordSegmenter { /// let th_str = "ทุกสองสัปดาห์"; /// let ja_str = "こんにちは世界"; /// - /// let segmenter = - /// WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()) - /// .unwrap(); + /// let segmenter = WordSegmenter::new_auto(); /// /// let th_bps = segmenter.segment_str(th_str).collect::>(); /// let ja_bps = segmenter.segment_str(ja_str).collect::>(); @@ -188,7 +180,38 @@ impl WordSegmenter { /// assert_eq!(th_bps, [0, 9, 18, 39]); /// assert_eq!(ja_bps, [0, 15, 21]); /// ``` + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] + #[cfg(feature = "auto")] + pub fn new_auto() -> Self { + Self { + payload: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_WORD_V1, + ), + complex: ComplexPayloads::new_auto(), + } + } + + #[cfg(feature = "auto")] + icu_provider::gen_any_buffer_data_constructors!( + locale: skip, + options: skip, + error: SegmenterError, + #[cfg(skip)] + functions: [ + try_new_auto, + try_new_auto_with_any_provider, + try_new_auto_with_buffer_provider, + try_new_auto_unstable, + Self + ] + ); + #[cfg(feature = "auto")] + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_auto)] pub fn try_new_auto_unstable(provider: &D) -> Result where D: DataProvider @@ -203,18 +226,6 @@ impl WordSegmenter { }) } - #[cfg(feature = "auto")] - icu_provider::gen_any_buffer_constructors!( - locale: skip, - options: skip, - error: SegmenterError, - functions: [ - Self::try_new_auto_unstable, - try_new_auto_with_any_provider, - try_new_auto_with_buffer_provider - ] - ); - /// Constructs a [`WordSegmenter`] with an invariant locale and LSTM data for /// complex scripts (Burmese, Khmer, Lao, and Thai). /// @@ -234,9 +245,7 @@ impl WordSegmenter { /// let th_str = "ทุกสองสัปดาห์"; /// let ja_str = "こんにちは世界"; /// - /// let segmenter = - /// WordSegmenter::try_new_lstm_unstable(&icu_testdata::unstable()) - /// .unwrap(); + /// let segmenter = WordSegmenter::new_lstm(); /// /// let th_bps = segmenter.segment_str(th_str).collect::>(); /// let ja_bps = segmenter.segment_str(ja_str).collect::>(); @@ -246,7 +255,38 @@ impl WordSegmenter { /// // Note: We aren't able to find a suitable breakpoint in Chinese/Japanese. /// assert_eq!(ja_bps, [0, 21]); /// ``` + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] #[cfg(feature = "lstm")] + pub fn new_lstm() -> Self { + Self { + payload: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_WORD_V1, + ), + complex: ComplexPayloads::new_lstm(), + } + } + + #[cfg(feature = "lstm")] + icu_provider::gen_any_buffer_data_constructors!( + locale: skip, + options: skip, + error: SegmenterError, + #[cfg(skip)] + functions: [ + new_lstm, + try_new_lstm_with_any_provider, + try_new_lstm_with_buffer_provider, + try_new_lstm_unstable, + Self + ] + ); + + #[cfg(feature = "lstm")] + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_lstm)] pub fn try_new_lstm_unstable(provider: &D) -> Result where D: DataProvider @@ -260,18 +300,6 @@ impl WordSegmenter { }) } - #[cfg(feature = "lstm")] - icu_provider::gen_any_buffer_constructors!( - locale: skip, - options: skip, - error: SegmenterError, - functions: [ - Self::try_new_lstm_unstable, - try_new_lstm_with_any_provider, - try_new_lstm_with_buffer_provider - ] - ); - /// Construct a [`WordSegmenter`] with an invariant locale and dictionary data for /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai). /// @@ -288,9 +316,7 @@ impl WordSegmenter { /// let th_str = "ทุกสองสัปดาห์"; /// let ja_str = "こんにちは世界"; /// - /// let segmenter = - /// WordSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()) - /// .unwrap(); + /// let segmenter = WordSegmenter::new_dictionary(); /// /// let th_bps = segmenter.segment_str(th_str).collect::>(); /// let ja_bps = segmenter.segment_str(ja_str).collect::>(); @@ -298,6 +324,35 @@ impl WordSegmenter { /// assert_eq!(th_bps, [0, 9, 18, 39]); /// assert_eq!(ja_bps, [0, 15, 21]); /// ``` + /// + /// ✨ **Enabled with the `"data"` feature.** + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "data")] + pub fn new_dictionary() -> Self { + Self { + payload: DataPayload::from_static_ref( + crate::provider::Baked::SINGLETON_SEGMENTER_WORD_V1, + ), + complex: ComplexPayloads::new_dict(), + } + } + + icu_provider::gen_any_buffer_data_constructors!( + locale: skip, + options: skip, + error: SegmenterError, + #[cfg(skip)] + functions: [ + new_dictionary, + try_new_dictionary_with_any_provider, + try_new_dictionary_with_buffer_provider, + try_new_dictionary_unstable, + Self + ] + ); + + #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_dictionary)] pub fn try_new_dictionary_unstable(provider: &D) -> Result where D: DataProvider @@ -312,17 +367,6 @@ impl WordSegmenter { }) } - icu_provider::gen_any_buffer_constructors!( - locale: skip, - options: skip, - error: SegmenterError, - functions: [ - Self::try_new_dictionary_unstable, - try_new_dictionary_with_any_provider, - try_new_dictionary_with_buffer_provider - ] - ); - /// Creates a word break iterator for an `str` (a UTF-8 string). /// /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. @@ -552,8 +596,7 @@ impl<'l, 's> RuleBreakType<'l, 's> for WordBreakTypeUtf16 { #[cfg(all(test, feature = "serde"))] #[test] fn empty_string() { - let segmenter = - WordSegmenter::try_new_auto_with_buffer_provider(&icu_testdata::buffer()).unwrap(); + let segmenter = WordSegmenter::new_auto(); let breaks: Vec = segmenter.segment_str("").collect(); assert_eq!(breaks, [0]); } diff --git a/components/segmenter/tests/complex_word.rs b/components/segmenter/tests/complex_word.rs index c1f4e11c5bc..c518f91ba66 100644 --- a/components/segmenter/tests/complex_word.rs +++ b/components/segmenter/tests/complex_word.rs @@ -2,30 +2,13 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use icu_provider::prelude::*; -use icu_provider_adapters::fork::ForkByKeyProvider; -use icu_provider_fs::FsDataProvider; use icu_segmenter::WordSegmenter; -use std::path::PathBuf; // Additional word segmenter tests with complex string. -fn get_segmenter_testdata_provider() -> impl BufferProvider { - let segmenter_fs_provider = FsDataProvider::try_new( - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/testdata/provider"), - ) - .unwrap(); - ForkByKeyProvider::new(segmenter_fs_provider, icu_testdata::buffer()) -} - #[test] fn word_break_th() { - let segmenter_auto = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_lstm = - WordSegmenter::try_new_lstm_unstable(&icu_testdata::unstable()).expect("Data exists"); - - for segmenter in [segmenter_auto, segmenter_lstm] { + for segmenter in [WordSegmenter::new_auto(), WordSegmenter::new_lstm()] { // http://wpt.live/css/css-text/word-break/word-break-normal-th-000.html let s = "ภาษาไทยภาษาไทย"; let utf16: Vec = s.encode_utf16().collect(); @@ -56,9 +39,7 @@ fn word_break_th() { #[test] fn word_break_my() { - let segmenter = - WordSegmenter::try_new_auto_with_buffer_provider(&get_segmenter_testdata_provider()) - .expect("Data exists"); + let segmenter = WordSegmenter::new_auto(); let s = "မြန်မာစာမြန်မာစာမြန်မာစာ"; let utf16: Vec = s.encode_utf16().collect(); @@ -72,12 +53,7 @@ fn word_break_my() { #[test] fn word_break_hiragana() { - let segmenter_auto = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_dictionary = - WordSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists"); - - for segmenter in [segmenter_auto, segmenter_dictionary] { + for segmenter in [WordSegmenter::new_auto(), WordSegmenter::new_dictionary()] { let s = "うなぎうなじ"; let iter = segmenter.segment_str(s); assert_eq!( @@ -90,12 +66,7 @@ fn word_break_hiragana() { #[test] fn word_break_mixed_han() { - let segmenter_auto = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_dictionary = - WordSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists"); - - for segmenter in [segmenter_auto, segmenter_dictionary] { + for segmenter in [WordSegmenter::new_auto(), WordSegmenter::new_dictionary()] { let s = "Welcome龟山岛龟山岛Welcome"; let iter = segmenter.segment_str(s); assert_eq!( @@ -115,10 +86,8 @@ fn word_line_th_wikipedia_auto() { let utf16: Vec = text.encode_utf16().collect(); assert_eq!(utf16.len(), 142); - let segmenter_word_auto = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); - let segmenter_line_auto = - LineSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter_word_auto = WordSegmenter::new_auto(); + let segmenter_line_auto = LineSegmenter::new_auto(); let breakpoints_word_utf8 = segmenter_word_auto.segment_str(text).collect::>(); assert_eq!( diff --git a/components/segmenter/tests/css_line_break.rs b/components/segmenter/tests/css_line_break.rs index 5c153a7358b..2a81c352108 100644 --- a/components/segmenter/tests/css_line_break.rs +++ b/components/segmenter/tests/css_line_break.rs @@ -13,9 +13,7 @@ fn check_with_options( mut expect_utf16: Vec, options: LineBreakOptions, ) { - let segmenter = - LineSegmenter::try_new_dictionary_with_options_unstable(&icu_testdata::unstable(), options) - .expect("Data exists"); + let segmenter = LineSegmenter::new_dictionary_with_options(options); let iter = segmenter.segment_str(s); let result: Vec = iter.collect(); diff --git a/components/segmenter/tests/css_word_break.rs b/components/segmenter/tests/css_word_break.rs index 89d54e3dc8a..7137d10b62c 100644 --- a/components/segmenter/tests/css_word_break.rs +++ b/components/segmenter/tests/css_word_break.rs @@ -2,22 +2,10 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use icu_provider::prelude::*; -use icu_provider_adapters::fork::ForkByKeyProvider; -use icu_provider_fs::FsDataProvider; use icu_segmenter::LineBreakOptions; use icu_segmenter::LineBreakStrictness; use icu_segmenter::LineBreakWordOption; use icu_segmenter::LineSegmenter; -use std::path::PathBuf; - -fn get_segmenter_testdata_provider() -> impl BufferProvider { - let segmenter_fs_provider = FsDataProvider::try_new( - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/testdata/provider"), - ) - .unwrap(); - ForkByKeyProvider::new(segmenter_fs_provider, icu_testdata::buffer()) -} fn check_with_options( s: &str, @@ -25,11 +13,7 @@ fn check_with_options( mut expect_utf16: Vec, options: LineBreakOptions, ) { - let segmenter = LineSegmenter::try_new_dictionary_with_options_with_buffer_provider( - &get_segmenter_testdata_provider(), - options, - ) - .expect("Data exists"); + let segmenter = LineSegmenter::new_dictionary_with_options(options); let iter = segmenter.segment_str(s); let result: Vec = iter.collect(); diff --git a/components/segmenter/tests/spec_test.rs b/components/segmenter/tests/spec_test.rs index 0b240f7e546..8cb96e6a6ad 100644 --- a/components/segmenter/tests/spec_test.rs +++ b/components/segmenter/tests/spec_test.rs @@ -105,8 +105,7 @@ impl Iterator for TestContentIterator { fn line_break_test(filename: &str) { let test_iter = TestContentIterator::new(filename); - let segmenter = - LineSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = LineSegmenter::new_dictionary(); for mut test in test_iter { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); @@ -152,8 +151,7 @@ fn run_line_break_extra_test() { #[test] fn run_word_break_test() { let test_iter = TestContentIterator::new("./tests/testdata/WordBreakTest.txt"); - let segmenter = - WordSegmenter::try_new_dictionary_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = WordSegmenter::new_dictionary(); for test in test_iter { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); @@ -184,8 +182,7 @@ fn run_word_break_test() { #[test] fn run_grapheme_break_test() { let test_iter = TestContentIterator::new("./tests/testdata/GraphemeBreakTest.txt"); - let segmenter = - GraphemeClusterSegmenter::try_new_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = GraphemeClusterSegmenter::new(); for test in test_iter { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); @@ -215,8 +212,7 @@ fn run_grapheme_break_test() { fn sentence_break_test(filename: &str) { let test_iter = TestContentIterator::new(filename); - let segmenter = - SentenceSegmenter::try_new_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = SentenceSegmenter::new(); for test in test_iter { let s: String = test.utf8_vec.into_iter().collect(); let iter = segmenter.segment_str(&s); diff --git a/components/segmenter/tests/testdata/provider/manifest.json b/components/segmenter/tests/testdata/provider/manifest.json deleted file mode 100644 index d758f9bdb8e..00000000000 --- a/components/segmenter/tests/testdata/provider/manifest.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "syntax": "Postcard1" -} diff --git a/components/segmenter/tests/testdata/provider/segmenter/dictionary/w_auto@1/ja.postcard b/components/segmenter/tests/testdata/provider/segmenter/dictionary/w_auto@1/ja.postcard deleted file mode 100644 index d9942cab71a..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/dictionary/w_auto@1/ja.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/km.postcard b/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/km.postcard deleted file mode 100644 index bec06af86ce..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/km.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/lo.postcard b/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/lo.postcard deleted file mode 100644 index ba549d12158..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/lo.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/my.postcard b/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/my.postcard deleted file mode 100644 index 3f3f6051f0d..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/my.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/th.postcard b/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/th.postcard deleted file mode 100644 index 3c59b64b40e..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/dictionary/wl_ext@1/th.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/km.postcard b/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/km.postcard deleted file mode 100644 index 89469c9081b..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/km.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/lo.postcard b/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/lo.postcard deleted file mode 100644 index 126965e95ef..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/lo.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/my.postcard b/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/my.postcard deleted file mode 100644 index 1609b475f18..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/my.postcard and /dev/null differ diff --git a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/th.postcard b/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/th.postcard deleted file mode 100644 index 0cca3b68369..00000000000 Binary files a/components/segmenter/tests/testdata/provider/segmenter/lstm/wl_auto@1/th.postcard and /dev/null differ diff --git a/components/segmenter/tests/word_rule_status.rs b/components/segmenter/tests/word_rule_status.rs index 4ae64552b70..d0638b047be 100644 --- a/components/segmenter/tests/word_rule_status.rs +++ b/components/segmenter/tests/word_rule_status.rs @@ -7,8 +7,7 @@ use icu_segmenter::WordType; #[test] fn rule_status() { - let segmenter = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = WordSegmenter::new_auto(); let mut iter = segmenter.segment_str("hello world 123"); assert_eq!(iter.next(), Some(0), "SOT"); @@ -38,8 +37,7 @@ fn rule_status() { #[test] fn rule_status_letter_eof() { - let segmenter = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = WordSegmenter::new_auto(); let mut iter = segmenter.segment_str("one."); assert_eq!(iter.next(), Some(0), "SOT"); @@ -57,8 +55,7 @@ fn rule_status_letter_eof() { #[test] fn rule_status_numeric_eof() { - let segmenter = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = WordSegmenter::new_auto(); let mut iter = segmenter.segment_str("42."); assert_eq!(iter.next(), Some(0), "SOT"); @@ -76,8 +73,7 @@ fn rule_status_numeric_eof() { #[test] fn rule_status_th() { - let segmenter = - WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists"); + let segmenter = WordSegmenter::new_auto(); let mut iter = segmenter.segment_str("ภาษาไทยภาษาไทย"); assert_eq!(iter.next(), Some(0), "SOT"); @@ -97,7 +93,7 @@ fn rule_status_th() { #[test] fn rule_status_no_word() { let segmenter = - SentenceSegmenter::try_new_unstable(&icu_testdata::unstable()).expect("Data exists"); + SentenceSegmenter::new(); let mut iter = segmenter.segment_str("hello"); assert_eq!(iter.next(), Some(0), "SOT"); diff --git a/tools/make/data.toml b/tools/make/data.toml index f91cf8a61dd..229c4773ba3 100644 --- a/tools/make/data.toml +++ b/tools/make/data.toml @@ -21,7 +21,6 @@ category = "ICU4X Data" dependencies = [ "testdata", "testdata-hello-world", - "testdata-segmenter", "download-repo-sources", ] script_runner = "@duckscript" @@ -112,26 +111,6 @@ dependencies = [ "testdata-hello-world-blob", ] -[tasks.testdata-segmenter] -description = "Build JSON testdata for the segmenter crate." -category = "ICU4X Data" -command = "cargo" -args = [ - "run", - "--bin=icu4x-datagen", - "--", - "--keys", - "segmenter/lstm/wl_auto@1", - "segmenter/dictionary/w_auto@1", - "segmenter/dictionary/wl_ext@1", - "--locales=full", - "--format=dir", - "--syntax=postcard", - "--icuexport-root=provider/repodata/data/icuexport", - "--out=components/segmenter/tests/testdata/provider", - "--overwrite", -] - [tasks.bakeddata] description = "Builds full baked data" category = "ICU4X Data"