Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing datagen features #3339

Merged
merged 3 commits into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions provider/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[package]
name = "icu_datagen"
description = "Generate data for ICU4X DataProvider"
version = "1.2.0"
version = "1.2.1"
authors = ["The ICU4X Project Developers"]
edition = "2021"
readme = "README.md"
Expand Down Expand Up @@ -105,7 +105,7 @@ use_wasm = ["icu_codepointtrie_builder/wasm"]
# If neither `use_wasm` nor `use_icu4c` are enabled,
# rule based segmenter data will not be generated.
use_icu4c = ["icu_codepointtrie_builder/icu4c"]
networking = ["cached-path"]
networking = ["dep:cached-path"]

[[bin]]
name = "icu4x-datagen"
Expand Down
84 changes: 39 additions & 45 deletions provider/datagen/src/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use std::collections::HashSet;
use std::fmt::Debug;
use std::io::Cursor;
use std::io::Read;
use std::ops::Deref;
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
Expand Down Expand Up @@ -61,6 +60,9 @@ impl SourceData {
/// The latest `SourceData` that has been verified to work with this version of `icu_datagen`.
///
/// See [`SourceData::LATEST_TESTED_CLDR_TAG`] and [`SourceData::LATEST_TESTED_ICUEXPORT_TAG`].
///
/// Requires `networking` Cargo feature.
#[cfg(feature = "networking")]
pub fn latest_tested() -> Self {
Self::default()
.with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default())
Expand Down Expand Up @@ -110,6 +112,9 @@ impl SourceData {
/// using the given tag (see [GitHub releases](https://github.com/unicode-org/cldr-json/releases)).
///
/// Also see: [`LATEST_TESTED_CLDR_TAG`](Self::LATEST_TESTED_CLDR_TAG)
///
/// Requires `networking` Cargo feature.
#[cfg(feature = "networking")]
pub fn with_cldr_for_tag(
self,
tag: &str,
Expand All @@ -128,6 +133,9 @@ impl SourceData {
/// using the given tag. (see [GitHub releases](https://github.com/unicode-org/icu/releases)).
///
/// Also see: [`LATEST_TESTED_ICUEXPORT_TAG`](Self::LATEST_TESTED_ICUEXPORT_TAG)
///
/// Requires `networking` Cargo feature.
#[cfg(feature = "networking")]
pub fn with_icuexport_for_tag(self, mut tag: &str) -> Result<Self, DataError> {
if tag == "release-71-1" {
tag = "icu4x/2022-08-17/71.x";
Expand All @@ -147,6 +155,7 @@ impl SourceData {
since = "1.1.0",
note = "Use `with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG)`"
)]
#[cfg(feature = "networking")]
/// Deprecated
pub fn with_cldr_latest(
self,
Expand All @@ -159,6 +168,7 @@ impl SourceData {
since = "1.1.0",
note = "Use `with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)`"
)]
#[cfg(feature = "networking")]
/// Deprecated
pub fn with_icuexport_latest(self) -> Result<Self, DataError> {
self.with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG)
Expand Down Expand Up @@ -239,15 +249,6 @@ pub(crate) enum IcuTrieType {
Small,
}

impl IcuTrieType {
pub(crate) fn to_internal(self) -> icu_collections::codepointtrie::TrieType {
match self {
IcuTrieType::Fast => icu_collections::codepointtrie::TrieType::Fast,
IcuTrieType::Small => icu_collections::codepointtrie::TrieType::Small,
}
}
}

impl std::fmt::Display for IcuTrieType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Expand Down Expand Up @@ -378,48 +379,41 @@ impl AbstractFs {
}
}

#[cfg(feature = "networking")]
fn new_from_url(path: String) -> Self {
Self::Zip(RwLock::new(Err(path)))
}

fn init(&self) -> Result<(), DataError> {
match self {
Self::Zip(lock) => {
if lock.read().expect("poison").is_ok() {
return Ok(());
}
let mut lock = lock.write().expect("poison");
let resource = if let Err(resource) = lock.deref() {
resource
} else {
return Ok(());
};

let root: PathBuf = {
#[cfg(not(feature = "networking"))]
unreachable!("AbstractFs URL mode only possible when using CLDR/ICU tags, which cannot be set without the `networking` feature");

#[cfg(feature = "networking")]
{
lazy_static::lazy_static! {
static ref CACHE: cached_path::Cache = cached_path::CacheBuilder::new()
.freshness_lifetime(u64::MAX)
.progress_bar(None)
.build()
.unwrap();
}

CACHE
.cached_path(resource)
.map_err(|e| DataError::custom("Download").with_display_context(&e))?
}
};
*lock = Ok(ZipArchive::new(Cursor::new(std::fs::read(root)?))
.map_err(|e| DataError::custom("Zip").with_display_context(&e))?);
Ok(())
#[cfg(feature = "networking")]
if let Self::Zip(lock) = self {
if lock.read().expect("poison").is_ok() {
return Ok(());
}
_ => Ok(()),
let mut lock = lock.write().expect("poison");
let resource = if let Err(resource) = &*lock {
resource
} else {
return Ok(());
};

let root = {
lazy_static::lazy_static! {
static ref CACHE: cached_path::Cache = cached_path::CacheBuilder::new()
.freshness_lifetime(u64::MAX)
.progress_bar(None)
.build()
.unwrap();
}

CACHE
.cached_path(resource)
.map_err(|e| DataError::custom("Download").with_display_context(&e))?
};
*lock = Ok(ZipArchive::new(Cursor::new(std::fs::read(root)?))
.map_err(|e| DataError::custom("Zip").with_display_context(&e))?);
}
Ok(())
}

fn read_to_buf(&self, path: &str) -> Result<Vec<u8>, DataError> {
Expand Down
96 changes: 61 additions & 35 deletions provider/datagen/src/transform/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

//! This module contains provider implementations backed by built-in segmentation data.

// Some code gated on icu_codepointtrie_builder features
#![allow(dead_code)]
#![allow(unused_imports)]

use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData};
use icu_collections::codepointtrie::CodePointTrie;
use icu_locid::{langid, locale};
Expand Down Expand Up @@ -222,6 +226,7 @@ fn is_cjk_fullwidth(eaw: maps::CodePointMapDataBorrowed<EastAsianWidth>, codepoi
}

impl crate::DatagenProvider {
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
fn generate_rule_break_data(
&self,
key: DataKey,
Expand Down Expand Up @@ -583,21 +588,18 @@ impl crate::DatagenProvider {
let complex_property = get_index_from_name(&properties_names, "SA").unwrap_or(127);

// Generate a CodePointTrie from properties_map
let property_trie: CodePointTrie<u8> = {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter data",
));

#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
CodePointTrieBuilder {
data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map),
default_value: 0,
error_value: 0,
trie_type: self.source.trie_type().to_internal(),
}
.build()
};
let property_trie: CodePointTrie<u8> = CodePointTrieBuilder {
data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map),
default_value: 0,
error_value: 0,
trie_type: match self.source.trie_type() {
crate::source::IcuTrieType::Fast => icu_collections::codepointtrie::TrieType::Fast,
crate::source::IcuTrieType::Small => {
icu_collections::codepointtrie::TrieType::Small
}
},
}
.build();

if segmenter.segmenter_type == "line" {
// Note: The following match statement had been used in line.rs:
Expand Down Expand Up @@ -648,12 +650,18 @@ impl crate::DatagenProvider {

impl DataProvider<LineBreakDataV1Marker> for crate::DatagenProvider {
fn load(&self, _req: DataRequest) -> Result<DataResponse<LineBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(LineBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/line@1",
)
.with_req(LineBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(LineBreakDataV1Marker::KEY)?,
)),
});
}
}

Expand All @@ -662,23 +670,35 @@ impl DataProvider<GraphemeClusterBreakDataV1Marker> for crate::DatagenProvider {
&self,
_req: DataRequest,
) -> Result<DataResponse<GraphemeClusterBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(GraphemeClusterBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/grapheme@1",
)
.with_req(GraphemeClusterBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(GraphemeClusterBreakDataV1Marker::KEY)?,
)),
});
}
}

impl DataProvider<WordBreakDataV1Marker> for crate::DatagenProvider {
fn load(&self, _req: DataRequest) -> Result<DataResponse<WordBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(WordBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/word@1",
)
.with_req(WordBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(WordBreakDataV1Marker::KEY)?,
)),
});
}
}

Expand All @@ -687,12 +707,18 @@ impl DataProvider<SentenceBreakDataV1Marker> for crate::DatagenProvider {
&self,
_req: DataRequest,
) -> Result<DataResponse<SentenceBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(SentenceBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/sentence@1",
)
.with_req(SentenceBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(SentenceBreakDataV1Marker::KEY)?,
)),
});
}
}

Expand Down