Skip to content

Commit

Permalink
Putting DatagenProvider behind a feature (#4800)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian committed Apr 17, 2024
1 parent 6df291f commit 2ac5c2b
Show file tree
Hide file tree
Showing 104 changed files with 1,990 additions and 1,855 deletions.
26 changes: 13 additions & 13 deletions ffi/dart/tools/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,21 @@ serde = "1"
simple_logger = { version = "4.1.0", default-features = false }

# These are required to make the make_exportable_provider macro work
icu_calendar.workspace = true
icu_casemap.workspace = true
icu_collator.workspace = true
icu_datetime.workspace = true
icu_decimal.workspace = true
icu_list.workspace = true
icu_locid_transform.workspace = true
icu_normalizer.workspace = true
icu_plurals.workspace = true
icu_properties.workspace = true
icu_segmenter.workspace = true
icu_timezone.workspace = true
icu_calendar = { workspace = true, features = ["datagen"] }
icu_casemap = { workspace = true, features = ["datagen"] }
icu_collator = { workspace = true, features = ["datagen"] }
icu_datetime = { workspace = true, features = ["datagen"] }
icu_decimal = { workspace = true, features = ["datagen"] }
icu_list = { workspace = true, features = ["datagen"] }
icu_locid_transform = { workspace = true, features = ["datagen"] }
icu_normalizer = { workspace = true, features = ["datagen"] }
icu_plurals = { workspace = true, features = ["datagen"] }
icu_properties = { workspace = true, features = ["datagen"] }
icu_segmenter = { workspace = true, features = ["datagen"] }
icu_timezone = { workspace = true, features = ["datagen"] }

[build-dependencies]
icu_datagen = { workspace = true, features = ["networking", "use_wasm"] }
icu_datagen = { workspace = true, features = ["networking", "use_wasm", "provider"] }
icu_provider_blob = { workspace = true, features = ["export"] }
log = "0.4"
simple_logger = { version = "4.1.0", default-features = false }
169 changes: 117 additions & 52 deletions provider/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,16 @@ version = "1.4.1"
all-features = true

[dependencies]
# ICU components
icu_calendar = { workspace = true, features = ["datagen"] }
icu_casemap = { workspace = true, features = ["datagen"] }
icu_collator = { workspace = true, features = ["datagen"] }
icu_datetime = { workspace = true, features = ["datagen"] }
icu_decimal = { workspace = true, features = ["datagen"] }
icu_list = { workspace = true, features = ["datagen"]}
icu_locid_transform = { workspace = true, features = ["datagen"] }
icu_normalizer = { workspace = true, features = ["datagen"] }
icu_pattern = { workspace = true, features = ["alloc"] }
icu_plurals = { workspace = true, features = ["datagen"] }
icu_properties = { workspace = true, features = ["datagen"]}
icu_segmenter = { workspace = true, features = ["datagen", "lstm"] }
icu_timezone = { workspace = true, features = ["datagen"] }
icu_experimental = { workspace = true, features = ["datagen"], optional = true }

num-bigint = { version = "0.4.4", default-features = false, optional = true }
num-rational = { version = "0.4", default-features = false, optional = true }

# ICU infrastructure
calendrical_calculations = { workspace = true }
icu_codepointtrie_builder = { workspace = true }
icu_collections = { workspace = true, features = ["serde"] }
icu_locid = { workspace = true, features = ["std", "serde"] }
# DatagenDriver
displaydoc = { version = "0.2.3", default-features = false }
icu_locid = { workspace = true, features = ["std"] }
icu_provider = { workspace = true, features = ["std", "logging", "datagen", "experimental"]}
icu_provider_adapters = { workspace = true }
tinystr = { workspace = true, features = ["alloc", "serde", "zerovec"] }
log = { version = "0.4" }
memchr = "2.5.0"
once_cell = "1"
rayon = { version = "1.5", optional = true }
writeable = { workspace = true }
zerotrie = { workspace = true, features = ["alloc"] }
zerovec = { workspace = true, features = ["serde", "yoke"] }

# Exporters
icu_provider_blob = { workspace = true, features = ["export"], optional = true }
Expand All @@ -70,54 +50,139 @@ crlify = { workspace = true, optional = true }
databake = { workspace = true, optional = true}
proc-macro2 = {version = "1", optional = true }

# Other external dependencies
displaydoc = { version = "0.2.3", default-features = false }
either = { workspace = true }
elsa = "1.10"
itertools = "0.10"
log = "0.4"
memchr = "2.5.0"
once_cell = "1"
ndarray = { version = "0.15.5", default-features = false }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] }
serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
serde-aux = { version = "4.1.2", default-features = false }
toml = "0.5"
twox-hash = { version = "1.6", default-features = false }
zip = { version = ">=0.5, <0.7", default-features = false, features = ["deflate"] }
# The components are needed for the registry, provider, baked exporter, with different features
icu_calendar = { workspace = true }
icu_casemap = { workspace = true }
icu_collator = { workspace = true }
icu_datetime = { workspace = true }
icu_decimal = { workspace = true }
icu_list = { workspace = true }
icu_locid_transform = { workspace = true }
icu_normalizer = { workspace = true }
icu_plurals = { workspace = true }
icu_properties = { workspace = true }
icu_segmenter = { workspace = true }
icu_timezone = { workspace = true }
icu_experimental = { workspace = true, optional = true }

rayon = { version = "1.5", optional = true }
ureq = { version = "2", optional = true }
# DatagenProvider

## ICU infrastructure
calendrical_calculations = { workspace = true, optional = true }
icu_codepointtrie_builder = { workspace = true, optional = true }
icu_collections = { workspace = true, features = ["serde"], optional = true }
icu_pattern = { workspace = true, features = ["alloc"], optional = true }
icu_provider_adapters = { workspace = true, optional = true }
tinystr = { workspace = true, features = ["alloc", "serde", "zerovec"], optional = true }
zerotrie = { workspace = true, features = ["alloc"], optional = true }
zerovec = { workspace = true, features = ["serde", "yoke"], optional = true }

# Dependencies for "bin" feature
## External dependencies
either = { workspace = true, optional = true }
elsa = { version = "1.10", optional = true }
itertools = { version = "0.10", optional = true }
ndarray = { version = "0.15.5", default-features = false, optional = true }
num-bigint = { version = "0.4.4", default-features = false, optional = true }
num-rational = { version = "0.4", default-features = false, optional = true }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
serde_json = { version = "1.0", default-features = false, features = ["alloc"], optional = true }
serde-aux = { version = "4.1.2", default-features = false, optional = true }
toml = { version = "0.5", optional = true }
twox-hash = { version = "1.6", default-features = false, optional = true }
ureq = { version = "2", optional = true }
zip = { version = ">=0.5, <0.7", default-features = false, features = ["deflate"], optional = true }

# "bin" feature
clap = { version = "4", optional = true, features = ["derive"] }
eyre = { version = "0.6", optional = true }
simple_logger = { version = "4.1.0", default-features = false, optional = true }

[dev-dependencies]
crlify = { path = "../../utils/crlify" }
elsa = { version = "1.10" }
icu = { path = "../../components/icu" }
postcard = "1"
simple_logger = { version = "4.1.0", default-features = false }

[features]
default = ["bin", "use_wasm", "networking", "legacy_api", "rayon", "fs_exporter", "blob_exporter", "baked_exporter"]
baked_exporter = ["dep:crlify", "dep:databake", "dep:proc-macro2"]
default = ["bin", "use_wasm", "networking", "legacy_api", "rayon", "fs_exporter", "blob_exporter", "baked_exporter", "provider"]
provider = [
"icu_calendar/datagen",
"icu_casemap/datagen",
"icu_collator/datagen",
"icu_datetime/datagen",
"icu_decimal/datagen",
"icu_experimental?/datagen",
"icu_list/datagen",
"icu_locid_transform/datagen",
"icu_locid/serde",
"icu_normalizer/datagen",
"icu_plurals/datagen",
"icu_properties/datagen",
"icu_segmenter/datagen",
"icu_segmenter/lstm",
"icu_timezone/datagen",
"dep:calendrical_calculations",
"dep:icu_codepointtrie_builder",
"dep:icu_collections",
"dep:icu_pattern",
"dep:icu_provider_adapters",
"dep:tinystr",
"dep:zerotrie",
"dep:zerovec",
"dep:either",
"dep:elsa",
"dep:itertools",
"dep:ndarray",
"dep:serde",
"dep:serde_json",
"dep:serde-aux",
"dep:toml",
"dep:twox-hash",
"dep:zip",
]
baked_exporter = [
"dep:crlify",
"dep:databake",
"dep:proc-macro2",
"dep:itertools",
"icu_calendar/datagen",
"icu_casemap/datagen",
"icu_collator/datagen",
"icu_datetime/datagen",
"icu_decimal/datagen",
"icu_list/datagen",
"icu_locid_transform/datagen",
"icu_normalizer/datagen",
"icu_plurals/datagen",
"icu_properties/datagen",
"icu_segmenter/datagen",
"icu_timezone/datagen",
"icu_experimental?/datagen",
]
blob_exporter = ["dep:icu_provider_blob"]
fs_exporter = ["dep:icu_provider_fs"]
legacy_api = ["fs_exporter", "blob_exporter", "baked_exporter"]
bin = ["dep:clap", "dep:eyre", "dep:simple_logger"]
legacy_api = ["fs_exporter", "blob_exporter", "baked_exporter", "provider"]
bin = ["dep:clap", "dep:eyre", "dep:simple_logger", "provider"]
rayon = ["dep:rayon"]
# Use wasm for building codepointtries
use_wasm = ["icu_codepointtrie_builder/wasm"]
use_wasm = ["icu_codepointtrie_builder?/wasm"]
# Use local ICU4C libraries for building codepointtries
# (will do nothing if used with `use_wasm`)
# If neither `use_wasm` nor `use_icu4c` are enabled,
# rule based segmenter data will not be generated.
use_icu4c = ["icu_codepointtrie_builder/icu4c"]
use_icu4c = ["icu_codepointtrie_builder?/icu4c"]
networking = ["dep:ureq"]
experimental_components = ["dep:icu_experimental", "dep:num-bigint", "dep:num-rational"]
experimental_components = [
"dep:icu_experimental",
# For registry
"icu_datetime/experimental",
# For registry
"icu_plurals/experimental",
# Only required if both provider and experimental are enabled, but that's not expressible with features
"dep:num-bigint",
"dep:num-rational"
]

[[bin]]
name = "icu4x-datagen"
Expand Down
53 changes: 9 additions & 44 deletions provider/datagen/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ pub struct NoFallbackOptions {}
/// the locale fallback algorithm. If internal fallback is requested for an exporter that does
/// not support it, an error will occur.
#[non_exhaustive]
#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum RuntimeFallbackLocation {
/// Include fallbacking code in the exported data provider.
Internal,
Expand Down Expand Up @@ -64,7 +64,7 @@ pub enum RuntimeFallbackLocation {
/// [`Maximal`]: DeduplicationStrategy::Maximal
/// [`None`]: DeduplicationStrategy::None
#[non_exhaustive]
#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum DeduplicationStrategy {
/// Removes from the lookup table any locale whose parent maps to the same data.
Maximal,
Expand Down Expand Up @@ -239,27 +239,6 @@ impl FromStr for LocaleFamily {
}
}

impl serde::Serialize for LocaleFamily {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.write_to_string().serialize(serializer)
}
}

impl<'de> serde::Deserialize<'de> for LocaleFamily {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
use serde::de::Error;
<&str>::deserialize(deserializer)?
.parse()
.map_err(D::Error::custom)
}
}

#[test]
fn test_locale_family_parsing() {
let valid_families = ["und", "de-CH", "^es", "@pt-BR", "full"];
Expand All @@ -268,9 +247,6 @@ fn test_locale_family_parsing() {
let family = family_str.parse::<LocaleFamily>().unwrap();
let family_to_str = family.to_string();
assert_eq!(family_str, family_to_str);
let family_json = serde_json::to_string(&family).unwrap();
let family_from_json = serde_json::from_str(&family_json).unwrap();
assert_eq!(family, family_from_json);
}
for family_str in invalid_families {
assert!(family_str.parse::<LocaleFamily>().is_err());
Expand Down Expand Up @@ -972,36 +948,27 @@ fn select_locales_for_key(
};
}

if key == icu_segmenter::provider::DictionaryForWordOnlyAutoV1Marker::KEY
|| key == icu_segmenter::provider::DictionaryForWordLineExtendedV1Marker::KEY
{
if key.path().get().starts_with("segmenter/dictionary/") {
supported_map.retain(|_, locales| {
locales.retain(|locale| {
let model =
crate::transform::segmenter::dictionary::data_locale_to_model_name(locale);
let model = crate::dictionary_data_locale_to_model_name(locale);
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
});
!locales.is_empty()
});
// Don't perform additional locale filtering
return Ok(supported_map.into_values().flatten().collect());
} else if key == icu_segmenter::provider::LstmForWordLineAutoV1Marker::KEY {
} else if key.path().get().starts_with("segmenter/lstm/") {
supported_map.retain(|_, locales| {
locales.retain(|locale| {
let model = crate::transform::segmenter::lstm::data_locale_to_model_name(locale);
let model = crate::lstm_data_locale_to_model_name(locale);
segmenter_models.iter().any(|m| Some(m.as_ref()) == model)
});
!locales.is_empty()
});
// Don't perform additional locale filtering
return Ok(supported_map.into_values().flatten().collect());
} else if key == icu_collator::provider::CollationDataV1Marker::KEY
|| key == icu_collator::provider::CollationDiacriticsV1Marker::KEY
|| key == icu_collator::provider::CollationJamoV1Marker::KEY
|| key == icu_collator::provider::CollationMetadataV1Marker::KEY
|| key == icu_collator::provider::CollationReorderingV1Marker::KEY
|| key == icu_collator::provider::CollationSpecialPrimariesV1Marker::KEY
{
} else if key.path().get().starts_with("collator/") {
supported_map.retain(|_, locales| {
locales.retain(|locale| {
let Some(collation) = locale
Expand Down Expand Up @@ -1083,9 +1050,7 @@ fn select_locales_for_key(
}
// Special case: skeletons *require* the -u-ca keyword, so don't export locales that don't have it
// This would get caught later on, but it makes datagen faster and quieter to catch it here
if key == icu_datetime::provider::calendar::DateSkeletonPatternsV1Marker::KEY
&& !locale.has_unicode_ext()
{
if key.path().get() == "datetime/skeletons@1" && !locale.has_unicode_ext() {
return false;
}
let mut iter = fallbacker_with_config.fallback_for(locale);
Expand Down Expand Up @@ -1198,7 +1163,7 @@ fn test_collation_filtering() {
];
for cas in cases {
let resolved_locales = select_locales_for_key(
&crate::DatagenProvider::new_testing(),
&crate::provider::DatagenProvider::new_testing(),
icu_collator::provider::CollationDataV1Marker::KEY,
&LocalesWithOrWithoutFallback::WithoutFallback {
locales: [cas.language.clone()].into_iter().collect(),
Expand Down
Loading

0 comments on commit 2ac5c2b

Please sign in to comment.