Skip to content

Commit

Permalink
Fixing datagen features (#3339)
Browse files Browse the repository at this point in the history
* fix

* patch

* clippy
  • Loading branch information
robertbastian committed Apr 17, 2023
1 parent 4aff8bc commit fbed6df
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 83 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions provider/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[package]
name = "icu_datagen"
description = "Generate data for ICU4X DataProvider"
version = "1.2.0"
version = "1.2.1"
authors = ["The ICU4X Project Developers"]
edition = "2021"
readme = "README.md"
Expand Down Expand Up @@ -105,7 +105,7 @@ use_wasm = ["icu_codepointtrie_builder/wasm"]
# If neither `use_wasm` nor `use_icu4c` are enabled,
# rule based segmenter data will not be generated.
use_icu4c = ["icu_codepointtrie_builder/icu4c"]
networking = ["cached-path"]
networking = ["dep:cached-path"]

[[bin]]
name = "icu4x-datagen"
Expand Down
84 changes: 39 additions & 45 deletions provider/datagen/src/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use std::collections::HashSet;
use std::fmt::Debug;
use std::io::Cursor;
use std::io::Read;
use std::ops::Deref;
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
Expand Down Expand Up @@ -61,6 +60,9 @@ impl SourceData {
/// The latest `SourceData` that has been verified to work with this version of `icu_datagen`.
///
/// See [`SourceData::LATEST_TESTED_CLDR_TAG`] and [`SourceData::LATEST_TESTED_ICUEXPORT_TAG`].
///
/// Requires `networking` Cargo feature.
#[cfg(feature = "networking")]
pub fn latest_tested() -> Self {
Self::default()
.with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default())
Expand Down Expand Up @@ -110,6 +112,9 @@ impl SourceData {
/// using the given tag (see [GitHub releases](https://github.com/unicode-org/cldr-json/releases)).
///
/// Also see: [`LATEST_TESTED_CLDR_TAG`](Self::LATEST_TESTED_CLDR_TAG)
///
/// Requires `networking` Cargo feature.
#[cfg(feature = "networking")]
pub fn with_cldr_for_tag(
self,
tag: &str,
Expand All @@ -128,6 +133,9 @@ impl SourceData {
/// using the given tag. (see [GitHub releases](https://github.com/unicode-org/icu/releases)).
///
/// Also see: [`LATEST_TESTED_ICUEXPORT_TAG`](Self::LATEST_TESTED_ICUEXPORT_TAG)
///
/// Requires `networking` Cargo feature.
#[cfg(feature = "networking")]
pub fn with_icuexport_for_tag(self, mut tag: &str) -> Result<Self, DataError> {
if tag == "release-71-1" {
tag = "icu4x/2022-08-17/71.x";
Expand All @@ -147,6 +155,7 @@ impl SourceData {
since = "1.1.0",
note = "Use `with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG)`"
)]
#[cfg(feature = "networking")]
/// Deprecated
pub fn with_cldr_latest(
self,
Expand All @@ -159,6 +168,7 @@ impl SourceData {
since = "1.1.0",
note = "Use `with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)`"
)]
#[cfg(feature = "networking")]
/// Deprecated
pub fn with_icuexport_latest(self) -> Result<Self, DataError> {
self.with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG)
Expand Down Expand Up @@ -239,15 +249,6 @@ pub(crate) enum IcuTrieType {
Small,
}

impl IcuTrieType {
pub(crate) fn to_internal(self) -> icu_collections::codepointtrie::TrieType {
match self {
IcuTrieType::Fast => icu_collections::codepointtrie::TrieType::Fast,
IcuTrieType::Small => icu_collections::codepointtrie::TrieType::Small,
}
}
}

impl std::fmt::Display for IcuTrieType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Expand Down Expand Up @@ -378,48 +379,41 @@ impl AbstractFs {
}
}

#[cfg(feature = "networking")]
fn new_from_url(path: String) -> Self {
Self::Zip(RwLock::new(Err(path)))
}

fn init(&self) -> Result<(), DataError> {
match self {
Self::Zip(lock) => {
if lock.read().expect("poison").is_ok() {
return Ok(());
}
let mut lock = lock.write().expect("poison");
let resource = if let Err(resource) = lock.deref() {
resource
} else {
return Ok(());
};

let root: PathBuf = {
#[cfg(not(feature = "networking"))]
unreachable!("AbstractFs URL mode only possible when using CLDR/ICU tags, which cannot be set without the `networking` feature");

#[cfg(feature = "networking")]
{
lazy_static::lazy_static! {
static ref CACHE: cached_path::Cache = cached_path::CacheBuilder::new()
.freshness_lifetime(u64::MAX)
.progress_bar(None)
.build()
.unwrap();
}

CACHE
.cached_path(resource)
.map_err(|e| DataError::custom("Download").with_display_context(&e))?
}
};
*lock = Ok(ZipArchive::new(Cursor::new(std::fs::read(root)?))
.map_err(|e| DataError::custom("Zip").with_display_context(&e))?);
Ok(())
#[cfg(feature = "networking")]
if let Self::Zip(lock) = self {
if lock.read().expect("poison").is_ok() {
return Ok(());
}
_ => Ok(()),
let mut lock = lock.write().expect("poison");
let resource = if let Err(resource) = &*lock {
resource
} else {
return Ok(());
};

let root = {
lazy_static::lazy_static! {
static ref CACHE: cached_path::Cache = cached_path::CacheBuilder::new()
.freshness_lifetime(u64::MAX)
.progress_bar(None)
.build()
.unwrap();
}

CACHE
.cached_path(resource)
.map_err(|e| DataError::custom("Download").with_display_context(&e))?
};
*lock = Ok(ZipArchive::new(Cursor::new(std::fs::read(root)?))
.map_err(|e| DataError::custom("Zip").with_display_context(&e))?);
}
Ok(())
}

fn read_to_buf(&self, path: &str) -> Result<Vec<u8>, DataError> {
Expand Down
96 changes: 61 additions & 35 deletions provider/datagen/src/transform/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

//! This module contains provider implementations backed by built-in segmentation data.

// Some code gated on icu_codepointtrie_builder features
#![allow(dead_code)]
#![allow(unused_imports)]

use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData};
use icu_collections::codepointtrie::CodePointTrie;
use icu_locid::{langid, locale};
Expand Down Expand Up @@ -222,6 +226,7 @@ fn is_cjk_fullwidth(eaw: maps::CodePointMapDataBorrowed<EastAsianWidth>, codepoi
}

impl crate::DatagenProvider {
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
fn generate_rule_break_data(
&self,
key: DataKey,
Expand Down Expand Up @@ -583,21 +588,18 @@ impl crate::DatagenProvider {
let complex_property = get_index_from_name(&properties_names, "SA").unwrap_or(127);

// Generate a CodePointTrie from properties_map
let property_trie: CodePointTrie<u8> = {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter data",
));

#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
CodePointTrieBuilder {
data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map),
default_value: 0,
error_value: 0,
trie_type: self.source.trie_type().to_internal(),
}
.build()
};
let property_trie: CodePointTrie<u8> = CodePointTrieBuilder {
data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map),
default_value: 0,
error_value: 0,
trie_type: match self.source.trie_type() {
crate::source::IcuTrieType::Fast => icu_collections::codepointtrie::TrieType::Fast,
crate::source::IcuTrieType::Small => {
icu_collections::codepointtrie::TrieType::Small
}
},
}
.build();

if segmenter.segmenter_type == "line" {
// Note: The following match statement had been used in line.rs:
Expand Down Expand Up @@ -648,12 +650,18 @@ impl crate::DatagenProvider {

impl DataProvider<LineBreakDataV1Marker> for crate::DatagenProvider {
fn load(&self, _req: DataRequest) -> Result<DataResponse<LineBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(LineBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/line@1",
)
.with_req(LineBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(LineBreakDataV1Marker::KEY)?,
)),
});
}
}

Expand All @@ -662,23 +670,35 @@ impl DataProvider<GraphemeClusterBreakDataV1Marker> for crate::DatagenProvider {
&self,
_req: DataRequest,
) -> Result<DataResponse<GraphemeClusterBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(GraphemeClusterBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/grapheme@1",
)
.with_req(GraphemeClusterBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(GraphemeClusterBreakDataV1Marker::KEY)?,
)),
});
}
}

impl DataProvider<WordBreakDataV1Marker> for crate::DatagenProvider {
fn load(&self, _req: DataRequest) -> Result<DataResponse<WordBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(WordBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/word@1",
)
.with_req(WordBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(WordBreakDataV1Marker::KEY)?,
)),
});
}
}

Expand All @@ -687,12 +707,18 @@ impl DataProvider<SentenceBreakDataV1Marker> for crate::DatagenProvider {
&self,
_req: DataRequest,
) -> Result<DataResponse<SentenceBreakDataV1Marker>, DataError> {
let break_data = self.generate_rule_break_data(SentenceBreakDataV1Marker::KEY)?;

Ok(DataResponse {
#[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
return Err(DataError::custom(
"icu_datagen must be built with use_icu4c or use_wasm to build segmenter/sentence@1",
)
.with_req(SentenceBreakDataV1Marker::KEY, _req));
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
return Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(break_data)),
})
payload: Some(DataPayload::from_owned(
self.generate_rule_break_data(SentenceBreakDataV1Marker::KEY)?,
)),
});
}
}

Expand Down

0 comments on commit fbed6df

Please sign in to comment.