Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Datagen: Consume CLDR-JSON resources keyed with default script #3772

Merged
merged 24 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions provider/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ itertools = "0.10"
lazy_static = "1"
log = "0.4"
memchr = "2.5.0"
once_cell = "1"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Manishearth to approve the new dependency. (We now have multiple places where it is useful)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to not have two lazy initialization dependencies, we should either use lazy_static everywhere or once_cell everywhere.

Slight preference for the latter since 1.70 has it in std

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Approved with #3777 being 1.3 blocking (I'm fine with this landing first)

ndarray = { version = "0.15.5", default-features = false }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] }
serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
Expand Down
8 changes: 3 additions & 5 deletions provider/datagen/src/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,7 @@ impl SourceData {
) -> Result<Self, DataError> {
let root = AbstractFs::new(root)?;
Ok(Self {
cldr_paths: Some(Arc::new(CldrCache::try_from_serde_cache(SerdeCache::new(
root,
))?)),
cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(root)))),
..self
})
}
Expand Down Expand Up @@ -132,9 +130,9 @@ impl SourceData {
_use_default_here: crate::CldrLocaleSubset,
) -> Result<Self, DataError> {
Ok(Self {
cldr_paths: Some(Arc::new(CldrCache::try_from_serde_cache(SerdeCache::new(AbstractFs::new_from_url(format!(
cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(AbstractFs::new_from_url(format!(
"https://github.com/unicode-org/cldr-json/releases/download/{tag}/cldr-{tag}-json-full.zip",
))))?
))))
)),
..self
})
Expand Down
134 changes: 72 additions & 62 deletions provider/datagen/src/transform/cldr/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

#![allow(dead_code)] // features

use super::cldr_serde;
use super::locale_canonicalizer::likely_subtags::LikelySubtagsResources;
use crate::source::SerdeCache;
use icu_locid::LanguageIdentifier;
Expand All @@ -15,9 +14,9 @@ use icu_provider::prelude::*;
use icu_provider::DataError;
use icu_provider_adapters::any_payload::AnyPayloadProvider;
use icu_provider_adapters::fork::ForkByKeyProvider;
use once_cell::sync::OnceCell;
use std::fmt::Debug;
use std::str::FromStr;
use std::sync::RwLock;

/// A language's CLDR coverage level.
#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
Expand All @@ -43,31 +42,17 @@ pub enum CoverageLevel {
#[derive(Debug)]
pub(crate) struct CldrCache {
serde_cache: SerdeCache,
is_full: RwLock<Option<bool>>,
locale_expander: LocaleExpander,
dir_suffix: OnceCell<&'static str>,
locale_expander: OnceCell<LocaleExpander>,
}

impl CldrCache {
pub fn try_from_serde_cache(serde_cache: SerdeCache) -> Result<Self, DataError> {
let likely_subtags: &cldr_serde::likely_subtags::Resource =
serde_cache.read_and_parse_json("cldr-core/supplemental/likelySubtags.json")?;
let coverage_levels: &cldr_serde::coverage_levels::Resource =
serde_cache.read_and_parse_json("cldr-core/coverageLevels.json")?;
let resources = LikelySubtagsResources::from_resources(likely_subtags, coverage_levels);
let data = super::locale_canonicalizer::likely_subtags::transform(resources.get_common());
let provider = ForkByKeyProvider::new(
AnyPayloadProvider::from_owned::<LikelySubtagsForLanguageV1Marker>(data.clone().into()),
AnyPayloadProvider::from_owned::<LikelySubtagsForScriptRegionV1Marker>(data.into()),
);
let locale_expander =
LocaleExpander::try_new_with_any_provider(&provider).map_err(|e| {
DataError::custom("creating LocaleExpander in CldrCache").with_display_context(&e)
})?;
Ok(CldrCache {
pub fn from_serde_cache(serde_cache: SerdeCache) -> Self {
CldrCache {
serde_cache,
is_full: Default::default(),
locale_expander,
})
dir_suffix: Default::default(),
locale_expander: Default::default(),
}
}

pub fn core(&self) -> CldrDirNoLang<'_> {
Expand Down Expand Up @@ -118,51 +103,74 @@ impl CldrCache {
}

fn dir_suffix(&self) -> Result<&'static str, DataError> {
let maybe_is_full = *self.is_full.read().expect("poison");
let is_full = match maybe_is_full {
Some(x) => x,
None => {
let is_full = self.serde_cache.list("cldr-misc-full")?.next().is_some();
let _ = self.is_full.write().expect("poison").insert(is_full);
is_full
}
};
if is_full {
Ok("full")
} else {
Ok("modern")
}
self.dir_suffix
.get_or_try_init(|| {
if self.serde_cache.list("cldr-misc-full")?.next().is_some() {
Ok("full")
} else {
Ok("modern")
}
})
.copied()
}

fn locale_expander(&self) -> Result<&LocaleExpander, DataError> {
self.locale_expander.get_or_try_init(|| {
let resources = LikelySubtagsResources::from_resources(
self.serde_cache
.read_and_parse_json("cldr-core/supplemental/likelySubtags.json")?,
self.serde_cache
.read_and_parse_json("cldr-core/coverageLevels.json")?,
);
let data =
super::locale_canonicalizer::likely_subtags::transform(resources.get_common());
let provider = ForkByKeyProvider::new(
AnyPayloadProvider::from_owned::<LikelySubtagsForLanguageV1Marker>(
data.clone().into(),
),
AnyPayloadProvider::from_owned::<LikelySubtagsForScriptRegionV1Marker>(data.into()),
);
LocaleExpander::try_new_with_any_provider(&provider).map_err(|e| {
DataError::custom("creating LocaleExpander in CldrCache").with_display_context(&e)
})
})
}

/// CLDR sometimes stores regional variants with their script.
/// Add in the likely subtags here to make that data reachable.
fn add_script(&self, langid: &LanguageIdentifier) -> Option<LanguageIdentifier> {
fn add_script(
&self,
langid: &LanguageIdentifier,
) -> Result<Option<LanguageIdentifier>, DataError> {
if langid.language.is_empty() || langid.script.is_some() || langid.region.is_none() {
return None;
return Ok(None);
}
let mut langid = langid.clone();
self.locale_expander.maximize(&mut langid);
self.locale_expander()?.maximize(&mut langid);
debug_assert!(langid.script.is_some());
Some(langid)
Ok(Some(langid))
}

/// ICU4X does not store regional variants with their script
/// if the script is the default for the language.
/// Perform that normalization mapping here.
fn remove_script(&self, langid: &LanguageIdentifier) -> Option<LanguageIdentifier> {
fn remove_script(
&self,
langid: &LanguageIdentifier,
) -> Result<Option<LanguageIdentifier>, DataError> {
if langid.language.is_empty() || langid.script.is_none() || langid.region.is_none() {
return None;
return Ok(None);
}
let region = langid.region;
let mut langid = langid.clone();
self.locale_expander.minimize(&mut langid);
self.locale_expander()?.minimize(&mut langid);
if langid.script.is_some() {
// Wasn't able to minimize the script
return None;
return Ok(None);
}
// Restore the region
langid.region = region;
Some(langid)
Ok(Some(langid))
}
}

Expand Down Expand Up @@ -194,24 +202,28 @@ impl<'a> CldrDirLang<'a> {
let path = format!("{}-{dir_suffix}/main/{lang}/{file_name}", self.1);
if self.0.serde_cache.file_exists(&path)? {
self.0.serde_cache.read_and_parse_json(&path)
} else if let Some(new_langid) = self.0.add_script(lang)? {
self.read_and_parse(&new_langid, file_name)
} else {
if let Some(new_langid) = self.0.add_script(lang) {
self.read_and_parse(&new_langid, file_name)
} else {
Err(DataErrorKind::Io(std::io::ErrorKind::NotFound)
.into_error()
.with_display_context(&path))
}
Err(DataErrorKind::Io(std::io::ErrorKind::NotFound)
.into_error()
.with_display_context(&path))
}
}

pub fn list_langs(&self) -> Result<impl Iterator<Item = LanguageIdentifier> + '_, DataError> {
let dir_suffix = self.0.dir_suffix()?;
let path = format!("{}-{dir_suffix}/main", self.1);
Ok(self.0.serde_cache.list(&path)?.map(|path| {
let langid = LanguageIdentifier::from_str(&path).unwrap();
self.0.remove_script(&langid).unwrap_or(langid)
}))
Ok(self
.0
.serde_cache
.list(&path)?
.map(|path| -> Result<LanguageIdentifier, DataError> {
let langid = LanguageIdentifier::from_str(&path).unwrap();
Ok(self.0.remove_script(&langid)?.unwrap_or(langid))
})
.collect::<Result<Vec<_>, _>>()?
.into_iter())
}

pub fn file_exists(
Expand All @@ -223,12 +235,10 @@ impl<'a> CldrDirLang<'a> {
let path = format!("{}-{dir_suffix}/main/{lang}/{file_name}", self.1);
if self.0.serde_cache.file_exists(&path)? {
Ok(true)
} else if let Some(new_langid) = self.0.add_script(lang)? {
self.file_exists(&new_langid, file_name)
} else {
if let Some(new_langid) = self.0.add_script(lang) {
self.file_exists(&new_langid, file_name)
} else {
Ok(false)
}
Ok(false)
}
}
}
Loading