Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify Language representation #1695

Merged
merged 7 commits into from
Mar 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions components/locale_canonicalizer/src/locale_canonicalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ fn uts35_check_language_rules(
locale: &mut Locale,
alias_data: &DataPayload<AliasesV1Marker>,
) -> CanonicalizationResult {
let maybe_lang: Option<TinyAsciiStr<3>> = locale.id.language.into();
if let Some(lang) = maybe_lang {
if !locale.id.language.is_empty() {
let lang: TinyAsciiStr<3> = locale.id.language.into();
let replacement = if lang.len() == 2 {
alias_data
.get()
Expand Down Expand Up @@ -557,14 +557,24 @@ impl LocaleCanonicalizer {
return CanonicalizationResult::Unmodified;
}

if let Some(language) = langid.language.into() {
if !langid.language.is_empty() {
if let Some(region) = langid.region {
maximize_locale!(langid, data.language_region, language, region.into());
maximize_locale!(
langid,
data.language_region,
langid.language.into(),
region.into()
);
}
if let Some(script) = langid.script {
maximize_locale!(langid, data.language_script, language, script.into());
maximize_locale!(
langid,
data.language_script,
langid.language.into(),
script.into()
);
}
maximize_locale!(langid, data.language, language);
maximize_locale!(langid, data.language, langid.language.into());
}
if let Some(script) = langid.script {
if let Some(region) = langid.region {
Expand Down
54 changes: 29 additions & 25 deletions components/locid/src/subtags/language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::parser::errors::ParserError;
use core::fmt;
use core::ops::RangeInclusive;
use core::str::FromStr;
use tinystr::{tinystr, TinyAsciiStr};
use tinystr::TinyAsciiStr;

/// A language subtag (examples: `"en"`, `"csb"`, `"zh"`, `"und"`, etc.)
///
Expand Down Expand Up @@ -38,11 +38,13 @@ use tinystr::{tinystr, TinyAsciiStr};
/// but that form has not been used and ICU4X does not support it right now.
///
/// [`unicode_language_id`]: https://unicode.org/reports/tr35/#unicode_language_id
#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
pub struct Language(Option<TinyAsciiStr<{ *LANGUAGE_LENGTH.end() }>>);
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
pub struct Language(TinyAsciiStr<{ *LANGUAGE_LENGTH.end() }>);

const LANGUAGE_LENGTH: RangeInclusive<usize> = 2..=3;
const UND_VALUE: TinyAsciiStr<3> = tinystr!(3, "und");
// TODO(#348): Change this to invoke a const function.
// Safe because "und" is a valid language subtag
const UND: Language = Language(unsafe { TinyAsciiStr::from_bytes_unchecked(*b"und") });
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be lang!("und") after #1631 . @zbraniecki please take a look at that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add a TODO(#348) here, and you can fix it in your PR (which it looks like you need to rebase anyway).


impl Language {
/// A constructor which takes a utf8 slice, parses it and
Expand Down Expand Up @@ -73,11 +75,7 @@ impl Language {

let value = s.to_ascii_lowercase();

if value == UND_VALUE {
Ok(Self(None))
} else {
Ok(Self(Some(value)))
}
Ok(Self(value))
}

/// Deconstructs the [`Language`] into raw format to be consumed
Expand All @@ -95,8 +93,8 @@ impl Language {
/// let lang = unsafe { Language::from_raw_unchecked(raw) };
/// assert_eq!(lang, "en");
/// ```
pub fn into_raw(self) -> Option<[u8; 3]> {
self.0.as_ref().map(TinyAsciiStr::all_bytes).copied()
pub fn into_raw(self) -> [u8; 3] {
*self.0.all_bytes()
}

/// Constructor which takes a raw value returned by
Expand All @@ -119,11 +117,8 @@ impl Language {
///
/// This function accepts a [`[u8; 3]`] that is expected to be a valid [`TinyAsciiStr<3>`]
/// representing a [`Language`] subtag in canonical syntax.
pub const unsafe fn from_raw_unchecked(v: Option<[u8; 3]>) -> Self {
Self(match v {
Some(v) => Some(TinyAsciiStr::from_bytes_unchecked(v)),
None => None,
})
pub const unsafe fn from_raw_unchecked(v: [u8; 3]) -> Self {
Self(TinyAsciiStr::from_bytes_unchecked(v))
}

/// Returns the default undefined language "und". Same as [`default()`](Default::default()), but is `const`.
Expand All @@ -139,7 +134,7 @@ impl Language {
/// ```
#[inline]
pub const fn und() -> Self {
Self(None)
UND
}

/// A helper function for displaying
Expand All @@ -158,11 +153,12 @@ impl Language {
///
/// `Notice`: For many use cases, such as comparison,
/// [`Language`] implements [`PartialEq`]`<&`[`str`]`>` which allows for direct comparisons.
#[inline]
pub fn as_str(&self) -> &str {
self.0.as_deref().unwrap_or("und")
self.0.as_str()
}

/// Resets the [`Language`] subtag to an empty one.
/// Resets the [`Language`] subtag to an empty one (equal to `"und"`).
///
/// # Examples
///
Expand All @@ -178,11 +174,12 @@ impl Language {
///
/// assert_eq!(lang.as_str(), "und");
/// ```
#[inline]
pub fn clear(&mut self) {
self.0.take();
*self = UND
}

/// Tests if the [`Language`] subtag is empty.
/// Tests if the [`Language`] subtag is empty (equal to `"und"`).
///
/// # Examples
///
Expand All @@ -198,8 +195,9 @@ impl Language {
///
/// assert_eq!(lang.is_empty(), true);
/// ```
#[inline]
pub fn is_empty(self) -> bool {
self.0.is_none()
self == UND
}
}

Expand All @@ -224,7 +222,7 @@ impl writeable::Writeable for Language {

#[inline]
fn write_len(&self) -> writeable::LengthHint {
writeable::LengthHint::exact(self.0.map_or(3, |t| t.len()))
writeable::LengthHint::exact(self.0.len())
}
}

Expand All @@ -247,8 +245,14 @@ impl<'l> From<&'l Language> for &'l str {
}
}

impl From<Language> for Option<TinyAsciiStr<3>> {
impl From<Language> for TinyAsciiStr<3> {
fn from(input: Language) -> Self {
input.0.map(Into::into)
input.0
}
}

impl Default for Language {
fn default() -> Language {
Language::und()
}
}
10 changes: 5 additions & 5 deletions provider/cldr/src/transform/locale_canonicalizer/aliases.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ impl From<&cldr_serde::aliases::Resource> for AliasesV1 {
continue;
}

let maybe_lang: Option<TinyAsciiStr<3>> = langid.language.into();
if let Some(lang) = maybe_lang {
if !langid.language.is_empty() {
let lang: TinyAsciiStr<3> = langid.language.into();
if langid.region.is_none() && langid.variants.is_empty() {
// Relatively few aliases exist for two character language identifiers,
// so we store them separately to not slow down canonicalization of
Expand Down Expand Up @@ -275,9 +275,9 @@ fn test_rules_cmp() {
assert_eq!(union_size(&rules[3]), 2);

rules.sort_unstable_by(rules_cmp);
assert_eq!(rules[0], "und-hepburn-heploc");
assert_eq!(rules[1], "en-GB");
assert_eq!(rules[2], "fr-CA");
assert_eq!(rules[0], "en-GB");
assert_eq!(rules[1], "fr-CA");
assert_eq!(rules[2], "und-hepburn-heploc");
assert_eq!(rules[3], "CA");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ impl From<&cldr_serde::likely_subtags::Resource> for LikelySubtagsV1 {
};

for entry in other.supplemental.likely_subtags.iter() {
if let Some(lang) = entry.0.language.into() {
if !entry.0.language.is_empty() {
let lang = entry.0.language.into();
if let Some(script) = entry.0.script {
language_script.insert((lang, script.into()), extract_result(entry));
} else if let Some(region) = entry.0.region {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
{
"language": [],
"language_variants": [
[
"und-hepburn-heploc",
"und-alalc97"
],
[
"aa-saaho",
"ssy"
Expand All @@ -29,6 +25,10 @@
"no-nynorsk",
"nn"
],
[
"und-hepburn-heploc",
"und-alalc97"
],
[
"zh-guoyu",
"zh"
Expand Down
Binary file modified provider/testdata/data/testdata.postcard
Binary file not shown.