Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First draft for implementing Local Displayname Algorithm #3587

Merged
merged 14 commits into from
Aug 23, 2023
261 changes: 194 additions & 67 deletions experimental/displaynames/src/displaynames.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@
use crate::options::*;
use crate::provider::*;
use alloc::borrow::Cow;
use icu_locid::{subtags::Language, subtags::Region, subtags::Script, subtags::Variant, Locale};
use alloc::string::String;
use alloc::vec;
use alloc::vec::Vec;
use icu_locid::{
subtags::Language, subtags::Region, subtags::Script, subtags::Variant, LanguageIdentifier,
Locale,
};
use icu_provider::prelude::*;

/// Lookup of the locale-specific display names by region code.
///
/// # Example
Expand Down Expand Up @@ -110,7 +115,6 @@ pub struct ScriptDisplayNames {
script_data: DataPayload<ScriptDisplayNamesV1Marker>,
}

#[allow(dead_code)] // not public at the moment
impl ScriptDisplayNames {
icu_provider::gen_any_buffer_data_constructors!(
locale: include,
Expand Down Expand Up @@ -188,7 +192,6 @@ pub struct VariantDisplayNames {
variant_data: DataPayload<VariantDisplayNamesV1Marker>,
}

#[allow(dead_code)] // not public at the moment
impl VariantDisplayNames {
icu_provider::gen_any_buffer_data_constructors!(
locale: include,
Expand Down Expand Up @@ -333,9 +336,9 @@ impl LanguageDisplayNames {
/// )
/// .expect("Data should load successfully");
///
/// assert_eq!(display_name.of(&locale!("de-CH")), "Swiss High German");
/// assert_eq!(display_name.of(&locale!("de")), "German");
/// assert_eq!(display_name.of(&locale!("de-MX")), "German (Mexico)");
/// assert_eq!(display_name.of(&locale!("en-GB")), "British English");
/// assert_eq!(display_name.of(&locale!("en")), "English");
/// assert_eq!(display_name.of(&locale!("en-MX")), "English (Mexico)");
/// assert_eq!(display_name.of(&locale!("xx-YY")), "xx (YY)");
/// assert_eq!(display_name.of(&locale!("xx")), "xx");
/// ```
Expand All @@ -345,13 +348,11 @@ pub struct LocaleDisplayNamesFormatter {
locale_data: DataPayload<LocaleDisplayNamesV1Marker>,

language_data: DataPayload<LanguageDisplayNamesV1Marker>,
#[allow(dead_code)] // TODO use this
script_data: DataPayload<ScriptDisplayNamesV1Marker>,
region_data: DataPayload<RegionDisplayNamesV1Marker>,
#[allow(dead_code)] // TODO add support for variants
variant_data: DataPayload<VariantDisplayNamesV1Marker>,
// key_data: DataPayload<KeyDisplayNamesV1Marker>,
// measuerment_data: DataPayload<MeasurementSystemsDisplayNamesV1Marker>,
// measurement_data: DataPayload<MeasurementSystemsDisplayNamesV1Marker>,
// subdivisions_data: DataPayload<SubdivisionsDisplayNamesV1Marker>,
// transforms_data: DataPayload<TransformsDisplayNamesV1Marker>,
}
Expand Down Expand Up @@ -404,90 +405,216 @@ impl LocaleDisplayNamesFormatter {
}

/// Returns the display name of a locale.
/// This implementation is based on the algorithm described in
/// <https://www.unicode.org/reports/tr35/tr35-general.html#locale_display_name_algorithm>
///
// TODO: Make this return a writeable instead of using alloc
pub fn of<'a, 'b: 'a, 'c: 'a>(&'b self, locale: &'c Locale) -> Cow<'a, str> {
// https://www.unicode.org/reports/tr35/tr35-general.html#Display_Name_Elements
let longest_matching_identifier = self.find_longest_matching_subtag(locale);

// Step - 1: Construct a locale display name string (LDN).
// Find the displayname for the longest_matching_subtag which was derived above.
let ldn = self.get_locale_display_name(locale, &longest_matching_identifier);

// Step - 2: Construct a vector of longest qualifying substrings (LQS).
// Find the displayname for the remaining locale if exists.
let lqs = self.get_longest_qualifying_substrings(locale, &longest_matching_identifier);

// Step - 3: Return the displayname based on the size of LQS.
let mut result = Cow::Borrowed(ldn);
#[allow(clippy::indexing_slicing)] // indexes in range
if !lqs.is_empty() {
Comment on lines +425 to +426
Copy link
Member

@sffc sffc Aug 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion (optional): you can use split_first to avoid the need to unwrap. split_first returns an Option<&T> with the first and an Option<&[T]> with the remainder.

let mut output = String::with_capacity(
result.len() + " (".len() + lqs.iter().map(|s| ", ".len() + s.len()).sum::<usize>()
- ", ".len()
+ ")".len(),
Comment on lines +428 to +430
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These strings should come from locale data. Could be a follow-up but we definitely need this before this goes stable.

https://github.com/unicode-org/cldr-json/blob/80a94b0f6c3a34d6e2dc0dca8639a54babc87f94/cldr-json/cldr-localenames-full/main/zh/localeDisplayNames.json#L12C1-L14C44

Copy link
Member Author

@snktd snktd Aug 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

);
output.push_str(&result);
output.push_str(" (");
output.push_str(lqs[0]);
for lqs in &lqs[1..] {
output.push_str(", ");
output.push_str(lqs);
}
output.push(')');
result = Cow::Owned(output);
}
result
}

// TODO: This binary search needs to return the longest matching found prefix
// instead of just perfect matches
if let Some(displayname) = match self.options.style {
Some(Style::Short) => self
/// For a given locale and the data, find the longest prefix of the string that exists as a key in the CLDR locale data.
pub fn find_longest_matching_subtag(&self, locale: &Locale) -> LanguageIdentifier {
Comment on lines +445 to +446
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Observation: this only ever returns one of the following:

  • language-script
  • language-region
  • language

But does the spec allow us to return language-script-region, etc? Consider this in #3913

// NOTE: The subtag ordering of the canonical locale is `language_script_region + variants + extensions`.
// The logic to find the longest matching subtag is based on this ordering.
if let Some(script) = locale.id.script {
Copy link
Member

@robertbastian robertbastian Aug 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you technically have to try all these combinations according to the algorithm?

  • LSRV
  • LSR
  • LSV
  • LRV
  • LS (x)
  • LR (x)
  • LV
  • SR
  • SV
  • RV
  • L (x)
  • S
  • R
  • V

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no explicit mention in the algorithm for what is covered under LocaleDisplayName (LDN). But because we are looking for the longest matching string in the locale and language data, I couldn't find any example in the data which has any other combinations then what is already covered in this implementation. However, we do need to technically support the other combinations as the data may change in future. I think a better way to implement the support for this is to use the subtag iterator.
Sketching the algorithm:

  1. Try matching the entire locale first and return the languageIdentifier if found in the locale data.
  2. If not, remove the last subtag and lookup for the remaining locale string in the locale data, if found then construct the languageIdentifier and return.
  3. Continue step-2 until all the subtags are removed.
  4. Fallback if no match is found.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If what we have is more efficient than the general solution, I'm okay landing this and fixing the algorithm later if these types of cases come up. The way you've written this, I think old-code-new-data should just ignore the new entries, which is fine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Chopping off subtags will only cover LSRV, LSR, LS, L.

let lang_script_identifier: LanguageIdentifier =
(locale.id.language, Some(script), None).into();
if self
.locale_data
.get()
.short_names
.get_by(|bytes| locale.strict_cmp(bytes).reverse()),
Some(Style::Long) => self
.locale_data
.get()
.long_names
.get_by(|bytes| locale.strict_cmp(bytes).reverse()),
Some(Style::Menu) => self
.locale_data
.get()
.menu_names
.get_by(|bytes| locale.strict_cmp(bytes).reverse()),
_ => None,
}
.or_else(|| {
self.locale_data
.get()
.names
.get_by(|bytes| locale.strict_cmp(bytes).reverse())
}) {
return Cow::Borrowed(displayname);
.get_by(|uvstr| lang_script_identifier.strict_cmp(uvstr).reverse())
.is_some()
{
return lang_script_identifier;
}
}
if let Some(region) = locale.id.region {
if locale.id.script.is_none() {
let lang_region_identifier: LanguageIdentifier =
(locale.id.language, None, Some(region)).into();
if self
.locale_data
.get()
.names
.get_by(|uvstr| lang_region_identifier.strict_cmp(uvstr).reverse())
.is_some()
{
return lang_region_identifier;
}
}
}
(locale.id.language, None, None).into()
}

// TODO: This is a dummy implementation which does not adhere to UTS35. It only uses
// the language and region code, and uses a hardcoded pattern to combine them.
fn get_locale_display_name<'a>(
&'a self,
locale: &'a Locale,
longest_matching_identifier: &LanguageIdentifier,
) -> &'a str {
let LocaleDisplayNamesFormatter {
options,
locale_data,
language_data,
..
} = self;

let langdisplay = match self.options.style {
Some(Style::Short) => self
.language_data
// Check if the key exists in the locale_data first.
// Example: "en_GB", "nl_BE".
let mut ldn = match options.style {
Some(Style::Short) => locale_data
.get()
.short_names
.get(&locale.id.language.into_tinystr().to_unvalidated()),
Some(Style::Long) => self
.language_data
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()),
Some(Style::Long) => locale_data
.get()
.long_names
.get(&locale.id.language.into_tinystr().to_unvalidated()),
Some(Style::Menu) => self
.language_data
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()),
Some(Style::Menu) => locale_data
.get()
.menu_names
.get(&locale.id.language.into_tinystr().to_unvalidated()),
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()),
_ => None,
}
.or_else(|| {
self.language_data
locale_data
.get()
.names
.get(&locale.id.language.into_tinystr().to_unvalidated())
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse())
});

if let Some(region) = locale.id.region {
let regiondisplay = match self.options.style {
Some(Style::Short) => self
.region_data
.get()
.short_names
.get(&region.into_tinystr().to_unvalidated()),
// At this point the key should exist in the language_data.
// Example: "en", "nl", "zh".
if ldn.is_none() {
ldn = match options.style {
Some(Style::Short) => language_data.get().short_names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
),
Some(Style::Long) => language_data.get().long_names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
),
Some(Style::Menu) => language_data.get().menu_names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
),
_ => None,
}
.or_else(|| {
self.region_data
language_data.get().names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
)
});
}
// Fallback on language subtag in LanguageIdentifier id the key is not found in CLDR data.
return ldn.unwrap_or(locale.id.language.as_str());
}

fn get_longest_qualifying_substrings<'a>(
&'a self,
locale: &'a Locale,
longest_matching_identifier: &'a LanguageIdentifier,
) -> Vec<&'a str> {
let LocaleDisplayNamesFormatter {
options,
region_data,
script_data,
variant_data,
..
} = self;

let mut lqs: Vec<&'a str> = vec![];

if let Some(script) = &locale.id.script {
// Ignore if the script was used to derive LDN.
if longest_matching_identifier.script.is_none() {
let scriptdisplay = match options.style {
Some(Style::Short) => script_data
.get()
.short_names
.get(&script.into_tinystr().to_unvalidated()),
_ => None,
}
.or_else(|| {
script_data
.get()
.names
.get(&script.into_tinystr().to_unvalidated())
});
lqs.push(scriptdisplay.unwrap_or(script.as_str()));
}
}

if let Some(region) = &locale.id.region {
// Ignore if the region was used to derive LDN.
if longest_matching_identifier.region.is_none() {
let regiondisplay = match options.style {
Some(Style::Short) => region_data
.get()
.short_names
.get(&region.into_tinystr().to_unvalidated()),
_ => None,
}
.or_else(|| {
region_data
.get()
.names
.get(&region.into_tinystr().to_unvalidated())
});

lqs.push(regiondisplay.unwrap_or(region.as_str()));
}
}

for variant_key in locale.id.variants.iter() {
lqs.push(
variant_data
.get()
.names
.get(&region.into_tinystr().to_unvalidated())
});
// TODO: Use data patterns
Cow::Owned(alloc::format!(
"{} ({})",
langdisplay.unwrap_or(locale.id.language.as_str()),
regiondisplay.unwrap_or(region.as_str())
))
} else {
Cow::Borrowed(langdisplay.unwrap_or(locale.id.language.as_str()))
.get(&variant_key.into_tinystr().to_unvalidated())
.unwrap_or(variant_key.as_str()),
);
}
lqs
}
}
Loading
Loading