Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First draft for implementing Local Displayname Algorithm #3587

Merged
merged 14 commits into from
Aug 23, 2023
5 changes: 1 addition & 4 deletions experimental/displaynames/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,4 @@ default = ["compiled_data"]
std = ["icu_collections/std", "icu_locid/std", "icu_provider/std"]
serde = ["dep:serde", "zerovec/serde", "icu_collections/serde", "tinystr/serde", "icu_provider/serde"]
datagen = ["serde", "std", "dep:databake", "zerovec/databake", "icu_collections/databake", "tinystr/databake"]

[[test]]
name = "tests"
path = "tests/tests.rs"
compiled_data = ["dep:icu_displaynames_data", "dep:icu_locid_transform"]
329 changes: 155 additions & 174 deletions experimental/displaynames/src/displaynames.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,61 +355,6 @@ pub struct LocaleDisplayNamesFormatter {
// transforms_data: DataPayload<TransformsDisplayNamesV1Marker>,
}

// LongestMatching subtag is a longest substring of a given locale that exists as a key in the CLDR locale data.
// This is used for implementing Locale Display Name Algorithm.
#[derive(PartialEq, Clone, Copy)]
enum LongestMatchingSubtag {
// Longest matching subtag of type ${lang}-${region}.
// Example: "de-ET", "en-GB"
LangRegion,
// Longest matching subtag of type ${lang}-${script}.
// Example: "hi-Latn", "zh-Hans"
LangScript,
// Longest matching subtag of type ${lang}
// Example: "en", "hi"
Lang,
}

impl LongestMatchingSubtag {
/// For a given locale and the data, find the longest prefix of the string that exists as a key in the CLDR locale data.
pub fn find_longest_matching_subtag<'a>(
locale: &Locale,
locale_dn_formatter: &'a LocaleDisplayNamesFormatter,
) -> Self {
let LocaleDisplayNamesFormatter { locale_data, .. } = locale_dn_formatter;

// NOTE: The subtag ordering of the canonical locale is `language_script_region + variants + extensions`.
// The logic to find the longest matching subtag is based on this ordering.
if let Some(script) = locale.id.script {
let lang_script_identifier: LanguageIdentifier =
(locale.id.language, Some(script), None).into();
if locale_data
.get()
.names
.get_by(|uvstr| lang_script_identifier.strict_cmp(uvstr).reverse())
.is_some()
{
return LongestMatchingSubtag::LangScript;
}
}
if let Some(region) = locale.id.region {
if locale.id.script.is_none() {
let lang_region_identifier: LanguageIdentifier =
(locale.id.language, None, Some(region)).into();
if locale_data
.get()
.names
.get_by(|uvstr| lang_region_identifier.strict_cmp(uvstr).reverse())
.is_some()
{
return LongestMatchingSubtag::LangRegion;
}
}
}
return LongestMatchingSubtag::Lang;
}
}

impl LocaleDisplayNamesFormatter {
icu_provider::gen_any_buffer_data_constructors!(
locale: include,
Expand Down Expand Up @@ -463,16 +408,15 @@ impl LocaleDisplayNamesFormatter {
///
// TODO: Make this return a writeable instead of using alloc
pub fn of<'a, 'b: 'a, 'c: 'a>(&'b self, locale: &'c Locale) -> Cow<'a, str> {
let longest_matching_subtag =
LongestMatchingSubtag::find_longest_matching_subtag(&locale, &self);
let longest_matching_identifier = self.find_longest_matching_subtag(&locale);

// Step - 1: Construct a locale display name string (LDN).
// Find the displayname for the longest_matching_subtag which was derived above.
let ldn = get_locale_display_name(&locale, longest_matching_subtag, &self);
let ldn = self.get_locale_display_name(&locale, &longest_matching_identifier);

// Step - 2: Construct a vector of longest qualifying substrings (LQS).
// Find the displayname for the remaining locale if exists.
let lqs = get_longest_qualifying_substrings(&locale, longest_matching_subtag, &self);
let lqs = self.get_longest_qualifying_substrings(&locale, &longest_matching_identifier);

// Step - 3: Return the displayname based on the size of LQS.
let mut result = Cow::Borrowed(ldn);
Expand All @@ -494,148 +438,185 @@ impl LocaleDisplayNamesFormatter {
}
result
}
}

fn get_locale_display_name<'a>(
locale: &Locale,
longest_matching_subtag: LongestMatchingSubtag,
locale_dn_formatter: &'a LocaleDisplayNamesFormatter,
) -> &'a str {
let LocaleDisplayNamesFormatter {
options,
locale_data,
language_data,
..
} = locale_dn_formatter;

let lang_id: LanguageIdentifier = match longest_matching_subtag {
LongestMatchingSubtag::LangRegion => (locale.id.language, None, locale.id.region).into(),
LongestMatchingSubtag::LangScript => (locale.id.language, locale.id.script, None).into(),
LongestMatchingSubtag::Lang => locale.id.language.into(),
};

// Check if the key exists in the locale_data first.
// Example: "en_GB", "nl_BE".
let mut ldn = match options.style {
Some(Style::Short) => locale_data
.get()
.short_names
.get_by(|uvstr| lang_id.strict_cmp(uvstr).reverse()),
Some(Style::Long) => locale_data
.get()
.long_names
.get_by(|uvstr| lang_id.strict_cmp(uvstr).reverse()),
Some(Style::Menu) => locale_data
.get()
.menu_names
.get_by(|uvstr| lang_id.strict_cmp(uvstr).reverse()),
_ => None,
/// For a given locale and the data, find the longest prefix of the string that exists as a key in the CLDR locale data.
pub fn find_longest_matching_subtag(&self, locale: &Locale) -> LanguageIdentifier {
Comment on lines +445 to +446
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Observation: this only ever returns one of the following:

  • language-script
  • language-region
  • language

But does the spec allow us to return language-script-region, etc? Consider this in #3913

let LocaleDisplayNamesFormatter { locale_data, .. } = self;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I personally would just use self.locale_data everywhere instead of doing this, but up to you

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


// NOTE: The subtag ordering of the canonical locale is `language_script_region + variants + extensions`.
// The logic to find the longest matching subtag is based on this ordering.
if let Some(script) = locale.id.script {
Copy link
Member

@robertbastian robertbastian Aug 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you technically have to try all these combinations according to the algorithm?

  • LSRV
  • LSR
  • LSV
  • LRV
  • LS (x)
  • LR (x)
  • LV
  • SR
  • SV
  • RV
  • L (x)
  • S
  • R
  • V

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no explicit mention in the algorithm for what is covered under LocaleDisplayName (LDN). But because we are looking for the longest matching string in the locale and language data, I couldn't find any example in the data which has any other combinations then what is already covered in this implementation. However, we do need to technically support the other combinations as the data may change in future. I think a better way to implement the support for this is to use the subtag iterator.
Sketching the algorithm:

  1. Try matching the entire locale first and return the languageIdentifier if found in the locale data.
  2. If not, remove the last subtag and lookup for the remaining locale string in the locale data, if found then construct the languageIdentifier and return.
  3. Continue step-2 until all the subtags are removed.
  4. Fallback if no match is found.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If what we have is more efficient than the general solution, I'm okay landing this and fixing the algorithm later if these types of cases come up. The way you've written this, I think old-code-new-data should just ignore the new entries, which is fine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Chopping off subtags will only cover LSRV, LSR, LS, L.

let lang_script_identifier: LanguageIdentifier =
(locale.id.language, Some(script), None).into();
if locale_data
.get()
.names
.get_by(|uvstr| lang_script_identifier.strict_cmp(uvstr).reverse())
.is_some()
{
return lang_script_identifier;
}
}
if let Some(region) = locale.id.region {
if locale.id.script.is_none() {
let lang_region_identifier: LanguageIdentifier =
(locale.id.language, None, Some(region)).into();
if locale_data
.get()
.names
.get_by(|uvstr| lang_region_identifier.strict_cmp(uvstr).reverse())
.is_some()
{
return lang_region_identifier;
}
}
}
return (locale.id.language, None, None).into();
}
.or_else(|| {
locale_data
.get()
.names
.get_by(|uvstr| lang_id.strict_cmp(uvstr).reverse())
});

// At this point the key should exist in the language_data.
// Example: "en", "nl", "zh".
if ldn.is_none() {
ldn = match options.style {
Some(Style::Short) => language_data

fn get_locale_display_name<'a>(
&'a self,
locale: &'a Locale,
longest_matching_identifier: &LanguageIdentifier,
) -> &'a str {
let LocaleDisplayNamesFormatter {
options,
locale_data,
language_data,
..
} = self;

// Check if the key exists in the locale_data first.
// Example: "en_GB", "nl_BE".
let mut ldn = match options.style {
Some(Style::Short) => locale_data
.get()
.short_names
.get(&lang_id.language.into_tinystr().to_unvalidated()),
Some(Style::Long) => language_data
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()),
Some(Style::Long) => locale_data
.get()
.long_names
.get(&lang_id.language.into_tinystr().to_unvalidated()),
Some(Style::Menu) => language_data
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()),
Some(Style::Menu) => locale_data
.get()
.menu_names
.get(&lang_id.language.into_tinystr().to_unvalidated()),
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse()),
_ => None,
}
.or_else(|| {
language_data
locale_data
.get()
.names
.get(&lang_id.language.into_tinystr().to_unvalidated())
.get_by(|uvstr| longest_matching_identifier.strict_cmp(uvstr).reverse())
});
}
// Throw an error if the LDN is none as it is not possible to have a locale string without the language.
return ldn.expect("cannot parse locale displayname.");
}

fn get_longest_qualifying_substrings<'a>(
locale: &Locale,
longest_matching_subtag: LongestMatchingSubtag,
locale_dn_formatter: &'a LocaleDisplayNamesFormatter,
) -> Vec<&'a str> {
let LocaleDisplayNamesFormatter {
options,
region_data,
script_data,
variant_data,
..
} = locale_dn_formatter;

let mut lqs: Vec<&str> = vec![];

if let Some(script) = locale.id.script {
// Ignore if the script was used to derive LDN.
if longest_matching_subtag != LongestMatchingSubtag::LangScript {
let scriptdisplay = match options.style {
Some(Style::Short) => script_data
.get()
.short_names
.get(&script.into_tinystr().to_unvalidated()),
// At this point the key should exist in the language_data.
// Example: "en", "nl", "zh".
if ldn.is_none() {
ldn = match options.style {
Some(Style::Short) => language_data.get().short_names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
),
Some(Style::Long) => language_data.get().long_names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
),
Some(Style::Menu) => language_data.get().menu_names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
),
_ => None,
}
.or_else(|| {
script_data
.get()
.names
.get(&script.into_tinystr().to_unvalidated())
language_data.get().names.get(
&longest_matching_identifier
.language
.into_tinystr()
.to_unvalidated(),
)
});
if let Some(scriptdn) = scriptdisplay {
lqs.push(scriptdn);
}
}
// Throw an error if the LDN is none as it is not possible to have a locale string without the language.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: update comment

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

return ldn.unwrap_or(locale.id.language.as_str());
}

if let Some(region) = locale.id.region {
// Ignore if the region was used to derive LDN.
if longest_matching_subtag != LongestMatchingSubtag::LangRegion {
let regiondisplay = match options.style {
Some(Style::Short) => region_data
.get()
.short_names
.get(&region.into_tinystr().to_unvalidated()),
_ => None,
fn get_longest_qualifying_substrings<'a>(
&'a self,
locale: &Locale,
longest_matching_identifier: &LanguageIdentifier,
) -> Vec<&'a str> {
let LocaleDisplayNamesFormatter {
options,
region_data,
script_data,
variant_data,
..
} = self;

let mut lqs: Vec<&str> = vec![];

if let Some(script) = locale.id.script {
// Ignore if the script was used to derive LDN.
if longest_matching_identifier.script.is_none() {
let scriptdisplay = match options.style {
Some(Style::Short) => script_data
.get()
.short_names
.get(&script.into_tinystr().to_unvalidated()),
_ => None,
}
.or_else(|| {
script_data
.get()
.names
.get(&script.into_tinystr().to_unvalidated())
});
if let Some(scriptdn) = scriptdisplay {
lqs.push(scriptdn);
}
}
.or_else(|| {
region_data
.get()
.names
.get(&region.into_tinystr().to_unvalidated())
});
}

if let Some(regiondn) = regiondisplay {
lqs.push(regiondn);
if let Some(region) = locale.id.region {
// Ignore if the region was used to derive LDN.
if longest_matching_identifier.region.is_none() {
let regiondisplay = match options.style {
Some(Style::Short) => region_data
.get()
.short_names
.get(&region.into_tinystr().to_unvalidated()),
_ => None,
}
.or_else(|| {
region_data
.get()
.names
.get(&region.into_tinystr().to_unvalidated())
});

if let Some(regiondn) = regiondisplay {
lqs.push(regiondn);
}
}
}
}

for &variant_key in locale.id.variants.iter() {
if let Some(variant_dn) = variant_data
.get()
.names
.get(&variant_key.into_tinystr().to_unvalidated())
{
lqs.push(variant_dn);
for &variant_key in locale.id.variants.iter() {
if let Some(variant_dn) = variant_data
.get()
.names
.get(&variant_key.into_tinystr().to_unvalidated())
{
lqs.push(variant_dn);
}
}
}

return lqs;
return lqs;
}
}
Loading
Loading