Skip to content

Commit

Permalink
Add is_normalized_up_tos
Browse files Browse the repository at this point in the history
Closes #4256. No UTF16 tests or fuzzing yet.
  • Loading branch information
CanadaHonk committed Nov 20, 2023
1 parent 9bf2525 commit 553413e
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 0 deletions.
30 changes: 30 additions & 0 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1457,6 +1457,13 @@ macro_rules! normalizer_methods {
ret
}

/// Return the index a string slice is normalized up to
pub fn is_normalized_up_to(&self, text: &str) -> usize {
let mut sink = IsNormalizedSinkStr::new(text);
let _ = self.normalize_to(text, &mut sink);
text.len() - sink.leftover()
}

/// Check whether a string slice is normalized.
pub fn is_normalized(&self, text: &str) -> bool {
let mut sink = IsNormalizedSinkStr::new(text);
Expand All @@ -1476,6 +1483,13 @@ macro_rules! normalizer_methods {
ret
}

/// Return the index a string slice is normalized UTF-16 up to
pub fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
let mut sink = IsNormalizedSinkUtf16::new(text);
let _ = self.normalize_utf16_to(text, &mut sink);
text.len() - sink.leftover()
}

/// Checks whether a slice of potentially-invalid UTF-16 is normalized.
///
/// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
Expand All @@ -1498,6 +1512,13 @@ macro_rules! normalizer_methods {
ret
}

/// Return the index a string slice is normalized UTF-8 up to
pub fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
let mut sink = IsNormalizedSinkUtf8::new(text);
let _ = self.normalize_utf8_to(text, &mut sink);
text.len() - sink.leftover()
}

/// Check if a slice of potentially-invalid UTF-8 is normalized.
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
Expand Down Expand Up @@ -2638,6 +2659,9 @@ impl<'a> IsNormalizedSinkUtf16<'a> {
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
pub fn leftover(&self) -> usize {
self.expect.len()
}
}

impl<'a> Write16 for IsNormalizedSinkUtf16<'a> {
Expand Down Expand Up @@ -2677,6 +2701,9 @@ impl<'a> IsNormalizedSinkUtf8<'a> {
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
pub fn leftover(&self) -> usize {
self.expect.len()
}
}

impl<'a> core::fmt::Write for IsNormalizedSinkUtf8<'a> {
Expand Down Expand Up @@ -2716,6 +2743,9 @@ impl<'a> IsNormalizedSinkStr<'a> {
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
pub fn leftover(&self) -> usize {
self.expect.len()
}
}

impl<'a> core::fmt::Write for IsNormalizedSinkStr<'a> {
Expand Down
133 changes: 133 additions & 0 deletions components/normalizer/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1398,3 +1398,136 @@ fn test_is_normalized() {
assert!(nfc.is_normalized_utf16(fraction16));
assert!(!nfkc.is_normalized_utf16(fraction16));
}

#[test]
fn test_is_normalized_up_to() {
let nfd: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let nfkd: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let nfc: ComposingNormalizer = ComposingNormalizer::new_nfc();
let nfkc: ComposingNormalizer = ComposingNormalizer::new_nfkc();

// Check a string slice is normalized up to where is_normalized_up_to reports
let check_str = |input: &str| {
// Check nfd
let up_to = nfd.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfd.normalize_to(tail, &mut normalized);
assert!(nfd.is_normalized(&normalized));

// Check nfkd
let up_to = nfkd.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfkd.normalize_to(tail, &mut normalized);
assert!(nfkd.is_normalized(&normalized));

// Check nfc
let up_to = nfc.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfc.normalize_to(tail, &mut normalized);
assert!(nfc.is_normalized(&normalized));

// Check nfkc
let up_to = nfkc.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfkc.normalize_to(tail, &mut normalized);
assert!(nfkc.is_normalized(&normalized));
};

// Check a string of UTF8 bytes is normalized up to where is_normalized_up_to reports
// note: from_utf8 can panic with invalid UTF8 input
let check_utf8 = |input: &[u8]| {
// Check nfd
let up_to = nfd.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfd.normalize_utf8_to(tail, &mut normalized);
assert!(nfd.is_normalized(&normalized));

// Check nfkd
let up_to = nfkd.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfkd.normalize_utf8_to(tail, &mut normalized);
assert!(nfkd.is_normalized(&normalized));

// Check nfc
let up_to = nfc.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfc.normalize_utf8_to(tail, &mut normalized);
assert!(nfc.is_normalized(&normalized));

// Check nfkc
let up_to = nfkc.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfkc.normalize_utf8_to(tail, &mut normalized);
assert!(nfkc.is_normalized(&normalized));
};

// todo: UTF16 tests?

let aaa = "aaa";
check_str(aaa);

let aaa_utf8 = aaa.as_bytes();
check_utf8(aaa_utf8);

assert!(nfd.is_normalized_up_to(aaa) == aaa.len());
assert!(nfkd.is_normalized_up_to(aaa) == aaa.len());
assert!(nfc.is_normalized_up_to(aaa) == aaa.len());
assert!(nfkc.is_normalized_up_to(aaa) == aaa.len());
assert!(nfd.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
assert!(nfkd.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
assert!(nfc.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
assert!(nfkc.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());

let note = "a𝅗\u{1D165}a";
check_str(note);

let note_utf8 = note.as_bytes();
check_utf8(note_utf8);

assert!(nfd.is_normalized_up_to(note) == note.len());
assert!(nfkd.is_normalized_up_to(note) == note.len());
assert!(nfc.is_normalized_up_to(note) == note.len());
assert!(nfkc.is_normalized_up_to(note) == note.len());
assert!(nfd.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
assert!(nfkd.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
assert!(nfc.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
assert!(nfkc.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());

let umlaut = "aäa";
check_str(umlaut);

let umlaut_utf8 = umlaut.as_bytes();
check_utf8(umlaut_utf8);

assert!(nfd.is_normalized_up_to(umlaut) == 1);
assert!(nfkd.is_normalized_up_to(umlaut) == 1);
assert!(nfc.is_normalized_up_to(umlaut) == 4);
assert!(nfkc.is_normalized_up_to(umlaut) == 4);
assert!(nfd.is_normalized_utf8_up_to(umlaut_utf8) == 1);
assert!(nfkd.is_normalized_utf8_up_to(umlaut_utf8) == 1);
assert!(nfc.is_normalized_utf8_up_to(umlaut_utf8) == 4);
assert!(nfkc.is_normalized_utf8_up_to(umlaut_utf8) == 4);

let fraction = "a½a";
check_str(fraction);

let fraction_utf8 = fraction.as_bytes();
check_utf8(fraction_utf8);

assert!(nfd.is_normalized_up_to(fraction) == 4);
assert!(nfkd.is_normalized_up_to(fraction) == 1);
assert!(nfc.is_normalized_up_to(fraction) == 4);
assert!(nfkc.is_normalized_up_to(fraction) == 1);
assert!(nfd.is_normalized_utf8_up_to(fraction_utf8) == 4);
assert!(nfkd.is_normalized_utf8_up_to(fraction_utf8) == 1);
assert!(nfc.is_normalized_utf8_up_to(fraction_utf8) == 4);
assert!(nfkc.is_normalized_utf8_up_to(fraction_utf8) == 1);
}

0 comments on commit 553413e

Please sign in to comment.