Add is_normalized_up_tos

Closes #4256. No UTF16 tests or fuzzing yet.
unicode-org · Nov 20, 2023 · 553413e · 553413e
1 parent 9bf2525
commit 553413e
Show file tree

Hide file tree

Showing 2 changed files with 163 additions and 0 deletions.
diff --git a/components/normalizer/src/lib.rs b/components/normalizer/src/lib.rs
@@ -1457,6 +1457,13 @@ macro_rules! normalizer_methods {
             ret
         }
 
+        /// Return the index a string slice is normalized up to
+        pub fn is_normalized_up_to(&self, text: &str) -> usize {
+            let mut sink = IsNormalizedSinkStr::new(text);
+            let _ = self.normalize_to(text, &mut sink);
+            text.len() - sink.leftover()
+        }
+
         /// Check whether a string slice is normalized.
         pub fn is_normalized(&self, text: &str) -> bool {
             let mut sink = IsNormalizedSinkStr::new(text);
@@ -1476,6 +1483,13 @@ macro_rules! normalizer_methods {
             ret
         }
 
+        /// Return the index a string slice is normalized UTF-16 up to
+        pub fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
+            let mut sink = IsNormalizedSinkUtf16::new(text);
+            let _ = self.normalize_utf16_to(text, &mut sink);
+            text.len() - sink.leftover()
+        }
+
         /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
         ///
         /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
@@ -1498,6 +1512,13 @@ macro_rules! normalizer_methods {
             ret
         }
 
+        /// Return the index a string slice is normalized UTF-8 up to
+        pub fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
+            let mut sink = IsNormalizedSinkUtf8::new(text);
+            let _ = self.normalize_utf8_to(text, &mut sink);
+            text.len() - sink.leftover()
+        }
+
         /// Check if a slice of potentially-invalid UTF-8 is normalized.
         ///
         /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
@@ -2638,6 +2659,9 @@ impl<'a> IsNormalizedSinkUtf16<'a> {
     pub fn finished(&self) -> bool {
         self.expect.is_empty()
     }
+    pub fn leftover(&self) -> usize {
+        self.expect.len()
+    }
 }
 
 impl<'a> Write16 for IsNormalizedSinkUtf16<'a> {
@@ -2677,6 +2701,9 @@ impl<'a> IsNormalizedSinkUtf8<'a> {
     pub fn finished(&self) -> bool {
         self.expect.is_empty()
     }
+    pub fn leftover(&self) -> usize {
+        self.expect.len()
+    }
 }
 
 impl<'a> core::fmt::Write for IsNormalizedSinkUtf8<'a> {
@@ -2716,6 +2743,9 @@ impl<'a> IsNormalizedSinkStr<'a> {
     pub fn finished(&self) -> bool {
         self.expect.is_empty()
     }
+    pub fn leftover(&self) -> usize {
+        self.expect.len()
+    }
 }
 
 impl<'a> core::fmt::Write for IsNormalizedSinkStr<'a> {

diff --git a/components/normalizer/tests/tests.rs b/components/normalizer/tests/tests.rs
@@ -1398,3 +1398,136 @@ fn test_is_normalized() {
     assert!(nfc.is_normalized_utf16(fraction16));
     assert!(!nfkc.is_normalized_utf16(fraction16));
 }
+
+#[test]
+fn test_is_normalized_up_to() {
+    let nfd: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
+    let nfkd: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
+    let nfc: ComposingNormalizer = ComposingNormalizer::new_nfc();
+    let nfkc: ComposingNormalizer = ComposingNormalizer::new_nfkc();
+
+    // Check a string slice is normalized up to where is_normalized_up_to reports
+    let check_str = |input: &str| {
+        // Check nfd
+        let up_to = nfd.is_normalized_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from(head);
+        let _ = nfd.normalize_to(tail, &mut normalized);
+        assert!(nfd.is_normalized(&normalized));
+
+        // Check nfkd
+        let up_to = nfkd.is_normalized_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from(head);
+        let _ = nfkd.normalize_to(tail, &mut normalized);
+        assert!(nfkd.is_normalized(&normalized));
+
+        // Check nfc
+        let up_to = nfc.is_normalized_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from(head);
+        let _ = nfc.normalize_to(tail, &mut normalized);
+        assert!(nfc.is_normalized(&normalized));
+
+        // Check nfkc
+        let up_to = nfkc.is_normalized_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from(head);
+        let _ = nfkc.normalize_to(tail, &mut normalized);
+        assert!(nfkc.is_normalized(&normalized));
+    };
+
+    // Check a string of UTF8 bytes is normalized up to where is_normalized_up_to reports
+    // note: from_utf8 can panic with invalid UTF8 input
+    let check_utf8 = |input: &[u8]| {
+        // Check nfd
+        let up_to = nfd.is_normalized_utf8_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from_utf8(head.to_vec()).unwrap();
+        let _ = nfd.normalize_utf8_to(tail, &mut normalized);
+        assert!(nfd.is_normalized(&normalized));
+
+        // Check nfkd
+        let up_to = nfkd.is_normalized_utf8_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from_utf8(head.to_vec()).unwrap();
+        let _ = nfkd.normalize_utf8_to(tail, &mut normalized);
+        assert!(nfkd.is_normalized(&normalized));
+
+        // Check nfc
+        let up_to = nfc.is_normalized_utf8_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from_utf8(head.to_vec()).unwrap();
+        let _ = nfc.normalize_utf8_to(tail, &mut normalized);
+        assert!(nfc.is_normalized(&normalized));
+
+        // Check nfkc
+        let up_to = nfkc.is_normalized_utf8_up_to(input);
+        let (head, tail) = input.split_at(up_to);
+        let mut normalized = String::from_utf8(head.to_vec()).unwrap();
+        let _ = nfkc.normalize_utf8_to(tail, &mut normalized);
+        assert!(nfkc.is_normalized(&normalized));
+    };
+
+    // todo: UTF16 tests?
+
+    let aaa = "aaa";
+    check_str(aaa);
+
+    let aaa_utf8 = aaa.as_bytes();
+    check_utf8(aaa_utf8);
+
+    assert!(nfd.is_normalized_up_to(aaa) == aaa.len());
+    assert!(nfkd.is_normalized_up_to(aaa) == aaa.len());
+    assert!(nfc.is_normalized_up_to(aaa) == aaa.len());
+    assert!(nfkc.is_normalized_up_to(aaa) == aaa.len());
+    assert!(nfd.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
+    assert!(nfkd.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
+    assert!(nfc.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
+    assert!(nfkc.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
+
+    let note = "a𝅗\u{1D165}a";
+    check_str(note);
+
+    let note_utf8 = note.as_bytes();
+    check_utf8(note_utf8);
+
+    assert!(nfd.is_normalized_up_to(note) == note.len());
+    assert!(nfkd.is_normalized_up_to(note) == note.len());
+    assert!(nfc.is_normalized_up_to(note) == note.len());
+    assert!(nfkc.is_normalized_up_to(note) == note.len());
+    assert!(nfd.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
+    assert!(nfkd.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
+    assert!(nfc.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
+    assert!(nfkc.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
+
+    let umlaut = "aäa";
+    check_str(umlaut);
+
+    let umlaut_utf8 = umlaut.as_bytes();
+    check_utf8(umlaut_utf8);
+
+    assert!(nfd.is_normalized_up_to(umlaut) == 1);
+    assert!(nfkd.is_normalized_up_to(umlaut) == 1);
+    assert!(nfc.is_normalized_up_to(umlaut) == 4);
+    assert!(nfkc.is_normalized_up_to(umlaut) == 4);
+    assert!(nfd.is_normalized_utf8_up_to(umlaut_utf8) == 1);
+    assert!(nfkd.is_normalized_utf8_up_to(umlaut_utf8) == 1);
+    assert!(nfc.is_normalized_utf8_up_to(umlaut_utf8) == 4);
+    assert!(nfkc.is_normalized_utf8_up_to(umlaut_utf8) == 4);
+
+    let fraction = "a½a";
+    check_str(fraction);
+
+    let fraction_utf8 = fraction.as_bytes();
+    check_utf8(fraction_utf8);
+
+    assert!(nfd.is_normalized_up_to(fraction) == 4);
+    assert!(nfkd.is_normalized_up_to(fraction) == 1);
+    assert!(nfc.is_normalized_up_to(fraction) == 4);
+    assert!(nfkc.is_normalized_up_to(fraction) == 1);
+    assert!(nfd.is_normalized_utf8_up_to(fraction_utf8) == 4);
+    assert!(nfkd.is_normalized_utf8_up_to(fraction_utf8) == 1);
+    assert!(nfc.is_normalized_utf8_up_to(fraction_utf8) == 4);
+    assert!(nfkc.is_normalized_utf8_up_to(fraction_utf8) == 1);
+}