Fix TextCodecUTF8's error handling in EOF and across buffer boundaries

When TextCodecUTF8 found a truncated sequence at EOF, it used to emit one replacement character per byte in the sequence, even when it was a prefix of a valid sequence. Additionally, in streaming mode, if it found a lead byte for which a valid sequence would span longer than the current available bytes, any processing of that sequence was deferred until all such bytes were available, even if errors could be detected earlier. Both issues are solved by always checking the validity of partial sequences. The approach used in this patch uses `DecodeNonASCIISequence` to find the length of the maximal subpart of a partial sequence, and if the length is equal to the partial sequence size and we're not at EOF, we don't emit the error. However, this does not work when a byte in the 0x80 to 0xC1 range is found in a lead position, since `NonASCIISequenceLength` wrongly returns 2 and `DecodeNonASCIISequence` isn't enough to determine whether the partial sequence is invalid. This is fixed by having `NonASCIISequenceLength` to return 0 in those cases. Another issue with this approach is that, since the outer do-while loops in the `Decode` method take `do_flush && partial_sequence_size` as a condition, if a non-ASCII lead byte is found whose valid sequences would span longer than the bytes we have, those bytes would not be processed until the next call to `Decode` if `do_flush` is false. But as it turns out, the `do_flush` condition is not in fact needed, and removing it fixes this issue. Fixed: 796697 Fixed: 978522 Change-Id: Ic5a78e4eca356fdc2ad4038eba9ffe455fddf3ee Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3263938 Reviewed-by: Jeremy Roman <jbroman@chromium.org> Reviewed-by: Joshua Bell <jsbell@chromium.org> Reviewed-by: Kentaro Hara <haraken@chromium.org> Commit-Queue: Jeremy Roman <jbroman@chromium.org> Cr-Commit-Position: refs/heads/main@{#944572}
web-platform-tests · Nov 23, 2021 · 8e786e0 · 8e786e0
1 parent 480b9ac
commit 8e786e0
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 3 deletions.
diff --git a/encoding/textdecoder-eof.any.js b/encoding/textdecoder-eof.any.js
@@ -1,7 +1,14 @@
 test(() => {
+  // Truncated sequences
   assert_equals(new TextDecoder().decode(new Uint8Array([0xF0])), "\uFFFD");
   assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F])), "\uFFFD");
   assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F, 0x92])), "\uFFFD");
+
+  // Errors near end-of-queue
+  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F, 0x41])), "\uFFFDA");
+  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x41, 0x42])), "\uFFFDAB");
+  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x41, 0xF0])), "\uFFFDA\uFFFD");
+  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x8F, 0x92])), "\uFFFD\uFFFD\uFFFD");
 }, "TextDecoder end-of-queue handling");
 
 test(() => {
@@ -15,4 +22,19 @@ test(() => {
 
   decoder.decode(new Uint8Array([0xF0, 0x9F]), { stream: true });
   assert_equals(decoder.decode(new Uint8Array([0x92])), "\uFFFD");
+
+  assert_equals(decoder.decode(new Uint8Array([0xF0, 0x9F]), { stream: true }), "");
+  assert_equals(decoder.decode(new Uint8Array([0x41]), { stream: true }), "\uFFFDA");
+  assert_equals(decoder.decode(), "");
+
+  assert_equals(decoder.decode(new Uint8Array([0xF0, 0x41, 0x42]), { stream: true }), "\uFFFDAB");
+  assert_equals(decoder.decode(), "");
+
+  assert_equals(decoder.decode(new Uint8Array([0xF0, 0x41, 0xF0]), { stream: true }), "\uFFFDA");
+  assert_equals(decoder.decode(), "\uFFFD");
+
+  assert_equals(decoder.decode(new Uint8Array([0xF0]), { stream: true }), "");
+  assert_equals(decoder.decode(new Uint8Array([0x8F]), { stream: true }), "\uFFFD\uFFFD");
+  assert_equals(decoder.decode(new Uint8Array([0x92]), { stream: true }), "\uFFFD");
+  assert_equals(decoder.decode(), "");
 }, "TextDecoder end-of-queue handling using stream: true");
diff --git a/encoding/textdecoder-streaming.any.js b/encoding/textdecoder-streaming.any.js
@@ -28,15 +28,62 @@ var octets = {
                 var decoder = new TextDecoder(encoding);
                 for (var i = 0; i < encoded.length; i += len) {
                     var sub = [];
-                    for (var j = i; j < encoded.length && j < i + len; ++j)
+                    for (var j = i; j < encoded.length && j < i + len; ++j) {
                         sub.push(encoded[j]);
-                        var uintArray = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, sub.length));
-                        uintArray.set(sub);
+                    }
+                    var uintArray = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, sub.length));
+                    uintArray.set(sub);
                     out += decoder.decode(uintArray, {stream: true});
                 }
                 out += decoder.decode();
                 assert_equals(out, string);
             }, 'Streaming decode: ' + encoding + ', ' + len + ' byte window (' + arrayBufferOrSharedArrayBuffer + ')');
         }
     });
+
+    test(() => {
+        function bytes(byteArray) {
+            const view = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, byteArray.length));
+            view.set(byteArray);
+            return view;
+        }
+
+        const decoder = new TextDecoder();
+
+        assert_equals(decoder.decode(bytes([0xC1]), {stream: true}), "\uFFFD");
+        assert_equals(decoder.decode(), "");
+
+        assert_equals(decoder.decode(bytes([0xF5]), {stream: true}), "\uFFFD");
+        assert_equals(decoder.decode(), "");
+
+        assert_equals(decoder.decode(bytes([0xE0, 0x41]), {stream: true}), "\uFFFDA");
+        assert_equals(decoder.decode(bytes([0x42])), "B");
+
+        assert_equals(decoder.decode(bytes([0xE0, 0x80]), {stream: true}), "\uFFFD\uFFFD");
+        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
+
+        assert_equals(decoder.decode(bytes([0xED, 0xA0]), {stream: true}), "\uFFFD\uFFFD");
+        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
+
+        assert_equals(decoder.decode(bytes([0xF0, 0x41]), {stream: true}), "\uFFFDA");
+        assert_equals(decoder.decode(bytes([0x42]), {stream: true}), "B");
+        assert_equals(decoder.decode(bytes([0x43])), "C");
+
+        assert_equals(decoder.decode(bytes([0xF0, 0x80]), {stream: true}), "\uFFFD\uFFFD");
+        assert_equals(decoder.decode(bytes([0x80]), {stream: true}), "\uFFFD");
+        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
+
+        assert_equals(decoder.decode(bytes([0xF4, 0xA0]), {stream: true}), "\uFFFD\uFFFD");
+        assert_equals(decoder.decode(bytes([0x80]), {stream: true}), "\uFFFD");
+        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
+
+        assert_equals(decoder.decode(bytes([0xF0, 0x90, 0x41]), {stream: true}), "\uFFFDA");
+        assert_equals(decoder.decode(bytes([0x42])), "B");
+
+        // 4-byte UTF-8 sequences always correspond to non-BMP characters. Here
+        // we make sure that, although the first 3 bytes are enough to emit the
+        // lead surrogate, it only gets emitted when the fourth byte is read.
+        assert_equals(decoder.decode(bytes([0xF0, 0x9F, 0x92]), {stream: true}), "");
+        assert_equals(decoder.decode(bytes([0xA9])), "\u{1F4A9}");
+    }, `Streaming decode: UTF-8 chunk tests (${arrayBufferOrSharedArrayBuffer})`);
 })