Consolidated fixes to character encoders/decoders (#11707)

* Fix EUC-JP reference decoder The relevant step fixed by this pull request says "Return error."; thus, the rest of the process should continue with the next iteration, rather than run the rest of the handler for the given byte. * Fix ISO-2022-JP reference implementation The relevant step in the Encoding Standard says "Prepend _lead_ and _byte_ to _stream_." However, the two bytes are prepended in the wrong order in the reference implementation. Note that under the Encoding Standard, "[w]hen one or more tokens are prepended to a stream, those tokens must be inserted, _in given order_, before the first token in the stream." (Note that the code, at the time of this request, moves _lead_ to the front of the array, then moves _byte_ to the front of the array.) There may be other issues like this elsewhere in the multiple-byte encoder reference implementations. * Fix bug in EUC-KR reference implementation Makes one "if" statement conditional rather than unconditional * Fix bug in Shift_JIS encoder * Fix bug in Big5 reference implementation * Fix Shift_JIS reference decoder To conform to the most recent Encoding Standard.
web-platform-tests · Jul 24, 2018 · 0589266 · 0589266
1 parent bb001a9
commit 0589266
Show file tree

Hide file tree

Showing 6 changed files with 19 additions and 18 deletions.
diff --git a/encoding/legacy-mb-japanese/euc-jp/eucjp-decoder.js b/encoding/legacy-mb-japanese/euc-jp/eucjp-decoder.js
@@ -16,8 +16,9 @@ function eucjpDecoder(stream) {
     stream = stream.replace(/%/g, " ");
     stream = stream.replace(/[\s]+/g, " ").trim();
     var bytes = stream.split(" ");
-    for (i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
+    for (var i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
     var out = "";
+
     var lead, byte, offset, ptr, cp;
     var jis0212flag = false;
     var eucjpLead = 0x00;
@@ -68,6 +69,7 @@ function eucjpDecoder(stream) {
             }
             if (byte >= 0x00 && byte <= 0x7f) bytes.unshift(byte);
             out += "�";
+            continue;
         }
         if (byte >= 0x00 && byte <= 0x7f) {
             out += dec2char(byte);

diff --git a/encoding/legacy-mb-japanese/iso-2022-jp/iso2022jp-decoder.js b/encoding/legacy-mb-japanese/iso-2022-jp/iso2022jp-decoder.js
@@ -25,7 +25,7 @@ function iso2022jpDecoder(stream) {
     stream = stream.replace(/%/g, " ");
     stream = stream.replace(/[\s]+/g, " ").trim();
     var bytes = stream.split(" ");
-    for (i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
+    for (var i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
     var endofstream = 2000000;
     //bytes.push(endofstream)
     var out = "";
@@ -193,8 +193,8 @@ function iso2022jpDecoder(stream) {
                         continue;
                     }
                 }
-                bytes.unshift(lead);
-                bytes.unshift(byte);
+                // Prepend the sequence (lead, byte) to the stream
+                bytes.unshift(lead, byte);
                 outFlag = false;
                 decState = outState;
                 out += "�";

diff --git a/encoding/legacy-mb-japanese/shift_jis/sjis-decoder.js b/encoding/legacy-mb-japanese/shift_jis/sjis-decoder.js
@@ -16,7 +16,7 @@ function sjisDecoder(stream) {
 	stream = stream.replace(/%/g, " ");
 	stream = stream.replace(/[\s]+/g, " ").trim();
 	var bytes = stream.split(" ");
-	for (i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
+	for (var i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
 	var out = "";
 	var lead, byte, leadoffset, offset, ptr, cp;
 	var sjisLead = 0x00;
@@ -46,7 +46,7 @@ function sjisDecoder(stream) {
 			else leadoffset = 0xc1;
 			if ((byte >= 0x40 && byte <= 0x7e) || (byte >= 0x80 && byte <= 0xfc))
 				ptr = (lead - leadoffset) * 188 + byte - offset;
-			if (cp == null && ptr >= 8836 && ptr <= 10528) {
+			if (ptr != null && ptr >= 8836 && ptr <= 10715) {
 				out += dec2char(0xe000 + ptr - 8836);
 				continue;
 			}

diff --git a/encoding/legacy-mb-japanese/shift_jis/sjis-encoder.js b/encoding/legacy-mb-japanese/shift_jis/sjis-encoder.js
@@ -50,7 +50,7 @@ function sjisEncoder(stream) {
 	var cp;
 	var finished = false;
 	var endofstream = 2000000;
-
+	var temp, offset, leadoffset, first, second;
 	while (!finished) {
 		if (cps.length == 0) cp = endofstream;
 		else cp = cps.shift();
@@ -73,7 +73,7 @@ function sjisEncoder(stream) {
 		}
 		if (cp >= 0xff61 && cp <= 0xff9f) {
 			temp = cp - 0xff61 + 0xa1;
-			out += temp.toString(16).toUpperCase();
+			out += " " + temp.toString(16).toUpperCase();
 			continue;
 		}
 		if (cp == 0x2212) {

diff --git a/encoding/legacy-mb-korean/euc-kr/euckr-decoder.js b/encoding/legacy-mb-korean/euc-kr/euckr-decoder.js
@@ -16,7 +16,7 @@ function euckrDecoder(stream) {
 	stream = stream.replace(/%/g, " ");
 	stream = stream.replace(/[\s]+/g, " ").trim();
 	var bytes = stream.split(" ");
-	for (i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
+	for (var i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
 	var out = "";
 	var lead, byte, offset, ptr, cp;
 	var euckrLead = 0x00;
@@ -41,7 +41,7 @@ function euckrDecoder(stream) {
 			lead = euckrLead;
 			ptr = null;
 			euckrLead = 0x00;
-			if (byte >= 0x41 || byte <= 0xfe)
+			if (byte >= 0x41 && byte <= 0xfe)
 				ptr = (lead - 0x81) * 190 + (byte - 0x41);
 			if (ptr == null) cp = null;
 			else cp = euckr[ptr];

diff --git a/encoding/legacy-mb-tchinese/big5/big5-decoder.js b/encoding/legacy-mb-tchinese/big5/big5-decoder.js
@@ -16,7 +16,7 @@ function big5Decoder(stream) {
 	stream = stream.replace(/%/g, " ");
 	stream = stream.replace(/[\s]+/g, " ").trim();
 	var bytes = stream.split(" ");
-	for (i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
+	for (var i = 0; i < bytes.length; i++) bytes[i] = parseInt(bytes[i], 16);
 	var out = "";
 	var lead, byte, offset, ptr, cp;
 	var big5lead = 0x00;
@@ -47,24 +47,23 @@ function big5Decoder(stream) {
 				ptr = (lead - 0x81) * 157 + (byte - offset);
 			// "If there is a row in the table below whose first column is pointer, return the two code points listed in its second column"
 			switch (ptr) {
-				case "1133":
+				case 1133:
 					out += "Ê̄";
 					continue;
-				case "1135":
+				case 1135:
 					out += "Ê̌";
 					continue;
-				case "1164":
+				case 1164:
 					out += "ê̄";
 					continue;
-				case "1166":
+				case 1166:
 					out += "ê̌";
 					continue;
 			}
 			if (ptr == null) cp = null;
 			else cp = big5[ptr];
-			if (cp == null && byte >= 0x00 && byte < 0x7f) {
+			if (cp == null && byte >= 0x00 && byte <= 0x7f) {
 				bytes.unshift(byte);
-				continue;
 			}
 			if (cp == null) {
 				out += "�";
@@ -73,7 +72,7 @@ function big5Decoder(stream) {
 			out += dec2char(cp);
 			continue;
 		}
-		if (byte >= 0x00 && byte < 0x7f) {
+		if (byte >= 0x00 && byte <= 0x7f) {
 			out += dec2char(byte);
 			continue;
 		}