From e7c0d68ce1b0744272a3504f466b9c7af74d228b Mon Sep 17 00:00:00 2001 From: Ivan Medina Date: Sun, 13 Oct 2024 21:42:04 +0200 Subject: [PATCH 1/5] feat(string): add normalize method --- nova_vm/Cargo.toml | 1 + .../string_objects/string_prototype.rs | 83 ++++++++++++++++++- 2 files changed, 81 insertions(+), 3 deletions(-) diff --git a/nova_vm/Cargo.toml b/nova_vm/Cargo.toml index c2abba55b..1f3f5780f 100644 --- a/nova_vm/Cargo.toml +++ b/nova_vm/Cargo.toml @@ -20,6 +20,7 @@ rand = { workspace = true } ryu-js = { workspace = true } small_string = { path = "../small_string" } sonic-rs = { workspace = true, optional = true } +unicode-normalization = "0.1.24" wtf8 = { workspace = true } [features] diff --git a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs index 10cc83931..bd9e77141 100644 --- a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs +++ b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs @@ -2,9 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::{cmp::max, collections::VecDeque, iter::repeat}; +use std::{cmp::max, collections::VecDeque, iter::repeat, str::FromStr}; use small_string::SmallString; +use unicode_normalization::UnicodeNormalization; use crate::{ ecmascript::{ @@ -567,8 +568,42 @@ impl StringPrototype { todo!() } - fn normalize(_agent: &mut Agent, _this_value: Value, _: ArgumentsList) -> JsResult { - todo!() + /// ### [22.1.3.15 String.prototype.normalize ( \[ form \] )](https://tc39.es/ecma262/#sec-string.prototype.normalize) + fn normalize( + agent: &mut Agent, + this_value: Value, + arguments: ArgumentsList, + ) -> JsResult { + // 1. Let O be ? RequireObjectCoercible(this value). + let o = require_object_coercible(agent, this_value)?; + + // 2. Let S be ? ToString(O). + let s = to_string(agent, o)?; + + // 3. If form is undefined, let f be "NFC". + let form = arguments.get(0); + let f = if form.is_undefined() { + NormalizeForm::NFC + } else { + // 4. Else, let f be ? ToString(form). + let form_result = NormalizeForm::from_str(form.to_string(agent).unwrap().as_str(agent)); + let form = match form_result { + Ok(form) => form, + // 5. If f is not one of "NFC", "NFD", "NFKC", or "NFKD", throw a RangeError exception. + Err(()) => { + return Err(agent.throw_exception_with_static_message( + ExceptionType::RangeError, + "The normalization form should be one of NFC, NFD, NFKC, NFKD.", + )) + } + }; + form + }; + + // 6. Let ns be the String value that is the result of normalizing S into the normalization form named by f as specified in the latest Unicode Standard, Normalization Forms. + let ns = unicode_normalize(s.as_str(agent), f); + // 7. Return ns. + Ok(Value::from_string(agent, ns).into_value()) } /// ### [22.1.3.16 String.prototype.padEnd ( maxLength \[ , fillString \] )](https://tc39.es/ecma262/#sec-string.prototype.padend) @@ -1478,3 +1513,45 @@ enum TrimWhere { End, StartAndEnd, } + +#[derive(Debug)] +enum NormalizeForm { + NFC, + NFD, + NFKC, + NFKD, +} + +impl NormalizeForm { + fn as_str(&self) -> &'static str { + match self { + NormalizeForm::NFC => "NFC", + NormalizeForm::NFD => "NFD", + NormalizeForm::NFKC => "NFKC", + NormalizeForm::NFKD => "NFKD", + } + } +} + +impl FromStr for NormalizeForm { + type Err = (); + + fn from_str(input: &str) -> Result { + match input { + "NFC" => Ok(NormalizeForm::NFC), + "NFD" => Ok(NormalizeForm::NFD), + "NFKC" => Ok(NormalizeForm::NFKC), + "NFKD" => Ok(NormalizeForm::NFKD), + _ => Err(()), + } + } +} + +fn unicode_normalize(s: &str, f: NormalizeForm) -> std::string::String { + match f { + NormalizeForm::NFC => s.nfc().collect::(), + NormalizeForm::NFD => s.nfd().collect::(), + NormalizeForm::NFKC => s.nfkc().collect::(), + NormalizeForm::NFKD => s.nfkd().collect::(), + } +} From c6b8e1a79ad80b0ea10d7317cae108e3e8fd9027 Mon Sep 17 00:00:00 2001 From: Ivan Medina Date: Sun, 13 Oct 2024 22:05:40 +0200 Subject: [PATCH 2/5] chore: remove derive attribute --- .../builtins/text_processing/string_objects/string_prototype.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs index bd9e77141..cd557bda1 100644 --- a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs +++ b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs @@ -1514,7 +1514,6 @@ enum TrimWhere { StartAndEnd, } -#[derive(Debug)] enum NormalizeForm { NFC, NFD, From 3ef66bd5bf28d83a78a4500f671891779382df45 Mon Sep 17 00:00:00 2001 From: Ivan Medina Date: Mon, 14 Oct 2024 17:08:42 +0200 Subject: [PATCH 3/5] fix: quick check if normalized first --- Cargo.toml | 1 + nova_vm/Cargo.toml | 2 +- .../string_objects/string_prototype.rs | 61 ++++++++++--------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 64b1a367e..f08ab5371 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,4 +23,5 @@ oxc_syntax = "0.30.3" rand = "0.8.5" ryu-js = "1.0.1" sonic-rs = "0.3.13" +unicode-normalization = "0.1.24" wtf8 = "0.1" diff --git a/nova_vm/Cargo.toml b/nova_vm/Cargo.toml index 1f3f5780f..f6e29845d 100644 --- a/nova_vm/Cargo.toml +++ b/nova_vm/Cargo.toml @@ -20,7 +20,7 @@ rand = { workspace = true } ryu-js = { workspace = true } small_string = { path = "../small_string" } sonic-rs = { workspace = true, optional = true } -unicode-normalization = "0.1.24" +unicode-normalization = { workspace = true } wtf8 = { workspace = true } [features] diff --git a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs index cd557bda1..0c0b3cbb7 100644 --- a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs +++ b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs @@ -5,7 +5,9 @@ use std::{cmp::max, collections::VecDeque, iter::repeat, str::FromStr}; use small_string::SmallString; -use unicode_normalization::UnicodeNormalization; +use unicode_normalization::{ + is_nfc_quick, is_nfd_quick, is_nfkc_quick, is_nfkd_quick, IsNormalized, UnicodeNormalization, +}; use crate::{ ecmascript::{ @@ -583,11 +585,12 @@ impl StringPrototype { // 3. If form is undefined, let f be "NFC". let form = arguments.get(0); let f = if form.is_undefined() { - NormalizeForm::NFC + NormalizeForm::Nfc } else { // 4. Else, let f be ? ToString(form). - let form_result = NormalizeForm::from_str(form.to_string(agent).unwrap().as_str(agent)); - let form = match form_result { + let f = to_string(agent, form)?; + let form_result = NormalizeForm::from_str(f.as_str(agent)); + match form_result { Ok(form) => form, // 5. If f is not one of "NFC", "NFD", "NFKC", or "NFKD", throw a RangeError exception. Err(()) => { @@ -596,8 +599,7 @@ impl StringPrototype { "The normalization form should be one of NFC, NFD, NFKC, NFKD.", )) } - }; - form + } }; // 6. Let ns be the String value that is the result of normalizing S into the normalization form named by f as specified in the latest Unicode Standard, Normalization Forms. @@ -1515,21 +1517,10 @@ enum TrimWhere { } enum NormalizeForm { - NFC, - NFD, - NFKC, - NFKD, -} - -impl NormalizeForm { - fn as_str(&self) -> &'static str { - match self { - NormalizeForm::NFC => "NFC", - NormalizeForm::NFD => "NFD", - NormalizeForm::NFKC => "NFKC", - NormalizeForm::NFKD => "NFKD", - } - } + Nfc, + Nfd, + Nfkc, + Nfkd, } impl FromStr for NormalizeForm { @@ -1537,10 +1528,10 @@ impl FromStr for NormalizeForm { fn from_str(input: &str) -> Result { match input { - "NFC" => Ok(NormalizeForm::NFC), - "NFD" => Ok(NormalizeForm::NFD), - "NFKC" => Ok(NormalizeForm::NFKC), - "NFKD" => Ok(NormalizeForm::NFKD), + "NFC" => Ok(NormalizeForm::Nfc), + "NFD" => Ok(NormalizeForm::Nfd), + "NFKC" => Ok(NormalizeForm::Nfkc), + "NFKD" => Ok(NormalizeForm::Nfkd), _ => Err(()), } } @@ -1548,9 +1539,21 @@ impl FromStr for NormalizeForm { fn unicode_normalize(s: &str, f: NormalizeForm) -> std::string::String { match f { - NormalizeForm::NFC => s.nfc().collect::(), - NormalizeForm::NFD => s.nfd().collect::(), - NormalizeForm::NFKC => s.nfkc().collect::(), - NormalizeForm::NFKD => s.nfkd().collect::(), + NormalizeForm::Nfc => match is_nfc_quick(s.chars()) { + IsNormalized::Yes => s.to_string(), + _ => s.nfc().collect::(), + }, + NormalizeForm::Nfd => match is_nfd_quick(s.chars()) { + IsNormalized::Yes => s.to_string(), + _ => s.nfd().collect::(), + }, + NormalizeForm::Nfkc => match is_nfkc_quick(s.chars()) { + IsNormalized::Yes => s.to_string(), + _ => s.nfkc().collect::(), + }, + NormalizeForm::Nfkd => match is_nfkd_quick(s.chars()) { + IsNormalized::Yes => s.to_string(), + _ => s.nfkd().collect::(), + }, } } From f4b2ebee924164f6532aaaa2b4211f6c6f947858 Mon Sep 17 00:00:00 2001 From: Ivan Medina Date: Tue, 15 Oct 2024 07:20:10 +0200 Subject: [PATCH 4/5] fix(test262): update expectations --- tests/expectations.json | 10 ---------- tests/metrics.json | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/expectations.json b/tests/expectations.json index 736549b11..7eb1f5498 100644 --- a/tests/expectations.json +++ b/tests/expectations.json @@ -6207,16 +6207,6 @@ "built-ins/String/prototype/matchAll/regexp-prototype-matchAll-v-u-flag.js": "CRASH", "built-ins/String/prototype/matchAll/this-val-non-obj-coercible.js": "CRASH", "built-ins/String/prototype/matchAll/toString-this-val.js": "CRASH", - "built-ins/String/prototype/normalize/form-is-not-valid-throws.js": "CRASH", - "built-ins/String/prototype/normalize/return-abrupt-from-form-as-symbol.js": "CRASH", - "built-ins/String/prototype/normalize/return-abrupt-from-form.js": "CRASH", - "built-ins/String/prototype/normalize/return-abrupt-from-this-as-symbol.js": "CRASH", - "built-ins/String/prototype/normalize/return-abrupt-from-this.js": "CRASH", - "built-ins/String/prototype/normalize/return-normalized-string-from-coerced-form.js": "CRASH", - "built-ins/String/prototype/normalize/return-normalized-string-using-default-parameter.js": "CRASH", - "built-ins/String/prototype/normalize/return-normalized-string.js": "CRASH", - "built-ins/String/prototype/normalize/this-is-null-throws.js": "CRASH", - "built-ins/String/prototype/normalize/this-is-undefined-throws.js": "CRASH", "built-ins/String/prototype/padEnd/normal-operation.js": "CRASH", "built-ins/String/prototype/padStart/normal-operation.js": "CRASH", "built-ins/String/prototype/repeat/repeat-string-n-times.js": "TIMEOUT", diff --git a/tests/metrics.json b/tests/metrics.json index 6bbd81a36..19f5dd95d 100644 --- a/tests/metrics.json +++ b/tests/metrics.json @@ -1,8 +1,8 @@ { "results": { - "crash": 16307, + "crash": 16297, "fail": 8261, - "pass": 20680, + "pass": 20690, "skip": 40, "timeout": 3, "unresolved": 0 From 0929bb0d19782b54f6f7ed50fd3412c0dad7f4ea Mon Sep 17 00:00:00 2001 From: Ivan Medina Date: Tue, 15 Oct 2024 10:43:28 +0200 Subject: [PATCH 5/5] fix(normalize): avoid extra heap data clone --- .../string_objects/string_prototype.rs | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs index 0c0b3cbb7..6ee4d6b95 100644 --- a/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs +++ b/nova_vm/src/ecmascript/builtins/text_processing/string_objects/string_prototype.rs @@ -603,9 +603,11 @@ impl StringPrototype { }; // 6. Let ns be the String value that is the result of normalizing S into the normalization form named by f as specified in the latest Unicode Standard, Normalization Forms. - let ns = unicode_normalize(s.as_str(agent), f); - // 7. Return ns. - Ok(Value::from_string(agent, ns).into_value()) + match unicode_normalize(s.as_str(agent), f) { + // 7. Return ns. + None => Ok(s.into_value()), + Some(ns) => Ok(Value::from_string(agent, ns).into_value()), + } } /// ### [22.1.3.16 String.prototype.padEnd ( maxLength \[ , fillString \] )](https://tc39.es/ecma262/#sec-string.prototype.padend) @@ -1537,23 +1539,23 @@ impl FromStr for NormalizeForm { } } -fn unicode_normalize(s: &str, f: NormalizeForm) -> std::string::String { +fn unicode_normalize(s: &str, f: NormalizeForm) -> Option { match f { NormalizeForm::Nfc => match is_nfc_quick(s.chars()) { - IsNormalized::Yes => s.to_string(), - _ => s.nfc().collect::(), + IsNormalized::Yes => None, + _ => Some(s.nfc().collect::()), }, NormalizeForm::Nfd => match is_nfd_quick(s.chars()) { - IsNormalized::Yes => s.to_string(), - _ => s.nfd().collect::(), + IsNormalized::Yes => None, + _ => Some(s.nfd().collect::()), }, NormalizeForm::Nfkc => match is_nfkc_quick(s.chars()) { - IsNormalized::Yes => s.to_string(), - _ => s.nfkc().collect::(), + IsNormalized::Yes => None, + _ => Some(s.nfkc().collect::()), }, NormalizeForm::Nfkd => match is_nfkd_quick(s.chars()) { - IsNormalized::Yes => s.to_string(), - _ => s.nfkd().collect::(), + IsNormalized::Yes => None, + _ => Some(s.nfkd().collect::()), }, } }