141 changes: 90 additions & 51 deletions src/regexp/special-case.h
Expand Up @@ -6,70 +6,109 @@
#define V8_REGEXP_SPECIAL_CASE_H_

#ifdef V8_INTL_SUPPORT
#include "unicode/uversion.h"
namespace U_ICU_NAMESPACE {
class UnicodeSet;
} // namespace U_ICU_NAMESPACE
#include "src/base/logging.h"
#include "src/common/globals.h"

#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"

namespace v8 {
namespace internal {

// Functions to build special sets of Unicode characters that need special
// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
// Sets of Unicode characters that need special handling under "i" mode

// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
// defines slightly different case-folding rules than Unicode. An
// input character should match a pattern character if the result of
// the Canonicalize algorithm is the same for both characters.
//
// For the characters in the "ignore set", the process should not treat other
// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
// equivlant under the ECMA262 RegExp "i" mode because these characters are
// uppercase themselves that no other characters in the set uppercase to.
// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
// the precise definition.
//
// For the characters in the "special add set", the proecess should add only
// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
// and also that ONE uppercase character that other non uppercase character
// uppercase into to the set. Other uppercase characters in the result of
// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
// RegExp "i" mode consider two characters as "case equivlant" if both
// characters uppercase to the same character.
// While compiling such regular expressions, we need to compute the
// set of characters that should match a given input character. (See
// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
// For almost all characters, this can be efficiently computed using
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
// the remaining special cases.
//
// For example, consider the following case equivalent set defined by Unicode
// standard. Notice there are more than one uppercase characters in this set:
// U+212B Å Angstrom Sign - an uppercase character.
// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
// uppercase to U+00C5.
// In this case equivlant set is a special set and need special handling while
// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
// different than Unicode Standard:
// * U+212B should be included into the "ignore" set because there are no other
// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
// uppercase to U+212B.
// * U+00C5 and U+00E5 will both be included into the "special add" set. While
// calculate the "equivlant set" under ECMA262 "i" mode, the process will
// add U+00E5, because it is not an uppercase character in the set. The
// process will also add U+00C5, because it is the uppercase character which
// other non uppercase character, U+00C5, uppercase into.
// For a character c, the rules are as follows:
//
// For characters not included in "ignore set" and "special add set", the
// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
// much faster.
// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
// containing c will produce the set of characters that should
// match /c/i (or /[c]/i), and only those characters.
//
// Under Unicode 12.0, there are only 7 characters in the "special add set" and
// 4 characters in "ignore set" so even the special add process is slower, it is
// limited to a small set of cases only.
// 2. If c is in IgnoreSet, then the only character it should match is
// itself. However, closeOver will add additional incorrect
// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
// itself, and should not match 'ẞ'. In these cases, we can skip
// the closeOver entirely, because it will never add an equivalent
// character.
//
// The implementation of these two function will be generated by calling ICU
// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
// the code in src/regexp/gen-regexp-special-case.cc.
// 3. If c is in SpecialAddSet, then it should match at least one
// character other than itself. However, closeOver will add at
// least one additional incorrect match. For example, consider the
// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
// SIGN should not match either of the other two characters. As a
// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
// IgnoreSet). To find the correct matches for characters in
// SpecialAddSet, we closeOver the original character, but filter
// out the results that do not have the same canonical value.
//
// These two function will be used with LazyInstance<> template to generate
// global sharable set to reduce memory usage and speed up performance.
// The contents of these sets are calculated at build time by
// src/regexp/gen-regexp-special-case.cc, which generates
// gen/src/regexp/special-case.cc. This is done by iterating over the
// result of closeOver for each BMP character, and finding sets for
// which at least one character has a different canonical value than
// another character. Characters that match no other characters in
// their equivalence class are added to IgnoreSet. Characters that
// match at least one other character are added to SpecialAddSet.

class RegExpCaseFolding final : public AllStatic {
public:
static const icu::UnicodeSet& IgnoreSet();
static const icu::UnicodeSet& SpecialAddSet();

// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
// Canonicalize) step 3, which is used to determine whether
// characters match when ignoreCase is true and unicode is false.
static UChar32 Canonicalize(UChar32 ch) {
// a. Assert: ch is a UTF-16 code unit.
CHECK_LE(ch, 0xffff);

// b. Let s be the String value consisting of the single code unit ch.
icu::UnicodeString s(ch);

// c. Let u be the same result produced as if by performing the algorithm
// for String.prototype.toUpperCase using s as the this value.
// d. Assert: Type(u) is String.
icu::UnicodeString& u = s.toUpper();

// e. If u does not consist of a single code unit, return ch.
if (u.length() != 1) {
return ch;
}

// f. Let cu be u's single code unit element.
UChar32 cu = u.char32At(0);

// Function to build and return the Ignore set.
icu::UnicodeSet BuildIgnoreSet();
// g. If the value of ch >= 128 and the value of cu < 128, return ch.
if (ch >= 128 && cu < 128) {
return ch;
}

// Function to build and return the Special Add set.
icu::UnicodeSet BuildSpecialAddSet();
// h. Return cu.
return cu;
}
};

} // namespace internal
} // namespace v8
Expand Down
70 changes: 70 additions & 0 deletions test/intl/regress-10248.js
@@ -0,0 +1,70 @@
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
function Canonicalize(ch) {
var u = ch.toUpperCase();
if (u.length > 1) return ch;
var cu = u.charCodeAt(0);
if (ch.charCodeAt(0) >= 128 && cu < 128) return ch;
return cu;
}

function TestEquivalenceClass(eclass) {
for (var i = 0; i < eclass.length; i++) {
for (var j = 0; j < eclass.length; j++) {
if (i == j) continue;
var c1 = eclass[i];
var c2 = eclass[j];
var shouldMatch = Canonicalize(c1) === Canonicalize(c2);

var re1 = new RegExp(c1, 'i');
var re2 = new RegExp('[' + c1 + ']', 'i');

assertEquals(re1.test(c2), shouldMatch);
assertEquals(re2.test(c2), shouldMatch);
}
}
}

function TestAll() {
for (var eclass of equivalence_classes) {
TestEquivalenceClass(eclass);
}
}

// Interesting case-folding equivalence classes (as determined by
// ICU's UnicodeSet::closeOver). A class is interesting if it contains
// more than two characters, or if it contains any characters in
// IgnoreSet or SpecialAddSet as defined in src/regexp/special-case.h.
var equivalence_classes = [
'\u0041\u0061', // Aa (sanity check)
'\u004b\u006b\u212a', // KkK
'\u0053\u0073\u017f', // Ssſ
'\u00b5\u039c\u03bc', // µΜμ
'\u00c5\u00e5\u212b', // ÅåÅ
'\u00df\u1e9e', // ßẞ
'\u03a9\u03c9\u2126', // ΩωΩ
'\u0390\u1fd3', // ΐΐ
'\u0398\u03b8\u03d1\u03f4', // Θθϑϴ
'\u03b0\u1fe3', // ΰΰ
'\u1f80\u1f88', // ᾀᾈ
'\u1fb3\u1fbc', // ᾳᾼ
'\u1fc3\u1fcc', // ῃῌ
'\u1ff3\u1ffc', // ῳῼ
'\ufb05\ufb06', // ſtst

// Everything below this line is a well-behaved case-folding
// equivalence class with more than two characters but only one
// canonical case-folded character
'\u01c4\u01c5\u01c6', '\u01c7\u01c8\u01c9', '\u01ca\u01cb\u01cc',
'\u01f1\u01f2\u01f3', '\u0345\u0399\u03b9\u1fbe', '\u0392\u03b2\u03d0',
'\u0395\u03b5\u03f5', '\u039a\u03ba\u03f0', '\u03a0\u03c0\u03d6',
'\u03a1\u03c1\u03f1', '\u03a3\u03c2\u03c3', '\u03a6\u03c6\u03d5',
'\u0412\u0432\u1c80', '\u0414\u0434\u1c81', '\u041e\u043e\u1c82',
'\u0421\u0441\u1c83', '\u0422\u0442\u1c84\u1c85', '\u042a\u044a\u1c86',
'\u0462\u0463\u1c87', '\u1c88\ua64a\ua64b', '\u1e60\u1e61\u1e9b'
];

TestAll();