-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcreate-char-set.ts
160 lines (141 loc) · 4.33 KB
/
create-char-set.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import { Char } from "../char-types";
import { CharRange, CharSet } from "../char-set";
import { assertNever } from "../util";
import { Flags } from "./flags";
import { CharEnv, getCharEnv } from "./char-env";
import { getCharacterProperty } from "./property";
export type PredefinedCharacterSet =
| AnyCharacterSet
| DigitCharacterSet
| PropertyCharacterSet
| SpaceCharacterSet
| WordCharacterSet;
export interface AnyCharacterSet {
kind: "any";
}
export interface DigitCharacterSet {
kind: "digit";
negate: boolean;
}
export type PropertyCharacterSet = CharacterPropertyCharacterSet | StringPropertyCharacterSet;
export interface CharacterPropertyCharacterSet {
kind: "property";
key: string;
value: string | null;
strings: false;
negate: boolean;
}
export interface StringPropertyCharacterSet {
kind: "property";
key: string;
value: null;
strings: true;
negate: false;
}
export interface SpaceCharacterSet {
kind: "space";
negate: boolean;
}
export interface WordCharacterSet {
kind: "word";
negate: boolean;
}
/**
* Creates a new character set with the characters equivalent to a JavaScript regular expression character set.
*
* @param chars The characters in the set.
* @param flags The flags of the pattern.
*/
export function createCharSet(
chars: Iterable<Char | CharRange | Readonly<Exclude<PredefinedCharacterSet, StringPropertyCharacterSet>>>,
flags: Readonly<Flags>
): CharSet {
// https://tc39.es/ecma262/#sec-runtime-semantics-charactersetmatcher-abstract-operation
// This works by first adding all characters and ranges to a single ranges array while keeping track of whether
// added characters/ranges might vary in case (if ignoreCase).
// If ignoreCase and the ranges might vary in case, the case variations of all characters will be added.
const env = getCharEnv(flags);
const ranges: CharRange[] = [];
let fullCaseCheck = false;
function addChar(char: Char): void {
/**
* We will only add all case variation for the given character if:
* 1) the regexp has the i flag set.
* 2) we don't already do a full case check. Since the full case check will add all case variations of this
* character anyway, there's no reason to do it here.
* 3) the given character actually varies in case.
*/
if (env.ignoreCase && !fullCaseCheck) {
const fold = env.caseFolding[char];
if (fold) {
// add all case variations
for (let i = 0, l = fold.length; i < l; i++) {
const variation = fold[i];
ranges.push({ min: variation, max: variation });
}
// all case variations also include the given character, so we are done
return;
}
}
ranges.push({ min: char, max: char });
}
function addRange(range: CharRange): void {
if (range.min === range.max) {
addChar(range.min);
return;
}
if (env.ignoreCase && !fullCaseCheck && !env.caseVarying.isDisjointWith(range)) {
fullCaseCheck = true;
}
ranges.push(range);
}
for (const char of chars) {
if (isChar(char)) {
addChar(char);
} else if ("kind" in char) {
const set = getPredefinedSet(char, flags, env);
if (set.isAll) {
// since all character sets and ranges are combined using union, we can stop here
return set;
}
ranges.push(...set.ranges);
} else {
addRange(char);
}
}
const cs = env.empty.union(ranges);
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (!env.ignoreCase || !fullCaseCheck) {
// no full case check, so we're done here.
return cs;
}
return env.withCaseVaryingCharacters(cs);
}
function isChar(value: unknown): value is Char {
return typeof value === "number";
}
function getPredefinedSet(
set: Readonly<Exclude<PredefinedCharacterSet, StringPropertyCharacterSet>>,
flags: Readonly<Flags>,
env: CharEnv
): CharSet {
switch (set.kind) {
case "any":
return flags.dotAll ? env.all : env.nonLineTerminator;
case "digit":
return set.negate ? env.nonDigit : env.digit;
case "space":
return set.negate ? env.nonSpace : env.space;
case "word":
return set.negate ? env.nonWord : env.word;
case "property": {
if (!env.unicode) {
throw new Error("Unicode property escapes cannot be used without the u flag.");
}
const { key, value, negate } = set;
return getCharacterProperty(key, value, negate, env, flags.unicodeSets ?? false);
}
default:
throw assertNever(set, "Invalid predefined character set type");
}
}