-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparser.ts
1244 lines (1099 loc) · 35.6 KB
/
parser.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import { Char } from "../char-types";
import { ReadonlyWordSet, WordSet } from "../word-set";
import { CharSet } from "../char-set";
import {
Alternation,
Assertion,
CharacterClass,
Concatenation,
Element,
Expression,
NoParent,
Node,
Parent,
Quantifier,
SourceLocation,
Unknown,
setParent,
setSource,
} from "../ast";
import { AST, RegExpParser, visitRegExpAST } from "@eslint-community/regexpp";
import {
UnionIterable,
assertNever,
concatSequences,
flatConcatSequences,
repeatSequences,
unionSequences,
} from "../util";
import { createAssertion } from "./create-assertion";
import { Literal } from "./literal";
import { TooManyNodesError } from "../errors";
import { MatchingDirection, isPotentiallyEmpty } from "../ast-analysis";
import {
backreferenceAlwaysAfterGroup,
hasSomeAncestor,
inheritedMatchingDirection,
somePathToBackreference,
} from "./regexpp-util";
import { getCharEnv } from "./char-env";
import { Flags, isFlags } from "./flags";
import { parseUnicodeSet } from "./parse-unicode-set";
import { UnicodeSet } from "./unicode-set";
import { Maximum } from "./maximum";
const DEFAULT_MAX_NODES = 10_000;
const DEFAULT_BACK_REF_MAX_WORDS = 100;
export interface ParseOptions {
/**
* The maximum number of words a backreference can be replaced by.
*
* Set this to 0 to disable resolving backreferences.
*
* @default 100
*/
maxBackreferenceWords?: number;
/**
* How to the parser will handle unresolved backreferences.
*
* - `"disable"`
*
* The parser will replace all backreferences with an empty character class. This will cause all paths containing
* a backreference to be (effectively) removed.
*
* E.g. `(a*)(\1|b)` will be parsed as `(a*)(([])|b)` which is equivalent to `a*b`.
*
* - `"throw"`
*
* The parser will throw an error when encountering a backreference that cannot be removed.
*
* E.g. `(a*)b\1` will throw but `(a*)[^\s\S]\1` will not because the backreference will be removed anyway because
* of the empty character class.
*
* - `"unknown"`
*
* The parser will create a `Unknown` node for each backreference that cannot be removed. The id of the node will
* be raw string of the backreference.
*
* Backreferences that have been resolved are not affected by this option.
*
* @default "throw"
*/
backreferences?: "disable" | "throw" | "unknown";
/**
* How the parser will handle assertions.
*
* - `"parse"`
*
* The parser will translate every assertion literally to an equivalent RE AST representation. Builtin assertions
* (e.g. `\b`, `$`) will be transformed into equivalent assertions.
*
* - `"disable"`
*
* The parser will disable all assertion by replacing them with an empty character class. This will cause all
* paths containing an assertion to be (effectively) removed.
*
* - `"ignore"`
*
* The parser will ignore all assertion by replacing them with an empty group.
*
* - `"throw"`
*
* The parser will throw an error when encountering a assertion that cannot be removed.
*
* E.g. `a\B` will throw but `a([]\b)(\b){0}` will not because none of the `\b`s can be reached.
*
* - `"unknown"`
*
* The parser will create a `Unknown` node for each assertion. The id of the node will be raw string of the
* assertion.
*
* @default "parse"
*/
assertions?: "parse" | "disable" | "ignore" | "throw" | "unknown";
/**
* By default, the parser will try to simplify the generated RE as much as possible.
*
* If set to `false`, all trivial simplifications will be disabled. This includes:
*
* - Removing alternatives where all paths go through an empty character class, an alternation with 0 alternatives,
* or a disabled backreference/assertion.
* - Removing constant 0 and constant 1 quantifiers.
* - Inlining single-alternative groups.
*
* These simplifications might prevent certain backreferences or assertions from throwing an error. It's usually
* good to have them enabled since parsing is usually faster and the produced RE AST is smaller.
*
* If the produced RE AST is supposed to be a literal translation, then simplifications have to be disabled.
*
* @default true
*/
simplify?: boolean;
/**
* The maximum number of nodes the parser is allowed to create.
*
* If the regexes requires more nodes, a {@link TooManyNodesError} will be thrown.
*
* @default 10000
*/
maxNodes?: number;
/**
* `Unknown` nodes have an `id` property that can be used to identify the element that created the unknown. This
* function can be used to control the `id` value.
*
* By default, the raw of the element will be used as its id.
*/
getUnknownId?: (element: AST.Backreference | AST.Assertion) => string;
}
export interface RegexppAst {
readonly pattern: AST.Pattern;
readonly flags: AST.Flags;
}
export type ParsableElement = AST.Element | AST.Pattern | AST.Alternative;
export interface ParseResult {
expression: Expression;
maxCharacter: Char;
}
type LogicalChar = Char | CharSet;
type LogicalWord = LogicalChar[];
type ReadonlyLogicalWord = readonly LogicalChar[];
interface ParserContext {
readonly maxBackreferenceWords: number;
readonly backreferences: NonNullable<ParseOptions["backreferences"]>;
readonly assertions: NonNullable<ParseOptions["assertions"]>;
readonly disableSimplification: boolean;
readonly getUnknownId: NonNullable<ParseOptions["getUnknownId"]>;
readonly nc: NodeCreator;
readonly matchingDir: MatchingDirection;
readonly variableResolved: ReadonlyMap<AST.CapturingGroup, ReadonlyLogicalWord>;
}
// Some helper constants and types to make the parser implementation more readable
const EMPTY_SET = 1;
const EMPTY_CONCAT = 2;
type EmptySet = typeof EMPTY_SET;
type EmptyConcat = typeof EMPTY_CONCAT;
type Empty = EmptyConcat | EmptySet;
/**
* Converts JS RegExp to refa's RE AST format.
*/
export class Parser {
/**
* The literal of the parser instance.
*/
readonly literal: Literal;
/**
* The parsed AST of the literal this parser works on.
*
* While not explicitly typed that way, the parser will assume that the AST is readonly and makes optimizations
* based on that assumption. It is not safe to change the AST in any way.
*/
readonly ast: RegexppAst;
/**
* This contains the same flags as `ast.flags` but with a better type.
*/
readonly flags: Required<Flags>;
/**
* The maximum character of all character sets in the parsed AST.
*
* This value will also be returned as part of the {@link ParseResult}.
*/
readonly maxCharacter: Char;
private readonly _charCache = new Map<string, UnicodeSet>();
private readonly _simpleCharCache = new Map<Char, CharSet>();
private readonly _backRefCanReachGroupCache = new Map<AST.Backreference, boolean>();
private readonly _backRefAlwaysAfterGroupCache = new Map<AST.Backreference, boolean>();
private readonly _constantResolveCache = new Map<AST.CapturingGroup, ReadonlyLogicalWord | null>();
private readonly _groupReferencesCache = new Map<AST.CapturingGroup, AST.Backreference[]>();
private readonly _charSetToCharFn: CharSetToCharsFn;
private constructor(ast: RegexppAst) {
this.literal = { source: ast.pattern.raw, flags: ast.flags.raw };
this.ast = ast;
this.maxCharacter = this.ast.flags.unicode ? Maximum.UNICODE : Maximum.UTF16;
if (!isFlags(ast.flags)) {
throw new Error("The flags in the given AST are not a valid set of flags.");
}
this.flags = ast.flags;
this._charSetToCharFn = createCharSetToCharsFn(this.flags);
}
/**
* Creates a new parser from the given literal.
*
* This function will throw a `SyntaxError` if the given literal is not a valid RegExp literal according to the
* given RegExp parser options.
*
* If a string is given as the literal, it must be of the form `/pattern/flags`. If possible, use the
* object form with {@link Literal} instead.
*
* @param literal
* @param parserOptions
*/
static fromLiteral(literal: Literal | string, parserOptions?: RegExpParser.Options): Parser {
const parser = new RegExpParser(parserOptions);
let ast;
if (typeof literal === "string") {
ast = parser.parseLiteral(literal);
} else {
const flags = parser.parseFlags(literal.flags);
const pattern = parser.parsePattern(literal.source, undefined, undefined, flags);
ast = { pattern, flags };
}
return new Parser(ast);
}
/**
* Creates a new parser from the given [regexpp](https://github.com/mysticatea/regexpp) AST.
*
* When the JS RegExp has already been parsed using regexpp, this method can be used to avoid parsing the regex
* again.
*
* The given AST is not allowed to be changed during the lifetime of the returned parser.
*
* @param ast
*/
static fromAst(ast: RegexppAst): Parser {
return new Parser(ast);
}
/**
* Parsed the entire literal.
*
* For more information on parsing, see {@link parseElement}.
*
* @param options
*/
parse(options?: Readonly<ParseOptions>): ParseResult {
return this.parseElement(this.ast.pattern, options);
}
/**
* Parses a specific element of the literal.
*
* Use {@link ParseOptions} to control how the element is parsed.
*
* @param element
* @param options
*/
parseElement(element: ParsableElement, options?: Readonly<ParseOptions>): ParseResult {
const context: ParserContext = {
maxBackreferenceWords: Math.round(options?.maxBackreferenceWords ?? DEFAULT_BACK_REF_MAX_WORDS),
backreferences: options?.backreferences ?? "throw",
assertions: options?.assertions ?? "parse",
disableSimplification: !(options?.simplify ?? true),
getUnknownId: options?.getUnknownId ?? (e => e.raw),
nc: new NodeCreator(options?.maxNodes ?? DEFAULT_MAX_NODES),
matchingDir: inheritedMatchingDirection(element),
variableResolved: new Map(),
};
const expression = this._parseElement(element, context);
setParent(expression, null);
return { expression, maxCharacter: this.maxCharacter };
}
private _parseElement(element: ParsableElement, context: ParserContext): NoParent<Expression> {
const expression: NoParent<Expression> = {
type: "Expression",
alternatives: [],
source: copySource(element),
};
switch (element.type) {
case "Alternative": {
this._addAlternatives([element], expression, context);
break;
}
case "Pattern": {
this._addAlternatives(element.alternatives, expression, context);
break;
}
default: {
const e = this._createElement(element, context);
if (e === EMPTY_SET) {
// do nothing
} else if (e === EMPTY_CONCAT) {
expression.alternatives.push(context.nc.newConcat(element));
} else {
if (!context.disableSimplification && e.type === "Alternation") {
// just inline the alternatives
expression.alternatives = e.alternatives;
} else {
const concat = context.nc.newConcat(element);
expression.alternatives.push(concat);
concat.elements.push(e);
}
}
break;
}
}
return expression;
}
private _addAlternatives(
alternatives: readonly AST.Alternative[],
parent: NoParent<Parent>,
context: ParserContext
): void {
for (const alternative of alternatives) {
const concat = this._createConcatenation(alternative, context);
if (concat !== EMPTY_SET) {
if (context.disableSimplification) {
parent.alternatives.push(concat);
} else {
if (concat.elements.length === 1 && concat.elements[0].type === "Alternation") {
// add all alternatives
parent.alternatives.push(...concat.elements[0].alternatives);
} else {
parent.alternatives.push(concat);
}
}
}
}
}
private _createConcatenation(
alternative: AST.Alternative,
context: ParserContext
): NoParent<Concatenation> | EmptySet {
const elements = context.matchingDir === "ltr" ? alternative.elements : [...alternative.elements].reverse();
const result = this._createElements(elements, context);
if (result === EMPTY_SET) {
return EMPTY_SET;
}
if (context.matchingDir === "rtl") {
result.reverse();
}
const concat = context.nc.newConcat(alternative);
this._setConcatenationElements(concat, result, context);
return concat;
}
private _setConcatenationElements(
concat: NoParent<Concatenation>,
elements: NoParent<Element>[],
context: ParserContext
): void {
if (context.disableSimplification) {
concat.elements = elements;
} else {
if (concat.elements.length > 0) {
concat.elements = [];
}
for (const e of elements) {
if (e.type === "Alternation" && e.alternatives.length === 1) {
concat.elements.push(...e.alternatives[0].elements);
} else {
concat.elements.push(e);
}
}
}
}
/**
* The order of elements depends on the current matching direction. If the current matching direction is "ltr", then
* the elements will be in order as they appear in AST. If the current matching direction is "rtl", then the order
* will be reversed.
*
* The function expects the given elements to be in that order and will returns the parsed elements in that order.
*
* @param inputElements
* @param context
*/
private _createElements(
inputElements: readonly AST.Element[],
context: ParserContext
): NoParent<Element>[] | EmptySet {
const outputElements: NoParent<Element>[] = [];
let error: { error: unknown } | undefined = undefined;
for (let i = 0; i < inputElements.length; i++) {
const currentElement = inputElements[i];
let result;
try {
result = this._createElement(currentElement, context);
} catch (e: unknown) {
if (context.disableSimplification) {
throw e;
} else {
// we catch the error and only rethrow it if the alternative did not get removed
// the only errors which can be thrown are not-supported errors, so if the alternative gets
// removed anyway, we shouldn't throw the error
// Note: For multiple errors, only the first one will be re-thrown
error ??= { error: e };
continue;
}
}
if (result === EMPTY_SET) {
return EMPTY_SET;
} else if (result === EMPTY_CONCAT) {
// do nothing
} else {
// an actual element
let resolved = false;
if (currentElement.type === "CapturingGroup" && context.maxBackreferenceWords > 0) {
try {
// try to resolve the backreferences of this capturing group
const resolveResult = this._variableResolveGroup(
currentElement,
result,
inputElements.slice(i + 1),
context
);
resolved = true;
outputElements.push(resolveResult.resolved);
i += resolveResult.skip;
} catch (e) {
// could not resolve
}
}
if (!resolved) {
outputElements.push(result);
}
}
}
if (error !== undefined) {
// rethrow the error
throw error.error;
}
return outputElements;
}
private _variableResolveGroup(
group: AST.CapturingGroup,
groupElement: NoParent<Element>,
afterGroup: readonly AST.Element[],
context: ParserContext
): { resolved: NoParent<Element>; skip: number } {
// try to resolve all backreferences of this capturing group
if (this._getResolvableGroupReferencesUnder(group, group.parent).length === 0) {
throw new Error("No backreferences that resolve this capturing group");
}
const words = atMostK(iterateLogicalWords(groupElement, this._charSetToCharFn), context.maxBackreferenceWords);
if (words.length === 0) {
throw new Error("Cannot resolve dead capturing group");
}
// variable resolved
const affectedSlice = [...afterGroup];
/**
* Resolving the capturing group might not affect all elements after the group.
*
* E.g. `(a|b)c\1d`: Only the elements between `(a|b)` and `\1` are affected, so we don't have to copy `c`.
*
* This is a minor optimization that can drastically reduce the number of created RE AST nodes in some cases.
* E.g. `/(a)\1(b)\2(c)\3/i` can be converted to `/(?:AA|aa)(?:BB|bb)(?:CC|cc)/` instead of
* `/AA(?:BB(?:CC|cc)|bb(?:CC|cc))|aa(?:BB(?:CC|cc)|bb(?:CC|cc))/`.
*
* However, there is one thing that makes it somewhat difficult to determine which elements are affected:
* other capturing groups. If another capturing group is affected by the current one, then the elements affected
* the other capturing group also have to be accounted for.
*
* Examples:
* - `/(a)(b)\1\2/i`: `\2` is affected because its capturing group is affected by `(a)`.
* - `/(a)(b\1)\1\2/i`: `(b\1)` is obviously affected by `(a)`.
*/
this._trimAffectedSlice(group, affectedSlice);
const alternatives: NoParent<Concatenation>[] = [];
for (const word of words) {
const result = this._createElements(affectedSlice, withResolved(context, group, word));
if (result === EMPTY_SET) {
// skip this alternative
continue;
}
const concatElements: NoParent<Element>[] = [];
const wordElement = this._wordToElement(group, word, context);
if (wordElement !== EMPTY_CONCAT) {
concatElements.push(wordElement);
}
concatElements.push(...result);
if (context.matchingDir === "rtl") {
// we have to reverse the order because `afterGroup` is reversed.
concatElements.reverse();
}
const wordConcat = context.nc.newConcat(group);
this._setConcatenationElements(wordConcat, concatElements, context);
alternatives.push(wordConcat);
}
const alternation = context.nc.newAlt(group);
alternation.alternatives = alternatives;
return { resolved: alternation, skip: affectedSlice.length };
}
private _trimAffectedSlice(group: AST.CapturingGroup, slice: AST.Element[]): void {
const length = this._affectedSliceLength(group, slice);
if (slice.length > length) {
slice.splice(length, slice.length - length);
}
}
private _affectedSliceLength(group: AST.CapturingGroup, slice: readonly AST.Element[]): number {
function withParent(element: AST.Element): AST.Element {
let p = element as AST.Element | AST.BranchNode | null;
while (p) {
if (p.parent === group.parent) {
return p;
}
p = p.parent;
}
throw new Error();
}
function rightMostIndex(elements: ReadonlySet<AST.Element>): number {
for (let i = slice.length - 1; i >= 0; i--) {
if (elements.has(slice[i])) {
return i;
}
}
throw -1;
}
const end = rightMostIndex(
new Set(this._getResolvableGroupReferencesUnder(group, group.parent).map(withParent))
);
const rights = [end];
for (let i = 0; i <= end; i++) {
const e = slice[i];
if (e.type === "CapturingGroup") {
rights.push(i + 1 + this._affectedSliceLength(e, slice.slice(i + 1)));
}
}
return Math.max(...rights) + 1;
}
private _createElement(element: AST.Element, context: ParserContext): NoParent<Element> | Empty {
switch (element.type) {
case "Assertion":
return this._createAssertion(element, context);
case "CapturingGroup":
case "Group":
return this._createGroup(element, context);
case "Character":
case "CharacterClass":
case "CharacterSet":
case "ExpressionCharacterClass":
return this._createCharacterClass(element, context);
case "Quantifier":
return this._createQuantifier(element, context);
case "Backreference":
return this._createBackreference(element, context);
default:
throw assertNever(element, "Unsupported element");
}
}
private _createAssertion(element: AST.Assertion, context: ParserContext): NoParent<Element> | Empty {
switch (context.assertions) {
case "throw":
throw new Error("Assertions are not supported.");
case "disable":
return this._createDisabledElement(element, context);
case "ignore":
if (context.disableSimplification) {
const alternation = context.nc.newAlt(element);
alternation.alternatives.push(context.nc.newConcat(element));
return alternation;
} else {
return EMPTY_CONCAT;
}
case "unknown":
return this._createUnknownElement(element, context);
case "parse":
return this._parseAssertion(element, context);
default:
assertNever(context.assertions);
}
}
private _parseAssertion(element: AST.Assertion, context: ParserContext): NoParent<Element> | Empty {
switch (element.kind) {
case "lookahead":
case "lookbehind": {
const assertion = context.nc.newAssertion(
element,
element.kind === "lookahead" ? "ahead" : "behind",
element.negate
);
const matchingDir: MatchingDirection = element.kind === "lookahead" ? "ltr" : "rtl";
this._addAlternatives(
element.alternatives,
assertion,
matchingDir === context.matchingDir ? context : { ...context, matchingDir }
);
if (!context.disableSimplification) {
// check for trivially accepting/rejecting assertions
const enum TrivialResult {
REJECT,
ACCEPT,
UNKNOWN,
}
let result = TrivialResult.UNKNOWN;
if (assertion.alternatives.length === 0) {
result = assertion.negate ? TrivialResult.ACCEPT : TrivialResult.REJECT;
} else if (isPotentiallyEmpty(assertion.alternatives)) {
result = assertion.negate ? TrivialResult.REJECT : TrivialResult.ACCEPT;
}
if (result === TrivialResult.ACCEPT) {
return EMPTY_CONCAT;
} else if (result === TrivialResult.REJECT) {
return EMPTY_SET;
}
}
return assertion;
}
case "end":
case "start":
case "word": {
const assertion = createAssertion(element, this.flags);
setSource(assertion, copySource(element));
return assertion;
}
default:
throw assertNever(element, "Unsupported element");
}
}
private _createCharacterClass(
element: AST.Character | AST.CharacterClass | AST.CharacterSet | AST.ExpressionCharacterClass,
context: ParserContext
): NoParent<Element> | EmptySet {
let cacheKey: string;
if (element.type === "Character") {
cacheKey = element.value.toString(16);
} else {
cacheKey = element.raw;
}
let chars = this._charCache.get(cacheKey);
if (chars === undefined) {
// cache miss
chars = parseUnicodeSet(element, this.flags);
this._charCache.set(cacheKey, chars);
}
if (chars.accept.isEmpty) {
// simple case where we only have a single char
if (chars.chars.isEmpty && !context.disableSimplification) {
return EMPTY_SET;
}
return context.nc.newCharClass(element, chars.chars);
}
const alternation = context.nc.newAlt(element);
for (const word of chars.wordSets) {
const alternative = context.nc.newConcat(element);
alternation.alternatives.push(alternative);
for (const char of word) {
alternative.elements.push(context.nc.newCharClass(element, char));
}
}
return alternation;
}
private _createGroup(element: AST.Group | AST.CapturingGroup, context: ParserContext): NoParent<Element> | Empty {
const alternation = context.nc.newAlt(element);
this._addAlternatives(element.alternatives, alternation, context);
if (!context.disableSimplification) {
if (alternation.alternatives.length === 0) {
return EMPTY_SET;
} else if (alternation.alternatives.length === 1 && alternation.alternatives[0].elements.length === 0) {
return EMPTY_CONCAT;
}
}
return alternation;
}
private _createQuantifier(element: AST.Quantifier, context: ParserContext): NoParent<Element> | Empty {
const min: number = element.min;
const max: number = element.max;
if (!context.disableSimplification) {
if (max === 0) {
return EMPTY_CONCAT;
}
if (min === 1 && max === 1) {
return this._createElement(element.element, context);
}
}
const quant = context.nc.newQuant(element, min, max, !element.greedy);
const qElement = element.element;
if (qElement.type === "CapturingGroup" || qElement.type === "Group") {
this._addAlternatives(qElement.alternatives, quant, context);
} else {
const newElement = this._createElement(qElement, context);
if (newElement !== EMPTY_SET) {
const concat = context.nc.newConcat(qElement);
quant.alternatives.push(concat);
if (newElement !== EMPTY_CONCAT) {
concat.elements.push(newElement);
}
}
}
if (!context.disableSimplification) {
if (quant.alternatives.length === 0) {
return min === 0 ? EMPTY_CONCAT : EMPTY_SET;
} else if (quant.alternatives.length === 1 && quant.alternatives[0].elements.length === 0) {
return EMPTY_CONCAT;
}
}
return quant;
}
private _createUnknownElement(
element: AST.Assertion | AST.Backreference,
context: ParserContext
): NoParent<Element> | EmptySet {
return context.nc.newUnknown(element, context.getUnknownId(element));
}
private _createDisabledElement(node: SourceLocation, context: ParserContext): NoParent<Element> | EmptySet {
if (context.disableSimplification) {
return context.nc.newAlt(node);
} else {
return EMPTY_SET;
}
}
private _createBackreference(element: AST.Backreference, context: ParserContext): NoParent<Element> | Empty {
if (context.maxBackreferenceWords > 0) {
// try resolve
if (!this._backRefCanReachGroup(element)) {
return EMPTY_CONCAT;
}
const group = element.resolved;
const resolved = context.variableResolved.get(group) ?? this._constantResolveGroup(group, context);
if (resolved !== null) {
if (resolved.length === 0) {
return EMPTY_CONCAT;
}
// since there might be some path with which we could reach the backreference without matching the
// group, we have to make sure that we always match the group before the backreference.
if (this._backRefAlwaysAfterGroup(element)) {
return this._wordToElement(element, resolved, context);
}
}
}
switch (context.backreferences) {
case "throw":
throw new Error("Backreferences are not supported.");
case "unknown":
return this._createUnknownElement(element, context);
case "disable":
return this._createDisabledElement(element, context);
default:
assertNever(context.backreferences);
}
}
private _constantResolveGroup(element: AST.CapturingGroup, context: ParserContext): ReadonlyLogicalWord | null {
const cached = this._constantResolveCache.get(element);
if (cached !== undefined) {
return cached;
}
const expression = this._parseElement(element, {
...context,
backreferences: "unknown",
disableSimplification: false,
});
// TODO: Use transformers to do that
removeLeadingLookbehinds(expression);
removeTrailingLookaheads(expression);
// if the group is constant, then all that's left will be a single alternative of only single-character
// character classes
let words = undefined;
try {
words = atMostK(iterateLogicalWords(expression, this._charSetToCharFn), 1);
} catch (e) {
// noop
}
let result: LogicalWord | null = null;
if (words) {
if (words.length === 0) {
// since the capturing can never be matched, all backreferences to it will always be replaced with the
// empty string
result = [];
} else if (words.length === 1) {
result = words[0];
} else {
throw new Error("More than one words were returned.");
}
}
this._constantResolveCache.set(element, result);
return result;
}
private _backRefCanReachGroup(element: AST.Backreference): boolean {
let cached = this._backRefCanReachGroupCache.get(element);
if (cached === undefined) {
cached = somePathToBackreference(element);
this._backRefCanReachGroupCache.set(element, cached);
}
return cached;
}
private _backRefAlwaysAfterGroup(element: AST.Backreference): boolean {
let cached = this._backRefAlwaysAfterGroupCache.get(element);
if (cached === undefined) {
cached = backreferenceAlwaysAfterGroup(element);
this._backRefAlwaysAfterGroupCache.set(element, cached);
}
return cached;
}
private _wordToElement(
source: Readonly<SourceLocation>,
word: ReadonlyLogicalWord,
context: ParserContext
): NoParent<Element> | EmptyConcat {
if (word.length === 0) {
return EMPTY_CONCAT;
} else if (word.length === 1) {
const char = word[0];
const characters = char instanceof CharSet ? char : this._charToCharSet(char);
return context.nc.newCharClass(source, characters);
} else {
const concat = context.nc.newConcat(source);
for (const char of word) {
const characters = char instanceof CharSet ? char : this._charToCharSet(char);
concat.elements.push(context.nc.newCharClass(source, characters));
}
const alt = context.nc.newAlt(source);
alt.alternatives.push(concat);
return alt;
}
}
private _charToCharSet(char: Char): CharSet {
let cached = this._simpleCharCache.get(char);
if (cached === undefined) {
cached = CharSet.fromCharacter(this.maxCharacter, char);
this._simpleCharCache.set(char, cached);
}
return cached;
}
private _getGroupReferences(group: AST.CapturingGroup): readonly AST.Backreference[] {
if (this._groupReferencesCache.size === 0) {
const getList = (capGroup: AST.CapturingGroup): AST.Backreference[] => {
let list = this._groupReferencesCache.get(capGroup);
if (list === undefined) {
list = [];
this._groupReferencesCache.set(capGroup, list);
}
return list;
};
visitRegExpAST(this.ast.pattern, {
onBackreferenceEnter(backRef) {
getList(backRef.resolved).push(backRef);
},
onCapturingGroupEnter(capGroup) {
getList(capGroup);
},
});
}
const references = this._groupReferencesCache.get(group);
if (!references) {
throw new Error("Unknown capturing group. The capturing group is not part of the AST of this parser.");
}
return references;
}
private _getResolvableGroupReferences(group: AST.CapturingGroup): readonly AST.Backreference[] {
return this._getGroupReferences(group).filter(
ref => this._backRefCanReachGroup(ref) && this._backRefAlwaysAfterGroup(ref)
);
}
private _getResolvableGroupReferencesUnder(
group: AST.CapturingGroup,
under: AST.BranchNode
): readonly AST.Backreference[] {
return this._getResolvableGroupReferences(group).filter(ref => hasSomeAncestor(ref, a => a === under));
}
}
class NodeCreator {
private _nodeCounter = 0;
private readonly _nodeLimit: number;
constructor(nodeLimit: number) {
this._nodeLimit = nodeLimit;
}
private _checkLimit(): void {
TooManyNodesError.assert(++this._nodeCounter, this._nodeLimit, "JS.Parser");
}
newAlt(source: Readonly<SourceLocation>): NoParent<Alternation> {
this._checkLimit();
return {
type: "Alternation",
alternatives: [],
source: copySource(source),