Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JS: Add ECMAScript 2024 v Flag Operators for Regex Parsing #18899

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add support for '\q{}' escape sequence in regular expressions.
  • Loading branch information
Napalys committed Mar 3, 2025
commit ed418be97a070a7c9b51fb23ba6f21e5aaf99a21
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package com.semmle.js.ast.regexp;

import com.semmle.js.ast.SourceLocation;

/**
* A '\q{}' escape sequence in a regular expression, which is a special extension
* to standard regular expressions.
*/
public class CharacterClassQuotedString extends RegExpTerm {
private final RegExpTerm term;

public CharacterClassQuotedString(SourceLocation loc, RegExpTerm term) {
super(loc, "CharacterClassQuotedString");
this.term = term;
}

public RegExpTerm getTerm() {
return term;
}

@Override
public void accept(Visitor v) {
v.visit(this);
}
}
Original file line number Diff line number Diff line change
@@ -61,4 +61,6 @@ public interface Visitor {
public void visit(ZeroWidthNegativeLookbehind nd);

public void visit(UnicodePropertyEscape nd);

public void visit(CharacterClassQuotedString nd);
}
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@
import com.semmle.js.ast.regexp.Caret;
import com.semmle.js.ast.regexp.CharacterClass;
import com.semmle.js.ast.regexp.CharacterClassEscape;
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
import com.semmle.js.ast.regexp.CharacterClassRange;
import com.semmle.js.ast.regexp.Constant;
import com.semmle.js.ast.regexp.ControlEscape;
@@ -92,6 +93,7 @@ public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
termkinds.put("ZeroWidthPositiveLookbehind", 25);
termkinds.put("ZeroWidthNegativeLookbehind", 26);
termkinds.put("UnicodePropertyEscape", 27);
termkinds.put("CharacterClassQuotedString", 28);
}

private static final String[] errmsgs =
@@ -344,6 +346,12 @@ public void visit(CharacterClassRange nd) {
visit(nd.getLeft(), lbl, 0);
visit(nd.getRight(), lbl, 1);
}

@Override
public void visit(CharacterClassQuotedString nd) {
Label lbl = extractTerm(nd, parent, idx);
visit(nd.getTerm(), lbl, 0);
}
}

public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {
46 changes: 46 additions & 0 deletions javascript/extractor/src/com/semmle/js/parser/RegExpParser.java
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
import com.semmle.js.ast.regexp.Caret;
import com.semmle.js.ast.regexp.CharacterClass;
import com.semmle.js.ast.regexp.CharacterClassEscape;
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
import com.semmle.js.ast.regexp.CharacterClassRange;
import com.semmle.js.ast.regexp.Constant;
import com.semmle.js.ast.regexp.ControlEscape;
@@ -283,6 +284,45 @@ private RegExpTerm parseTerm() {
return this.finishTerm(this.parseQuantifierOpt(loc, this.parseAtom()));
}

private RegExpTerm parseDisjunctionInsideQuotedString() {
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> disjuncts = new ArrayList<>();
disjuncts.add(this.parseAlternativeInsideQuotedString());
while (this.match("|")) {
disjuncts.add(this.parseAlternativeInsideQuotedString());
}
if (disjuncts.size() == 1) return disjuncts.get(0);
return this.finishTerm(new Disjunction(loc, disjuncts));
}

private RegExpTerm parseAlternativeInsideQuotedString() {
SourceLocation loc = new SourceLocation(pos());
StringBuilder sb = new StringBuilder();
boolean escaped = false;
while (true) {
// If we're at the end of the string, something went wrong.
if (this.atEOS()) {
this.error(Error.UNEXPECTED_EOS);
break;
}
// We can end parsing if we're not escaped and we see a `|` which would mean Alternation
// or `}` which would mean the end of the Quoted String.
if(!escaped && this.lookahead(null, "|", "}")){
break;
}
char c = this.nextChar();
// Track whether the character is an escape character.
escaped = !escaped && (c == '\\');
sb.append(c);
}

String literal = sb.toString();
loc.setEnd(pos());
loc.setSource(literal);

return new Constant(loc, literal);
}

private RegExpTerm parseQuantifierOpt(SourceLocation loc, RegExpTerm atom) {
if (this.match("*")) return this.finishTerm(new Star(loc, atom, !this.match("?")));
if (this.match("+")) return this.finishTerm(new Plus(loc, atom, !this.match("?")));
@@ -427,6 +467,12 @@ private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
return this.finishTerm(new NamedBackReference(loc, name, "\\k<" + name + ">"));
}

if (this.match("q{")) {
RegExpTerm term = parseDisjunctionInsideQuotedString();
this.expectRBrace();
return this.finishTerm(new CharacterClassQuotedString(loc, term));
}

if (this.match("p{", "P{")) {
String name = this.readIdentifier();
if (this.match("=")) {
Loading
Oops, something went wrong.