Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JS: Add ECMAScript 2024 v Flag Operators for Regex Parsing #18899

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added ability to parse nested character classes while using v flag.
  • Loading branch information
Napalys committed Mar 3, 2025
commit 2333c538d91f2c2832bc864c650e5e483a0e6109
23 changes: 23 additions & 0 deletions javascript/extractor/src/com/semmle/js/parser/RegExpParser.java
Original file line number Diff line number Diff line change
@@ -545,6 +545,7 @@ private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
}

private RegExpTerm parseCharacterClass() {
if (flags != null && flags.contains("v")) return parseNestedCharacterClass();
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> elements = new ArrayList<>();

@@ -560,6 +561,28 @@ private RegExpTerm parseCharacterClass() {
return this.finishTerm(new CharacterClass(loc, elements, inverted));
}

// ECMA 2024 `v` flag allows nested character classes.
private RegExpTerm parseNestedCharacterClass() {
SourceLocation loc = new SourceLocation(pos());
List<RegExpTerm> elements = new ArrayList<>();

this.match("[");
boolean inverted = this.match("^");
while (!this.match("]")) {
if (this.atEOS()) {
this.error(Error.EXPECTED_RBRACKET);
break;
}
if (lookahead("[")) {
elements.add(parseNestedCharacterClass());
}
else {
elements.add(this.parseCharacterClassElement());
}
}
return this.finishTerm(new CharacterClass(loc, elements, inverted));
}

private static final List<String> escapeClasses = Arrays.asList("d", "D", "s", "S", "w", "W");

private RegExpTerm parseCharacterClassElement() {
Original file line number Diff line number Diff line change
@@ -86,181 +86,126 @@ enclosing_stmt(#20027,#20025)
expr_containers(#20027,#20001)
literals("/[[]]/v","/[[]]/v",#20027)
#20028=*
regexpterm(#20028,1,#20027,0,"[[]]")
regexpterm(#20028,23,#20027,0,"[[]]")
#20029=@"loc,{#10000},1,2,1,5"
locations_default(#20029,#10000,1,2,1,5)
hasLocation(#20028,#20029)
#20030=*
regexpterm(#20030,23,#20028,0,"[[]")
#20031=@"loc,{#10000},1,2,1,4"
locations_default(#20031,#10000,1,2,1,4)
regexpterm(#20030,23,#20028,0,"[]")
#20031=@"loc,{#10000},1,3,1,4"
locations_default(#20031,#10000,1,3,1,4)
hasLocation(#20030,#20031)
#20032=*
regexpterm(#20032,14,#20030,0,"[")
#20033=@"loc,{#10000},1,3,1,3"
locations_default(#20033,#10000,1,3,1,3)
hasLocation(#20032,#20033)
regexp_const_value(#20032,"[")
stmts(#20032,2,#20001,1,"/[[a]]/v;")
hasLocation(#20032,#20007)
stmt_containers(#20032,#20001)
#20033=*
exprs(#20033,5,#20032,0,"/[[a]]/v")
hasLocation(#20033,#20015)
enclosing_stmt(#20033,#20032)
expr_containers(#20033,#20001)
literals("/[[a]]/v","/[[a]]/v",#20033)
#20034=*
regexpterm(#20034,14,#20028,1,"]")
#20035=@"loc,{#10000},1,5,1,5"
locations_default(#20035,#10000,1,5,1,5)
regexpterm(#20034,23,#20033,0,"[[a]]")
#20035=@"loc,{#10000},2,2,2,6"
locations_default(#20035,#10000,2,2,2,6)
hasLocation(#20034,#20035)
regexp_const_value(#20034,"]")
#20036=*
regexp_parse_errors(#20036,#20028,"unexpected character")
hasLocation(#20036,#20035)
#20037=*
stmts(#20037,2,#20001,1,"/[[a]]/v;")
hasLocation(#20037,#20007)
stmt_containers(#20037,#20001)
regexpterm(#20036,23,#20034,0,"[a]")
#20037=@"loc,{#10000},2,3,2,5"
locations_default(#20037,#10000,2,3,2,5)
hasLocation(#20036,#20037)
#20038=*
exprs(#20038,5,#20037,0,"/[[a]]/v")
hasLocation(#20038,#20015)
enclosing_stmt(#20038,#20037)
expr_containers(#20038,#20001)
literals("/[[a]]/v","/[[a]]/v",#20038)
#20039=*
regexpterm(#20039,1,#20038,0,"[[a]]")
#20040=@"loc,{#10000},2,2,2,6"
locations_default(#20040,#10000,2,2,2,6)
hasLocation(#20039,#20040)
regexpterm(#20038,14,#20036,0,"a")
#20039=@"loc,{#10000},2,4,2,4"
locations_default(#20039,#10000,2,4,2,4)
hasLocation(#20038,#20039)
regexp_const_value(#20038,"a")
#20040=*
stmts(#20040,2,#20001,2,"/[ [] [ [] [] ] ]/v;")
hasLocation(#20040,#20009)
stmt_containers(#20040,#20001)
#20041=*
regexpterm(#20041,23,#20039,0,"[[a]")
#20042=@"loc,{#10000},2,2,2,5"
locations_default(#20042,#10000,2,2,2,5)
hasLocation(#20041,#20042)
#20043=*
regexpterm(#20043,14,#20041,0,"[")
#20044=@"loc,{#10000},2,3,2,3"
locations_default(#20044,#10000,2,3,2,3)
hasLocation(#20043,#20044)
regexp_const_value(#20043,"[")
#20045=*
regexpterm(#20045,14,#20041,1,"a")
#20046=@"loc,{#10000},2,4,2,4"
locations_default(#20046,#10000,2,4,2,4)
hasLocation(#20045,#20046)
regexp_const_value(#20045,"a")
#20047=*
regexpterm(#20047,14,#20039,1,"]")
#20048=@"loc,{#10000},2,6,2,6"
locations_default(#20048,#10000,2,6,2,6)
hasLocation(#20047,#20048)
regexp_const_value(#20047,"]")
#20049=*
regexp_parse_errors(#20049,#20039,"unexpected character")
hasLocation(#20049,#20048)
exprs(#20041,5,#20040,0,"/[ [] [ [] [] ] ]/v")
hasLocation(#20041,#20019)
enclosing_stmt(#20041,#20040)
expr_containers(#20041,#20001)
literals("/[ [] [ [] [] ] ]/v","/[ [] [ [] [] ] ]/v",#20041)
#20042=*
regexpterm(#20042,23,#20041,0,"[ [] [ [] [] ] ]")
#20043=@"loc,{#10000},3,2,3,17"
locations_default(#20043,#10000,3,2,3,17)
hasLocation(#20042,#20043)
#20044=*
regexpterm(#20044,14,#20042,0," ")
#20045=@"loc,{#10000},3,3,3,3"
locations_default(#20045,#10000,3,3,3,3)
hasLocation(#20044,#20045)
regexp_const_value(#20044," ")
#20046=*
regexpterm(#20046,23,#20042,1,"[]")
#20047=@"loc,{#10000},3,4,3,5"
locations_default(#20047,#10000,3,4,3,5)
hasLocation(#20046,#20047)
#20048=*
regexpterm(#20048,14,#20042,2," ")
#20049=@"loc,{#10000},3,6,3,6"
locations_default(#20049,#10000,3,6,3,6)
hasLocation(#20048,#20049)
regexp_const_value(#20048," ")
#20050=*
stmts(#20050,2,#20001,2,"/[ [] [ [] [] ] ]/v;")
hasLocation(#20050,#20009)
stmt_containers(#20050,#20001)
#20051=*
exprs(#20051,5,#20050,0,"/[ [] [ [] [] ] ]/v")
hasLocation(#20051,#20019)
enclosing_stmt(#20051,#20050)
expr_containers(#20051,#20001)
literals("/[ [] [ [] [] ] ]/v","/[ [] [ [] [] ] ]/v",#20051)
regexpterm(#20050,23,#20042,3,"[ [] [] ]")
#20051=@"loc,{#10000},3,7,3,15"
locations_default(#20051,#10000,3,7,3,15)
hasLocation(#20050,#20051)
#20052=*
regexpterm(#20052,1,#20051,0,"[ [] [ [] [] ] ]")
#20053=@"loc,{#10000},3,2,3,17"
locations_default(#20053,#10000,3,2,3,17)
regexpterm(#20052,14,#20050,0," ")
#20053=@"loc,{#10000},3,8,3,8"
locations_default(#20053,#10000,3,8,3,8)
hasLocation(#20052,#20053)
regexp_const_value(#20052," ")
#20054=*
regexpterm(#20054,23,#20052,0,"[ []")
#20055=@"loc,{#10000},3,2,3,5"
locations_default(#20055,#10000,3,2,3,5)
regexpterm(#20054,23,#20050,1,"[]")
#20055=@"loc,{#10000},3,9,3,10"
locations_default(#20055,#10000,3,9,3,10)
hasLocation(#20054,#20055)
#20056=*
regexpterm(#20056,14,#20054,0," ")
#20057=@"loc,{#10000},3,3,3,3"
locations_default(#20057,#10000,3,3,3,3)
regexpterm(#20056,14,#20050,2," ")
#20057=@"loc,{#10000},3,11,3,11"
locations_default(#20057,#10000,3,11,3,11)
hasLocation(#20056,#20057)
regexp_const_value(#20056," ")
#20058=*
regexpterm(#20058,14,#20054,1,"[")
#20059=@"loc,{#10000},3,4,3,4"
locations_default(#20059,#10000,3,4,3,4)
regexpterm(#20058,23,#20050,3,"[]")
#20059=@"loc,{#10000},3,12,3,13"
locations_default(#20059,#10000,3,12,3,13)
hasLocation(#20058,#20059)
regexp_const_value(#20058,"[")
#20060=*
regexpterm(#20060,14,#20052,1," ")
#20061=@"loc,{#10000},3,6,3,6"
locations_default(#20061,#10000,3,6,3,6)
regexpterm(#20060,14,#20050,4," ")
#20061=@"loc,{#10000},3,14,3,14"
locations_default(#20061,#10000,3,14,3,14)
hasLocation(#20060,#20061)
regexp_const_value(#20060," ")
#20062=*
regexpterm(#20062,23,#20052,2,"[ []")
#20063=@"loc,{#10000},3,7,3,10"
locations_default(#20063,#10000,3,7,3,10)
regexpterm(#20062,14,#20042,4," ")
#20063=@"loc,{#10000},3,16,3,16"
locations_default(#20063,#10000,3,16,3,16)
hasLocation(#20062,#20063)
regexp_const_value(#20062," ")
#20064=*
regexpterm(#20064,14,#20062,0," ")
#20065=@"loc,{#10000},3,8,3,8"
locations_default(#20065,#10000,3,8,3,8)
entry_cfg_node(#20064,#20001)
#20065=@"loc,{#10000},1,1,1,0"
locations_default(#20065,#10000,1,1,1,0)
hasLocation(#20064,#20065)
regexp_const_value(#20064," ")
#20066=*
regexpterm(#20066,14,#20062,1,"[")
#20067=@"loc,{#10000},3,9,3,9"
locations_default(#20067,#10000,3,9,3,9)
hasLocation(#20066,#20067)
regexp_const_value(#20066,"[")
#20068=*
regexpterm(#20068,14,#20052,3," ")
#20069=@"loc,{#10000},3,11,3,11"
locations_default(#20069,#10000,3,11,3,11)
hasLocation(#20068,#20069)
regexp_const_value(#20068," ")
#20070=*
regexpterm(#20070,23,#20052,4,"[]")
#20071=@"loc,{#10000},3,12,3,13"
locations_default(#20071,#10000,3,12,3,13)
hasLocation(#20070,#20071)
#20072=*
regexpterm(#20072,14,#20052,5," ")
#20073=@"loc,{#10000},3,14,3,14"
locations_default(#20073,#10000,3,14,3,14)
hasLocation(#20072,#20073)
regexp_const_value(#20072," ")
#20074=*
regexpterm(#20074,14,#20052,6,"]")
#20075=@"loc,{#10000},3,15,3,15"
locations_default(#20075,#10000,3,15,3,15)
hasLocation(#20074,#20075)
regexp_const_value(#20074,"]")
#20076=*
regexpterm(#20076,14,#20052,7," ")
#20077=@"loc,{#10000},3,16,3,16"
locations_default(#20077,#10000,3,16,3,16)
hasLocation(#20076,#20077)
regexp_const_value(#20076," ")
#20078=*
regexpterm(#20078,14,#20052,8,"]")
#20079=@"loc,{#10000},3,17,3,17"
locations_default(#20079,#10000,3,17,3,17)
hasLocation(#20078,#20079)
regexp_const_value(#20078,"]")
#20080=*
regexp_parse_errors(#20080,#20052,"unexpected character")
hasLocation(#20080,#20075)
#20081=*
regexp_parse_errors(#20081,#20052,"unexpected character")
hasLocation(#20081,#20079)
#20082=*
entry_cfg_node(#20082,#20001)
#20083=@"loc,{#10000},1,1,1,0"
locations_default(#20083,#10000,1,1,1,0)
hasLocation(#20082,#20083)
#20084=*
exit_cfg_node(#20084,#20001)
hasLocation(#20084,#20023)
successor(#20050,#20051)
successor(#20051,#20084)
successor(#20037,#20038)
successor(#20038,#20050)
exit_cfg_node(#20066,#20001)
hasLocation(#20066,#20023)
successor(#20040,#20041)
successor(#20041,#20066)
successor(#20032,#20033)
successor(#20033,#20040)
successor(#20025,#20027)
successor(#20027,#20037)
successor(#20082,#20025)
successor(#20027,#20032)
successor(#20064,#20025)
numlines(#10000,3,3,1)
filetype(#10000,"javascript")