diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 1c9ee57bd..618ae2412 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -45,6 +45,7 @@ enum ParseError: Error, Hashable { case confusableCharacter(Character) case quoteMayNotSpanMultipleLines + case unsetExtendedSyntaxMayNotSpanMultipleLines case cannotReferToWholePattern @@ -81,6 +82,7 @@ enum ParseError: Error, Hashable { case cannotRemoveTextSegmentOptions case cannotRemoveSemanticsOptions case cannotRemoveExtendedSyntaxInMultilineMode + case cannotResetExtendedSyntaxInMultilineMode case expectedCalloutArgument @@ -143,6 +145,8 @@ extension ParseError: CustomStringConvertible { return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead" case .quoteMayNotSpanMultipleLines: return "quoted sequence may not span multiple lines in multi-line literal" + case .unsetExtendedSyntaxMayNotSpanMultipleLines: + return "group that unsets extended syntax may not span multiple lines in multi-line literal" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .quantifierRequiresOperand(let q): @@ -194,6 +198,8 @@ extension ParseError: CustomStringConvertible { return "semantic level cannot be unset, only changed" case .cannotRemoveExtendedSyntaxInMultilineMode: return "extended syntax may not be disabled in multi-line mode" + case .cannotResetExtendedSyntaxInMultilineMode: + return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead" case .expectedCalloutArgument: return "expected argument to callout" case .unrecognizedScript(let value): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index f63cd435e..41b744234 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -597,7 +597,7 @@ extension Source { }.value // In multi-line literals, the quote may not span multiple lines. - if context.syntax.contains(.multilineExtendedSyntax), + if context.syntax.contains(.multilineCompilerLiteral), contents.spansMultipleLinesInRegexLiteral { throw ParseError.quoteMayNotSpanMultipleLines } @@ -840,11 +840,6 @@ extension Source { if opt.isSemanticMatchingLevel { throw ParseError.cannotRemoveSemanticsOptions } - // Extended syntax may not be removed if in multi-line mode. - if context.syntax.contains(.multilineExtendedSyntax) && - opt.isAnyExtended { - throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode - } removing.append(opt) } return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index e540e6c1d..3e20ae8c0 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -289,8 +289,8 @@ extension Parser { /// Apply the syntax options of a given matching option sequence to the /// current set of options. private mutating func applySyntaxOptions( - of opts: AST.MatchingOptionSequence - ) { + of opts: AST.MatchingOptionSequence, isScoped: Bool + ) throws { func mapOption(_ option: SyntaxOptions, _ pred: (AST.MatchingOption) -> Bool) { if opts.resetsCurrentOptions { @@ -311,22 +311,41 @@ extension Parser { mapOption(.namedCapturesOnly, .namedCapturesOnly) // (?x), (?xx) - // We skip this for multi-line, as extended syntax is always enabled there. + // This cannot be unset in a multi-line literal, unless in a scoped group + // e.g (?-x:...). We later enforce that such a group does not span multiple + // lines. // TODO: PCRE differentiates between (?x) and (?xx) where only the latter // handles non-semantic whitespace in a custom character class. Other // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, // treat (?x) and (?xx) as the same option here. If we ever get a strict // PCRE mode, we will need to change this to handle that. - if !context.syntax.contains(.multilineExtendedSyntax) { + if !isScoped && context.syntax.contains(.multilineCompilerLiteral) { + // An unscoped removal of extended syntax is not allowed in a multi-line + // literal. + if let opt = opts.removing.first(where: \.isAnyExtended) { + throw Source.LocatedError( + ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location) + } + if opts.resetsCurrentOptions { + throw Source.LocatedError( + ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!) + } + // The only remaning case is an unscoped addition of extended syntax, + // which is a no-op. + } else { + // We either have a scoped change of extended syntax, or this is a + // single-line literal. mapOption(.extendedSyntax, \.isAnyExtended) } } /// Apply the syntax options of a matching option changing group to the /// current set of options. - private mutating func applySyntaxOptions(of group: AST.Group.Kind) { + private mutating func applySyntaxOptions( + of group: AST.Group.Kind, isScoped: Bool + ) throws { if case .changeMatchingOptions(let seq) = group { - applySyntaxOptions(of: seq) + try applySyntaxOptions(of: seq, isScoped: isScoped) } } @@ -337,14 +356,25 @@ extension Parser { context.recordGroup(kind.value) let currentSyntax = context.syntax - applySyntaxOptions(of: kind.value) + try applySyntaxOptions(of: kind.value, isScoped: true) defer { context.syntax = currentSyntax } - + let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) && + !context.syntax.contains(.extendedSyntax) let child = try parseNode() try source.expect(")") - return .init(kind, child, loc(start)) + let groupLoc = loc(start) + + // In multi-line literals, the body of a group that unsets extended syntax + // may not span multiple lines. + if unsetsExtendedSyntax && + context.syntax.contains(.multilineCompilerLiteral) && + source[child.location.range].spansMultipleLinesInRegexLiteral { + throw Source.LocatedError( + ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc) + } + return .init(kind, child, groupLoc) } /// Consume the body of an absent function. @@ -438,7 +468,7 @@ extension Parser { // If we have a change matching options atom, apply the syntax options. We // already take care of scoping syntax options within a group. if case .changeMatchingOptions(let opts) = atom.kind { - applySyntaxOptions(of: opts) + try applySyntaxOptions(of: opts, isScoped: false) } // TODO: track source locations return .atom(atom) @@ -592,7 +622,7 @@ public func parse( return ast } -extension String { +extension StringProtocol { /// Whether the given string is considered multi-line for a regex literal. var spansMultipleLinesInRegexLiteral: Bool { unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) @@ -609,7 +639,7 @@ fileprivate func defaultSyntaxOptions( // For an extended syntax forward slash e.g #/.../#, extended syntax is // permitted if it spans multiple lines. if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral { - return .multilineExtendedSyntax + return [.multilineCompilerLiteral, .extendedSyntax] } return .traditional case .reSingleQuote: diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift index dbfe5f2d6..302032fd3 100644 --- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift @@ -58,10 +58,10 @@ public struct SyntaxOptions: OptionSet { /// `(_: .*)` == `(?:.*)` public static var experimentalCaptures: Self { Self(1 << 5) } - /// The default syntax for a multi-line regex literal. - public static var multilineExtendedSyntax: Self { - return [Self(1 << 6), .extendedSyntax] - } + /// The syntax kind of a multi-line literal. This will always be set when + /// parsing a multi-line `#/.../#` literal. Note this does not imply extended + /// syntax, as that may be temporarily disabled while parsing. + public static var multilineCompilerLiteral: Self { Self(1 << 6) } /// `(?n)` public static var namedCapturesOnly: Self { Self(1 << 7) } @@ -76,8 +76,8 @@ public struct SyntaxOptions: OptionSet { public static var traditional: Self { Self(0) } public static var experimental: Self { - // Experimental syntax enables everything except end-of-line comments. - Self(~0).subtracting(.endOfLineComments) + [.nonSemanticWhitespace, .experimentalQuotes, .experimentalComments, + .experimentalRanges, .experimentalCaptures] } // TODO: Probably want to model strict-PCRE etc. options too. diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 425f44f48..5315f24c4 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -772,6 +772,9 @@ extension RegexTests { syntax: .experimental) parseTest(#""\"""#, quote("\""), syntax: .experimental) + parseTest(#"(abc)"#, capture(concat("a", "b", "c")), + syntax: .experimental, captures: [.cap]) + // Quotes in character classes. parseTest(#"[\Q-\E]"#, charClass(quote_m("-"))) parseTest(#"[\Qa-b[[*+\\E]"#, charClass(quote_m("a-b[[*+\\"))) @@ -1777,6 +1780,13 @@ extension RegexTests { " ", "b" ) ) + parseTest( + "(?x) a (?^: b)", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", + changeMatchingOptions(unsetMatchingOptions(), concat(" ", "b")) + ) + ) parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c")) parseTest("[#]", charClass("#")) @@ -2099,6 +2109,17 @@ extension RegexTests { throwsError: .unsupported, syntax: .extendedSyntax ) + parseWithDelimitersTest( + #""" + #/ + a\ + b\ + c + /# + """#, + concat("a", "\n", "b", "\n", "c") + ) + // MARK: Parse with delimiters parseWithDelimitersTest("/a b/", concat("a", " ", "b")) @@ -2174,22 +2195,40 @@ extension RegexTests { /# """, concat("a", "b")) - // Make sure (?^) is ignored. + // (?x) has no effect. parseWithDelimitersTest(""" #/ - (?^) + (?x) # comment /# - """, changeMatchingOptions(unsetMatchingOptions()) + """, changeMatchingOptions(matchingOptions(adding: .extended)) ) - // (?x) has no effect. + // Scoped removal of extended syntax is allowed as long as it does not span + // multiple lines. parseWithDelimitersTest(""" #/ - (?x) - # comment + (?-x:a b) /# - """, changeMatchingOptions(matchingOptions(adding: .extended)) + """, changeMatchingOptions( + matchingOptions(removing: .extended), + concat("a", " ", "b") + ) + ) + parseWithDelimitersTest(""" + #/ + (?-xx:a b) + /# + """, changeMatchingOptions( + matchingOptions(removing: .extraExtended), + concat("a", " ", "b") + ) + ) + parseWithDelimitersTest(""" + #/ + (?^: a b ) # comment + /# + """, changeMatchingOptions(unsetMatchingOptions(), concat(" ", "a", " ", "b", " ")) ) parseWithDelimitersTest(#""" @@ -2773,17 +2812,50 @@ extension RegexTests { /# """, .cannotRemoveExtendedSyntaxInMultilineMode ) + + // Scoped removal of extended syntax may not span multiple lines diagnosticWithDelimitersTest(""" #/ - (?-x:a b) + (?-x:a b + ) /# - """, .cannotRemoveExtendedSyntaxInMultilineMode + """, .unsetExtendedSyntaxMayNotSpanMultipleLines ) diagnosticWithDelimitersTest(""" #/ - (?-xx:a b) + (?-x:a + b) /# - """, .cannotRemoveExtendedSyntaxInMultilineMode + """, .unsetExtendedSyntaxMayNotSpanMultipleLines + ) + diagnosticWithDelimitersTest(""" + #/ + (?-xx: + a b) + /# + """, .unsetExtendedSyntaxMayNotSpanMultipleLines + ) + diagnosticWithDelimitersTest(""" + #/ + (?x-x: + a b) + /# + """, .unsetExtendedSyntaxMayNotSpanMultipleLines + ) + diagnosticWithDelimitersTest(""" + #/ + (?^) + # comment + /# + """, .cannotResetExtendedSyntaxInMultilineMode + ) + diagnosticWithDelimitersTest(""" + #/ + (?^: + # comment + ) + /# + """, .unsetExtendedSyntaxMayNotSpanMultipleLines ) diagnosticWithDelimitersTest(#"""