diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 479604582..ee17f209f 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -44,6 +44,8 @@ enum ParseError: Error, Hashable { case invalidEscape(Character) case confusableCharacter(Character) + case quoteMayNotSpanMultipleLines + case cannotReferToWholePattern case quantifierRequiresOperand(String) @@ -138,6 +140,8 @@ extension ParseError: CustomStringConvertible { return "invalid escape sequence '\\\(c)'" case .confusableCharacter(let c): return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead" + case .quoteMayNotSpanMultipleLines: + return "quoted sequence may not span multiple lines in multi-line literal" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .quantifierRequiresOperand(let q): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index a6dfa0ce9..bdf076e68 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -579,7 +579,7 @@ extension Source { /// Try to consume quoted content /// - /// Quote -> '\Q' (!'\E' .)* '\E' + /// Quote -> '\Q' (!'\E' .)* '\E'? /// /// With `SyntaxOptions.experimentalQuotes`, also accepts /// @@ -592,9 +592,24 @@ extension Source { mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? { let str = try recordLoc { src -> String? in if src.tryEat(sequence: #"\Q"#) { - return try src.expectQuoted(endingWith: #"\E"#).value + let contents = src.lexUntil { src in + src.isEmpty || src.tryEat(sequence: #"\E"#) + }.value + + // In multi-line literals, the quote may not span multiple lines. + if context.syntax.contains(.multilineExtendedSyntax), + contents.spansMultipleLinesInRegexLiteral { + throw ParseError.quoteMayNotSpanMultipleLines + } + + // The sequence must not be empty in a custom character class. + if context.isInCustomCharacterClass && contents.isEmpty { + throw ParseError.expectedNonEmptyContents + } + return contents } if context.experimentalQuotes, src.tryEat("\"") { + // TODO: Can experimental quotes be empty? return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value } return nil diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 84957220c..e540e6c1d 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -592,6 +592,13 @@ public func parse( return ast } +extension String { + /// Whether the given string is considered multi-line for a regex literal. + var spansMultipleLinesInRegexLiteral: Bool { + unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) + } +} + /// Retrieve the default set of syntax options that a delimiter and literal /// contents indicates. fileprivate func defaultSyntaxOptions( @@ -601,8 +608,7 @@ fileprivate func defaultSyntaxOptions( case .forwardSlash: // For an extended syntax forward slash e.g #/.../#, extended syntax is // permitted if it spans multiple lines. - if delim.poundCount > 0 && - contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) { + if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral { return .multilineExtendedSyntax } return .traditional diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index dbdacb0c2..fbca83128 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -754,6 +754,14 @@ extension RegexTests { // This follows the PCRE behavior. parseTest(#"\Q\\E"#, quote("\\")) + // ICU allows quotes to be empty outside of custom character classes. + parseTest(#"\Q\E"#, quote("")) + + // Quotes may be unterminated. + parseTest(#"\Qab"#, quote("ab")) + parseTest(#"\Q"#, quote("")) + parseTest("\\Qab\\", quote("ab\\")) + parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), @@ -2539,8 +2547,6 @@ extension RegexTests { diagnosticTest(#"(?P"#, .expected(")")) diagnosticTest(#"(?R"#, .expected(")")) - diagnosticTest(#"\Qab"#, .expected("\\E")) - diagnosticTest("\\Qab\\", .expected("\\E")) diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental) diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental) diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental) @@ -2619,6 +2625,9 @@ extension RegexTests { // TODO: Custom diagnostic for missing '\Q' diagnosticTest(#"\E"#, .invalidEscape("E")) + diagnosticTest(#"[\Q\E]"#, .expectedNonEmptyContents) + diagnosticTest(#"[\Q]"#, .expected("]")) + // PCRE treats these as octal, but we require a `0` prefix. diagnosticTest(#"[\1]"#, .invalidEscape("1")) diagnosticTest(#"[\123]"#, .invalidEscape("1")) @@ -2711,6 +2720,26 @@ extension RegexTests { """, .cannotRemoveExtendedSyntaxInMultilineMode ) + diagnosticWithDelimitersTest(#""" + #/ + \Q + \E + /# + """#, .quoteMayNotSpanMultipleLines) + + diagnosticWithDelimitersTest(#""" + #/ + \Qabc + \E + /# + """#, .quoteMayNotSpanMultipleLines) + + diagnosticWithDelimitersTest(#""" + #/ + \Q + /# + """#, .quoteMayNotSpanMultipleLines) + // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*"))