diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift new file mode 100644 index 00000000..c61c37fd --- /dev/null +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -0,0 +1,855 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +internal import _RegexParser + +extension Compiler.ByteCodeGen { + mutating func emitRoot(_ root: DSLList) throws -> MEProgram { + // If the whole regex is a matcher, then the whole-match value + // is the constructed value. Denote that the current value + // register is the processor's value output. + switch root.nodes.first { + case .matcher: + builder.denoteCurrentValueIsWholeMatchValue() + default: + break + } + + var list = root.nodes[...] + try emitNode(&list) + + builder.canOnlyMatchAtStart = canOnlyMatchAtStart(in: root) + builder.buildAccept() + return try builder.assemble() + } +} + +fileprivate extension Compiler.ByteCodeGen { + /// Implementation for `canOnlyMatchAtStart`, which maintains the option + /// state. + /// + /// For a given specific node, this method can return one of three values: + /// + /// - `true`: This node is guaranteed to match only at the start of a subject. + /// - `false`: This node can match anywhere in the subject. + /// - `nil`: This node is inconclusive about where it can match. + /// + /// In particular, non-required groups and option-setting groups are + /// inconclusive about where they can match. + private mutating func _canOnlyMatchAtStartImpl( + _ list: inout ArraySlice + ) -> Bool? { + guard let node = list.popFirst() else { return false } + switch node { + // Defining cases + case .atom(.assertion(.startOfSubject)): + return true + case .atom(.assertion(.caretAnchor)): + return !options.anchorsMatchNewlines + + // Changing options doesn't determine `true`/`false`. + case .atom(.changeMatchingOptions(let sequence)): + options.apply(sequence.ast) + return nil + + // Any other atom or consuming node returns `false`. + case .atom, .customCharacterClass, .quotedLiteral: + return false + + // Trivia/empty have no effect. + case .trivia, .empty: + return nil + + // In an alternation, all of its children must match only at start. + case .orderedChoice(let children): + for _ in 0.. Bool { + let currentOptions = options + options = MatchingOptions() + defer { options = currentOptions } + + var list = list.nodes[...] + return _canOnlyMatchAtStartImpl(&list) ?? false + } + + mutating func emitAlternationGen( + _ elements: inout ArraySlice, + alternationCount: Int, + withBacktracking: Bool, + _ body: (inout Compiler.ByteCodeGen, inout ArraySlice) throws -> Void + ) rethrows { + // Alternation: p0 | p1 | ... | pn + // save next_p1 + // + // branch done + // next_p1: + // save next_p2 + // + // branch done + // next_p2: + // save next_p... + // + // branch done + // ... + // next_pn: + // + // done: + let done = builder.makeAddress() + for _ in 1.., + alternationCount count: Int + ) throws { + try emitAlternationGen(&list, alternationCount: count, withBacktracking: true) { + try $0.emitNode(&$1) + } + } + + mutating func emitPositiveLookahead(_ list: inout ArraySlice) throws { + /* + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSave(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildFail(preservingCaptures: true) // Lookahead succeeds here + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + + mutating func emitNegativeLookahead(_ list: inout ArraySlice) throws { + /* + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + clearSavePoint // remove success + fail // propagate failure + intercept: + fail // ->success + success: + ... + */ + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSave(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(intercept) + builder.buildFail() + + builder.label(success) + } + + mutating func emitLookaround( + _ kind: (forwards: Bool, positive: Bool), + _ list: inout ArraySlice + ) throws { + guard kind.forwards else { + throw Unsupported("backwards assertions") + } + if kind.positive { + try emitPositiveLookahead(&list) + } else { + try emitNegativeLookahead(&list) + } + } + + mutating func emitAtomicNoncapturingGroup( + _ list: inout ArraySlice + ) throws { + /* + save(continuingAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSaveAddress(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildFail(preservingCaptures: true) // Atomic group succeeds here + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + + mutating func emitNoncapturingGroup( + _ kind: AST.Group.Kind, + _ list: inout ArraySlice + ) throws { + assert(!kind.isCapturing) + + options.beginScope() + defer { options.endScope() } + + if let lookaround = kind.lookaroundKind { + try emitLookaround(lookaround, &list) + return + } + + switch kind { + case .lookahead, .negativeLookahead, + .lookbehind, .negativeLookbehind: + throw Unreachable("TODO: reason") + + case .capture, .namedCapture, .balancedCapture: + throw Unreachable("These should produce a capture node") + + case .changeMatchingOptions(let optionSequence): + if !hasEmittedFirstMatchableAtom { + builder.initialOptions.apply(optionSequence) + } + options.apply(optionSequence) + try emitNode(&list) + + case .atomicNonCapturing: + try emitAtomicNoncapturingGroup(&list) + + default: + // FIXME: Other kinds... + try emitNode(&list) + } + } + + mutating func emitQuantification( + _ amount: AST.Quantification.Amount, + _ kind: DSLTree.QuantificationKind, + _ list: inout ArraySlice + ) throws { + let updatedKind: AST.Quantification.Kind + switch kind { + case .explicit(let kind): + updatedKind = kind.ast + case .syntax(let kind): + updatedKind = kind.ast.applying(options) + case .default: + updatedKind = options.defaultQuantificationKind + } + + let (low, high) = amount.bounds + guard let low = low else { + throw Unreachable("Must have a lower bound") + } + switch (low, high) { + case (_, 0): + try skipNode(&list) + return + case let (n, m?) where n > m: + // TODO: Should error out earlier, maybe DSL and parser + // has validation logic? + return + + case let (n, m) where m == nil || n <= m!: + // Ok + break + default: + throw Unreachable("TODO: reason") + } + + // Compiler and/or parser should enforce these invariants + // before we are called + assert(high != 0) + assert((0...(high ?? Int.max)).contains(low)) + + let maxExtraTrips: Int? + if let h = high { + maxExtraTrips = h - low + } else { + maxExtraTrips = nil + } + let minTrips = low + assert((maxExtraTrips ?? 1) >= 0) + + var tmp = list + if tryEmitFastQuant(&tmp, updatedKind, minTrips, maxExtraTrips) { + list = tmp + return + } + + // The below is a general algorithm for bounded and unbounded + // quantification. It can be specialized when the min + // is 0 or 1, or when extra trips is 1 or unbounded. + // + // Stuff inside `<` and `>` are decided at compile time, + // while run-time values stored in registers start with a `%` + _ = """ + min-trip-count control block: + if %minTrips is zero: + goto exit-policy control block + else: + decrement %minTrips and fallthrough + + loop-body: + : + mov currentPosition %pos + evaluate the subexpression + : + if %pos is currentPosition: + goto exit + goto min-trip-count control block + + exit-policy control block: + if %maxExtraTrips is zero: + goto exit + else: + decrement %maxExtraTrips and fallthrough + + : + save exit and goto loop-body + : + ratchet and goto loop + : + save loop-body and fallthrough (i.e. goto exit) + + exit + ... the rest of the program ... + """ + + // Specialization based on `minTrips` for 0 or 1: + _ = """ + min-trip-count control block: + : + goto exit-policy + : + /* fallthrough */ + + loop-body: + evaluate the subexpression + + /* fallthrough */ + """ + + // Specialization based on `maxExtraTrips` for 0 or unbounded + _ = """ + exit-policy control block: + : + goto exit + : + /* fallthrough */ + """ + + /* + NOTE: These specializations don't emit the optimal + code layout (e.g. fallthrough vs goto), but that's better + done later (not prematurely) and certainly better + done by an optimizing compiler. + + NOTE: We're intentionally emitting essentially the same + algorithm for all quantifications for now, for better + testing and surfacing difficult bugs. We can specialize + for other things, like `.*`, later. + + When it comes time for optimizing, we can also look into + quantification instructions (e.g. reduce save-point traffic) + */ + + let minTripsControl = builder.makeAddress() + let loopBody = builder.makeAddress() + let exitPolicy = builder.makeAddress() + let exit = builder.makeAddress() + + // We'll need registers if we're (non-trivially) bounded + let minTripsReg: IntRegister? + if minTrips > 1 { + minTripsReg = builder.makeIntRegister( + initialValue: minTrips) + } else { + minTripsReg = nil + } + + let maxExtraTripsReg: IntRegister? + if (maxExtraTrips ?? 0) > 0 { + maxExtraTripsReg = builder.makeIntRegister( + initialValue: maxExtraTrips!) + } else { + maxExtraTripsReg = nil + } + + // Set up a dummy save point for possessive to update + if updatedKind == .possessive { + builder.pushEmptySavePoint() + } + + // min-trip-count: + // condBranch(to: exitPolicy, ifZeroElseDecrement: %min) + builder.label(minTripsControl) + switch minTrips { + case 0: builder.buildBranch(to: exitPolicy) + case 1: break + default: + assert(minTripsReg != nil, "logic inconsistency") + builder.buildCondBranch( + to: exitPolicy, ifZeroElseDecrement: minTripsReg!) + } + + // FIXME: Possessive needs a "dummy" save point to ratchet + + // loop: + // + // branch min-trip-count + builder.label(loopBody) + + // if we aren't sure if the child node will have forward progress and + // we have an unbounded quantification + let startPosition: PositionRegister? + // FIXME: forward progress check?! + let emitPositionChecking = + (!optimizationsEnabled || (list.first?.guaranteesForwardProgress != true)) && + maxExtraTrips == nil + + if emitPositionChecking { + startPosition = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: startPosition!) + } else { + startPosition = nil + } + try emitNode(&list) + if emitPositionChecking { + // in all quantifier cases, no matter what minTrips or maxExtraTrips is, + // if we have a successful non-advancing match, branch to exit because it + // can match an arbitrary number of times + builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) + } + + if minTrips <= 1 { + // fallthrough + } else { + builder.buildBranch(to: minTripsControl) + } + + // exit-policy: + // condBranch(to: exit, ifZeroElseDecrement: %maxExtraTrips) + // + // + // , + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ maxExtraTrips: Int? + ) -> Bool { + let isScalarSemantics = options.semanticLevel == .unicodeScalar + guard optimizationsEnabled + && minTrips <= QuantifyPayload.maxStorableTrips + && maxExtraTrips ?? 0 <= QuantifyPayload.maxStorableTrips + && kind != .reluctant else { + return false + } + guard let child = list.popFirst() else { return false } + + switch child { + case .customCharacterClass(let ccc): + // ascii only custom character class + guard let bitset = ccc.asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + case .atom(let atom): + switch atom { + case .char(let c): + if options.isCaseInsensitive && c.isCased { + // Cased character with case-insensitive matching; match only as an ASCII bitset + guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + } else { + // Uncased character OR case-sensitive matching; match as a single scalar ascii value character + guard let val = c._singleScalarAsciiValue else { + return false + } + builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + } + + case .any: + builder.buildQuantifyAny( + matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + case .anyNonNewline: + builder.buildQuantifyAny( + matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + case .dot: + builder.buildQuantifyAny( + matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + case .characterClass(let cc): + // Custom character class that consumes a single grapheme + let model = cc.asRuntimeModel(options) + builder.buildQuantify( + model: model, + kind, + minTrips, + maxExtraTrips, + isScalarSemantics: isScalarSemantics) + default: + return false + } + case .limitCaptureNesting(let node): + if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { + return true + } else { + return false + } + case .nonCapturingGroup(let groupKind, let node): + // .nonCapture nonCapturingGroups are ignored during compilation + guard groupKind.ast == .nonCapture else { + return false + } + if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { + return true + } else { + return false + } + default: + return false + } + return true + } + + mutating func emitConcatenation( + _ list: inout ArraySlice, + componentCount: Int + ) throws { + // Unlike the tree-based bytecode generator, in a DSLList concatenations + // have already been flattened. + for _ in 0..) throws -> ValueRegister? { + guard let node = list.popFirst() else { return nil } + switch node { + + case let .orderedChoice(children): + let n = children.count + try emitAlternation(&list, alternationCount: n) + + case let .concatenation(children): + let n = children.count + try emitConcatenation(&list, componentCount: n) + + case let .capture(name, refId, _, transform): + options.beginScope() + defer { options.endScope() } + + let cap = builder.makeCapture(id: refId, name: name) + builder.buildBeginCapture(cap) + let value = try emitNode(&list) + builder.buildEndCapture(cap) + // If the child node produced a custom capture value, e.g. the result of + // a matcher, this should override the captured substring. + if let value { + builder.buildMove(value, into: cap) + } + // If there's a capture transform, apply it now. + if let transform = transform { + let fn = builder.makeTransformFunction { input, cap in + // If it's a substring capture with no custom value, apply the + // transform directly to the substring to avoid existential traffic. + // + // FIXME: separate out this code path. This is fragile, + // slow, and these are clearly different constructs + if let range = cap.range, cap.value == nil { + return try transform(input[range]) + } + + let value = constructExistentialOutputComponent( + from: input, + component: cap.deconstructed, + optionalCount: 0) + return try transform(value) + } + builder.buildTransformCapture(cap, fn) + } + + case let .nonCapturingGroup(kind, _): + try emitNoncapturingGroup(kind.ast, &list) + + case let .ignoreCapturesInTypedOutput(_): + try emitNode(&list) + + case let .limitCaptureNesting(_): + return try emitNode(&list) + + case .conditional: + throw Unsupported("Conditionals") + + case let .quantification(amt, kind, _): + try emitQuantification(amt.ast, kind, &list) + + case let .customCharacterClass(ccc): + if ccc.containsDot { + if !ccc.isInverted { + try emitDot() + } else { + throw Unsupported("Inverted any") + } + } else { + try emitCustomCharacterClass(ccc) + } + + case let .atom(a): + try emitAtom(a) + + case let .quotedLiteral(s): + emitQuotedLiteral(s) + + case .absentFunction: + throw Unsupported("absent function") + case .consumer: + throw Unsupported("consumer") + + case let .matcher(_, f): + return emitMatcher(f) + + case .characterPredicate: + throw Unsupported("character predicates") + + case .trivia, .empty: + return nil + } + return nil + } +} + +// MARK: Skip node + +extension Compiler.ByteCodeGen { + mutating func skipNode( + _ list: inout ArraySlice, + preservingCaptures: Bool = true + ) throws { + guard let node = list.popFirst() else { return } + switch node { + case let .orderedChoice(children): + let n = children.count + for _ in 0.. 0 && child.guaranteesForwardProgress + case .limitCaptureNesting(let node), .ignoreCapturesInTypedOutput(let node): + return node.guaranteesForwardProgress default: return false } } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 33cffaf2..e2fd2a28 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -32,6 +32,10 @@ class Compiler { } __consuming func emit() throws -> MEProgram { + try emitViaList() + } + + __consuming func emitViaTree() throws -> MEProgram { // TODO: Handle global options var codegen = ByteCodeGen( options: options, @@ -40,6 +44,17 @@ class Compiler { captureList: tree.captureList) return try codegen.emitRoot(tree.root) } + + __consuming func emitViaList() throws -> MEProgram { + // TODO: Handle global options + let dslList = DSLList(tree: tree) + var codegen = ByteCodeGen( + options: options, + compileOptions: + compileOptions, + captureList: tree.captureList) + return try codegen.emitRoot(dslList) + } } /// Hashable wrapper for `Any.Type`. diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index 5c136827..fa80f032 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -116,11 +116,9 @@ extension LiteralPrinter { outputNode(child) output(")") - case let .ignoreCapturesInTypedOutput(child): + case let .ignoreCapturesInTypedOutput(child), + let .limitCaptureNesting(child): outputNode(child) - case .convertedRegexLiteral(let node, _): - outputNode(node) - case let .quantification(amount, kind, node): outputQuantification(amount, kind, node) case let .customCharacterClass(charClass): diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 34ca44f0..2f6ebab6 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -179,6 +179,9 @@ extension PrettyPrinter { case let .ignoreCapturesInTypedOutput(child): printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) + case let .limitCaptureNesting(child): + printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) + case .conditional: print("/* TODO: conditional */") @@ -258,20 +261,6 @@ extension PrettyPrinter { break - case let .convertedRegexLiteral(.atom(a), _): - if let pattern = a._patternBase(&self), pattern.canBeWrapped { - printAtom(pattern.0) - return - } - - break - case let .convertedRegexLiteral(.customCharacterClass(ccc), _): - if ccc.isSimplePrint { - printSimpleCCC(ccc) - return - } - - break default: break } @@ -305,13 +294,6 @@ extension PrettyPrinter { case let .quotedLiteral(v): print(v._quoted) - case let .convertedRegexLiteral(n, _): - // FIXME: This recursion coordinates with back-off - // check above, so it should work out. Need a - // cleaner way to do this. This means the argument - // label is a lie. - printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) - case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -1431,9 +1413,6 @@ extension DSLTree.Node { result += node.getNamedCaptures() } - case .convertedRegexLiteral(let node, _): - result += node.getNamedCaptures() - case .quantification(_, _, let node): result += node.getNamedCaptures() diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 49094d4f..fbb18955 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -13,28 +13,13 @@ internal import _RegexParser extension AST { var dslTree: DSLTree { - return DSLTree(root.dslTreeNode) + return DSLTree(.limitCaptureNesting(root.dslTreeNode)) } } extension AST.Node { /// Converts an AST node to a `convertedRegexLiteral` node. var dslTreeNode: DSLTree.Node { - func wrap(_ node: DSLTree.Node) -> DSLTree.Node { - switch node { - case .convertedRegexLiteral: - // FIXME: DSL can have one item concats -// assertionFailure("Double wrapping?") - return node - default: - break - } - // TODO: Should we do this for the - // single-concatenation child too, or should? - // we wrap _that_? - return .convertedRegexLiteral(node, .init(ast: self)) - } - // Convert the top-level node without wrapping func convert() throws -> DSLTree.Node { switch self { @@ -105,9 +90,8 @@ extension AST.Node { } } - // FIXME: make total function again let converted = try! convert() - return wrap(converted) + return converted } } diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift new file mode 100644 index 00000000..1bbb0c9c --- /dev/null +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +struct DSLList { + var nodes: [DSLTree.Node] + + init(_ initial: DSLTree.Node) { + self.nodes = [initial] + } + + init(_ nodes: [DSLTree.Node]) { + self.nodes = nodes + } + + init(tree: DSLTree) { + self.nodes = Array(tree.depthFirst) + } +} + +extension DSLTree.Node { + var directChildren: Int { + switch self { + case .trivia, .empty, .quotedLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return 0 + + case .orderedChoice(let c), .concatenation(let c): + return c.count + + case .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, + .limitCaptureNesting, .conditional: + return 1 + + case .absentFunction: + return 0 + } + } +} + +extension DSLTree { + struct DepthFirst: Sequence, IteratorProtocol { + typealias Element = DSLTree.Node + private var stack: [Frame] + private let getChildren: (Element) -> [Element] + + private struct Frame { + let node: Element + let children: [Element] + var nextIndex: Int = 0 + } + + fileprivate init( + root: Element, + getChildren: @escaping (Element) -> [Element] + ) { + self.getChildren = getChildren + self.stack = [Frame(node: root, children: getChildren(root))] + } + + mutating func next() -> Element? { + guard let top = stack.popLast() else { return nil } + // Push children in reverse so leftmost comes out first. + for child in top.children.reversed() { + stack.append(Frame(node: child, children: getChildren(child))) + } + + // Since we coalesce the children before adding them to the stack, + // we need an exact matching number of children in the list's + // concatenation node, so that it can provide the correct component + // count. This will go away/change when .concatenation only stores + // a count. + return switch top.node { + case .concatenation: + .concatenation(top.node.coalescedChildren) + default: + top.node + } + } + } + + var depthFirst: DepthFirst { + DepthFirst(root: root, getChildren: { + $0.coalescedChildren + }) + } +} diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 5971cd93..03a56397 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -44,7 +44,8 @@ extension DSLTree { /// Marks all captures in a subpattern as ignored in strongly-typed output. case ignoreCapturesInTypedOutput(Node) - + case limitCaptureNesting(Node) + // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -79,13 +80,6 @@ extension DSLTree { /// TODO: Consider splitting off expression functions, or have our own kind case absentFunction(_AST.AbsentFunction) - // MARK: - Tree conversions - - /// The target of AST conversion. - /// - /// Keeps original AST around for rich syntactic and source information - case convertedRegexLiteral(Node, _AST.ASTNode) - // MARK: - Extensibility points case consumer(_ConsumerInterface) @@ -384,8 +378,9 @@ extension DSLTree.Node { case .orderedChoice(let c), .concatenation(let c): return !c.isEmpty - case .convertedRegexLiteral, .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, .conditional: + case .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, .limitCaptureNesting, + .conditional: return true case .absentFunction(let abs): @@ -398,16 +393,72 @@ extension DSLTree.Node { switch self { case let .orderedChoice(v): return v - case let .concatenation(v): return v + case let .concatenation(v): return v + + case let .capture(_, _, n, _): return [n] + case let .nonCapturingGroup(_, n): return [n] + case let .quantification(_, _, n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] + case let .limitCaptureNesting(n): return [n] + + case let .conditional(_, t, f): return [t,f] + + case .trivia, .empty, .quotedLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return [] + + case let .absentFunction(abs): + return abs.ast.children.map(\.dslTreeNode) + } + } + + public var coalescedChildren: [DSLTree.Node] { + // Before converting a concatenation in a tree to list form, we need to + // flatten out any nested concatenations, and coalesce any adjacent + // characters and scalars, forming quoted literals of their contents, + // over which we can perform grapheme breaking. - case let .convertedRegexLiteral(n, _): - // Treat this transparently - return n.children + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): + return flatten(n) + default: + return [node] + } + } + + switch self { + case let .orderedChoice(v): return v + case let .concatenation(v): + let children = v + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty + default: + return false + } + } + return children case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] + case let .limitCaptureNesting(n): return [n] case let .conditional(_, t, f): return [t,f] @@ -424,18 +475,12 @@ extension DSLTree.Node { extension DSLTree.Node { var astNode: AST.Node? { - switch self { - case let .convertedRegexLiteral(_, literal): return literal.ast - default: return nil - } + nil } /// If this node is for a converted literal, look through it. var lookingThroughConvertedLiteral: Self { - switch self { - case let .convertedRegexLiteral(n, _): return n - default: return self - } + self } } @@ -468,10 +513,6 @@ extension DSLTree.Node { switch self { case .capture: return true - case let .convertedRegexLiteral(n, re): - assert(n.hasCapture == re.ast.hasCapture) - return n.hasCapture - default: return self.children.any(\.hasCapture) } @@ -655,6 +696,9 @@ extension CaptureList.Builder { case let .ignoreCapturesInTypedOutput(child): addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: false) + case let .limitCaptureNesting(child): + addCaptures(of: child, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) + case let .conditional(cond, trueBranch, falseBranch): switch cond.ast { case .group(let g): @@ -685,11 +729,11 @@ extension CaptureList.Builder { #endif } - case let .convertedRegexLiteral(n, _): - // We disable nesting for converted AST trees, as literals do not nest - // captures. This includes literals nested in a DSL. - return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) - +// case let .convertedRegexLiteral(n, _): +// // We disable nesting for converted AST trees, as literals do not nest +// // captures. This includes literals nested in a DSL. +// return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) +// case .matcher: break @@ -717,8 +761,8 @@ extension DSLTree.Node { return true case .orderedChoice, .concatenation, .capture, .conditional, .quantification, .customCharacterClass, .atom, - .trivia, .empty, .quotedLiteral, .absentFunction, - .convertedRegexLiteral, .consumer, + .trivia, .empty, .quotedLiteral, .limitCaptureNesting, + .consumer, .absentFunction, .characterPredicate, .matcher: return false } @@ -805,8 +849,7 @@ extension DSLTree.Node { options.beginScope() defer { options.endScope() } return child._canOnlyMatchAtStartImpl(&options) - case .ignoreCapturesInTypedOutput(let child), - .convertedRegexLiteral(let child, _): + case .ignoreCapturesInTypedOutput(let child), .limitCaptureNesting(let child): return child._canOnlyMatchAtStartImpl(&options) // A quantification that doesn't require its child to exist can still @@ -869,14 +912,13 @@ extension DSLTree { case let .orderedChoice(v): return v.map(_Tree.init) case let .concatenation(v): return v.map(_Tree.init) - case let .convertedRegexLiteral(n, _): - // Treat this transparently - return _Tree(n).children - case let .capture(_, _, n, _): return [_Tree(n)] case let .nonCapturingGroup(_, n): return [_Tree(n)] case let .quantification(_, _, n): return [_Tree(n)] - case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] + case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] + case let .limitCaptureNesting(n): + // This is a transparent wrapper + return _Tree(n).children case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] diff --git a/Tests/RegexTests/DSLListTests.swift b/Tests/RegexTests/DSLListTests.swift new file mode 100644 index 00000000..d8acec73 --- /dev/null +++ b/Tests/RegexTests/DSLListTests.swift @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import Testing +@testable import _StringProcessing + +@Suite +struct DSLListTests { + @available(macOS 9999, *) + @Test(arguments: [ + (#/a/#, 2), // literal, a + (#/abcd+/#, 5), // literal, concat, abc, quant, d + (#/a(?:b+)c*/#, 8), // literal, concat, a, noncap grp, quant, b, quant, c + ]) + func convertedNodeCount(regex: Regex, nodeCount: Int) { + let dslList = DSLList(tree: regex.program.tree) + #expect(dslList.nodes.count == nodeCount) + } + + @Test(arguments: [#/a|b/#, #/a+b?c/#, #/abc/#, #/a(?:b+)c*/#, #/;[\r\n]/#, #/(?=(?:[1-9]|(?:a|b)))/#]) + func compilationComparison(regex: Regex) throws { + let listCompiler = Compiler(tree: regex.program.tree) + let listProgram = try listCompiler.emitViaList() + let treeCompiler = Compiler(tree: regex.program.tree) + let treeProgram = try treeCompiler.emit() + + #expect(treeProgram.instructions == listProgram.instructions) + } +} diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e20beeaf..e36285ae 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -37,16 +37,34 @@ func _roundTripLiteral( return remadeRegex } +func _validateListCompilation( + _ regex: Regex +) throws -> Bool { + let treeCompiler = Compiler(tree: regex.program.tree) + let treeProgram = try treeCompiler.emitViaTree() + let listCompiler = Compiler(tree: regex.program.tree) + let listProgram = try listCompiler.emitViaList() + return treeProgram.instructions == listProgram.instructions +} + func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, semanticLevel: RegexSemanticLevel = .graphemeCluster, - syntax: SyntaxOptions = .traditional + syntax: SyntaxOptions = .traditional, + file: StaticString = #file, + line: UInt = #line ) throws -> (String, [String?])? { var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) let result = try regex.firstMatch(in: input) - + + if try !_validateListCompilation(regex) { + XCTFail( + "List compilation failed for '\(regexStr)'", + file: file, line: line) + } + func validateSubstring(_ substringInput: Substring) throws { // Sometimes the characters we add to a substring merge with existing // string members. This messes up cross-validation, so skip the test. @@ -105,14 +123,18 @@ func _firstMatch( For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex?._literalPattern ?? "")' - """) + """, + file: file, + line: line) case let (_, rtMatch?): XCTFail(""" Incorrectly matched as '\(rtMatch)' For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex!._literalPattern!)' - """) + """, + file: file, + line: line) } } @@ -184,7 +206,8 @@ func flatCaptureTest( input: test, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax + syntax: syntax, + file: file, line: line ) else { if expect == nil { continue @@ -303,7 +326,8 @@ func firstMatchTest( input: input, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax)?.0 + syntax: syntax, + file: file, line: line)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line)