From 97e7b8a8bd97f027733611b259ce035ce36db3ef Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 25 Sep 2025 13:03:19 -0500 Subject: [PATCH 1/6] Begin work on flatted DSLTree --- Sources/_StringProcessing/Regex/DSLList.swift | 113 ++++++++++++++++++ Tests/RegexTests/DSLListTests.swift | 21 ++++ 2 files changed, 134 insertions(+) create mode 100644 Sources/_StringProcessing/Regex/DSLList.swift create mode 100644 Tests/RegexTests/DSLListTests.swift diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift new file mode 100644 index 00000000..af068fce --- /dev/null +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -0,0 +1,113 @@ +// +// DSLList.swift +// swift-experimental-string-processing +// +// Created by Nate Cook on 9/25/25. +// + +struct DSLList { + var nodes: [DSLTree.Node] + + init(_ initial: DSLTree.Node) { + self.nodes = [initial] + } + + init(_ nodes: [DSLTree.Node]) { + self.nodes = nodes + } + + init(root: DSLTree.Node) { + self.nodes = Array(root) + } +} + +extension DSLList { + struct Children: Sequence { + var nodes: [DSLTree.Node] + var firstChildIndex: Int + + struct Iterator: IteratorProtocol { + var nodes: [DSLTree.Node] + var currentIndex: Int + var remainingCount: Int + + mutating func next() -> DSLTree.Node? { + guard remainingCount > 0 else { return nil } + guard currentIndex < nodes.count else { + // FIXME: assert? + print("ERROR: index out of bounds") + return nil + } + remainingCount -= 1 + var nextIndex = currentIndex + var inc = nodes[currentIndex].directChildren + 1 + while inc > 0 { + nextIndex += 1 + inc += nodes[nextIndex].directChildren - 1 + } + + return nodes[currentIndex] + } + } + + func makeIterator() -> Iterator { + Iterator(nodes: nodes, currentIndex: firstChildIndex, remainingCount: nodes[firstChildIndex].directChildren) + } + } +} + +extension DSLTree.Node { + var directChildren: Int { + switch self { + case .trivia, .empty, .quotedLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return 0 + + case .orderedChoice(let c), .concatenation(let c): + return c.count + + case .convertedRegexLiteral, .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, .conditional: + return 1 + + case .absentFunction: + return 0 + } + } +} + +extension DSLTree.Node: Sequence { + struct Iterator: Sequence, IteratorProtocol { + typealias Element = DSLTree.Node + private var stack: [Frame] + private let getChildren: (Element) -> [Element] + + private struct Frame { + let node: Element + let children: [Element] + var nextIndex: Int = 0 + } + + fileprivate init( + root: Element, + getChildren: @escaping (Element) -> [Element] + ) { + self.getChildren = getChildren + self.stack = [Frame(node: root, children: getChildren(root))] + } + + mutating func next() -> Element? { + guard let top = stack.popLast() else { return nil } + // Push children in reverse so leftmost comes out first. + for child in top.children.reversed() { + stack.append(Frame(node: child, children: getChildren(child))) + } + return top.node + } + } + + func makeIterator() -> Iterator { + Iterator(root: self, getChildren: { $0.children }) + } +} diff --git a/Tests/RegexTests/DSLListTests.swift b/Tests/RegexTests/DSLListTests.swift new file mode 100644 index 00000000..feffa6bc --- /dev/null +++ b/Tests/RegexTests/DSLListTests.swift @@ -0,0 +1,21 @@ +// +// DSLListTests.swift +// swift-experimental-string-processing +// +// Created by Nate Cook on 9/25/25. +// + +import Testing +@testable import _StringProcessing + +@Suite +struct DSLListTests { + @Test(arguments: [(#/abc/#, 4), (#/a(?:b+)c*/#, 7)]) + func simple(regex: Regex, nodeCount: Int) { + let dslList = DSLList(root: regex.root) + #expect(dslList.nodes.count == nodeCount) + for (i, node) in dslList.nodes.enumerated() { + print(i, node) + } + } +} From fa993aa5b98c3af798961267113f55fc07f98775 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 25 Sep 2025 20:29:36 -0500 Subject: [PATCH 2/6] Eliminate convertedRegexLiteral wrapper node --- Sources/_StringProcessing/ByteCodeGen.swift | 14 ++-- .../_StringProcessing/LiteralPrinter.swift | 6 +- .../_StringProcessing/PrintAsPattern.swift | 27 +------- .../Regex/ASTConversion.swift | 33 ++++----- Sources/_StringProcessing/Regex/DSLList.swift | 5 +- Sources/_StringProcessing/Regex/DSLTree.swift | 68 ++++++++----------- 6 files changed, 59 insertions(+), 94 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 90a47bdf..a3f449c4 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -809,7 +809,7 @@ fileprivate extension Compiler.ByteCodeGen { default: return false } - case .convertedRegexLiteral(let node, _): + case .limitCaptureNesting(let node): return tryEmitFastQuant(node, kind, minTrips, maxExtraTrips) case .nonCapturingGroup(let groupKind, let node): // .nonCapture nonCapturingGroups are ignored during compilation @@ -1203,7 +1203,7 @@ fileprivate extension Compiler.ByteCodeGen { switch node { case .concatenation(let ch): return ch.flatMap(flatten) - case .convertedRegexLiteral(let n, _), .ignoreCapturesInTypedOutput(let n): + case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): return flatten(n) default: return [node] @@ -1283,6 +1283,9 @@ fileprivate extension Compiler.ByteCodeGen { case let .ignoreCapturesInTypedOutput(child): try emitNode(child) + case let .limitCaptureNesting(child): + return try emitNode(child) + case .conditional: throw Unsupported("Conditionals") @@ -1306,9 +1309,6 @@ fileprivate extension Compiler.ByteCodeGen { case let .quotedLiteral(s): emitQuotedLiteral(s) - case let .convertedRegexLiteral(n, _): - return try emitNode(n) - case .absentFunction: throw Unsupported("absent function") case .consumer: @@ -1359,8 +1359,6 @@ extension DSLTree.Node { return false case .quotedLiteral(let string): return !string.isEmpty - case .convertedRegexLiteral(let node, _): - return node.guaranteesForwardProgress case .consumer, .matcher: // Allow zero width consumers and matchers return false @@ -1369,6 +1367,8 @@ extension DSLTree.Node { case .quantification(let amount, _, let child): let (atLeast, _) = amount.ast.bounds return atLeast ?? 0 > 0 && child.guaranteesForwardProgress + case .limitCaptureNesting(let node), .ignoreCapturesInTypedOutput(let node): + return node.guaranteesForwardProgress default: return false } } diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index 5c136827..fa80f032 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -116,11 +116,9 @@ extension LiteralPrinter { outputNode(child) output(")") - case let .ignoreCapturesInTypedOutput(child): + case let .ignoreCapturesInTypedOutput(child), + let .limitCaptureNesting(child): outputNode(child) - case .convertedRegexLiteral(let node, _): - outputNode(node) - case let .quantification(amount, kind, node): outputQuantification(amount, kind, node) case let .customCharacterClass(charClass): diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 34ca44f0..2f6ebab6 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -179,6 +179,9 @@ extension PrettyPrinter { case let .ignoreCapturesInTypedOutput(child): printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) + case let .limitCaptureNesting(child): + printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) + case .conditional: print("/* TODO: conditional */") @@ -258,20 +261,6 @@ extension PrettyPrinter { break - case let .convertedRegexLiteral(.atom(a), _): - if let pattern = a._patternBase(&self), pattern.canBeWrapped { - printAtom(pattern.0) - return - } - - break - case let .convertedRegexLiteral(.customCharacterClass(ccc), _): - if ccc.isSimplePrint { - printSimpleCCC(ccc) - return - } - - break default: break } @@ -305,13 +294,6 @@ extension PrettyPrinter { case let .quotedLiteral(v): print(v._quoted) - case let .convertedRegexLiteral(n, _): - // FIXME: This recursion coordinates with back-off - // check above, so it should work out. Need a - // cleaner way to do this. This means the argument - // label is a lie. - printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) - case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -1431,9 +1413,6 @@ extension DSLTree.Node { result += node.getNamedCaptures() } - case .convertedRegexLiteral(let node, _): - result += node.getNamedCaptures() - case .quantification(_, _, let node): result += node.getNamedCaptures() diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 49094d4f..c230e761 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -13,27 +13,27 @@ internal import _RegexParser extension AST { var dslTree: DSLTree { - return DSLTree(root.dslTreeNode) + return DSLTree(.limitCaptureNesting(root.dslTreeNode)) } } extension AST.Node { /// Converts an AST node to a `convertedRegexLiteral` node. var dslTreeNode: DSLTree.Node { - func wrap(_ node: DSLTree.Node) -> DSLTree.Node { - switch node { - case .convertedRegexLiteral: - // FIXME: DSL can have one item concats -// assertionFailure("Double wrapping?") - return node - default: - break - } - // TODO: Should we do this for the - // single-concatenation child too, or should? - // we wrap _that_? - return .convertedRegexLiteral(node, .init(ast: self)) - } +// func wrap(_ node: DSLTree.Node) -> DSLTree.Node { +// switch node { +// case .convertedRegexLiteral(let child, _): +// // FIXME: DSL can have one item concats +//// assertionFailure("Double wrapping?") +// return child +// default: +// break +// } +// // TODO: Should we do this for the +// // single-concatenation child too, or should? +// // we wrap _that_? +// return node +// } // Convert the top-level node without wrapping func convert() throws -> DSLTree.Node { @@ -107,7 +107,8 @@ extension AST.Node { // FIXME: make total function again let converted = try! convert() - return wrap(converted) +// return wrap(converted) + return converted } } diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift index af068fce..9c657d97 100644 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -67,8 +67,9 @@ extension DSLTree.Node { case .orderedChoice(let c), .concatenation(let c): return c.count - case .convertedRegexLiteral, .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, .conditional: + case .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, + .limitCaptureNesting, .conditional: return 1 case .absentFunction: diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 5971cd93..5d5ad075 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -44,7 +44,8 @@ extension DSLTree { /// Marks all captures in a subpattern as ignored in strongly-typed output. case ignoreCapturesInTypedOutput(Node) - + case limitCaptureNesting(Node) + // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -79,13 +80,6 @@ extension DSLTree { /// TODO: Consider splitting off expression functions, or have our own kind case absentFunction(_AST.AbsentFunction) - // MARK: - Tree conversions - - /// The target of AST conversion. - /// - /// Keeps original AST around for rich syntactic and source information - case convertedRegexLiteral(Node, _AST.ASTNode) - // MARK: - Extensibility points case consumer(_ConsumerInterface) @@ -384,8 +378,9 @@ extension DSLTree.Node { case .orderedChoice(let c), .concatenation(let c): return !c.isEmpty - case .convertedRegexLiteral, .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, .conditional: + case .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, .limitCaptureNesting, + .conditional: return true case .absentFunction(let abs): @@ -400,14 +395,14 @@ extension DSLTree.Node { case let .orderedChoice(v): return v case let .concatenation(v): return v - case let .convertedRegexLiteral(n, _): - // Treat this transparently - return n.children - case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] + + case let .limitCaptureNesting(n): + // This is a transparent wrapper + return n.children case let .conditional(_, t, f): return [t,f] @@ -424,18 +419,12 @@ extension DSLTree.Node { extension DSLTree.Node { var astNode: AST.Node? { - switch self { - case let .convertedRegexLiteral(_, literal): return literal.ast - default: return nil - } + nil } /// If this node is for a converted literal, look through it. var lookingThroughConvertedLiteral: Self { - switch self { - case let .convertedRegexLiteral(n, _): return n - default: return self - } + self } } @@ -468,10 +457,6 @@ extension DSLTree.Node { switch self { case .capture: return true - case let .convertedRegexLiteral(n, re): - assert(n.hasCapture == re.ast.hasCapture) - return n.hasCapture - default: return self.children.any(\.hasCapture) } @@ -655,6 +640,9 @@ extension CaptureList.Builder { case let .ignoreCapturesInTypedOutput(child): addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: false) + case let .limitCaptureNesting(child): + addCaptures(of: child, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) + case let .conditional(cond, trueBranch, falseBranch): switch cond.ast { case .group(let g): @@ -685,11 +673,11 @@ extension CaptureList.Builder { #endif } - case let .convertedRegexLiteral(n, _): - // We disable nesting for converted AST trees, as literals do not nest - // captures. This includes literals nested in a DSL. - return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) - +// case let .convertedRegexLiteral(n, _): +// // We disable nesting for converted AST trees, as literals do not nest +// // captures. This includes literals nested in a DSL. +// return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) +// case .matcher: break @@ -717,8 +705,8 @@ extension DSLTree.Node { return true case .orderedChoice, .concatenation, .capture, .conditional, .quantification, .customCharacterClass, .atom, - .trivia, .empty, .quotedLiteral, .absentFunction, - .convertedRegexLiteral, .consumer, + .trivia, .empty, .quotedLiteral, .limitCaptureNesting, + .consumer, .absentFunction, .characterPredicate, .matcher: return false } @@ -805,8 +793,7 @@ extension DSLTree.Node { options.beginScope() defer { options.endScope() } return child._canOnlyMatchAtStartImpl(&options) - case .ignoreCapturesInTypedOutput(let child), - .convertedRegexLiteral(let child, _): + case .ignoreCapturesInTypedOutput(let child), .limitCaptureNesting(let child): return child._canOnlyMatchAtStartImpl(&options) // A quantification that doesn't require its child to exist can still @@ -869,14 +856,13 @@ extension DSLTree { case let .orderedChoice(v): return v.map(_Tree.init) case let .concatenation(v): return v.map(_Tree.init) - case let .convertedRegexLiteral(n, _): - // Treat this transparently - return _Tree(n).children - case let .capture(_, _, n, _): return [_Tree(n)] case let .nonCapturingGroup(_, n): return [_Tree(n)] case let .quantification(_, _, n): return [_Tree(n)] - case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] + case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] + case let .limitCaptureNesting(n): + // This is a transparent wrapper + return _Tree(n).children case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] From cdff26b4cba9a9f0dfe91350bcbdd986e57433d0 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 3 Oct 2025 22:32:03 -0500 Subject: [PATCH 3/6] Add list-based compilation This implements bytecode generation from a DSLList instead of a DSLTree. The change includes tests that all regex patterns in the `MatchTests` file produce the exact same bytecode from a list as from a tree. --- .../ByteCodeGen+DSLList.swift | 723 ++++++++++++++++++ Sources/_StringProcessing/ByteCodeGen.swift | 4 +- Sources/_StringProcessing/Compiler.swift | 14 + .../Regex/ASTConversion.swift | 17 - Sources/_StringProcessing/Regex/DSLList.swift | 37 +- Sources/_StringProcessing/Regex/DSLTree.swift | 66 +- Tests/RegexTests/DSLListTests.swift | 34 +- Tests/RegexTests/MatchTests.swift | 36 +- 8 files changed, 882 insertions(+), 49 deletions(-) create mode 100644 Sources/_StringProcessing/ByteCodeGen+DSLList.swift diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift new file mode 100644 index 00000000..970f6004 --- /dev/null +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -0,0 +1,723 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +internal import _RegexParser + +extension Compiler.ByteCodeGen { + mutating func emitRoot(_ root: DSLList) throws -> MEProgram { + // If the whole regex is a matcher, then the whole-match value + // is the constructed value. Denote that the current value + // register is the processor's value output. + switch root.nodes.first { + case .matcher: + builder.denoteCurrentValueIsWholeMatchValue() + default: + break + } + + var list = root.nodes[...] + try emitNode(&list) + + // FIXME: Restore this canOnlyMatchAtStart + // builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart() + builder.buildAccept() + return try builder.assemble() + } +} + +fileprivate extension Compiler.ByteCodeGen { + mutating func emitAlternationGen( + _ elements: inout ArraySlice, + alternationCount: Int, + withBacktracking: Bool, + _ body: (inout Compiler.ByteCodeGen, inout ArraySlice) throws -> Void + ) rethrows { + // Alternation: p0 | p1 | ... | pn + // save next_p1 + // + // branch done + // next_p1: + // save next_p2 + // + // branch done + // next_p2: + // save next_p... + // + // branch done + // ... + // next_pn: + // + // done: + let done = builder.makeAddress() + for _ in 1.., + alternationCount count: Int + ) throws { + try emitAlternationGen(&list, alternationCount: count, withBacktracking: true) { + try $0.emitNode(&$1) + } + } + + mutating func emitPositiveLookahead(_ list: inout ArraySlice) throws { + /* + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSave(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildFail(preservingCaptures: true) // Lookahead succeeds here + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + + mutating func emitNegativeLookahead(_ list: inout ArraySlice) throws { + /* + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + clearSavePoint // remove success + fail // propagate failure + intercept: + fail // ->success + success: + ... + */ + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSave(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(intercept) + builder.buildFail() + + builder.label(success) + } + + mutating func emitLookaround( + _ kind: (forwards: Bool, positive: Bool), + _ list: inout ArraySlice + ) throws { + guard kind.forwards else { + throw Unsupported("backwards assertions") + } + if kind.positive { + try emitPositiveLookahead(&list) + } else { + try emitNegativeLookahead(&list) + } + } + + mutating func emitAtomicNoncapturingGroup( + _ list: inout ArraySlice + ) throws { + /* + save(continuingAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSaveAddress(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildFail(preservingCaptures: true) // Atomic group succeeds here + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + + mutating func emitNoncapturingGroup( + _ kind: AST.Group.Kind, + _ list: inout ArraySlice + ) throws { + assert(!kind.isCapturing) + + options.beginScope() + defer { options.endScope() } + + if let lookaround = kind.lookaroundKind { + try emitLookaround(lookaround, &list) + return + } + + switch kind { + case .lookahead, .negativeLookahead, + .lookbehind, .negativeLookbehind: + throw Unreachable("TODO: reason") + + case .capture, .namedCapture, .balancedCapture: + throw Unreachable("These should produce a capture node") + + case .changeMatchingOptions(let optionSequence): + if !hasEmittedFirstMatchableAtom { + builder.initialOptions.apply(optionSequence) + } + options.apply(optionSequence) + try emitNode(&list) + + case .atomicNonCapturing: + try emitAtomicNoncapturingGroup(&list) + + default: + // FIXME: Other kinds... + try emitNode(&list) + } + } + + mutating func emitQuantification( + _ amount: AST.Quantification.Amount, + _ kind: DSLTree.QuantificationKind, + _ list: inout ArraySlice + ) throws { + let updatedKind: AST.Quantification.Kind + switch kind { + case .explicit(let kind): + updatedKind = kind.ast + case .syntax(let kind): + updatedKind = kind.ast.applying(options) + case .default: + updatedKind = options.defaultQuantificationKind + } + + let (low, high) = amount.bounds + guard let low = low else { + throw Unreachable("Must have a lower bound") + } + switch (low, high) { + case (_, 0): + try skipNode(&list) + return + case let (n, m?) where n > m: + // TODO: Should error out earlier, maybe DSL and parser + // has validation logic? + return + + case let (n, m) where m == nil || n <= m!: + // Ok + break + default: + throw Unreachable("TODO: reason") + } + + // Compiler and/or parser should enforce these invariants + // before we are called + assert(high != 0) + assert((0...(high ?? Int.max)).contains(low)) + + let maxExtraTrips: Int? + if let h = high { + maxExtraTrips = h - low + } else { + maxExtraTrips = nil + } + let minTrips = low + assert((maxExtraTrips ?? 1) >= 0) + + var tmp = list + if tryEmitFastQuant(&tmp, updatedKind, minTrips, maxExtraTrips) { + list = tmp + return + } + + // The below is a general algorithm for bounded and unbounded + // quantification. It can be specialized when the min + // is 0 or 1, or when extra trips is 1 or unbounded. + // + // Stuff inside `<` and `>` are decided at compile time, + // while run-time values stored in registers start with a `%` + _ = """ + min-trip-count control block: + if %minTrips is zero: + goto exit-policy control block + else: + decrement %minTrips and fallthrough + + loop-body: + : + mov currentPosition %pos + evaluate the subexpression + : + if %pos is currentPosition: + goto exit + goto min-trip-count control block + + exit-policy control block: + if %maxExtraTrips is zero: + goto exit + else: + decrement %maxExtraTrips and fallthrough + + : + save exit and goto loop-body + : + ratchet and goto loop + : + save loop-body and fallthrough (i.e. goto exit) + + exit + ... the rest of the program ... + """ + + // Specialization based on `minTrips` for 0 or 1: + _ = """ + min-trip-count control block: + : + goto exit-policy + : + /* fallthrough */ + + loop-body: + evaluate the subexpression + + /* fallthrough */ + """ + + // Specialization based on `maxExtraTrips` for 0 or unbounded + _ = """ + exit-policy control block: + : + goto exit + : + /* fallthrough */ + """ + + /* + NOTE: These specializations don't emit the optimal + code layout (e.g. fallthrough vs goto), but that's better + done later (not prematurely) and certainly better + done by an optimizing compiler. + + NOTE: We're intentionally emitting essentially the same + algorithm for all quantifications for now, for better + testing and surfacing difficult bugs. We can specialize + for other things, like `.*`, later. + + When it comes time for optimizing, we can also look into + quantification instructions (e.g. reduce save-point traffic) + */ + + let minTripsControl = builder.makeAddress() + let loopBody = builder.makeAddress() + let exitPolicy = builder.makeAddress() + let exit = builder.makeAddress() + + // We'll need registers if we're (non-trivially) bounded + let minTripsReg: IntRegister? + if minTrips > 1 { + minTripsReg = builder.makeIntRegister( + initialValue: minTrips) + } else { + minTripsReg = nil + } + + let maxExtraTripsReg: IntRegister? + if (maxExtraTrips ?? 0) > 0 { + maxExtraTripsReg = builder.makeIntRegister( + initialValue: maxExtraTrips!) + } else { + maxExtraTripsReg = nil + } + + // Set up a dummy save point for possessive to update + if updatedKind == .possessive { + builder.pushEmptySavePoint() + } + + // min-trip-count: + // condBranch(to: exitPolicy, ifZeroElseDecrement: %min) + builder.label(minTripsControl) + switch minTrips { + case 0: builder.buildBranch(to: exitPolicy) + case 1: break + default: + assert(minTripsReg != nil, "logic inconsistency") + builder.buildCondBranch( + to: exitPolicy, ifZeroElseDecrement: minTripsReg!) + } + + // FIXME: Possessive needs a "dummy" save point to ratchet + + // loop: + // + // branch min-trip-count + builder.label(loopBody) + + // if we aren't sure if the child node will have forward progress and + // we have an unbounded quantification + let startPosition: PositionRegister? + // FIXME: forward progress check?! + let emitPositionChecking = + (!optimizationsEnabled || (list.first?.guaranteesForwardProgress != true)) && + maxExtraTrips == nil + + if emitPositionChecking { + startPosition = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: startPosition!) + } else { + startPosition = nil + } + try emitNode(&list) + if emitPositionChecking { + // in all quantifier cases, no matter what minTrips or maxExtraTrips is, + // if we have a successful non-advancing match, branch to exit because it + // can match an arbitrary number of times + builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) + } + + if minTrips <= 1 { + // fallthrough + } else { + builder.buildBranch(to: minTripsControl) + } + + // exit-policy: + // condBranch(to: exit, ifZeroElseDecrement: %maxExtraTrips) + // + // + // , + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ maxExtraTrips: Int? + ) -> Bool { + let isScalarSemantics = options.semanticLevel == .unicodeScalar + guard optimizationsEnabled + && minTrips <= QuantifyPayload.maxStorableTrips + && maxExtraTrips ?? 0 <= QuantifyPayload.maxStorableTrips + && kind != .reluctant else { + return false + } + guard let child = list.popFirst() else { return false } + + switch child { + case .customCharacterClass(let ccc): + // ascii only custom character class + guard let bitset = ccc.asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + case .atom(let atom): + switch atom { + case .char(let c): + if options.isCaseInsensitive && c.isCased { + // Cased character with case-insensitive matching; match only as an ASCII bitset + guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + } else { + // Uncased character OR case-sensitive matching; match as a single scalar ascii value character + guard let val = c._singleScalarAsciiValue else { + return false + } + builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + } + + case .any: + builder.buildQuantifyAny( + matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + case .anyNonNewline: + builder.buildQuantifyAny( + matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + case .dot: + builder.buildQuantifyAny( + matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + case .characterClass(let cc): + // Custom character class that consumes a single grapheme + let model = cc.asRuntimeModel(options) + builder.buildQuantify( + model: model, + kind, + minTrips, + maxExtraTrips, + isScalarSemantics: isScalarSemantics) + default: + return false + } + case .limitCaptureNesting(let node): + if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { + return true + } else { + return false + } + case .nonCapturingGroup(let groupKind, let node): + // .nonCapture nonCapturingGroups are ignored during compilation + guard groupKind.ast == .nonCapture else { + return false + } + if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { + return true + } else { + return false + } + default: + return false + } + return true + } + + mutating func emitConcatenation( + _ list: inout ArraySlice, + componentCount: Int + ) throws { + // Unlike the tree-based bytecode generator, in a DSLList concatenations + // have already been flattened. + for _ in 0..) throws -> ValueRegister? { + guard let node = list.popFirst() else { return nil } + switch node { + + case let .orderedChoice(children): + let n = children.count + try emitAlternation(&list, alternationCount: n) + + case let .concatenation(children): + let n = children.count + try emitConcatenation(&list, componentCount: n) + + case let .capture(name, refId, _, transform): + options.beginScope() + defer { options.endScope() } + + let cap = builder.makeCapture(id: refId, name: name) + builder.buildBeginCapture(cap) + let value = try emitNode(&list) + builder.buildEndCapture(cap) + // If the child node produced a custom capture value, e.g. the result of + // a matcher, this should override the captured substring. + if let value { + builder.buildMove(value, into: cap) + } + // If there's a capture transform, apply it now. + if let transform = transform { + let fn = builder.makeTransformFunction { input, cap in + // If it's a substring capture with no custom value, apply the + // transform directly to the substring to avoid existential traffic. + // + // FIXME: separate out this code path. This is fragile, + // slow, and these are clearly different constructs + if let range = cap.range, cap.value == nil { + return try transform(input[range]) + } + + let value = constructExistentialOutputComponent( + from: input, + component: cap.deconstructed, + optionalCount: 0) + return try transform(value) + } + builder.buildTransformCapture(cap, fn) + } + + case let .nonCapturingGroup(kind, _): + try emitNoncapturingGroup(kind.ast, &list) + + case let .ignoreCapturesInTypedOutput(_): + try emitNode(&list) + + case let .limitCaptureNesting(_): + return try emitNode(&list) + + case .conditional: + throw Unsupported("Conditionals") + + case let .quantification(amt, kind, _): + try emitQuantification(amt.ast, kind, &list) + + case let .customCharacterClass(ccc): + if ccc.containsDot { + if !ccc.isInverted { + try emitDot() + } else { + throw Unsupported("Inverted any") + } + } else { + try emitCustomCharacterClass(ccc) + } + + case let .atom(a): + try emitAtom(a) + + case let .quotedLiteral(s): + emitQuotedLiteral(s) + + case .absentFunction: + throw Unsupported("absent function") + case .consumer: + throw Unsupported("consumer") + + case let .matcher(_, f): + return emitMatcher(f) + + case .characterPredicate: + throw Unsupported("character predicates") + + case .trivia, .empty: + return nil + } + return nil + } +} + +// MARK: Skip node + +extension Compiler.ByteCodeGen { + mutating func skipNode(_ list: inout ArraySlice) throws { + guard let node = list.popFirst() else { return } + switch node { + case let .orderedChoice(children): + let n = children.count + for _ in 0.. MEProgram { + // TODO: Handle global options + let dslList = DSLList(tree: tree) + for (n, el) in dslList.nodes.enumerated() { + print("\(n): \(el)") + } + var codegen = ByteCodeGen( + options: options, + compileOptions: + compileOptions, + captureList: tree.captureList) + return try codegen.emitRoot(dslList) + } } /// Hashable wrapper for `Any.Type`. diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index c230e761..fbb18955 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -20,21 +20,6 @@ extension AST { extension AST.Node { /// Converts an AST node to a `convertedRegexLiteral` node. var dslTreeNode: DSLTree.Node { -// func wrap(_ node: DSLTree.Node) -> DSLTree.Node { -// switch node { -// case .convertedRegexLiteral(let child, _): -// // FIXME: DSL can have one item concats -//// assertionFailure("Double wrapping?") -// return child -// default: -// break -// } -// // TODO: Should we do this for the -// // single-concatenation child too, or should? -// // we wrap _that_? -// return node -// } - // Convert the top-level node without wrapping func convert() throws -> DSLTree.Node { switch self { @@ -105,9 +90,7 @@ extension AST.Node { } } - // FIXME: make total function again let converted = try! convert() -// return wrap(converted) return converted } } diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift index 9c657d97..2eb73087 100644 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -1,9 +1,13 @@ +//===----------------------------------------------------------------------===// // -// DSLList.swift -// swift-experimental-string-processing +// This source file is part of the Swift.org open source project // -// Created by Nate Cook on 9/25/25. +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception // +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// struct DSLList { var nodes: [DSLTree.Node] @@ -16,8 +20,8 @@ struct DSLList { self.nodes = nodes } - init(root: DSLTree.Node) { - self.nodes = Array(root) + init(tree: DSLTree) { + self.nodes = Array(tree.depthFirst) } } @@ -78,8 +82,8 @@ extension DSLTree.Node { } } -extension DSLTree.Node: Sequence { - struct Iterator: Sequence, IteratorProtocol { +extension DSLTree { + struct DepthFirst: Sequence, IteratorProtocol { typealias Element = DSLTree.Node private var stack: [Frame] private let getChildren: (Element) -> [Element] @@ -104,11 +108,24 @@ extension DSLTree.Node: Sequence { for child in top.children.reversed() { stack.append(Frame(node: child, children: getChildren(child))) } - return top.node + + // Since we coalesce the children before adding them to the stack, + // we need an exact matching number of children in the list's + // concatenation node, so that it can provide the correct component + // count. This will go away/change when .concatenation only stores + // a count. + return switch top.node { + case .concatenation: + .concatenation(top.node.coalescedChildren) + default: + top.node + } } } - func makeIterator() -> Iterator { - Iterator(root: self, getChildren: { $0.children }) + var depthFirst: DepthFirst { + DepthFirst(root: root, getChildren: { + $0.coalescedChildren + }) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 5d5ad075..03a56397 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -393,16 +393,72 @@ extension DSLTree.Node { switch self { case let .orderedChoice(v): return v - case let .concatenation(v): return v - + case let .concatenation(v): return v + case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] case let .ignoreCapturesInTypedOutput(n): return [n] + case let .limitCaptureNesting(n): return [n] + + case let .conditional(_, t, f): return [t,f] + + case .trivia, .empty, .quotedLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return [] + + case let .absentFunction(abs): + return abs.ast.children.map(\.dslTreeNode) + } + } + + public var coalescedChildren: [DSLTree.Node] { + // Before converting a concatenation in a tree to list form, we need to + // flatten out any nested concatenations, and coalesce any adjacent + // characters and scalars, forming quoted literals of their contents, + // over which we can perform grapheme breaking. + + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): + return flatten(n) + default: + return [node] + } + } - case let .limitCaptureNesting(n): - // This is a transparent wrapper - return n.children + switch self { + case let .orderedChoice(v): return v + case let .concatenation(v): + let children = v + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty + default: + return false + } + } + return children + + case let .capture(_, _, n, _): return [n] + case let .nonCapturingGroup(_, n): return [n] + case let .quantification(_, _, n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] + case let .limitCaptureNesting(n): return [n] case let .conditional(_, t, f): return [t,f] diff --git a/Tests/RegexTests/DSLListTests.swift b/Tests/RegexTests/DSLListTests.swift index feffa6bc..d8acec73 100644 --- a/Tests/RegexTests/DSLListTests.swift +++ b/Tests/RegexTests/DSLListTests.swift @@ -1,21 +1,37 @@ +//===----------------------------------------------------------------------===// // -// DSLListTests.swift -// swift-experimental-string-processing +// This source file is part of the Swift.org open source project // -// Created by Nate Cook on 9/25/25. +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception // +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// import Testing @testable import _StringProcessing @Suite struct DSLListTests { - @Test(arguments: [(#/abc/#, 4), (#/a(?:b+)c*/#, 7)]) - func simple(regex: Regex, nodeCount: Int) { - let dslList = DSLList(root: regex.root) + @available(macOS 9999, *) + @Test(arguments: [ + (#/a/#, 2), // literal, a + (#/abcd+/#, 5), // literal, concat, abc, quant, d + (#/a(?:b+)c*/#, 8), // literal, concat, a, noncap grp, quant, b, quant, c + ]) + func convertedNodeCount(regex: Regex, nodeCount: Int) { + let dslList = DSLList(tree: regex.program.tree) #expect(dslList.nodes.count == nodeCount) - for (i, node) in dslList.nodes.enumerated() { - print(i, node) - } + } + + @Test(arguments: [#/a|b/#, #/a+b?c/#, #/abc/#, #/a(?:b+)c*/#, #/;[\r\n]/#, #/(?=(?:[1-9]|(?:a|b)))/#]) + func compilationComparison(regex: Regex) throws { + let listCompiler = Compiler(tree: regex.program.tree) + let listProgram = try listCompiler.emitViaList() + let treeCompiler = Compiler(tree: regex.program.tree) + let treeProgram = try treeCompiler.emit() + + #expect(treeProgram.instructions == listProgram.instructions) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e20beeaf..ded6c213 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -37,16 +37,34 @@ func _roundTripLiteral( return remadeRegex } +func _validateListCompilation( + _ regex: Regex +) throws -> Bool { + let treeCompiler = Compiler(tree: regex.program.tree) + let treeProgram = try treeCompiler.emit() + let listCompiler = Compiler(tree: regex.program.tree) + let listProgram = try listCompiler.emitViaList() + return treeProgram.instructions == listProgram.instructions +} + func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, semanticLevel: RegexSemanticLevel = .graphemeCluster, - syntax: SyntaxOptions = .traditional + syntax: SyntaxOptions = .traditional, + file: StaticString = #file, + line: UInt = #line ) throws -> (String, [String?])? { var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) let result = try regex.firstMatch(in: input) - + + if try !_validateListCompilation(regex) { + XCTFail( + "List compilation failed for '\(regexStr)'", + file: file, line: line) + } + func validateSubstring(_ substringInput: Substring) throws { // Sometimes the characters we add to a substring merge with existing // string members. This messes up cross-validation, so skip the test. @@ -105,14 +123,18 @@ func _firstMatch( For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex?._literalPattern ?? "")' - """) + """, + file: file, + line: line) case let (_, rtMatch?): XCTFail(""" Incorrectly matched as '\(rtMatch)' For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex!._literalPattern!)' - """) + """, + file: file, + line: line) } } @@ -184,7 +206,8 @@ func flatCaptureTest( input: test, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax + syntax: syntax, + file: file, line: line ) else { if expect == nil { continue @@ -303,7 +326,8 @@ func firstMatchTest( input: input, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax)?.0 + syntax: syntax, + file: file, line: line)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line) From 4bd8a5d58aa6494223d442861837f4b40278f70c Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 4 Oct 2025 12:37:59 -0500 Subject: [PATCH 4/6] ... --- Sources/_StringProcessing/Compiler.swift | 3 --- 1 file changed, 3 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index e5e09ed0..dc7a498d 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -44,9 +44,6 @@ class Compiler { __consuming func emitViaList() throws -> MEProgram { // TODO: Handle global options let dslList = DSLList(tree: tree) - for (n, el) in dslList.nodes.enumerated() { - print("\(n): \(el)") - } var codegen = ByteCodeGen( options: options, compileOptions: From 467e885bade8da5e101afeb38fc12d8c9951a808 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 4 Oct 2025 15:43:10 -0500 Subject: [PATCH 5/6] Hande `canOnlyMatchAtStart` for DSLList --- .../ByteCodeGen+DSLList.swift | 158 ++++++++++++++++-- Sources/_StringProcessing/Regex/DSLList.swift | 35 ---- 2 files changed, 145 insertions(+), 48 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift index 970f6004..c61c37fd 100644 --- a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -26,14 +26,139 @@ extension Compiler.ByteCodeGen { var list = root.nodes[...] try emitNode(&list) - // FIXME: Restore this canOnlyMatchAtStart - // builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart() + builder.canOnlyMatchAtStart = canOnlyMatchAtStart(in: root) builder.buildAccept() return try builder.assemble() } } fileprivate extension Compiler.ByteCodeGen { + /// Implementation for `canOnlyMatchAtStart`, which maintains the option + /// state. + /// + /// For a given specific node, this method can return one of three values: + /// + /// - `true`: This node is guaranteed to match only at the start of a subject. + /// - `false`: This node can match anywhere in the subject. + /// - `nil`: This node is inconclusive about where it can match. + /// + /// In particular, non-required groups and option-setting groups are + /// inconclusive about where they can match. + private mutating func _canOnlyMatchAtStartImpl( + _ list: inout ArraySlice + ) -> Bool? { + guard let node = list.popFirst() else { return false } + switch node { + // Defining cases + case .atom(.assertion(.startOfSubject)): + return true + case .atom(.assertion(.caretAnchor)): + return !options.anchorsMatchNewlines + + // Changing options doesn't determine `true`/`false`. + case .atom(.changeMatchingOptions(let sequence)): + options.apply(sequence.ast) + return nil + + // Any other atom or consuming node returns `false`. + case .atom, .customCharacterClass, .quotedLiteral: + return false + + // Trivia/empty have no effect. + case .trivia, .empty: + return nil + + // In an alternation, all of its children must match only at start. + case .orderedChoice(let children): + for _ in 0.. Bool { + let currentOptions = options + options = MatchingOptions() + defer { options = currentOptions } + + var list = list.nodes[...] + return _canOnlyMatchAtStartImpl(&list) ?? false + } + mutating func emitAlternationGen( _ elements: inout ArraySlice, alternationCount: Int, @@ -667,41 +792,48 @@ fileprivate extension Compiler.ByteCodeGen { // MARK: Skip node extension Compiler.ByteCodeGen { - mutating func skipNode(_ list: inout ArraySlice) throws { + mutating func skipNode( + _ list: inout ArraySlice, + preservingCaptures: Bool = true + ) throws { guard let node = list.popFirst() else { return } switch node { case let .orderedChoice(children): let n = children.count for _ in 0.. DSLTree.Node? { - guard remainingCount > 0 else { return nil } - guard currentIndex < nodes.count else { - // FIXME: assert? - print("ERROR: index out of bounds") - return nil - } - remainingCount -= 1 - var nextIndex = currentIndex - var inc = nodes[currentIndex].directChildren + 1 - while inc > 0 { - nextIndex += 1 - inc += nodes[nextIndex].directChildren - 1 - } - - return nodes[currentIndex] - } - } - - func makeIterator() -> Iterator { - Iterator(nodes: nodes, currentIndex: firstChildIndex, remainingCount: nodes[firstChildIndex].directChildren) - } - } -} - extension DSLTree.Node { var directChildren: Int { switch self { From 409f69accf4856624d2b51fa7b57aecec1979090 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 4 Oct 2025 15:43:56 -0500 Subject: [PATCH 6/6] Enable DSLList as default This likely involves an increase in compilation time, since the list needs to be generated from the tree before compiling. --- Sources/_StringProcessing/Compiler.swift | 4 ++++ Tests/RegexTests/MatchTests.swift | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index dc7a498d..e2fd2a28 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -32,6 +32,10 @@ class Compiler { } __consuming func emit() throws -> MEProgram { + try emitViaList() + } + + __consuming func emitViaTree() throws -> MEProgram { // TODO: Handle global options var codegen = ByteCodeGen( options: options, diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index ded6c213..e36285ae 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -41,7 +41,7 @@ func _validateListCompilation( _ regex: Regex ) throws -> Bool { let treeCompiler = Compiler(tree: regex.program.tree) - let treeProgram = try treeCompiler.emit() + let treeProgram = try treeCompiler.emitViaTree() let listCompiler = Compiler(tree: regex.program.tree) let listProgram = try listCompiler.emitViaList() return treeProgram.instructions == listProgram.instructions