From 683b9fa021a0879f604fc428fe5a3b57d7663c43 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 1 Mar 2022 19:39:18 -0800 Subject: [PATCH 01/83] =?UTF-8?q?[stdlib]=20Adjust/fix=20String=E2=80=99s?= =?UTF-8?q?=20indexing=20operations=20to=20deal=20with=20the=20consequence?= =?UTF-8?q?s=20of=20SE-0180?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- stdlib/public/core/StringCharacterView.swift | 66 +++- .../public/core/StringGraphemeBreaking.swift | 219 ++++++++++---- stdlib/public/core/StringIndex.swift | 20 ++ .../public/core/StringUnicodeScalarView.swift | 1 + stdlib/public/core/Substring.swift | 211 +++++++++++-- test/stdlib/StringIndex.swift | 284 +++++++++++++++++- 6 files changed, 699 insertions(+), 102 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 131ae5d91df48..9443e024e3337 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -51,6 +51,10 @@ extension String: BidirectionalCollection { public func index(after i: Index) -> Index { _precondition(i < endIndex, "String index is out of bounds") + // FIXME: Unlike `index(before:)`, this function may return incorrect + // results if `i` isn't on a grapheme cluster boundary. (The grapheme + // breaking algorithm assumes we start on a break when we go forward.) + // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let i = _guts.scalarAlign(i) let stride = _characterStride(startingAt: i) @@ -149,12 +153,37 @@ extension String: BidirectionalCollection { /// case, the method returns `nil`. /// /// - Complexity: O(*n*), where *n* is the absolute value of `distance`. - @inlinable @inline(__always) public func index( _ i: Index, offsetBy distance: Int, limitedBy limit: Index ) -> Index? { + // Note: In Swift 5.6 and below, this function used to be inlinable, + // forwarding to `BidirectionalCollection._index(_:offsetBy:limitedBy:)`. + // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - return _index(i, offsetBy: distance, limitedBy: limit) + + // Per SE-0180, `i` and `limit` are allowed to fall in between grapheme + // breaks, in which case this function must still terminate without trapping + // and return a result that makes sense. + + // Note: `limit` is intentionally not scalar aligned to ensure our behavior + // exactly matches the documentation above. + + let start = _guts.scalarAlign(i) + var i = start + if distance >= 0 { + for _ in stride(from: 0, to: distance, by: 1) { + guard limit < start || i < limit else { return nil } + formIndex(after: &i) + } + guard limit < start || i <= limit else { return nil } + } else { + for _ in stride(from: 0, to: distance, by: -1) { + guard limit > start || i > limit else { return nil } + formIndex(before: &i) + } + guard limit > start || i >= limit else { return nil } + } + return i } /// Returns the distance between two indices. @@ -166,10 +195,39 @@ extension String: BidirectionalCollection { /// - Returns: The distance between `start` and `end`. /// /// - Complexity: O(*n*), where *n* is the resulting distance. - @inlinable @inline(__always) public func distance(from start: Index, to end: Index) -> Int { + // Note: In Swift 5.6 and below, this function used to be inlinable, + // forwarding to `BidirectionalCollection._distance(from:to:)`. + // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - return _distance(from: _guts.scalarAlign(start), to: _guts.scalarAlign(end)) + let start = _guts.scalarAlign(start) + let end = _guts.scalarAlign(end) + + // Per SE-0180, `start` and `end` are allowed to fall in between grapheme + // breaks, in which case this function must still terminate without trapping + // and return a result that makes sense. + + // FIXME: Due to the `index(after:)` problem above, this function doesn't + // always return consistent results when the given indices fall between + // grapheme breaks -- swapping `start` and `end` may change the magnitude of + // the result. + + var i = start + var count = 0 + + if i < end { + while i < end { // Note `<` instead of `==` + count += 1 + formIndex(after: &i) + } + } + else if i > end { + while i > end { // Note `<` instead of `==` + count -= 1 + formIndex(before: &i) + } + } + return count } /// Accesses the character at the given position. diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 2abfa59866597..f25667bfb2994 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -106,9 +106,32 @@ extension _StringGuts { } let nextIdx = withFastUTF8 { utf8 in - nextBoundary(startingAt: i) { - let (scalar, len) = _decodeScalar(utf8, startingAt: $0) - return (scalar, $0 &+ len) + nextBoundary(startingAt: i, startIndex: 0) { j in + guard j < utf8.count else { return nil } + let (scalar, len) = _decodeScalar(utf8, startingAt: j) + return (scalar, j &+ len) + } + } + + return nextIdx &- i + } + + @_effects(releasenone) + internal func _opaqueCharacterStride( + startingAt i: Int, + in bounds: Range + ) -> Int { + _internalInvariant(bounds.contains(i)) + if _slowPath(isForeign) { + return _foreignOpaqueCharacterStride(startingAt: i, in: bounds) + } + + let nextIdx = withFastUTF8 { utf8 in + nextBoundary(startingAt: i, startIndex: bounds.lowerBound) { j in + _internalInvariant(j >= bounds.lowerBound) + guard j < bounds.upperBound else { return nil } + let (scalar, len) = _decodeScalar(utf8, startingAt: j) + return (scalar, j &+ len) } } @@ -123,9 +146,32 @@ extension _StringGuts { } let previousIdx = withFastUTF8 { utf8 in - previousBoundary(endingAt: i) { - let (scalar, len) = _decodeScalar(utf8, endingAt: $0) - return (scalar, $0 &- len) + previousBoundary(endingAt: i, startIndex: 0) { j in + guard j > 0 else { return nil } + let (scalar, len) = _decodeScalar(utf8, endingAt: j) + return (scalar, j &- len) + } + } + + return i &- previousIdx + } + + @_effects(releasenone) + internal func _opaqueCharacterStride( + endingAt i: Int, + in bounds: Range + ) -> Int { + _internalInvariant(i > bounds.lowerBound && i <= bounds.upperBound) + if _slowPath(isForeign) { + return _foreignOpaqueCharacterStride(endingAt: i, in: bounds) + } + + let previousIdx = withFastUTF8 { utf8 in + previousBoundary(endingAt: i, startIndex: bounds.lowerBound) { j in + _internalInvariant(j <= bounds.upperBound) + guard j > bounds.lowerBound else { return nil } + let (scalar, len) = _decodeScalar(utf8, endingAt: j) + return (scalar, j &- len) } } @@ -138,9 +184,39 @@ extension _StringGuts { #if _runtime(_ObjC) _internalInvariant(isForeign) - let nextIdx = nextBoundary(startingAt: i) { + let nextIdx = nextBoundary(startingAt: i, startIndex: 0) { j in + guard j < count else { return nil } let scalars = String.UnicodeScalarView(self) - let idx = String.Index(_encodedOffset: $0) + let idx = String.Index(_encodedOffset: j) + + let scalar = scalars[idx] + let nextIdx = scalars.index(after: idx) + + return (scalar, nextIdx._encodedOffset) + } + + return nextIdx &- i +#else + fatalError("No foreign strings on Linux in this version of Swift") +#endif + } + + @inline(never) + @_effects(releasenone) + private func _foreignOpaqueCharacterStride( + startingAt i: Int, + in bounds: Range + ) -> Int { +#if _runtime(_ObjC) + _internalInvariant(isForeign) + _internalInvariant(bounds.contains(i)) + + let nextIdx = nextBoundary( + startingAt: i, startIndex: bounds.lowerBound + ) { j in + guard j < bounds.upperBound else { return nil } + let scalars = String.UnicodeScalarView(self) + let idx = String.Index(_encodedOffset: j) let scalar = scalars[idx] let nextIdx = scalars.index(after: idx) @@ -160,9 +236,10 @@ extension _StringGuts { #if _runtime(_ObjC) _internalInvariant(isForeign) - let previousIdx = previousBoundary(endingAt: i) { + let previousIdx = previousBoundary(endingAt: i, startIndex: 0) { j in + guard j > 0 else { return nil } let scalars = String.UnicodeScalarView(self) - let idx = String.Index(_encodedOffset: $0) + let idx = String.Index(_encodedOffset: j) let previousIdx = scalars.index(before: idx) let scalar = scalars[previousIdx] @@ -173,6 +250,35 @@ extension _StringGuts { return i &- previousIdx #else fatalError("No foreign strings on Linux in this version of Swift") +#endif + } + + @inline(never) + @_effects(releasenone) + private func _foreignOpaqueCharacterStride( + endingAt i: Int, + in bounds: Range + ) -> Int { +#if _runtime(_ObjC) + _internalInvariant(isForeign) + _internalInvariant(i > bounds.lowerBound && i <= bounds.upperBound) + + let previousIdx = previousBoundary( + endingAt: i, startIndex: bounds.lowerBound + ) { j in + guard j > bounds.lowerBound else { return nil } + let scalars = String.UnicodeScalarView(self) + let idx = String.Index(_encodedOffset: j) + + let previousIdx = scalars.index(before: idx) + + let scalar = scalars[previousIdx] + return (scalar, previousIdx._encodedOffset) + } + + return i &- previousIdx +#else + fatalError("No foreign strings on Linux in this version of Swift") #endif } } @@ -239,63 +345,54 @@ internal struct _GraphemeBreakingState { } extension _StringGuts { - // Returns the stride of the next grapheme cluster at the previous boundary - // offset. + // Returns the stride of the grapheme cluster starting at offset `index`. internal func nextBoundary( startingAt index: Int, - nextScalar: (Int) -> (Unicode.Scalar, end: Int) + startIndex: Int, + nextScalar: (Int) -> (Unicode.Scalar, end: Int)? ) -> Int { - _internalInvariant(index != endIndex._encodedOffset) + _internalInvariant(index < endIndex._encodedOffset) var state = _GraphemeBreakingState() - var index = index + var (scalar, index) = nextScalar(index)! while true { - let (scalar1, nextIdx) = nextScalar(index) - index = nextIdx - - guard index != endIndex._encodedOffset else { - break - } - - let (scalar2, _) = nextScalar(index) - - if shouldBreak(scalar1, between: scalar2, &state, index) { + guard let (scalar2, nextIndex) = nextScalar(index) else { break } + if shouldBreak( + scalar, between: scalar2, &state, index, startIndex: startIndex + ) { break } + index = nextIndex + scalar = scalar2 } return index } - // Returns the stride of the previous grapheme cluster at the current boundary - // offset. + // Returns the stride of the grapheme cluster ending at offset `index`. internal func previousBoundary( endingAt index: Int, - previousScalar: (Int) -> (Unicode.Scalar, start: Int) + startIndex: Int, + previousScalar: (Int) -> (Unicode.Scalar, start: Int)? ) -> Int { - _internalInvariant(index != startIndex._encodedOffset) + _internalInvariant(index > startIndex) var state = _GraphemeBreakingState() - var index = index + var (scalar2, index) = previousScalar(index)! while true { - let (scalar2, previousIdx) = previousScalar(index) - index = previousIdx - - guard index != startIndex._encodedOffset else { - break - } - - let (scalar1, _) = previousScalar(index) - + guard let (scalar1, previousIndex) = previousScalar(index) else { break } if shouldBreak( scalar1, between: scalar2, &state, index, + startIndex: startIndex, isBackwards: true ) { break } + index = previousIndex + scalar2 = scalar1 } return index @@ -313,6 +410,7 @@ extension _StringGuts { between scalar2: Unicode.Scalar, _ state: inout _GraphemeBreakingState, _ index: Int, + startIndex: Int = 0, isBackwards: Bool = false ) -> Bool { // GB3 @@ -421,7 +519,7 @@ extension _StringGuts { // GB11 case (.zwj, .extendedPictographic): if isBackwards { - return !checkIfInEmojiSequence(index) + return !checkIfInEmojiSequence(index, startIndex: startIndex) } return !state.isInEmojiSequence @@ -429,7 +527,7 @@ extension _StringGuts { // GB12 & GB13 case (.regionalIndicator, .regionalIndicator): if isBackwards { - return countRIs(index) + return countRIs(index, startIndex: startIndex) } defer { @@ -457,10 +555,10 @@ extension _StringGuts { return true } - return !checkIfInIndicSequence(index) + return !checkIfInIndicSequence(index, startIndex: startIndex) case (.zwj, true): - return !checkIfInIndicSequence(index) + return !checkIfInIndicSequence(index, startIndex: startIndex) default: return true @@ -514,17 +612,14 @@ extension _StringGuts { // | = We found our starting .extendedPictographic letting us // know that we are in an emoji sequence so our initial // break question is answered as NO. - internal func checkIfInEmojiSequence(_ index: Int) -> Bool { - var emojiIdx = String.Index(_encodedOffset: index) - - guard emojiIdx != startIndex else { - return false - } + internal func checkIfInEmojiSequence(_ index: Int, startIndex: Int) -> Bool { + guard index > startIndex else { return false } + var emojiIdx = String.Index(_encodedOffset: index) let scalars = String.UnicodeScalarView(self) scalars.formIndex(before: &emojiIdx) - while emojiIdx != startIndex { + while emojiIdx._encodedOffset > startIndex { scalars.formIndex(before: &emojiIdx) let scalar = scalars[emojiIdx] @@ -571,13 +666,10 @@ extension _StringGuts { // ^ // | = Is a linking consonant and we've seen a virama, so this is a // legitimate indic sequence, so do NOT break the initial question. - internal func checkIfInIndicSequence(_ index: Int) -> Bool { - var indicIdx = String.Index(_encodedOffset: index) - - guard indicIdx != startIndex else { - return false - } + internal func checkIfInIndicSequence(_ index: Int, startIndex: Int) -> Bool { + guard index > startIndex else { return false } + var indicIdx = String.Index(_encodedOffset: index) let scalars = String.UnicodeScalarView(self) scalars.formIndex(before: &indicIdx) @@ -590,7 +682,7 @@ extension _StringGuts { hasSeenVirama = true } - while indicIdx != startIndex { + while indicIdx._encodedOffset > startIndex { scalars.formIndex(before: &indicIdx) let scalar = scalars[indicIdx] @@ -657,21 +749,16 @@ extension _StringGuts { // ^ // | = Not a .regionalIndicator. riCount = 1 which is odd, so break // the last two .regionalIndicators. - internal func countRIs( - _ index: Int - ) -> Bool { - var riIdx = String.Index(_encodedOffset: index) - - guard riIdx != startIndex else { - return false - } + internal func countRIs(_ index: Int, startIndex: Int) -> Bool { + guard index > startIndex else { return false } + var riIdx = String.Index(_encodedOffset: index) var riCount = 0 let scalars = String.UnicodeScalarView(self) scalars.formIndex(before: &riIdx) - while riIdx != startIndex { + while riIdx._encodedOffset > startIndex { scalars.formIndex(before: &riIdx) let scalar = scalars[riIdx] diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index 74be5f4b37983..0b518b45e4913 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -297,3 +297,23 @@ extension String.Index: Hashable { hasher.combine(orderingValue) } } + +// FIXME: This is for debugging only; remove before merging. +extension String.Index: CustomStringConvertible { + @_alwaysEmitIntoClient + public var description: String { + var d = "Index(" + d += "offset: \(_encodedOffset)" + if transcodedOffset != 0 { + d += "+\(transcodedOffset)" + } + if let stride = characterStride { + d += ", stride: \(stride)" + } + if _isScalarAligned { + d += ", scalarAligned" + } + d += ")" + return d + } +} diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index f5b901916cb00..b98b180d34188 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -414,6 +414,7 @@ extension String.UnicodeScalarView { @available(swift, introduced: 4) public subscript(r: Range) -> String.UnicodeScalarView.SubSequence { + _failEarlyRangeCheck(r, bounds: startIndex.. Index { - _precondition(i < endIndex, "Cannot increment beyond endIndex") - _precondition(i >= startIndex, "Cannot increment an invalid index") - return _slice.index(after: i) + // Note: in Swift 5.6 and below, this method used to be inlinable, + // forwarding to `_slice.base.index(after:)`. Unfortunately, that approach + // isn't compatible with SE-0180, as it allows Unicode scalars outside the + // substring to affect grapheme breaking results within the substring. This + // leads to Collection conformance issues when the `Substring`'s bounds do + // not fall on grapheme boundaries in `base`. + + // FIXME: Unlike `index(before:)`, this function may return incorrect + // results if `i` isn't on a grapheme cluster boundary. (The grapheme + // breaking algorithm assumes we start on a break when we go forward.) + + let i = _slice.base._guts.scalarAlign(i) + + _precondition(i < endIndex && i >= startIndex, + "Substring index is out of bounds") + + let stride = _characterStride(startingAt: i) + let nextOffset = i._encodedOffset &+ stride + + let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned + guard _knownToStartOnGraphemeBreak else { + // Don't cache character strides in indices of exotic substrings whose + // startIndex isn't aligned on a grapheme cluster boundary. (Their + // grapheme breaks may not match with those in `base`.) + return nextIndex + } + guard nextIndex < endIndex || _knownToEndOnGraphemeBreak else { + // Don't cache the stride if we end on a partial grapheme cluster. + return nextIndex + } + let nextStride = _characterStride(startingAt: nextIndex) + return Index( + encodedOffset: nextOffset, characterStride: nextStride)._scalarAligned } - @inlinable @inline(__always) public func index(before i: Index) -> Index { - _precondition(i <= endIndex, "Cannot decrement an invalid index") - _precondition(i > startIndex, "Cannot decrement beyond startIndex") - return _slice.index(before: i) + // Note: in Swift 5.6 and below, this method used to be inlinable, + // forwarding to `_slice.base.index(before:)`. Unfortunately, that approach + // isn't compatible with SE-0180, as it allows Unicode scalars outside the + // substring to affect grapheme breaking results within the substring. This + // leads to Collection conformance issues when the `Substring`'s bounds do + // not fall on grapheme boundaries in `base`. + + _precondition(i <= endIndex && i > startIndex, + "Substring index is out of bounds") + + // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. + let i = _slice.base._guts.scalarAlign(i) + let stride = _characterStride(endingAt: i) + let priorOffset = i._encodedOffset &- stride + _internalInvariant(priorOffset >= startIndex._encodedOffset) + + guard _knownToStartOnGraphemeBreak else { + // Don't cache character strides in indices of exotic substrings whose + // startIndex isn't aligned on a grapheme cluster boundary. (Their + // grapheme breaks may not match with those in `base`.) + return Index(_encodedOffset: priorOffset)._scalarAligned + } + + return Index( + encodedOffset: priorOffset, characterStride: stride)._scalarAligned } - @inlinable @inline(__always) - public func index(_ i: Index, offsetBy n: Int) -> Index { - let result = _slice.index(i, offsetBy: n) - _precondition( - (_slice._startIndex ... _slice.endIndex).contains(result), - "Operation results in an invalid index") - return result + public func index(_ i: Index, offsetBy distance: Int) -> Index { + // Note: in Swift 5.6 and below, this method used to be inlinable, + // forwarding to `_slice.base.index(_:offsetBy:)`. Unfortunately, that + // approach isn't compatible with SE-0180, as it allows Unicode scalars + // outside the substring to affect grapheme breaking results within the + // substring. This leads to Collection conformance issues when the + // `Substring`'s bounds do not fall on grapheme boundaries in `base`. + return _index(i, offsetBy: distance) } - @inlinable @inline(__always) public func index( - _ i: Index, offsetBy n: Int, limitedBy limit: Index + _ i: Index, offsetBy distance: Int, limitedBy limit: Index ) -> Index? { - let result = _slice.index(i, offsetBy: n, limitedBy: limit) - _precondition(result.map { - (_slice._startIndex ... _slice.endIndex).contains($0) - } ?? true, - "Operation results in an invalid index") - return result + // Note: in Swift 5.6 and below, this method used to be inlinable, + // forwarding to `_slice.base.index(_:offsetBy:limitedBy:)`. Unfortunately, + // that approach isn't compatible with SE-0180, as it allows Unicode scalars + // outside the substring to affect grapheme breaking results within the + // substring. This leads to Collection conformance issues when the + // `Substring`'s bounds do not fall on grapheme boundaries in `base`. + + // Per SE-0180, `i` and `limit` are allowed to fall in between grapheme + // breaks, in which case this function must still terminate without trapping + // and return a result that makes sense. + + // Note: `limit` is intentionally not scalar aligned to ensure our behavior + // exactly matches the documentation. + + let start = _slice.base._guts.scalarAlign(i) + var i = start + if distance >= 0 { + for _ in stride(from: 0, to: distance, by: 1) { + guard limit < start || i < limit else { return nil } + formIndex(after: &i) + } + guard limit < start || i <= limit else { return nil } + } else { + for _ in stride(from: 0, to: distance, by: -1) { + guard limit > start || i > limit else { return nil } + formIndex(before: &i) + } + guard limit > start || i >= limit else { return nil } + } + return i } - @inlinable @inline(__always) public func distance(from start: Index, to end: Index) -> Int { - return _slice.distance(from: start, to: end) + // Note: in Swift 5.6 and below, this method used to be inlinable, + // forwarding to `_slice.base.distance(from:to:)`. Unfortunately, that + // approach isn't compatible with SE-0180, as it allows Unicode scalars + // outside the substring to affect grapheme breaking results within the + // substring. This leads to Collection conformance issues when the + // `Substring`'s bounds do not fall on grapheme boundaries in `base`. + + // TODO: known-ASCII and single-scalar-grapheme fast path, etc. + + // Per SE-0180, `start` and `end` are allowed to fall in between grapheme + // breaks, in which case this function must still terminate without trapping + // and return a result that makes sense. + var i = _slice.base._guts.scalarAlign(start) + let end = _slice.base._guts.scalarAlign(end) + var count = 0 + + if i < end { + while i < end { // Note `<` instead of `==` + count += 1 + formIndex(after: &i) + } + } + else if i > end { + while i > end { // Note `<` instead of `==` + count -= 1 + formIndex(before: &i) + } + } + return count } public subscript(i: Index) -> Character { @@ -208,13 +309,20 @@ extension Substring: StringProtocol { _ bounds: Range, with newElements: C ) where C: Collection, C.Iterator.Element == Iterator.Element { - _slice.replaceSubrange(bounds, with: newElements) + _replaceSubrange(bounds, with: newElements) } public mutating func replaceSubrange( _ bounds: Range, with newElements: Substring ) { - replaceSubrange(bounds, with: newElements._slice) + _replaceSubrange(bounds, with: newElements) + } + + @inline(__always) + internal mutating func _replaceSubrange( + _ bounds: Range, with newElements: C + ) where C.Element == Element { + _slice.replaceSubrange(bounds, with: newElements) } /// Creates a string from the given Unicode code units in the specified @@ -307,6 +415,50 @@ extension Substring: StringProtocol { } } +extension Substring { + internal var _knownToStartOnGraphemeBreak: Bool { + startIndex._encodedOffset == 0 || startIndex.characterStride != nil + } + + internal var _knownToEndOnGraphemeBreak: Bool { + endIndex == _slice.base.endIndex || endIndex.characterStride != nil + } + + internal var _encodedOffsetRange: Range { + Range(_uncheckedBounds: ( + _slice._startIndex._encodedOffset, _slice._endIndex._encodedOffset)) + } + + internal func _characterStride(startingAt i: Index) -> Int { + _internalInvariant(i._isScalarAligned) + + // Fast path if the index already has its stride cached. Substrings that + // don't start on a grapheme cluster boundary may have different grapheme + // break positions than their base string, so we must ignore the cache in + // that case. + if let d = i.characterStride, _knownToStartOnGraphemeBreak { + // Make sure a cached stride cannot lead us beyond the substring's end + // index. This can happen if `self` ends between grapheme cluster + // boundaries. + return Swift.min(d, endIndex._encodedOffset &- i._encodedOffset) + } + + if i == endIndex { return 0 } + + return _slice.base._guts._opaqueCharacterStride( + startingAt: i._encodedOffset, in: _encodedOffsetRange) + } + + internal func _characterStride(endingAt i: Index) -> Int { + _internalInvariant(i._isScalarAligned) + + if i == startIndex { return 0 } + + return _slice.base._guts._opaqueCharacterStride( + endingAt: i._encodedOffset, in: _encodedOffsetRange) + } +} + #if SWIFT_ENABLE_REFLECTION extension Substring: CustomReflectable { public var customMirror: Mirror { return String(self).customMirror } @@ -601,9 +753,11 @@ extension Substring { /// Creates an instance that slices `base` at `_bounds`. @inlinable internal init(_ base: String.UnicodeScalarView, _bounds: Range) { + let start = base._guts.scalarAlign(_bounds.lowerBound) + let end = base._guts.scalarAlign(_bounds.upperBound) _slice = Slice( base: String(base._guts).unicodeScalars, - bounds: _bounds) + bounds: Range(_uncheckedBounds: (start, end))) } } } @@ -676,6 +830,7 @@ extension Substring.UnicodeScalarView: BidirectionalCollection { @inlinable public subscript(r: Range) -> Substring.UnicodeScalarView { + _failEarlyRangeCheck(r, bounds: startIndex.. + + + s.unicodeScalars.indices.forEach { + print("\($0) -> U+\(String(s.unicodeScalars[$0].value, radix: 16, uppercase: true)) \(s.unicodeScalars[$0].properties.name ?? "\(s.unicodeScalars[$0].debugDescription)")") + } + + let i = s.unicodeScalars.index(s.unicodeScalars.startIndex, offsetBy: 1) // S + let j = s.unicodeScalars.index(s.unicodeScalars.startIndex, offsetBy: 3) // A + // Per SE-0180, `s[i..(_:in:)") { guard #available(SwiftStdlib 5.1, *) else { return @@ -301,8 +331,6 @@ StringIndexTests.test("Misaligned") { doIt(string) } -#endif // _runtime(_ObjC) - StringIndexTests.test("Exhaustive Index Interchange") { // Exhaustively test aspects of string index interchange func testInterchange( @@ -464,5 +492,253 @@ StringIndexTests.test("Exhaustive Index Interchange") { testInterchange(("ab\r\ncдe\u{301}日🧟‍♀️x🧟x🏳️‍🌈🇺🇸🇨🇦" as NSString) as String) #endif // _runtime(_ObjC) } +#endif + +extension Collection { + // Assuming both `self` and `other` are sorted, call `body` for each element + // `a` in `other` together with the slice in `self` that starts with the first + // element in `self` that is greater than or equal to `a`, up to the first + // element that is greater than or equal to the next value in `other`. + // + // `other` must start with an item that is less than or equal to the first + // item in `self`. + func forEachIndexGroup( + by other: G, + body: (G.Index, Self.SubSequence) throws -> Void + ) rethrows +where G.Index == Self.Index + { + if other.isEmpty { + assert(self.isEmpty) + return + } + var i = other.startIndex + var j = self.startIndex + while i != other.endIndex { + let current = i + other.formIndex(after: &i) + let start = j + while j < i, j < self.endIndex { + self.formIndex(after: &j) + } + let end = j + try body(current, self[start ..< end]) + } + } +} + +extension String { + /// Returns a dictionary mapping each valid index to the index that lies on + /// the nearest scalar boundary, rounding down. + func scalarMap() -> [String.Index: String.Index] { + var map: [String.Index: String.Index] = [:] + self.utf8.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice in + for i in slice.indices { map[i] = scalar } + } + self.utf16.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice in + for i in slice.indices { map[i] = scalar } + } + self.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice in + for i in slice.indices { map[i] = scalar } + } + map[endIndex] = endIndex + return map + } + + /// Returns a dictionary mapping each valid index to the index that lies on + /// the nearest character boundary, rounding down. + func characterMap() -> [String.Index: String.Index] { + var map: [String.Index: String.Index] = [:] + self.utf8.forEachIndexGroup(by: self) { scalar, slice in + for i in slice.indices { map[i] = scalar } + } + self.utf16.forEachIndexGroup(by: self) { scalar, slice in + for i in slice.indices { map[i] = scalar } + } + self.unicodeScalars.forEachIndexGroup(by: self) { scalar, slice in + for i in slice.indices { map[i] = scalar } + } + map[endIndex] = endIndex + return map + } +} + +StringIndexTests.test("Extra Exhaustive Index Interchange") { + func check( + _ string: String, + stackTrace: SourceLocStack = SourceLocStack(), + showFrame: Bool = true, + file: String = #file, + line: UInt = #line + ) { + let scalarMap = string.scalarMap() + let characterMap = string.characterMap() + + // This is a list of every valid index in every string view, including end + // indices. We keep equal indices because they may have different grapheme + // size caches or flags etc. + var allIndices = Array(string.indices) + [string.endIndex] + allIndices += Array(string.unicodeScalars.indices) + [string.unicodeScalars.endIndex] + allIndices += Array(string.utf8.indices) + [string.utf8.endIndex] + allIndices += Array(string.utf16.indices) + [string.utf16.endIndex] + + func referenceCharacterDistance( + from i: String.Index, to j: String.Index + ) -> Int { + let ci = characterMap[i]! + let cj = characterMap[j]! + let si = scalarMap[i]! + let sj = scalarMap[j]! + var d = string.distance(from: ci, to: cj) + if si < sj { + if ci == cj { d = 1 } + else if cj < sj { d += 1 } + } else if si > sj { + if ci == cj { d = -1 } + else if ci < si { d -= 1 } + } + return d + } + + for i in allIndices { + for j in allIndices { + let si = scalarMap[i]! + let sj = scalarMap[j]! + + let characterDistance = referenceCharacterDistance(from: i, to: j) + let scalarDistance = string.unicodeScalars.distance(from: si, to: sj) + + // Check distance calculations. + if #available(SwiftStdlib 5.7, *) { + expectEqual( + string.distance(from: i, to: j), + characterDistance, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + """) + if i <= j { + expectEqual(string[i ..< j].count, characterDistance, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + """) + } + } + + expectEqual( + string.unicodeScalars.distance(from: i, to: j), + scalarDistance, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + """) + if i <= j { + expectEqual(string.unicodeScalars[i ..< j].count, scalarDistance, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + """) + } + + // Check reachability of substring bounds. + if i <= j { + if #available(SwiftStdlib 5.7, *) { + let substring = string[i ..< j] + expectEqual( + substring.index(substring.startIndex, offsetBy: characterDistance), + substring.endIndex, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + distance: \(characterDistance) + """) + expectEqual( + substring.index(substring.endIndex, offsetBy: -characterDistance), + substring.startIndex, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + distance: \(-characterDistance) + """) + } + let subscalars = string.unicodeScalars[i ..< j] + expectEqual( + subscalars.index(subscalars.startIndex, offsetBy: scalarDistance), + subscalars.endIndex, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + distance: \(scalarDistance) + """) + expectEqual( + subscalars.index(subscalars.endIndex, offsetBy: -scalarDistance), + subscalars.startIndex, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + distance: \(-scalarDistance) + """) + } + } + } + + // Check `String.index(_:offsetBy:limitedBy:)`. + if #available(SwiftStdlib 5.7, *) { + for i in allIndices { + for j in string.indices + [string.endIndex] { // End on a char boundary + let distance = referenceCharacterDistance(from: i, to: j) + for limit in allIndices { + let expectHit = ( + distance > 0 && i <= limit && j > limit ? true + : distance < 0 && i >= limit && j < limit ? true + : false) + expectEqual( + string.index(i, offsetBy: distance, limitedBy: limit), + expectHit ? nil : j, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) (distance: \(distance)) + limit: \(limit) + """) + } + } + } + } + } + + let strings: [StaticString] = [ + "abc\r\ndefg", + "ab\r\ncдe\u{301}日🧟‍♀️x🧟x🏳️‍🌈🇺🇸🇨🇦", + ] + + for s in strings { + let str = "\(s)" + print("-------------------------------------------------------------------") + str.unicodeScalars.indices.forEach { i in + let scalar = str.unicodeScalars[i] + let value = String(scalar.value, radix: 16, uppercase: true) + let name = scalar.properties.name ?? "\(scalar.debugDescription)" + print("\(i) -> U+\(value) \(name)") + } + + check(str) + + #if _runtime(_ObjC) + let nsstr = NSString(utf8String: s.utf8Start)! + check(nsstr as String) + #endif + } +} + runAllTests() From 6e18955f901ae4328991311b8e8696ad74dc485c Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 1 Mar 2022 19:47:16 -0800 Subject: [PATCH 02/83] [stdlib] Add bookkeeping to keep track of the encoding of strings and indices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Assign some previously reserved bits in String.Index and _StringObject to keep track of their associated storage encoding (either UTF-8 or UTF-16). None of these bits will be reliably set in processes that load binaries compiled with older stdlib releases, but when they do end up getting set, we can use them opportunistically to more reliably detect cases where an index is applied on a string with a mismatching encoding. As more and more code gets recompiled with 5.7+, the stdlib will gradually become able to detect such issues with complete accuracy. Code that misuses indices this way was always considered broken; however, String wasn’t able to reliably detect these runtime errors before. Therefore, I expect there is a large amount of broken code out there that keeps using bridged Cocoa String indices (UTF-16) after a mutation turns them into native UTF-8 strings. Therefore, instead of trapping, this commit silently corrects the issue, transcoding the offsets into the correct encoding. It would probably be a good idea to also emit a runtime warning in addition to recovering from the error. This would generate some noise that would gently nudge folks to fix their code. rdar://89369680 --- stdlib/public/core/StringBridge.swift | 7 +- stdlib/public/core/StringCharacterView.swift | 132 +++++++++++---- stdlib/public/core/StringGuts.swift | 100 +++++++++++- stdlib/public/core/StringGutsSlice.swift | 10 +- stdlib/public/core/StringIndex.swift | 131 +++++++++++++-- stdlib/public/core/StringObject.swift | 162 +++++++++++++++---- stdlib/public/core/Substring.swift | 145 +++++++++++++---- stdlib/public/core/UnicodeHelpers.swift | 8 +- 8 files changed, 587 insertions(+), 108 deletions(-) diff --git a/stdlib/public/core/StringBridge.swift b/stdlib/public/core/StringBridge.swift index a62dbeec5365a..195d702aacd38 100644 --- a/stdlib/public/core/StringBridge.swift +++ b/stdlib/public/core/StringBridge.swift @@ -614,10 +614,13 @@ extension String { // TODO: We'd rather emit a valid ObjC object statically than create a // shared string class instance. let gutsCountAndFlags = _guts._object._countAndFlags + let countAndFlags = _StringObject.CountAndFlags( + sharedCount: _guts.count, + isASCII: gutsCountAndFlags.isASCII, + isUTF16: false) return __SharedStringStorage( immortal: _guts._object.fastUTF8.baseAddress!, - countAndFlags: _StringObject.CountAndFlags( - sharedCount: _guts.count, isASCII: gutsCountAndFlags.isASCII)) + countAndFlags: countAndFlags) } _internalInvariant(_guts._object.hasObjCBridgeableObject, diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 9443e024e3337..e53eed2a6aee0 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -49,21 +49,34 @@ extension String: BidirectionalCollection { /// `endIndex`. /// - Returns: The index value immediately after `i`. public func index(after i: Index) -> Index { + let i = _guts.ensureMatchingEncoding(i) _precondition(i < endIndex, "String index is out of bounds") + let r = _uncheckedIndex(after: _guts.scalarAlign(i)) + return _guts.markEncoding(r) + } + /// A version of `index(after:)` that assumes that the given index: + /// + /// - has the right encoding, + /// - is within bounds, and + /// - is scalar aligned. + /// + /// It does not mark the encoding of the returned index. + internal func _uncheckedIndex(after i: Index) -> Index { // FIXME: Unlike `index(before:)`, this function may return incorrect // results if `i` isn't on a grapheme cluster boundary. (The grapheme // breaking algorithm assumes we start on a break when we go forward.) + _internalInvariant(_guts.hasMatchingEncoding(i)) + _internalInvariant(i < endIndex) + _internalInvariant(i._isScalarAligned) // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. - let i = _guts.scalarAlign(i) let stride = _characterStride(startingAt: i) let nextOffset = i._encodedOffset &+ stride - let nextStride = _characterStride( - startingAt: Index(_encodedOffset: nextOffset)._scalarAligned) - - return Index( - encodedOffset: nextOffset, characterStride: nextStride)._scalarAligned + let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned + let nextStride = _characterStride(startingAt: nextIndex) + let r = Index(encodedOffset: nextOffset, characterStride: nextStride) + return r._scalarAligned } /// Returns the position immediately before the given index. @@ -72,7 +85,7 @@ extension String: BidirectionalCollection { /// `startIndex`. /// - Returns: The index value immediately before `i`. public func index(before i: Index) -> Index { - // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. + let i = _guts.ensureMatchingEncoding(i) // Note: bounds checking in `index(before:)` is tricky as scalar aligning an // index may need to access storage, but it may also move it closer towards @@ -82,11 +95,30 @@ extension String: BidirectionalCollection { let i = _guts.scalarAlign(i) _precondition(i > startIndex, "String index is out of bounds") + let r = _uncheckedIndex(before: _guts.scalarAlign(i)) + return _guts.markEncoding(r) + } + + /// A version of `index(before:)` that assumes that the given index: + /// + /// - has the right encoding, + /// - is within bounds, and + /// - is scalar aligned. + /// + /// It does not mark the encoding of the returned index. + internal func _uncheckedIndex(before i: Index) -> Index { + _internalInvariant(_guts.hasMatchingEncoding(i)) + _internalInvariant(i > startIndex && i <= endIndex) + _internalInvariant(i._isScalarAligned) + + // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let stride = _characterStride(endingAt: i) let priorOffset = i._encodedOffset &- stride - return Index( - encodedOffset: priorOffset, characterStride: stride)._scalarAligned + + let r = Index(encodedOffset: priorOffset, characterStride: stride) + return r._scalarAligned } + /// Returns an index that is the specified distance from the given index. /// /// The following example obtains an index advanced four positions from a @@ -109,10 +141,29 @@ extension String: BidirectionalCollection { /// is the same value as the result of `abs(distance)` calls to /// `index(before:)`. /// - Complexity: O(*n*), where *n* is the absolute value of `distance`. - @inlinable @inline(__always) public func index(_ i: Index, offsetBy distance: Int) -> Index { + // Note: in Swift 5.6 and below, this method used to be inlinable, + // forwarding to `_index(_:offsetBy:)`. + // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - return _index(i, offsetBy: distance) + + var i = _guts.ensureMatchingEncoding(i) + _precondition(i >= startIndex && i <= endIndex, + "String index is out of bounds") + i = _guts.scalarAlign(i) + + if distance >= 0 { + for _ in stride(from: 0, to: distance, by: 1) { + _precondition(i < endIndex, "String index is out of bounds") + i = _uncheckedIndex(after: i) + } + } else { + for _ in stride(from: 0, to: distance, by: -1) { + _precondition(i > startIndex, "String index is out of bounds") + i = _uncheckedIndex(before: i) + } + } + return _guts.markEncoding(i) } /// Returns an index that is the specified distance from the given index, @@ -158,6 +209,8 @@ extension String: BidirectionalCollection { ) -> Index? { // Note: In Swift 5.6 and below, this function used to be inlinable, // forwarding to `BidirectionalCollection._index(_:offsetBy:limitedBy:)`. + // Unfortunately, that approach isn't compatible with SE-0180, as it doesn't + // support cases where `i` or `limit` aren't character aligned. // TODO: known-ASCII and single-scalar-grapheme fast path, etc. @@ -167,23 +220,30 @@ extension String: BidirectionalCollection { // Note: `limit` is intentionally not scalar aligned to ensure our behavior // exactly matches the documentation above. + let limit = _guts.ensureMatchingEncoding(limit) - let start = _guts.scalarAlign(i) - var i = start + var i = _guts.ensureMatchingEncoding(i) + _precondition(i >= startIndex && i <= endIndex, + "String index is out of bounds") + i = _guts.scalarAlign(i) + + let start = i if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { guard limit < start || i < limit else { return nil } - formIndex(after: &i) + _precondition(i < endIndex, "String index is out of bounds") + i = _uncheckedIndex(after: i) } guard limit < start || i <= limit else { return nil } } else { for _ in stride(from: 0, to: distance, by: -1) { guard limit > start || i > limit else { return nil } - formIndex(before: &i) + _precondition(i > startIndex, "String index is out of bounds") + i = _uncheckedIndex(before: i) } guard limit > start || i >= limit else { return nil } } - return i + return _guts.markEncoding(i) } /// Returns the distance between two indices. @@ -199,32 +259,40 @@ extension String: BidirectionalCollection { // Note: In Swift 5.6 and below, this function used to be inlinable, // forwarding to `BidirectionalCollection._distance(from:to:)`. + // FIXME: Due to the `index(after:)` problem above, this function doesn't + // always return consistent results when the given indices fall between + // grapheme breaks -- swapping `start` and `end` may change the magnitude of + // the result. + + var start = _guts.ensureMatchingEncoding(start) + var end = _guts.ensureMatchingEncoding(end) + + _precondition( + start >= startIndex && start <= endIndex && + end >= startIndex && end <= endIndex, + "String index is out of bounds") + + start = _guts.scalarAlign(start) + end = _guts.scalarAlign(end) + // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - let start = _guts.scalarAlign(start) - let end = _guts.scalarAlign(end) // Per SE-0180, `start` and `end` are allowed to fall in between grapheme // breaks, in which case this function must still terminate without trapping // and return a result that makes sense. - // FIXME: Due to the `index(after:)` problem above, this function doesn't - // always return consistent results when the given indices fall between - // grapheme breaks -- swapping `start` and `end` may change the magnitude of - // the result. - var i = start var count = 0 - if i < end { while i < end { // Note `<` instead of `==` count += 1 - formIndex(after: &i) + i = _uncheckedIndex(after: i) } } else if i > end { while i > end { // Note `<` instead of `==` count -= 1 - formIndex(before: &i) + i = _uncheckedIndex(before: i) } } return count @@ -245,11 +313,17 @@ extension String: BidirectionalCollection { /// /// - Parameter i: A valid index of the string. `i` must be less than the /// string's end index. - @inlinable @inline(__always) + @inlinable @inline(__always) // FIXME(lorentey): Consider removing these. If + // `index(after:)` isn't inlinable, does it + // really matter if this one is? (Potential + // _guts-related optimizations notwithstanding.) + // `subscript` being inlinable forces a bunch of + // new additions to be _aEIC, even though they + // ought to be internal. public subscript(i: Index) -> Character { + var i = _guts.ensureMatchingEncoding(i) _boundsCheck(i) - - let i = _guts.scalarAlign(i) + i = _guts.scalarAlign(i) let distance = _characterStride(startingAt: i) return _guts.errorCorrectedCharacter( diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 163b06d3b626c..887f058aeff27 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -288,11 +288,107 @@ extension _StringGuts { @inlinable @inline(__always) internal var startIndex: String.Index { - return Index(_encodedOffset: 0)._scalarAligned + Index(_encodedOffset: 0)._scalarAligned._encodingIndependent } + @inlinable @inline(__always) internal var endIndex: String.Index { - return Index(_encodedOffset: self.count)._scalarAligned + markEncoding(Index(_encodedOffset: self.count)._scalarAligned) + } + + @inlinable @inline(__always) + internal func index(atOffset offset: Int) -> String.Index { + markEncoding(Index(_encodedOffset: self.count)._scalarAligned) + } +} + +// Encoding +extension _StringGuts { + @_alwaysEmitIntoClient // Swift 5.7 + internal func markEncoding(_ i: String.Index) -> String.Index { + if _slowPath(isForeign) { + // FIXME: Instead of having an opaque path here, we should define the same + // encoding flags in StringObject and pick them up from there. The flags + // can be initialized at the time the foreign string is created. + guard + #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) // SwiftStdlib 5.7 + else { + // We know all foreign strings were UTF-16 in releases < 5.7 + return i._knownUTF16 + } + return _foreignMarkEncoding(i) + } + return i._knownUTF8 + } + + @_effects(readnone) + @available(SwiftStdlib 5.7, *) + @usableFromInline + internal func _foreignMarkEncoding(_ i: String.Index) -> String.Index { + // Currently foreign indices always have UTF-16 offsets. + i._knownUTF16 + } + + internal func hasMatchingEncoding(_ i: String.Index) -> Bool { + (isForeign && i._canBeUTF16) || (!isForeign && i._canBeUTF8) + } + + /// Return an index whose encoding can be assumed to match that of `self`. + /// + /// Detecting an encoding mismatch isn't always possible -- older binaries did + /// not set the flags that this method relies on. However, false positives + /// cannot happen: if this method detects a mismatch, then it is guaranteed to + /// be a real one. + @_alwaysEmitIntoClient + @inline(__always) + internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index { + if _fastPath(!isForeign && i._canBeUTF8) { return i } + return _slowEnsureMatchingEncoding(i) + } + + @_alwaysEmitIntoClient + internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { + _internalInvariant(isForeign || !i._canBeUTF8) + if isForeign { + // Opportunistically detect attempts to use an UTF-8 index on a UTF-16 + // string. Strings don't usually get converted to UTF-16 storage, so it + // seems okay to trap in this case -- the index most likely comes from an + // unrelated string. (Trapping here may still turn out to affect binary + // compatibility with broken code in existing binaries running with new + // stdlibs. If so, we can replace this with the same transcoding hack as + // in the UTF-16->8 case below.) + // + // Note that this trap is not guaranteed to trigger when the process + // includes client binaries compiled with a previous Swift release. + // (`i._canBeUTF16` can sometimes return true in that case even if the + // index actually came from an UTF-8 string.) However, the trap will still + // often trigger in this case, as long as the index was initialized by + // code that was compiled with 5.7+. + // + // This trap can never trigger on OSes that have stdlibs <= 5.6, because + // those versions never set the `isKnownUTF16` flag in `_StringObject`. + // + _precondition(!_object.isKnownUTF16 || i._canBeUTF16, + "Invalid string index") + return i + } + // If we get here, then we know for sure that this is an attempt to use an + // UTF-16 index on a UTF-8 string. + // + // This can happen if `self` was originally verbatim-bridged, and someone + // mistakenly attempts to keep using an old index after a mutation. This is + // technically an error, but trapping here would trigger a lot of broken + // code that previously happened to work "fine" on e.g. ASCII strings. + // Instead, attempt to convert the offset to UTF-8 code units by transcoding + // the string. This can be slow, but it often results in a usable index, + // even if non-ASCII characters are present. (UTF-16 breadcrumbs help reduce + // the severity of the slowdown.) + + // FIXME: Consider emitting a runtime warning here. + // FIXME: Consider performing a linked-on-or-after check & trapping if the + // client executable was built on some particular future Swift release. + let utf16 = String(self).utf16 + return utf16.index(utf16.startIndex, offsetBy: i._encodedOffset) } } diff --git a/stdlib/public/core/StringGutsSlice.swift b/stdlib/public/core/StringGutsSlice.swift index e3f5b4e175271..dbd0e54744315 100644 --- a/stdlib/public/core/StringGutsSlice.swift +++ b/stdlib/public/core/StringGutsSlice.swift @@ -29,6 +29,9 @@ internal struct _StringGutsSlice { @inline(__always) internal init(_ guts: _StringGuts, _ offsetRange: Range) { + _internalInvariant( + guts.isOnUnicodeScalarBoundary(offsetRange.lowerBound) + && guts.isOnUnicodeScalarBoundary(offsetRange.upperBound)) self._guts = guts self._offsetRange = offsetRange } @@ -74,8 +77,11 @@ internal struct _StringGutsSlice { @inlinable internal var range: Range { @inline(__always) get { - return String.Index(_encodedOffset: _offsetRange.lowerBound) - ..< String.Index(_encodedOffset: _offsetRange.upperBound) + let lower = String.Index(_encodedOffset: _offsetRange.lowerBound) + ._scalarAligned + let higher = String.Index(_encodedOffset: _offsetRange.upperBound) + ._scalarAligned + return Range(_uncheckedBounds: (lower, higher)) } } diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index 0b518b45e4913..76af50f251556 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -16,20 +16,20 @@ import SwiftShims String's Index has the following layout: - ┌──────────┬───────────────────╥────────────────┬──────────╥────────────────┐ - │ b63:b16 │ b15:b14 ║ b13:b8 │ b7:b1 ║ b0 │ - ├──────────┼───────────────────╫────────────────┼──────────╫────────────────┤ - │ position │ transcoded offset ║ grapheme cache │ reserved ║ scalar aligned │ - └──────────┴───────────────────╨────────────────┴──────────╨────────────────┘ - └──────── resilient ────────┘ - -Position, transcoded offset, and scalar aligned are fully exposed in the ABI. -Grapheme cache and reserved are partially resilient: the fact that there are 13 -bits with a default value of `0` is ABI, but not the layout, construction, or + ┌──────────┬────────────────╥────────────────┬───────╥───────┐ + │ b63:b16 │ b15:b14 ║ b13:b8 │ b7:b3 ║ b2:b0 │ + ├──────────┼────────────────╫────────────────┼───────╫───────┤ + │ position │ transc. offset ║ grapheme cache │ rsvd ║ flags │ + └──────────┴────────────────╨────────────────┴───────╨───────┘ + └────── resilient ───────┘ + +Position, transcoded offset, and flags are fully exposed in the ABI. Grapheme +cache and reserved bits are partially resilient: the fact that there are 11 bits +with a default value of `0` is ABI, but not the layout, construction, or interpretation of those bits. All use of grapheme cache should be behind non-inlinable function calls. Inlinable code should not set a non-zero value to -grapheme cache bits: doing so breaks back deployment as they will be interpreted -as a set cache. +resilient bits: doing so breaks future evolution as the meaning of those bits +isn't frozen. - position aka `encodedOffset`: A 48-bit offset into the string's code units @@ -40,12 +40,18 @@ as a set cache. - grapheme cache: A 6-bit value remembering the distance to the next grapheme boundary. -- reserved: 7-bit for future use. +- reserved: 5 unused bits available for future flags etc. The meaning of each + bit may change between stdlib versions. These must be set to zero if + constructing an index in inlinable code. -- scalar aligned, whether this index is known to be scalar-aligned (see below) + b2: UTF-16. If set, position is in known to be UTF-16 code units [Swift 5.7+] + b1: UTF-8. If set, position is in known to be UTF-8 code units [Swift 5.7+] + b0: Scalar alignment. If set, index is known to be scalar-aligned (see below) +Before Swift 5.7, bits b1 and b2 used to be part of the resilient slice. +See the note on Index Encoding below to see how this works. */ extension String { @@ -72,7 +78,7 @@ extension String.Index { @inlinable @inline(__always) internal var isZeroPosition: Bool { return orderingValue == 0 } - /// The UTF-16 code unit offset corresponding to this Index + /// The UTF-16 code unit offset corresponding to this index. public func utf16Offset(in s: S) -> Int { return s.utf16.distance(from: s.utf16.startIndex, to: self) } @@ -272,6 +278,95 @@ extension String.Index { } } +/* + Index Encoding + + Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding + associated with the position value in String indices. Indices whose position + is an offset into UTF-8 storage come with the corresponding flag set, and a + separate flag is set for UTF-16 indices. (Only foreign strings can be UTF-16 + encoded. As of 5.7, all foreign strings are UTF-16; but this is subject to + change later if we ever decide to implement additional foreign forms.) + + In releases before 5.7, the bits corresponding to these flags were considered + reserved, and they were both set to zero in inlinable code. This means that + (on ABI stable platforms at least) we cannot assume that either of these bits + will be reliably set. If they are both clear, then we must fall back to + assuming that the index has the right encoding for whatever string it is used + on. However, if any of these bits are set, then the other bit's value is also + reliable -- whether it's set or cleared. + + The indices of ASCII strings are encoding-independent, i.e. transcoding such + strings from UTF-8 to UTF-16 (or vice versa) does not change the position + value of any of their indices. Therefore it isn't an error for an index to + have both of these flags set. (The start index of every string also behaves + this way: position zero is the same no matter how the rest of string is + stored.) + + These two bits (along with the isKnownUTF16 flag in StringObject) allows newer + versions of the Standard Library to more reliably catch runtime errors where + client code is applying an index from a UTF-16 string to a UTF-8 one, or vice + versa. This typically happens when indices from a UTF-16 Cocoa string that was + verbatim bridged into Swift are accidentally applied to a mutated version of + the same string. (The mutation turns it into a UTF-8 native string, where the + same numerical offsets might correspond to wildly different logical + positions.) + + Such code has always been broken, as the old indices are documented to be no + longer valid after the mutation; however, in previous releases this bug wasn't + reliably detected, and if the code was only ever tested on ASCII strings, then + the bug could lie dormant for a long time. (Until the code encounters a + non-ASCII character and someone gets surprised that the results no longer make + sense.) + + As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become + able to reliably catch and correct all such issues. The error cases are + handled in `_StringGuts.ensureMatchingEncoding(_:)`; see there for the sordid + details. + +*/ +extension String.Index { + /// Returns true if the position in this index is okay to interpret as offset + /// into UTF-8-encoded string storage. + /// + /// (This returns true if either we know for sure that this is an UTF-8 index, + /// or if we don't have enough information to determine its encoding.) + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _canBeUTF8: Bool { + // The only way an index cannot be UTF-8 is it has only the UTF-16 flag set. + _rawBits & 0x6 != 0x04 + } + + /// Returns true if the position in this index is okay to interpret as offset + /// into UTF-16-encoded string storage. + /// + /// (This returns true if either we know for sure that this is an UTF-16 + /// index, or if we don't have enough information to determine its + /// encoding.) + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _canBeUTF16: Bool { + // The only way an index cannot be UTF-16 is it has only the UTF-8 flag set. + _rawBits & 0x6 != 0x02 + } + + /// Returns the same index with the UTF-8 bit set. + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _knownUTF8: Self { Self(_rawBits | 0x2) } + + /// Returns the same index with the UTF-16 bit set. + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _knownUTF16: Self { Self(_rawBits | 0x4) } + + /// Returns the same index with both UTF-8 & UTF-16 bits set. + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _encodingIndependent: Self { Self(_rawBits | 0x6) } +} + extension String.Index: Equatable { @inlinable @inline(__always) public static func == (lhs: String.Index, rhs: String.Index) -> Bool { @@ -313,6 +408,12 @@ extension String.Index: CustomStringConvertible { if _isScalarAligned { d += ", scalarAligned" } + if _rawBits & 0x2 != 0 { + d += ", utf8" + } + if _rawBits & 0x4 != 0 { + d += ", utf16" + } d += ")" return d } diff --git a/stdlib/public/core/StringObject.swift b/stdlib/public/core/StringObject.swift index b087e87f51eb7..21f7de961d806 100644 --- a/stdlib/public/core/StringObject.swift +++ b/stdlib/public/core/StringObject.swift @@ -580,40 +580,79 @@ extension _StringObject { All non-small forms share the same structure for the other half of the bits (i.e. non-object bits) as a word containing code unit count and various - performance flags. The top 16 bits are for performance flags, which are not - semantically relevant but communicate that some operations can be done more - efficiently on this particular string, and the lower 48 are the code unit - count (aka endIndex). - -┌─────────┬───────┬──────────────────┬─────────────────┬────────┬───────┐ -│ b63 │ b62 │ b61 │ b60 │ b59:48 │ b47:0 │ -├─────────┼───────┼──────────────────┼─────────────────┼────────┼───────┤ -│ isASCII │ isNFC │ isNativelyStored │ isTailAllocated │ TBD │ count │ -└─────────┴───────┴──────────────────┴─────────────────┴────────┴───────┘ - - isASCII: set when all code units are known to be ASCII, enabling: + performance flags. The top 16 bits are nonessential flags; these aren't + critical for correct operation, but they may provide additional guarantees that + allow more efficient operation or more reliable detection of runtime errors. + The lower 48 bits contain the code unit count (aka endIndex). + +┌──────┬──────┬──────┬──────┬──────┬──────────┬───────────────────────────────┐ +│ b63 │ b62 │ b61 │ b60 │ b59 │ b58:48 │ b47:0 │ +├──────┼──────┼──────┼──────┼──────┼──────────┼───────────────────────────────┤ +│ ASCII│ NFC │native│ tail │ UTF16│ reserved │ count │ +└──────┴──────┴──────┴──────┴──────┴──────────┴───────────────────────────────┘ + + b63: isASCII. set when all code units are known to be ASCII, enabling: - Trivial Unicode scalars, they're just the code units - Trivial UTF-16 transcoding (just bit-extend) - Also, isASCII always implies isNFC - isNFC: set when the contents are in normal form C + + b62: isNFC. set when the contents are in normal form C - Enables trivial lexicographical comparisons: just memcmp - `isASCII` always implies `isNFC`, but not vice versa - isNativelyStored: set for native stored strings + + b61: isNativelyStored. set for native stored strings - `largeAddressBits` holds an instance of `_StringStorage`. - I.e. the start of the code units is at the stored address + `nativeBias` - isTailAllocated: contiguous UTF-8 code units starts at address + `nativeBias` + - NOTE: isNativelyStored is *specifically* allocated to b61 to align with the + bit-position of isSmall on the BridgeObject. This allows us to check for + native storage without an extra branch guarding against smallness. See + `_StringObject.hasNativeStorage` for this usage. + + b60: isTailAllocated. contiguous UTF-8 code units starts at address + `nativeBias` - `isNativelyStored` always implies `isTailAllocated`, but not vice versa (e.g. literals) - `isTailAllocated` always implies `isFastUTF8` - TBD: Reserved for future usage - - Setting a TBD bit to 1 must be semantically equivalent to 0 - - I.e. it can only be used to "cache" fast-path information in the future - count: stores the number of code units, corresponds to `endIndex`. - NOTE: isNativelyStored is *specifically* allocated to b61 to align with the - bit-position of isSmall on the BridgeObject. This allows us to check for - native storage without an extra branch guarding against smallness. See - `_StringObject.hasNativeStorage` for this usage. + b59: isKnownUTF16. This bit is set if index positions in the string are known + to be measured in UTF-16 code units, rather than the default UTF-8. + - This is only ever set on UTF-16 foreign strings created in noninlinable + code in stdlib versions >= 5.7. On stdlibs <= 5.6, this bit is always set + to zero. + - Note that while as of 5.7 all foreign strings are UTF-16, this isn't + guaranteed to remain this way -- future versions of the stdlib may + introduce new foreign forms that use a different encoding. (Likely UTF-8.) + - Foreign strings are only created in non-inlinable code, so on stdlib + versions >=5.7, this bit always correctly reflects the correct encoding + for the string's offset values. + - This bit along with the two related bits in String.Index allow us to + opportunistically catch cases where an UTF-16 index is used on an UTF-8 + string (or vice versa), and to provide better error reporting & recovery. + As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become + able to reliably catch all such issues. + - It is okay for isASCII strings to not set this flag, even if they are + UTF-16 encoded -- the offsets in that case can work in either encoding. + (This is not currently exercised, as foreign bridged strings never set + the isASCII flag.) + + b48-58: Reserved for future usage. + - Because Swift is ABI stable (on some platforms at least), these bits can + only be assigned semantics that don't affect interoperability with code + built with previous releases of the Standard Library, from 5.0 onward. + - Older binaries will not look at newly assigned bits, and they will not + set them, either (unless by side effect of calling into newly built code). + Such code must continue working. + - Code in new versions of the stdlib must continue to work corectly even if + some of these newly assigned bits are never set -- as may be the case when + the initialization of a string was emitted entirely into an older client + binary. + - This typically means that these bits can only be used as optional + performance shortcuts, e.g. to signal the availability of a potential fast + path. (However, it is also possible to store information here that allows + more reliable detection & handling of runtime errors, like the + `isKnownUTF16` bit above.) + + b0-47: count. Stores the number of code units. Corresponds to the position of + the `endIndex`. */ extension _StringObject.CountAndFlags { @@ -639,6 +678,12 @@ extension _StringObject.CountAndFlags { return 0x1000_0000_0000_0000 } + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal static var isKnownUTF16Mask: UInt64 { + return 0x0800_0000_0000_0000 + } + // General purpose bottom initializer @inlinable @inline(__always) internal init( @@ -677,10 +722,53 @@ extension _StringObject.CountAndFlags { _internalInvariant(isTailAllocated == self.isTailAllocated) } + @inline(__always) + internal init( + count: Int, + isASCII: Bool, + isNFC: Bool, + isNativelyStored: Bool, + isTailAllocated: Bool, + isKnownUTF16: Bool + ) { + var rawBits = UInt64(truncatingIfNeeded: count) + _internalInvariant(rawBits <= _StringObject.CountAndFlags.countMask) + + if isASCII { + _internalInvariant(isNFC) + rawBits |= _StringObject.CountAndFlags.isASCIIMask + } + + if isNFC { + rawBits |= _StringObject.CountAndFlags.isNFCMask + } + + if isNativelyStored { + _internalInvariant(isTailAllocated) + rawBits |= _StringObject.CountAndFlags.isNativelyStoredMask + } + + if isTailAllocated { + rawBits |= _StringObject.CountAndFlags.isTailAllocatedMask + } + + if isKnownUTF16 { + rawBits |= _StringObject.CountAndFlags.isKnownUTF16Mask + } + + self.init(raw: rawBits) + _internalInvariant(count == self.count) + _internalInvariant(isASCII == self.isASCII) + _internalInvariant(isNFC == self.isNFC) + _internalInvariant(isNativelyStored == self.isNativelyStored) + _internalInvariant(isTailAllocated == self.isTailAllocated) + _internalInvariant(isKnownUTF16 == self.isKnownUTF16) + } + @inlinable @inline(__always) internal init(count: Int, flags: UInt16) { - // Currently, we only use top 4 flags - _internalInvariant(flags & 0xF000 == flags) + // Currently, we only use top 5 flags + _internalInvariant(flags & 0xF800 == flags) let rawBits = UInt64(truncatingIfNeeded: flags) &<< 48 | UInt64(truncatingIfNeeded: count) @@ -710,13 +798,14 @@ extension _StringObject.CountAndFlags { isTailAllocated: true) } @inline(__always) - internal init(sharedCount: Int, isASCII: Bool) { + internal init(sharedCount: Int, isASCII: Bool, isUTF16: Bool) { self.init( count: sharedCount, isASCII: isASCII, isNFC: isASCII, isNativelyStored: false, - isTailAllocated: false) + isTailAllocated: false, + isKnownUTF16: isUTF16) } // @@ -750,6 +839,11 @@ extension _StringObject.CountAndFlags { internal var isTailAllocated: Bool { return 0 != _storage & _StringObject.CountAndFlags.isTailAllocatedMask } + @_alwaysEmitIntoClient + @inline(__always) // Swift 5.7 + internal var isKnownUTF16: Bool { + return 0 != _storage & _StringObject.CountAndFlags.isKnownUTF16Mask + } #if !INTERNAL_CHECKS_ENABLED @inlinable @inline(__always) internal func _invariantCheck() {} @@ -762,6 +856,10 @@ extension _StringObject.CountAndFlags { if isNativelyStored { _internalInvariant(isTailAllocated) } + if isKnownUTF16 { + _internalInvariant(!isNativelyStored) + _internalInvariant(!isTailAllocated) + } } #endif // INTERNAL_CHECKS_ENABLED } @@ -895,6 +993,13 @@ extension _StringObject { return _countAndFlags.isNFC } + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var isKnownUTF16: Bool { + if isSmall { return false } + return _countAndFlags.isKnownUTF16 + } + // Get access to fast UTF-8 contents for large strings which provide it. @inlinable @inline(__always) internal var fastUTF8: UnsafeBufferPointer { @@ -994,7 +1099,8 @@ extension _StringObject { internal init( cocoa: AnyObject, providesFastUTF8: Bool, isASCII: Bool, length: Int ) { - let countAndFlags = CountAndFlags(sharedCount: length, isASCII: isASCII) + let countAndFlags = CountAndFlags( + sharedCount: length, isASCII: isASCII, isUTF16: !providesFastUTF8) let discriminator = Nibbles.largeCocoa(providesFastUTF8: providesFastUTF8) #if arch(i386) || arch(arm) || arch(arm64_32) || arch(wasm32) self.init( diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index e547f0b592fef..bd66639723e17 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -97,11 +97,17 @@ public struct Substring: Sendable { @usableFromInline internal var _slice: Slice - @inlinable + @usableFromInline internal init(_ slice: Slice) { - let _guts = slice.base._guts - let start = _guts.scalarAlign(slice.startIndex) - let end = _guts.scalarAlign(slice.endIndex) + let _guts = slice._base._guts + _internalInvariant( + _guts.hasMatchingEncoding(slice.startIndex) && + _guts.hasMatchingEncoding(slice.endIndex)) + _internalInvariant( + slice.startIndex >= _guts.startIndex && slice.endIndex <= _guts.endIndex) + + let start = slice.base._guts.scalarAlign(slice.startIndex) + let end = slice.base._guts.scalarAlign(slice.endIndex) self._slice = Slice( base: slice.base, @@ -168,32 +174,44 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. + let i = _slice.base._guts.ensureMatchingEncoding(i) + _precondition(i < endIndex && i >= startIndex, + "Substring index is out of bounds") + let r = _uncheckedIndex(after: _slice.base._guts.scalarAlign(i)) + return _slice.base._guts.markEncoding(r) + } + + /// A version of `index(after:)` that assumes that the given index: + /// + /// - has the right encoding, + /// - is within bounds, and + /// - is scalar aligned. + /// + /// It does not mark the encoding of the returned index. + internal func _uncheckedIndex(after i: Index) -> Index { // FIXME: Unlike `index(before:)`, this function may return incorrect // results if `i` isn't on a grapheme cluster boundary. (The grapheme // breaking algorithm assumes we start on a break when we go forward.) - - let i = _slice.base._guts.scalarAlign(i) - - _precondition(i < endIndex && i >= startIndex, - "Substring index is out of bounds") + _internalInvariant(_slice.base._guts.hasMatchingEncoding(i)) + _internalInvariant(i < endIndex) + _internalInvariant(i._isScalarAligned) let stride = _characterStride(startingAt: i) let nextOffset = i._encodedOffset &+ stride - let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned - guard _knownToStartOnGraphemeBreak else { + guard // Don't cache character strides in indices of exotic substrings whose // startIndex isn't aligned on a grapheme cluster boundary. (Their // grapheme breaks may not match with those in `base`.) - return nextIndex - } - guard nextIndex < endIndex || _knownToEndOnGraphemeBreak else { + _knownToStartOnGraphemeBreak, // Don't cache the stride if we end on a partial grapheme cluster. + nextIndex < endIndex || _knownToEndOnGraphemeBreak + else { return nextIndex } let nextStride = _characterStride(startingAt: nextIndex) - return Index( - encodedOffset: nextOffset, characterStride: nextStride)._scalarAligned + let r = Index(encodedOffset: nextOffset, characterStride: nextStride) + return r._scalarAligned } public func index(before i: Index) -> Index { @@ -204,8 +222,24 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. + let i = _slice.base._guts.ensureMatchingEncoding(i) _precondition(i <= endIndex && i > startIndex, "Substring index is out of bounds") + let r = _uncheckedIndex(before: _slice.base._guts.scalarAlign(i)) + return _slice.base._guts.markEncoding(r) + } + + /// A version of `index(before:)` that assumes that the given index: + /// + /// - has the right encoding, + /// - is within bounds, and + /// - is scalar aligned. + /// + /// It does not mark the encoding of the returned index. + internal func _uncheckedIndex(before i: Index) -> Index { + _internalInvariant(_slice.base._guts.hasMatchingEncoding(i)) + _internalInvariant(i < endIndex) + _internalInvariant(i._isScalarAligned) // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let i = _slice.base._guts.scalarAlign(i) @@ -231,7 +265,24 @@ extension Substring: StringProtocol { // outside the substring to affect grapheme breaking results within the // substring. This leads to Collection conformance issues when the // `Substring`'s bounds do not fall on grapheme boundaries in `base`. - return _index(i, offsetBy: distance) + + var i = _slice.base._guts.ensureMatchingEncoding(i) + _precondition(i >= startIndex && i <= endIndex, + "String index is out of bounds") + i = _slice.base._guts.scalarAlign(i) + // TODO: known-ASCII and single-scalar-grapheme fast path, etc. + if distance >= 0 { + for _ in stride(from: 0, to: distance, by: 1) { + _precondition(i < endIndex, "String index is out of bounds") + i = _uncheckedIndex(after: i) + } + } else { + for _ in stride(from: 0, to: distance, by: -1) { + _precondition(i > startIndex, "String index is out of bounds") + i = _uncheckedIndex(before: i) + } + } + return _slice.base._guts.markEncoding(i) } public func index( @@ -250,23 +301,30 @@ extension Substring: StringProtocol { // Note: `limit` is intentionally not scalar aligned to ensure our behavior // exactly matches the documentation. + let limit = _slice.base._guts.ensureMatchingEncoding(limit) - let start = _slice.base._guts.scalarAlign(i) - var i = start + var i = _slice.base._guts.ensureMatchingEncoding(i) + _precondition(i >= startIndex && i <= endIndex, + "String index is out of bounds") + i = _slice.base._guts.scalarAlign(i) + + let start = i if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { guard limit < start || i < limit else { return nil } - formIndex(after: &i) + _precondition(i < endIndex, "String index is out of bounds") + i = _uncheckedIndex(after: i) } guard limit < start || i <= limit else { return nil } } else { for _ in stride(from: 0, to: distance, by: -1) { guard limit > start || i > limit else { return nil } - formIndex(before: &i) + _precondition(i > startIndex, "String index is out of bounds") + i = _uncheckedIndex(before: i) } guard limit > start || i >= limit else { return nil } } - return i + return _slice.base._guts.markEncoding(i) } public func distance(from start: Index, to end: Index) -> Int { @@ -277,32 +335,53 @@ extension Substring: StringProtocol { // substring. This leads to Collection conformance issues when the // `Substring`'s bounds do not fall on grapheme boundaries in `base`. + // FIXME: Due to the `index(after:)` problem above, this function doesn't + // always return consistent results when the given indices fall between + // grapheme breaks -- swapping `start` and `end` may change the magnitude of + // the result. + + var start = _slice.base._guts.ensureMatchingEncoding(start) + var end = _slice.base._guts.ensureMatchingEncoding(end) + + _precondition( + start >= startIndex && start <= endIndex && + end >= startIndex && end <= endIndex, + "String index is out of bounds") + + start = _slice.base._guts.scalarAlign(start) + end = _slice.base._guts.scalarAlign(end) + // TODO: known-ASCII and single-scalar-grapheme fast path, etc. // Per SE-0180, `start` and `end` are allowed to fall in between grapheme // breaks, in which case this function must still terminate without trapping // and return a result that makes sense. - var i = _slice.base._guts.scalarAlign(start) - let end = _slice.base._guts.scalarAlign(end) - var count = 0 + var i = start + var count = 0 if i < end { while i < end { // Note `<` instead of `==` count += 1 - formIndex(after: &i) + i = _uncheckedIndex(after: i) } } else if i > end { while i > end { // Note `<` instead of `==` count -= 1 - formIndex(before: &i) + i = _uncheckedIndex(before: i) } } return count } public subscript(i: Index) -> Character { - get { return _slice[i] } + var i = _slice.base._guts.ensureMatchingEncoding(i) + _precondition(i >= startIndex && i < endIndex, + "Substring index is out of bounds") + i = _slice.base._guts.scalarAlign(i) + let distance = _characterStride(startingAt: i) + return _slice.base._guts.errorCorrectedCharacter( + startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance) } public mutating func replaceSubrange( @@ -958,6 +1037,9 @@ extension Substring: ExpressibleByStringLiteral { extension String { @available(swift, introduced: 4) public subscript(r: Range) -> Substring { + let r = Range(_uncheckedBounds: ( + _guts.ensureMatchingEncoding(r.lowerBound), + _guts.ensureMatchingEncoding(r.upperBound))) _boundsCheck(r) return Substring(Slice(base: self, bounds: r)) } @@ -966,6 +1048,11 @@ extension String { extension Substring { @available(swift, introduced: 4) public subscript(r: Range) -> Substring { - return Substring(_slice[r]) + let r = Range(_uncheckedBounds: ( + _slice.base._guts.ensureMatchingEncoding(r.lowerBound), + _slice.base._guts.ensureMatchingEncoding(r.upperBound))) + _precondition(r.lowerBound >= startIndex && r.upperBound <= endIndex, + "Substring index range is out of bounds") + return Substring(Slice(base: _slice.base, bounds: r)) } } diff --git a/stdlib/public/core/UnicodeHelpers.swift b/stdlib/public/core/UnicodeHelpers.swift index 13341cb542f6a..76e64309b8fdc 100644 --- a/stdlib/public/core/UnicodeHelpers.swift +++ b/stdlib/public/core/UnicodeHelpers.swift @@ -233,6 +233,12 @@ extension _StringGuts { return self.withFastUTF8 { _decodeScalar($0, startingAt: i).0 } } + @_alwaysEmitIntoClient + @inline(__always) + internal func isOnUnicodeScalarBoundary(_ offset: Int) -> Bool { + isOnUnicodeScalarBoundary(String.Index(_encodedOffset: offset)) + } + @usableFromInline @_effects(releasenone) internal func isOnUnicodeScalarBoundary(_ i: String.Index) -> Bool { @@ -244,7 +250,7 @@ extension _StringGuts { if _fastPath(isFastUTF8) { return self.withFastUTF8 { - return !UTF8.isContinuation($0[i._encodedOffset]) + return !UTF8.isContinuation($0[_unchecked: i._encodedOffset]) } } From 15c7721caf1c362fc96a7f54529bfd92f2cdc799 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 1 Mar 2022 19:36:20 -0800 Subject: [PATCH 03/83] [stdlib] Use the new index encoding flags when marking the encoding of indices This removes an unnecessary opaque call from the inlinable path, but it preserves a runtime version check. --- stdlib/public/core/StringCharacterView.swift | 8 +-- stdlib/public/core/StringGuts.swift | 61 ++++++++++++-------- stdlib/public/core/Substring.swift | 8 +-- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index e53eed2a6aee0..8e14a8c3d83e7 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -52,7 +52,7 @@ extension String: BidirectionalCollection { let i = _guts.ensureMatchingEncoding(i) _precondition(i < endIndex, "String index is out of bounds") let r = _uncheckedIndex(after: _guts.scalarAlign(i)) - return _guts.markEncoding(r) + return _guts.internalMarkEncoding(r) } /// A version of `index(after:)` that assumes that the given index: @@ -96,7 +96,7 @@ extension String: BidirectionalCollection { _precondition(i > startIndex, "String index is out of bounds") let r = _uncheckedIndex(before: _guts.scalarAlign(i)) - return _guts.markEncoding(r) + return _guts.internalMarkEncoding(r) } /// A version of `index(before:)` that assumes that the given index: @@ -163,7 +163,7 @@ extension String: BidirectionalCollection { i = _uncheckedIndex(before: i) } } - return _guts.markEncoding(i) + return _guts.internalMarkEncoding(i) } /// Returns an index that is the specified distance from the given index, @@ -243,7 +243,7 @@ extension String: BidirectionalCollection { } guard limit > start || i >= limit else { return nil } } - return _guts.markEncoding(i) + return _guts.internalMarkEncoding(i) } /// Returns the distance between two indices. diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 887f058aeff27..fc5dcc7bc3166 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -295,10 +295,15 @@ extension _StringGuts { internal var endIndex: String.Index { markEncoding(Index(_encodedOffset: self.count)._scalarAligned) } +} - @inlinable @inline(__always) - internal func index(atOffset offset: Int) -> String.Index { - markEncoding(Index(_encodedOffset: self.count)._scalarAligned) +@_alwaysEmitIntoClient +@inline(__always) +func _isSwiftStdlib_5_7() -> Bool { + if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { // SwiftStdlib 5.7 + return true + } else { + return false } } @@ -306,29 +311,39 @@ extension _StringGuts { extension _StringGuts { @_alwaysEmitIntoClient // Swift 5.7 internal func markEncoding(_ i: String.Index) -> String.Index { - if _slowPath(isForeign) { - // FIXME: Instead of having an opaque path here, we should define the same - // encoding flags in StringObject and pick them up from there. The flags - // can be initialized at the time the foreign string is created. - guard - #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) // SwiftStdlib 5.7 - else { - // We know all foreign strings were UTF-16 in releases < 5.7 - return i._knownUTF16 - } - return _foreignMarkEncoding(i) + // In this inlinable function, we cannot assume that all foreign strings are + // UTF-16 encoded, as this code may run on a future stdlib that may have + // introduced other foreign forms. + if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { // SwiftStdlib 5.7 + // With a >=5.7 stdlib, we can rely on `isKnownUTF16` to contain the truth. + return _object.isKnownUTF16 ? i._knownUTF16 : i._knownUTF8 } - return i._knownUTF8 + // We know that in stdlibs 5.0..<5.7, all foreign strings were UTF-16, + // so we can use `isForeign` to determine the encoding. + return isForeign ? i._knownUTF16 : i._knownUTF8 } - @_effects(readnone) - @available(SwiftStdlib 5.7, *) - @usableFromInline - internal func _foreignMarkEncoding(_ i: String.Index) -> String.Index { - // Currently foreign indices always have UTF-16 offsets. - i._knownUTF16 + @inline(__always) + internal func internalMarkEncoding(_ i: String.Index) -> String.Index { + // This code is behind a resiliance boundary, so it always runs on a >=5.7 + // stdlib. Note though that it doesn't match the 5.7+ case in the inlinable + // version above! + // + // We know that in this version of the stdlib, foreign strings happen to + // always be UTF-16 encoded (like they were between 5.0 and 5.6), and + // looking at `isForeign` instead of `isKnownUTF16` may allow the stdlib's + // internal code to be better optimized -- so let's do that. + isForeign ? i._knownUTF16 : i._knownUTF8 } + /// Returns true if the encoding of the given index isn't known to be in + /// conflict with this string's encoding. + /// + /// If the index or the string was created by code that was built on stdlibs + /// below 5.7, then this check may incorrectly return true on a mismatching + /// index, but it is guaranteed to never incorrectly return false. If all + /// loaded binaries were built in 5.7+, then this method is guaranteed to + /// always return the correct value. internal func hasMatchingEncoding(_ i: String.Index) -> Bool { (isForeign && i._canBeUTF16) || (!isForeign && i._canBeUTF8) } @@ -339,14 +354,14 @@ extension _StringGuts { /// not set the flags that this method relies on. However, false positives /// cannot happen: if this method detects a mismatch, then it is guaranteed to /// be a real one. - @_alwaysEmitIntoClient + @_alwaysEmitIntoClient // FIXME(lorentey): Should this remain internal? @inline(__always) internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index { if _fastPath(!isForeign && i._canBeUTF8) { return i } return _slowEnsureMatchingEncoding(i) } - @_alwaysEmitIntoClient + @_alwaysEmitIntoClient // FIXME(lorentey): Should this remain internal? internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { _internalInvariant(isForeign || !i._canBeUTF8) if isForeign { diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index bd66639723e17..92134a76b6ab2 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -178,7 +178,7 @@ extension Substring: StringProtocol { _precondition(i < endIndex && i >= startIndex, "Substring index is out of bounds") let r = _uncheckedIndex(after: _slice.base._guts.scalarAlign(i)) - return _slice.base._guts.markEncoding(r) + return _slice.base._guts.internalMarkEncoding(r) } /// A version of `index(after:)` that assumes that the given index: @@ -226,7 +226,7 @@ extension Substring: StringProtocol { _precondition(i <= endIndex && i > startIndex, "Substring index is out of bounds") let r = _uncheckedIndex(before: _slice.base._guts.scalarAlign(i)) - return _slice.base._guts.markEncoding(r) + return _slice.base._guts.internalMarkEncoding(r) } /// A version of `index(before:)` that assumes that the given index: @@ -282,7 +282,7 @@ extension Substring: StringProtocol { i = _uncheckedIndex(before: i) } } - return _slice.base._guts.markEncoding(i) + return _slice.base._guts.internalMarkEncoding(i) } public func index( @@ -324,7 +324,7 @@ extension Substring: StringProtocol { } guard limit > start || i >= limit else { return nil } } - return _slice.base._guts.markEncoding(i) + return _slice.base._guts.internalMarkEncoding(i) } public func distance(from start: Index, to end: Index) -> Int { From a44997eeea6a75e5c5ed6dae8ae00d8f3ab09a01 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 1 Mar 2022 23:27:40 -0800 Subject: [PATCH 04/83] [stdlib] Factor scalar-aligned String index validation out into a set of common routines There are three flavors, corresponding to i < endIndex, i <= endIndex, and range containment checks. Additionally, we have separate variants for index validation in substrings. --- stdlib/public/core/StringCharacterView.swift | 52 ++-- stdlib/public/core/StringGuts.swift | 148 +++++++++++- stdlib/public/core/StringObject.swift | 8 + .../StringRangeReplaceableCollection.swift | 23 +- .../public/core/StringUnicodeScalarView.swift | 8 + stdlib/public/core/Substring.swift | 223 +++++++++++------- 6 files changed, 336 insertions(+), 126 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 8e14a8c3d83e7..63dc9a0e69846 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -49,9 +49,8 @@ extension String: BidirectionalCollection { /// `endIndex`. /// - Returns: The index value immediately after `i`. public func index(after i: Index) -> Index { - let i = _guts.ensureMatchingEncoding(i) - _precondition(i < endIndex, "String index is out of bounds") - let r = _uncheckedIndex(after: _guts.scalarAlign(i)) + let i = _guts.validateScalarIndex(i) + let r = _uncheckedIndex(after: i) return _guts.internalMarkEncoding(r) } @@ -85,14 +84,10 @@ extension String: BidirectionalCollection { /// `startIndex`. /// - Returns: The index value immediately before `i`. public func index(before i: Index) -> Index { - let i = _guts.ensureMatchingEncoding(i) - - // Note: bounds checking in `index(before:)` is tricky as scalar aligning an - // index may need to access storage, but it may also move it closer towards - // the `startIndex`. Therefore, we must check against the `endIndex` before - // aligning, but we need to delay the `i > startIndex` check until after. - _precondition(i <= endIndex, "String index is out of bounds") - let i = _guts.scalarAlign(i) + let i = _guts.validateInclusiveScalarIndex(i) + // Note: Scalar aligning an index may move it closer towards the + // `startIndex`, so the `i > startIndex` check needs to come after the + // `validateScalarIndex` call. _precondition(i > startIndex, "String index is out of bounds") let r = _uncheckedIndex(before: _guts.scalarAlign(i)) @@ -147,10 +142,7 @@ extension String: BidirectionalCollection { // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - var i = _guts.ensureMatchingEncoding(i) - _precondition(i >= startIndex && i <= endIndex, - "String index is out of bounds") - i = _guts.scalarAlign(i) + var i = _guts.validateInclusiveScalarIndex(i) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { @@ -222,10 +214,7 @@ extension String: BidirectionalCollection { // exactly matches the documentation above. let limit = _guts.ensureMatchingEncoding(limit) - var i = _guts.ensureMatchingEncoding(i) - _precondition(i >= startIndex && i <= endIndex, - "String index is out of bounds") - i = _guts.scalarAlign(i) + var i = _guts.validateInclusiveScalarIndex(i) let start = i if distance >= 0 { @@ -264,22 +253,14 @@ extension String: BidirectionalCollection { // grapheme breaks -- swapping `start` and `end` may change the magnitude of // the result. - var start = _guts.ensureMatchingEncoding(start) - var end = _guts.ensureMatchingEncoding(end) - - _precondition( - start >= startIndex && start <= endIndex && - end >= startIndex && end <= endIndex, - "String index is out of bounds") - - start = _guts.scalarAlign(start) - end = _guts.scalarAlign(end) + let start = _guts.validateInclusiveScalarIndex(start) + let end = _guts.validateInclusiveScalarIndex(end) // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - // Per SE-0180, `start` and `end` are allowed to fall in between grapheme - // breaks, in which case this function must still terminate without trapping - // and return a result that makes sense. + // Per SE-0180, `start` and `end` are allowed to fall in between Character + // boundaries, in which case this function must still terminate without + // trapping and return a result that makes sense. var i = start var count = 0 @@ -313,7 +294,7 @@ extension String: BidirectionalCollection { /// /// - Parameter i: A valid index of the string. `i` must be less than the /// string's end index. - @inlinable @inline(__always) // FIXME(lorentey): Consider removing these. If + @inlinable @inline(__always) // TODO(lorentey): Consider removing these. If // `index(after:)` isn't inlinable, does it // really matter if this one is? (Potential // _guts-related optimizations notwithstanding.) @@ -321,11 +302,8 @@ extension String: BidirectionalCollection { // new additions to be _aEIC, even though they // ought to be internal. public subscript(i: Index) -> Character { - var i = _guts.ensureMatchingEncoding(i) - _boundsCheck(i) - i = _guts.scalarAlign(i) + let i = _guts.validateScalarIndex(i) let distance = _characterStride(startingAt: i) - return _guts.errorCorrectedCharacter( startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance) } diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index fc5dcc7bc3166..18b28864613df 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -309,6 +309,17 @@ func _isSwiftStdlib_5_7() -> Bool { // Encoding extension _StringGuts { + /// Returns whether this string is known to use UTF-16 code units. + /// + /// This always returns a value corresponding to the string's actual encoding + /// on stdlib versions >=5.7. + /// + /// Standard Library versions <=5.6 did not set the corresponding flag, so + /// this property always returns false. + @_alwaysEmitIntoClient + @inline(__always) + internal var isKnownUTF16: Bool { _object.isKnownUTF16 } + @_alwaysEmitIntoClient // Swift 5.7 internal func markEncoding(_ i: String.Index) -> String.Index { // In this inlinable function, we cannot assume that all foreign strings are @@ -316,7 +327,7 @@ extension _StringGuts { // introduced other foreign forms. if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { // SwiftStdlib 5.7 // With a >=5.7 stdlib, we can rely on `isKnownUTF16` to contain the truth. - return _object.isKnownUTF16 ? i._knownUTF16 : i._knownUTF8 + return isKnownUTF16 ? i._knownUTF16 : i._knownUTF8 } // We know that in stdlibs 5.0..<5.7, all foreign strings were UTF-16, // so we can use `isForeign` to determine the encoding. @@ -354,14 +365,14 @@ extension _StringGuts { /// not set the flags that this method relies on. However, false positives /// cannot happen: if this method detects a mismatch, then it is guaranteed to /// be a real one. - @_alwaysEmitIntoClient // FIXME(lorentey): Should this remain internal? + @_alwaysEmitIntoClient // TODO(lorentey): Should this remain internal? @inline(__always) internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index { if _fastPath(!isForeign && i._canBeUTF8) { return i } return _slowEnsureMatchingEncoding(i) } - @_alwaysEmitIntoClient // FIXME(lorentey): Should this remain internal? + @_alwaysEmitIntoClient // TODO(lorentey): Should this remain internal? internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { _internalInvariant(isForeign || !i._canBeUTF8) if isForeign { @@ -383,7 +394,7 @@ extension _StringGuts { // This trap can never trigger on OSes that have stdlibs <= 5.6, because // those versions never set the `isKnownUTF16` flag in `_StringObject`. // - _precondition(!_object.isKnownUTF16 || i._canBeUTF16, + _precondition(!isKnownUTF16 || i._canBeUTF16, "Invalid string index") return i } @@ -407,6 +418,135 @@ extension _StringGuts { } } +// Index validation +extension _StringGuts { + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within the bounds of this string, and + /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateScalarIndex(_ i: String.Index) -> String.Index { + let i = ensureMatchingEncoding(i) + _precondition(i._encodedOffset < count, "String index is out of bounds") + return scalarAlign(i) + } + + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within `start ..< end`, and + /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateScalarIndex( + _ i: String.Index, + from start: String.Index, + to end: String.Index + ) -> String.Index { + _internalInvariant(start <= end && end <= endIndex) + + let i = ensureMatchingEncoding(i) + _precondition(i >= start && i < end, "Substring index is out of bounds") + return scalarAlign(i) + } + + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within the bounds of this string (including the `endIndex`), and + /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateInclusiveScalarIndex( + _ i: String.Index + ) -> String.Index { + let i = ensureMatchingEncoding(i) + _precondition(i._encodedOffset <= count, "String index is out of bounds") + return scalarAlign(i) + } + + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within the bounds of this string (including the `endIndex`), and + /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateInclusiveScalarIndex( + _ i: String.Index, + from start: String.Index, + to end: String.Index + ) -> String.Index { + _internalInvariant(start <= end && end <= endIndex) + + let i = ensureMatchingEncoding(i) + _precondition(i >= start && i <= end, "Substring index is out of bounds") + return scalarAlign(i) + } + + /// Validate `range` and adjust the position of its bounds, returning the + /// resulting range or trapping as appropriate. If this function returns, then + /// the bounds of the returned value + /// + /// - have an encoding that matches this string, + /// - are within the bounds of this string, and + /// - are aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateScalarRange( + _ range: Range + ) -> Range { + var upper = ensureMatchingEncoding(range.upperBound) + var lower = ensureMatchingEncoding(range.lowerBound) + + // Note: if only `lower` was miscoded, then the range invariant `lower <= + // upper` may no longer hold after the above conversions, so we need to + // re-check it here. + _precondition(upper._encodedOffset <= count && lower <= upper, + "String index range is out of bounds") + + upper = scalarAlign(upper) + lower = scalarAlign(lower) + + return Range(_uncheckedBounds: (lower, upper)) + } + + /// Validate `range` and adjust the position of its bounds, returning the + /// resulting range or trapping as appropriate. If this function returns, then + /// the bounds of the returned value + /// + /// - have an encoding that matches this string, + /// - are within `start ..< end`, and + /// - are aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateScalarRange( + _ range: Range, + from start: String.Index, + to end: String.Index + ) -> Range { + _internalInvariant(start <= end && end <= endIndex) + + var upper = ensureMatchingEncoding(range.upperBound) + var lower = ensureMatchingEncoding(range.lowerBound) + + // Note: if only `lower` was miscoded, then the range invariant `lower <= + // upper` may no longer hold after the above conversions, so we need to + // re-check it here. + _precondition(upper <= end && lower >= start && lower <= upper, + "Substring index range is out of bounds") + + upper = scalarAlign(upper) + lower = scalarAlign(lower) + + return Range(_uncheckedBounds: (lower, upper)) + } +} + // Old SPI(corelibs-foundation) extension _StringGuts { @available(*, deprecated) diff --git a/stdlib/public/core/StringObject.swift b/stdlib/public/core/StringObject.swift index 21f7de961d806..e49cbb4a3bbc2 100644 --- a/stdlib/public/core/StringObject.swift +++ b/stdlib/public/core/StringObject.swift @@ -839,6 +839,14 @@ extension _StringObject.CountAndFlags { internal var isTailAllocated: Bool { return 0 != _storage & _StringObject.CountAndFlags.isTailAllocatedMask } + + /// Returns whether this string is known to use UTF-16 code units. + /// + /// This always returns a value corresponding to the string's actual encoding + /// on stdlib versions >=5.7. + /// + /// Standard Library versions <=5.6 did not set the corresponding flag, so + /// this property always returns false. @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 internal var isKnownUTF16: Bool { diff --git a/stdlib/public/core/StringRangeReplaceableCollection.swift b/stdlib/public/core/StringRangeReplaceableCollection.swift index 65196daaf2507..604571bdb902a 100644 --- a/stdlib/public/core/StringRangeReplaceableCollection.swift +++ b/stdlib/public/core/StringRangeReplaceableCollection.swift @@ -195,10 +195,11 @@ extension String: RangeReplaceableCollection { @_specialize(where C == Substring) @_specialize(where C == Array) public mutating func replaceSubrange( - _ bounds: Range, + _ subrange: Range, with newElements: C ) where C: Collection, C.Iterator.Element == Character { - _guts.replaceSubrange(bounds, with: newElements) + let subrange = _guts.validateScalarRange(subrange) + _guts.replaceSubrange(subrange, with: newElements) } /// Inserts a new character at the specified position. @@ -213,7 +214,9 @@ extension String: RangeReplaceableCollection { /// /// - Complexity: O(*n*), where *n* is the length of the string. public mutating func insert(_ newElement: Character, at i: Index) { - self.replaceSubrange(i..( contentsOf newElements: S, at i: Index ) where S.Element == Character { - self.replaceSubrange(i.. Character { - let result = self[i] - _guts.remove(from: i, to: self.index(after: i)) + let i = _guts.validateScalarIndex(i) + let stride = _characterStride(startingAt: i) + let j = Index(_encodedOffset: i._encodedOffset &+ stride)._scalarAligned + + let result = _guts.errorCorrectedCharacter( + startingAt: i._encodedOffset, endingAt: j._encodedOffset) + _guts.remove(from: i, to: j) return result } @@ -275,6 +285,7 @@ extension String: RangeReplaceableCollection { /// - Parameter bounds: The range of the elements to remove. The upper and /// lower bounds of `bounds` must be valid indices of the string. public mutating func removeSubrange(_ bounds: Range) { + let bounds = _guts.validateScalarRange(bounds) _guts.remove(from: bounds.lowerBound, to: bounds.upperBound) } diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index b98b180d34188..980976bad73c8 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -108,6 +108,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { public func index(after i: Index) -> Index { // TODO(String performance): isASCII fast-path + // TODO(lorentey): Review index validation _precondition(i < endIndex, "String index is out of bounds") let i = _guts.scalarAlign(i) @@ -121,6 +122,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { @_alwaysEmitIntoClient // Swift 5.1 bug fix public func distance(from start: Index, to end: Index) -> Int { + // TODO(lorentey): Review index validation return _distance(from: _guts.scalarAlign(start), to: _guts.scalarAlign(end)) } @@ -129,6 +131,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { /// - Precondition: The previous location exists. @inlinable @inline(__always) public func index(before i: Index) -> Index { + // TODO(lorentey): Review index validation // TODO(String performance): isASCII fast-path // Note: bounds checking in `index(before:)` is tricky as scalar aligning an @@ -168,6 +171,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { /// must be less than the view's end index. @inlinable @inline(__always) public subscript(position: Index) -> Unicode.Scalar { + // TODO(lorentey): Review index validation String(_guts)._boundsCheck(position) let i = _guts.scalarAlign(position) return _guts.errorCorrectedScalar(startingAt: i._encodedOffset).0 @@ -314,6 +318,7 @@ extension String.UnicodeScalarView: RangeReplaceableCollection { _ bounds: Range, with newElements: C ) where C: Collection, C.Element == Unicode.Scalar { + // TODO(lorentey): Review index validation // TODO(String performance): Skip extra String and Array allocation let utf8Replacement = newElements.flatMap { String($0).utf8 } @@ -358,6 +363,7 @@ extension String.UnicodeScalarIndex { _ sourcePosition: String.Index, within unicodeScalars: String.UnicodeScalarView ) { + // TODO(lorentey): Review index validation guard unicodeScalars._guts.isOnUnicodeScalarBoundary(sourcePosition) else { return nil } @@ -385,6 +391,7 @@ extension String.UnicodeScalarIndex { /// an attempt to convert the position of a UTF-8 continuation byte /// returns `nil`. public func samePosition(in characters: String) -> String.Index? { + // TODO(lorentey): Review index validation return String.Index(self, within: characters) } } @@ -414,6 +421,7 @@ extension String.UnicodeScalarView { @available(swift, introduced: 4) public subscript(r: Range) -> String.UnicodeScalarView.SubSequence { + // TODO(lorentey): Review index validation _failEarlyRangeCheck(r, bounds: startIndex.. - @usableFromInline - internal init(_ slice: Slice) { - let _guts = slice._base._guts + @inline(__always) + internal init(_unchecked slice: Slice) { + _internalInvariant(slice.endIndex <= slice._base._guts.endIndex) _internalInvariant( - _guts.hasMatchingEncoding(slice.startIndex) && - _guts.hasMatchingEncoding(slice.endIndex)) + slice._base._guts.hasMatchingEncoding(slice.startIndex) && + slice._base._guts.hasMatchingEncoding(slice.endIndex)) _internalInvariant( - slice.startIndex >= _guts.startIndex && slice.endIndex <= _guts.endIndex) - - let start = slice.base._guts.scalarAlign(slice.startIndex) - let end = slice.base._guts.scalarAlign(slice.endIndex) + slice.startIndex._isScalarAligned && slice.endIndex._isScalarAligned) + self._slice = slice + _invariantCheck() + } - self._slice = Slice( - base: slice.base, - bounds: Range(_uncheckedBounds: (start, end))) + @usableFromInline + @available(*, deprecated) // Use `init(_unchecked:)` in new code. + internal init(_ slice: Slice) { + let r = slice.base._guts.validateScalarRange( + slice.startIndex ..< slice.endIndex) + self._slice = Slice(base: slice.base, bounds: r) _invariantCheck() } @@ -123,7 +126,7 @@ public struct Substring: Sendable { /// Creates an empty substring. @inlinable @inline(__always) public init() { - self.init(Slice()) + self._slice = Slice() } } @@ -146,6 +149,10 @@ extension Substring { #else @usableFromInline @inline(never) @_effects(releasenone) internal func _invariantCheck() { + _internalInvariant(_slice.endIndex <= base._guts.endIndex) + _internalInvariant( + base._guts.hasMatchingEncoding(_slice.startIndex) && + base._guts.hasMatchingEncoding(_slice.endIndex)) // Indices are always scalar aligned _internalInvariant( _slice.startIndex == base._guts.scalarAlign(_slice.startIndex) && @@ -156,6 +163,29 @@ extension Substring { #endif // INTERNAL_CHECKS_ENABLED } +extension Substring { + @inline(__always) + internal func _validateScalarIndex(_ i: String.Index) -> String.Index { + _slice._base._guts.validateScalarIndex(i, from: startIndex, to: endIndex) + } + + @inline(__always) + internal func _validateInclusiveScalarIndex( + _ i: String.Index + ) -> String.Index { + _slice._base._guts.validateInclusiveScalarIndex( + i, from: startIndex, to: endIndex) + } + + @inline(__always) + internal func _validateScalarRange( + _ range: Range + ) -> Range { + _slice._base._guts.validateScalarRange( + range, from: startIndex, to: endIndex) + } +} + extension Substring: StringProtocol { public typealias Index = String.Index public typealias SubSequence = Substring @@ -174,10 +204,8 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. - let i = _slice.base._guts.ensureMatchingEncoding(i) - _precondition(i < endIndex && i >= startIndex, - "Substring index is out of bounds") - let r = _uncheckedIndex(after: _slice.base._guts.scalarAlign(i)) + let i = _validateScalarIndex(i) + let r = _uncheckedIndex(after: i) return _slice.base._guts.internalMarkEncoding(r) } @@ -222,10 +250,13 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. - let i = _slice.base._guts.ensureMatchingEncoding(i) - _precondition(i <= endIndex && i > startIndex, - "Substring index is out of bounds") - let r = _uncheckedIndex(before: _slice.base._guts.scalarAlign(i)) + let i = _validateInclusiveScalarIndex(i) + // Note: Scalar aligning an index may move it closer towards the + // `startIndex`, so the `i > startIndex` check needs to come after the + // `validateScalarIndex` call. + _precondition(i > startIndex, "Substring index is out of bounds") + + let r = _uncheckedIndex(before: i) return _slice.base._guts.internalMarkEncoding(r) } @@ -242,7 +273,6 @@ extension Substring: StringProtocol { _internalInvariant(i._isScalarAligned) // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. - let i = _slice.base._guts.scalarAlign(i) let stride = _characterStride(endingAt: i) let priorOffset = i._encodedOffset &- stride _internalInvariant(priorOffset >= startIndex._encodedOffset) @@ -266,11 +296,8 @@ extension Substring: StringProtocol { // substring. This leads to Collection conformance issues when the // `Substring`'s bounds do not fall on grapheme boundaries in `base`. - var i = _slice.base._guts.ensureMatchingEncoding(i) - _precondition(i >= startIndex && i <= endIndex, - "String index is out of bounds") - i = _slice.base._guts.scalarAlign(i) // TODO: known-ASCII and single-scalar-grapheme fast path, etc. + var i = _validateInclusiveScalarIndex(i) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { _precondition(i < endIndex, "String index is out of bounds") @@ -303,11 +330,7 @@ extension Substring: StringProtocol { // exactly matches the documentation. let limit = _slice.base._guts.ensureMatchingEncoding(limit) - var i = _slice.base._guts.ensureMatchingEncoding(i) - _precondition(i >= startIndex && i <= endIndex, - "String index is out of bounds") - i = _slice.base._guts.scalarAlign(i) - + var i = _validateInclusiveScalarIndex(i) let start = i if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { @@ -340,22 +363,14 @@ extension Substring: StringProtocol { // grapheme breaks -- swapping `start` and `end` may change the magnitude of // the result. - var start = _slice.base._guts.ensureMatchingEncoding(start) - var end = _slice.base._guts.ensureMatchingEncoding(end) - - _precondition( - start >= startIndex && start <= endIndex && - end >= startIndex && end <= endIndex, - "String index is out of bounds") - - start = _slice.base._guts.scalarAlign(start) - end = _slice.base._guts.scalarAlign(end) + let start = _validateInclusiveScalarIndex(start) + let end = _validateInclusiveScalarIndex(end) // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - // Per SE-0180, `start` and `end` are allowed to fall in between grapheme - // breaks, in which case this function must still terminate without trapping - // and return a result that makes sense. + // Per SE-0180, `start` and `end` are allowed to fall in between Character + // boundaries, in which case this function must still terminate without + // trapping and return a result that makes sense. var i = start var count = 0 @@ -375,33 +390,34 @@ extension Substring: StringProtocol { } public subscript(i: Index) -> Character { - var i = _slice.base._guts.ensureMatchingEncoding(i) - _precondition(i >= startIndex && i < endIndex, - "Substring index is out of bounds") - i = _slice.base._guts.scalarAlign(i) + let i = _validateScalarIndex(i) let distance = _characterStride(startingAt: i) return _slice.base._guts.errorCorrectedCharacter( startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance) } public mutating func replaceSubrange( - _ bounds: Range, + _ subrange: Range, with newElements: C ) where C: Collection, C.Iterator.Element == Iterator.Element { - _replaceSubrange(bounds, with: newElements) + _replaceSubrange(subrange, with: newElements) } public mutating func replaceSubrange( - _ bounds: Range, with newElements: Substring + _ subrange: Range, with newElements: Substring ) { - _replaceSubrange(bounds, with: newElements) + _replaceSubrange(subrange, with: newElements) } - @inline(__always) internal mutating func _replaceSubrange( - _ bounds: Range, with newElements: C + _ subrange: Range, with newElements: C ) where C.Element == Element { - _slice.replaceSubrange(bounds, with: newElements) + let subrange = _validateScalarRange(subrange) + // TODO(lorentey): We can't delegate to Slice here; it doesn't handle + // subscalar indices or the case where `newElements` changes character + // breaks in the surrounding context. The substring's + // `startIndex`/`endIndex` may get broken. + _slice.replaceSubrange(subrange, with: newElements) } /// Creates a string from the given Unicode code units in the specified @@ -495,17 +511,20 @@ extension Substring: StringProtocol { } extension Substring { + // TODO(lorentey): Rename to proper terminology internal var _knownToStartOnGraphemeBreak: Bool { startIndex._encodedOffset == 0 || startIndex.characterStride != nil } + // TODO(lorentey): Rename to proper terminology internal var _knownToEndOnGraphemeBreak: Bool { endIndex == _slice.base.endIndex || endIndex.characterStride != nil } internal var _encodedOffsetRange: Range { - Range(_uncheckedBounds: ( - _slice._startIndex._encodedOffset, _slice._endIndex._encodedOffset)) + let lower = _slice._startIndex._encodedOffset + let upper = _slice._endIndex._encodedOffset + return Range(_uncheckedBounds: (lower, upper)) } internal func _characterStride(startingAt i: Index) -> Int { @@ -556,7 +575,7 @@ extension Substring: CustomDebugStringConvertible { extension Substring: LosslessStringConvertible { public init(_ content: String) { let range = Range(_uncheckedBounds: (content.startIndex, content.endIndex)) - self.init(Slice(base: content, bounds: range)) + self.init(_unchecked: Slice(base: content, bounds: range)) } } @@ -602,11 +621,13 @@ extension Substring.UTF8View: BidirectionalCollection { @inlinable public func formIndex(after i: inout Index) { + // TODO(lorentey): Review index validation _slice.formIndex(after: &i) } @inlinable public func index(_ i: Index, offsetBy n: Int) -> Index { + // TODO(lorentey): Review index validation return _slice.index(i, offsetBy: n) } @@ -614,11 +635,13 @@ extension Substring.UTF8View: BidirectionalCollection { public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { + // TODO(lorentey): Review index validation return _slice.index(i, offsetBy: n, limitedBy: limit) } @inlinable public func distance(from start: Index, to end: Index) -> Int { + // TODO(lorentey): Review index validation return _slice.distance(from: start, to: end) } @@ -632,6 +655,7 @@ extension Substring.UTF8View: BidirectionalCollection { @inlinable public func _failEarlyRangeCheck(_ index: Index, bounds: Range) { + // TODO(lorentey): Review index validation _slice._failEarlyRangeCheck(index, bounds: bounds) } @@ -639,19 +663,25 @@ extension Substring.UTF8View: BidirectionalCollection { public func _failEarlyRangeCheck( _ range: Range, bounds: Range ) { + // TODO(lorentey): Review index validation _slice._failEarlyRangeCheck(range, bounds: bounds) } @inlinable - public func index(before i: Index) -> Index { return _slice.index(before: i) } + public func index(before i: Index) -> Index { + // TODO(lorentey): Review index validation + return _slice.index(before: i) + } @inlinable public func formIndex(before i: inout Index) { + // TODO(lorentey): Review index validation _slice.formIndex(before: &i) } @inlinable public subscript(r: Range) -> Substring.UTF8View { + // TODO(lorentey): Review index validation // FIXME(strings): tests. _precondition(r.lowerBound >= startIndex && r.upperBound <= endIndex, "UTF8View index range out of bounds") @@ -697,6 +727,7 @@ extension String { self = String(Substring(codeUnits)) } } + extension Substring { @frozen public struct UTF16View: Sendable { @@ -706,6 +737,7 @@ extension Substring { /// Creates an instance that slices `base` at `_bounds`. @inlinable internal init(_ base: String.UTF16View, _bounds: Range) { + // TODO(lorentey): Review index validation _slice = Slice( base: String(base._guts).utf16, bounds: _bounds) @@ -729,21 +761,29 @@ extension Substring.UTF16View: BidirectionalCollection { public var endIndex: Index { return _slice.endIndex } @inlinable - public subscript(index: Index) -> Element { return _slice[index] } + public subscript(index: Index) -> Element { + // TODO(lorentey): Review index validation + return _slice[index] + } @inlinable public var indices: Indices { return _slice.indices } @inlinable - public func index(after i: Index) -> Index { return _slice.index(after: i) } + public func index(after i: Index) -> Index { + // TODO(lorentey): Review index validation + return _slice.index(after: i) + } @inlinable public func formIndex(after i: inout Index) { + // TODO(lorentey): Review index validation _slice.formIndex(after: &i) } @inlinable public func index(_ i: Index, offsetBy n: Int) -> Index { + // TODO(lorentey): Review index validation return _slice.index(i, offsetBy: n) } @@ -751,16 +791,19 @@ extension Substring.UTF16View: BidirectionalCollection { public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { + // TODO(lorentey): Review index validation return _slice.index(i, offsetBy: n, limitedBy: limit) } @inlinable public func distance(from start: Index, to end: Index) -> Int { + // TODO(lorentey): Review index validation return _slice.distance(from: start, to: end) } @inlinable public func _failEarlyRangeCheck(_ index: Index, bounds: Range) { + // TODO(lorentey): Review index validation _slice._failEarlyRangeCheck(index, bounds: bounds) } @@ -768,19 +811,25 @@ extension Substring.UTF16View: BidirectionalCollection { public func _failEarlyRangeCheck( _ range: Range, bounds: Range ) { + // TODO(lorentey): Review index validation _slice._failEarlyRangeCheck(range, bounds: bounds) } @inlinable - public func index(before i: Index) -> Index { return _slice.index(before: i) } + public func index(before i: Index) -> Index { + // TODO(lorentey): Review index validation + return _slice.index(before: i) + } @inlinable public func formIndex(before i: inout Index) { + // TODO(lorentey): Review index validation _slice.formIndex(before: &i) } @inlinable public subscript(r: Range) -> Substring.UTF16View { + // TODO(lorentey): Review index validation return Substring.UTF16View(_slice.base, _bounds: r) } } @@ -857,21 +906,32 @@ extension Substring.UnicodeScalarView: BidirectionalCollection { public var endIndex: Index { return _slice.endIndex } @inlinable - public subscript(index: Index) -> Element { return _slice[index] } + public subscript(index: Index) -> Element { + // TODO(lorentey): Review index validation + return _slice[index] + } @inlinable - public var indices: Indices { return _slice.indices } + public var indices: Indices { + // TODO(lorentey): Review index validation + return _slice.indices + } @inlinable - public func index(after i: Index) -> Index { return _slice.index(after: i) } + public func index(after i: Index) -> Index { + // TODO(lorentey): Review index validation + return _slice.index(after: i) + } @inlinable public func formIndex(after i: inout Index) { + // TODO(lorentey): Review index validation _slice.formIndex(after: &i) } @inlinable public func index(_ i: Index, offsetBy n: Int) -> Index { + // TODO(lorentey): Review index validation return _slice.index(i, offsetBy: n) } @@ -879,16 +939,19 @@ extension Substring.UnicodeScalarView: BidirectionalCollection { public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { + // TODO(lorentey): Review index validation return _slice.index(i, offsetBy: n, limitedBy: limit) } @inlinable public func distance(from start: Index, to end: Index) -> Int { + // TODO(lorentey): Review index validation return _slice.distance(from: start, to: end) } @inlinable public func _failEarlyRangeCheck(_ index: Index, bounds: Range) { + // TODO(lorentey): Review index validation _slice._failEarlyRangeCheck(index, bounds: bounds) } @@ -896,19 +959,25 @@ extension Substring.UnicodeScalarView: BidirectionalCollection { public func _failEarlyRangeCheck( _ range: Range, bounds: Range ) { + // TODO(lorentey): Review index validation _slice._failEarlyRangeCheck(range, bounds: bounds) } @inlinable - public func index(before i: Index) -> Index { return _slice.index(before: i) } + public func index(before i: Index) -> Index { + // TODO(lorentey): Review index validation + return _slice.index(before: i) + } @inlinable public func formIndex(before i: inout Index) { + // TODO(lorentey): Review index validation _slice.formIndex(before: &i) } @inlinable public subscript(r: Range) -> Substring.UnicodeScalarView { + // TODO(lorentey): Review index validation _failEarlyRangeCheck(r, bounds: startIndex..( - _ target: Range, with replacement: C + _ subrange: Range, with replacement: C ) where C.Element == Element { - _slice.replaceSubrange(target, with: replacement) + // TODO(lorentey): Review index validation + let subrange = _slice.base._guts.validateScalarRange( + subrange, from: startIndex, to: endIndex) + _slice.replaceSubrange(subrange, with: replacement) } } @@ -1037,22 +1109,15 @@ extension Substring: ExpressibleByStringLiteral { extension String { @available(swift, introduced: 4) public subscript(r: Range) -> Substring { - let r = Range(_uncheckedBounds: ( - _guts.ensureMatchingEncoding(r.lowerBound), - _guts.ensureMatchingEncoding(r.upperBound))) - _boundsCheck(r) - return Substring(Slice(base: self, bounds: r)) + let r = _guts.validateScalarRange(r) + return Substring(_unchecked: Slice(base: self, bounds: r)) } } extension Substring { @available(swift, introduced: 4) public subscript(r: Range) -> Substring { - let r = Range(_uncheckedBounds: ( - _slice.base._guts.ensureMatchingEncoding(r.lowerBound), - _slice.base._guts.ensureMatchingEncoding(r.upperBound))) - _precondition(r.lowerBound >= startIndex && r.upperBound <= endIndex, - "Substring index range is out of bounds") - return Substring(Slice(base: _slice.base, bounds: r)) + let r = _validateScalarRange(r) + return Substring(_unchecked: Slice(base: base, bounds: r)) } } From 87073f2af839a3ec78e5c21137d7f85114fac6b2 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 2 Mar 2022 21:15:11 -0800 Subject: [PATCH 05/83] [stdlib] Substring.replaceSubrange: fix startIndex/endIndex adjustment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This used to forward to `Slice.replaceSubrange`, but that’s a generic algorithm that isn’t aware of the pecularities of Unicode extended grapheme clusters, and it can be mislead by unusual cases, like a substring or subrange whose bounds aren’t `Character`-aligned, or a replacement string that starts with a continuation scalar. --- .../core/StringGutsRangeReplaceable.swift | 37 ++++--- stdlib/public/core/Substring.swift | 100 +++++++++++++++++- 2 files changed, 117 insertions(+), 20 deletions(-) diff --git a/stdlib/public/core/StringGutsRangeReplaceable.swift b/stdlib/public/core/StringGutsRangeReplaceable.swift index 4b77b471f8ba1..1983c45f1ad51 100644 --- a/stdlib/public/core/StringGutsRangeReplaceable.swift +++ b/stdlib/public/core/StringGutsRangeReplaceable.swift @@ -299,21 +299,22 @@ extension _StringGuts { self = result._guts } + // - Returns: The encoded offset range of the replaced contents in the result. + @discardableResult internal mutating func replaceSubrange( _ bounds: Range, with newElements: C - ) where C: Collection, C.Iterator.Element == Character { + ) -> Range + where C: Collection, C.Iterator.Element == Character { if isUniqueNative { if let replStr = newElements as? String, replStr._guts.isFastUTF8 { - replStr._guts.withFastUTF8 { + return replStr._guts.withFastUTF8 { uniqueNativeReplaceSubrange( bounds, with: $0, isASCII: replStr._guts.isASCII) } - return } - uniqueNativeReplaceSubrange( + return uniqueNativeReplaceSubrange( bounds, with: newElements.lazy.flatMap { $0.utf8 }) - return } var result = String() @@ -324,16 +325,20 @@ extension _StringGuts { } let selfStr = String(self) result.append(contentsOf: selfStr[.., with codeUnits: UnsafeBufferPointer, isASCII: Bool - ) { + ) -> Range { let neededCapacity = bounds.lowerBound._encodedOffset + codeUnits.count + (self.count - bounds.upperBound._encodedOffset) @@ -342,17 +347,19 @@ extension _StringGuts { _internalInvariant(bounds.lowerBound.transcodedOffset == 0) _internalInvariant(bounds.upperBound.transcodedOffset == 0) - _object.nativeStorage.replace( - from: bounds.lowerBound._encodedOffset, - to: bounds.upperBound._encodedOffset, - with: codeUnits) + let start = bounds.lowerBound._encodedOffset + let end = bounds.upperBound._encodedOffset + _object.nativeStorage.replace(from: start, to: end, with: codeUnits) self = _StringGuts(_object.nativeStorage) + return Range(_uncheckedBounds: (start, start + codeUnits.count)) } + // - Returns: The encoded offset range of the replaced contents in the result. internal mutating func uniqueNativeReplaceSubrange( _ bounds: Range, with codeUnits: C - ) where C.Element == UInt8 { + ) -> Range + where C.Element == UInt8 { let replCount = codeUnits.count let neededCapacity = @@ -363,12 +370,12 @@ extension _StringGuts { _internalInvariant(bounds.lowerBound.transcodedOffset == 0) _internalInvariant(bounds.upperBound.transcodedOffset == 0) + let start = bounds.lowerBound._encodedOffset + let end = bounds.upperBound._encodedOffset _object.nativeStorage.replace( - from: bounds.lowerBound._encodedOffset, - to: bounds.upperBound._encodedOffset, - with: codeUnits, - replacementCount: replCount) + from: start, to: end, with: codeUnits, replacementCount: replCount) self = _StringGuts(_object.nativeStorage) + return Range(_uncheckedBounds: (start, start + replCount)) } } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 8406db6f22aa6..13380ca7d93a5 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -412,12 +412,102 @@ extension Substring: StringProtocol { internal mutating func _replaceSubrange( _ subrange: Range, with newElements: C ) where C.Element == Element { + defer { _invariantCheck() } let subrange = _validateScalarRange(subrange) - // TODO(lorentey): We can't delegate to Slice here; it doesn't handle - // subscalar indices or the case where `newElements` changes character - // breaks in the surrounding context. The substring's - // `startIndex`/`endIndex` may get broken. - _slice.replaceSubrange(subrange, with: newElements) + + // Replacing the range is easy -- we can just reuse `String`'s + // implementation. However, we must also update `startIndex` and `endIndex` + // to keep them valid & pointing to the same positions, which is somewhat + // tricky. + // + // In Swift <=5.6, this used to forward to `Slice.replaceSubrange`, which + // does it by counting elements, i.e., `Character`s. Unfortunately, that is + // prone to return incorrect results in unusual cases, e.g. + // + // - when the substring or the given subrange doesn't start/end on a + // character boundary, or + // - when the beginning/end of the replacement string ends up getting + // merged with the Character preceding/following the replaced range. + // + // The best way to avoid problems in these cases is to lower index + // calculations to Unicode scalars (or below) -- in this implementation, we + // are measuring things in UTF-8 code units, for efficiency. + + if _slowPath(_slice._base._guts.isKnownUTF16) { + // UTF-16 (i.e., foreign) string. The mutation will convert this to the + // native UTF-8 encoding, so we need to do some extra work to preserve our + // bounds. + let utf8StartOffset = _slice._base.utf8.distance( + from: _slice._base.startIndex, + to: _slice._startIndex) + let oldUTF8Count = self.utf8.count + + let oldSubrangeCount = self.utf8.distance( + from: subrange.lowerBound, to: subrange.upperBound) + + let newUTF8Subrange = _slice._base._guts.replaceSubrange( + subrange, with: newElements) + _internalInvariant(!_slice._base._guts.isKnownUTF16) + + let newUTF8Count = oldUTF8Count + newUTF8Subrange.count - oldSubrangeCount + + // Get the character stride in the entire string, not just the substring. + // (Characters in a substring may end beyond the bounds of it.) + let newStride = _slice.base._guts._opaqueCharacterStride( + startingAt: utf8StartOffset, + in: utf8StartOffset ..< _slice._base._guts.count) + + _slice._startIndex = String.Index( + encodedOffset: utf8StartOffset, + transcodedOffset: 0, + characterStride: newStride)._scalarAligned._knownUTF8 + _slice._endIndex = String.Index( + encodedOffset: utf8StartOffset + newUTF8Count, + transcodedOffset: 0)._scalarAligned._knownUTF8 + return + } + + // UTF-8 string. + + let oldRange = Range(_uncheckedBounds: ( + subrange.lowerBound._encodedOffset, subrange.upperBound._encodedOffset)) + + let newRange = _slice._base._guts.replaceSubrange( + subrange, with: newElements) + + let newOffsetBounds = Range(_uncheckedBounds: ( + startIndex._encodedOffset, + endIndex._encodedOffset &+ newRange.count &- oldRange.count)) + + // Update `startIndex` if necessary. The replacement may have invalidated + // its cached character stride, but not its stored offset. + // + // We are exploiting the fact that mutating the string _after_ the scalar + // following the end of the character at `startIndex` cannot possibly change + // the length of that character. (This is true because `index(after:)` never + // needs to look ahead by more than one Unicode scalar.) + if + let stride = startIndex.characterStride, + oldRange.lowerBound <= startIndex._encodedOffset &+ stride + { + // Get the character stride in the entire string, not just the substring. + // (Characters in a substring may end beyond the bounds of it.) + let newStride = _slice.base._guts._opaqueCharacterStride( + startingAt: newOffsetBounds.lowerBound, + in: newOffsetBounds.lowerBound ..< _slice._base._guts.count) + _slice._startIndex = String.Index( + encodedOffset: startIndex._encodedOffset, + transcodedOffset: 0, + characterStride: newStride)._scalarAligned._knownUTF8 + } + + // Update endIndex. + if newOffsetBounds.upperBound != endIndex._encodedOffset { + _slice._endIndex = Index( + encodedOffset: newOffsetBounds.upperBound, + transcodedOffset: 0 + )._scalarAligned._knownUTF8 + } } /// Creates a string from the given Unicode code units in the specified From 8ab2379946707301f78bd655b3baba173c685a3f Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 16 Mar 2022 20:14:24 -0700 Subject: [PATCH 06/83] =?UTF-8?q?[stdlib]=20Round=20indices=20down=20to=20?= =?UTF-8?q?nearest=20Character=20in=20String=E2=80=99s=20index=20algorithm?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prevent unaligned indices from breaking well-defined index distance and index offset calculations, round every index down to the nearest whole Character. For the horrific details, see the forum discussion below. https://forums.swift.org/t/string-index-unification-vs-bidirectionalcollection-requirements/55946 To avoid rounding from regressing String performance in the regular case (when indices aren’t being passed across string views), introduce a new String.Index flag bit that indicates that the index is already Character aligned. --- stdlib/public/core/StringCharacterView.swift | 66 ++++-- .../public/core/StringGraphemeBreaking.swift | 70 +++++- stdlib/public/core/StringGuts.swift | 16 +- stdlib/public/core/StringIndex.swift | 224 ++++++++++++------ .../StringRangeReplaceableCollection.swift | 3 + stdlib/public/core/Substring.swift | 137 +++++++---- 6 files changed, 364 insertions(+), 152 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 63dc9a0e69846..875723173b405 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -49,7 +49,7 @@ extension String: BidirectionalCollection { /// `endIndex`. /// - Returns: The index value immediately after `i`. public func index(after i: Index) -> Index { - let i = _guts.validateScalarIndex(i) + let i = _guts.roundDownToNearestCharacter(_guts.validateScalarIndex(i)) let r = _uncheckedIndex(after: i) return _guts.internalMarkEncoding(r) } @@ -62,20 +62,17 @@ extension String: BidirectionalCollection { /// /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(after i: Index) -> Index { - // FIXME: Unlike `index(before:)`, this function may return incorrect - // results if `i` isn't on a grapheme cluster boundary. (The grapheme - // breaking algorithm assumes we start on a break when we go forward.) _internalInvariant(_guts.hasMatchingEncoding(i)) _internalInvariant(i < endIndex) - _internalInvariant(i._isScalarAligned) + _internalInvariant(i._isCharacterAligned) // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let stride = _characterStride(startingAt: i) let nextOffset = i._encodedOffset &+ stride - let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned + let nextIndex = Index(_encodedOffset: nextOffset)._characterAligned let nextStride = _characterStride(startingAt: nextIndex) let r = Index(encodedOffset: nextOffset, characterStride: nextStride) - return r._scalarAligned + return _guts.internalMarkEncoding(r._characterAligned) } /// Returns the position immediately before the given index. @@ -84,13 +81,13 @@ extension String: BidirectionalCollection { /// `startIndex`. /// - Returns: The index value immediately before `i`. public func index(before i: Index) -> Index { - let i = _guts.validateInclusiveScalarIndex(i) - // Note: Scalar aligning an index may move it closer towards the - // `startIndex`, so the `i > startIndex` check needs to come after the - // `validateScalarIndex` call. + let i = _guts.roundDownToNearestCharacter( + _guts.validateInclusiveScalarIndex(i)) + // Note: Aligning an index may move it closer towards the `startIndex`, so + // the `i > startIndex` check needs to come after rounding. _precondition(i > startIndex, "String index is out of bounds") - let r = _uncheckedIndex(before: _guts.scalarAlign(i)) + let r = _uncheckedIndex(before: i) return _guts.internalMarkEncoding(r) } @@ -98,20 +95,20 @@ extension String: BidirectionalCollection { /// /// - has the right encoding, /// - is within bounds, and - /// - is scalar aligned. + /// - is character aligned. /// /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(before i: Index) -> Index { _internalInvariant(_guts.hasMatchingEncoding(i)) _internalInvariant(i > startIndex && i <= endIndex) - _internalInvariant(i._isScalarAligned) + _internalInvariant(i._isCharacterAligned) // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let stride = _characterStride(endingAt: i) let priorOffset = i._encodedOffset &- stride let r = Index(encodedOffset: priorOffset, characterStride: stride) - return r._scalarAligned + return r._characterAligned } /// Returns an index that is the specified distance from the given index. @@ -142,7 +139,8 @@ extension String: BidirectionalCollection { // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - var i = _guts.validateInclusiveScalarIndex(i) + var i = _guts.roundDownToNearestCharacter( + _guts.validateInclusiveScalarIndex(i)) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { @@ -210,11 +208,12 @@ extension String: BidirectionalCollection { // breaks, in which case this function must still terminate without trapping // and return a result that makes sense. - // Note: `limit` is intentionally not scalar aligned to ensure our behavior - // exactly matches the documentation above. + // Note: `limit` is intentionally not scalar (or character-) aligned to + // ensure our behavior exactly matches the documentation above. let limit = _guts.ensureMatchingEncoding(limit) - var i = _guts.validateInclusiveScalarIndex(i) + var i = _guts.roundDownToNearestCharacter( + _guts.validateInclusiveScalarIndex(i)) let start = i if distance >= 0 { @@ -253,8 +252,10 @@ extension String: BidirectionalCollection { // grapheme breaks -- swapping `start` and `end` may change the magnitude of // the result. - let start = _guts.validateInclusiveScalarIndex(start) - let end = _guts.validateInclusiveScalarIndex(end) + let start = _guts.roundDownToNearestCharacter( + _guts.validateInclusiveScalarIndex(start)) + let end = _guts.roundDownToNearestCharacter( + _guts.validateInclusiveScalarIndex(end)) // TODO: known-ASCII and single-scalar-grapheme fast path, etc. @@ -297,17 +298,32 @@ extension String: BidirectionalCollection { @inlinable @inline(__always) // TODO(lorentey): Consider removing these. If // `index(after:)` isn't inlinable, does it // really matter if this one is? (Potential - // _guts-related optimizations notwithstanding.) - // `subscript` being inlinable forces a bunch of - // new additions to be _aEIC, even though they - // ought to be internal. + // optimizations notwithstanding.) `subscript` + // being inlinable forces a bunch of new + // additions to be _aEIC, even though they ought + // to be internal. public subscript(i: Index) -> Character { + // Note: SE-0180 requires us not to round `i` down to the nearest whole + // `Character` boundary. let i = _guts.validateScalarIndex(i) let distance = _characterStride(startingAt: i) return _guts.errorCorrectedCharacter( startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance) } + /// Return the length of the `Character` starting at the given index, measured + /// in encoded code units, and without looking back at any scalar that + /// precedes `i`. + /// + /// Note: if `i` isn't `Character`-aligned, then this operation must still + /// finish successfully and return the length of the grapheme cluster starting + /// at `i` _as if the string started on that scalar_. (This can be different + /// from the length of the whole character when the preceding scalars are + /// present!) + /// + /// This method is called from inlinable `subscript` implementations in + /// current and previous versions of the stdlib, wich require this contract + /// not to be violated. @inlinable @inline(__always) internal func _characterStride(startingAt i: Index) -> Int { _internalInvariant_5_1(i._isScalarAligned) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index f25667bfb2994..2102e1b66be01 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -83,10 +83,74 @@ internal func _hasGraphemeBreakBetween( return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) } +extension _StringGuts { + internal func roundDownToNearestCharacter( + _ i: String.Index + ) -> String.Index { + _internalInvariant(i._isScalarAligned) + _internalInvariant(hasMatchingEncoding(i)) + _internalInvariant(i._encodedOffset <= count) + + let offset = i._encodedOffset + if i._isCharacterAligned { return i } + if offset == 0 || offset == count { return i._characterAligned } + + let start = offset - _opaqueCharacterStride(endingAt: offset) + let stride = _opaqueCharacterStride(startingAt: start) + _internalInvariant(offset <= start + stride, + "Grapheme breaking inconsistency") + if offset >= start + stride { + // Already aligned, or grapheme breaking returned an unexpected result. + return i._characterAligned + } + let r = String.Index(encodedOffset: start, characterStride: stride) + return markEncoding(r._characterAligned) + } + + internal func roundDownToNearestCharacter( + _ i: String.Index, + from start: String.Index, + to end: String.Index + ) -> String.Index { + _internalInvariant(start._isScalarAligned && end._isScalarAligned) + _internalInvariant(hasMatchingEncoding(start) && hasMatchingEncoding(end)) + _internalInvariant(start <= end && end <= endIndex) + + _internalInvariant(i._isScalarAligned) + _internalInvariant(hasMatchingEncoding(i)) + _internalInvariant(i >= start && i <= end) + + // We can only use the `_isCharacterAligned` bit if the start index is also + // character-aligned. + if start._isCharacterAligned && i._isCharacterAligned { return i } + + if i == start || i == end { return i } + + let offset = i._encodedOffset + let prior = offset - _opaqueCharacterStride(endingAt: offset) + let stride = _opaqueCharacterStride(startingAt: prior) + _internalInvariant(offset <= prior + stride, + "Grapheme breaking inconsistency") + if offset >= prior + stride { + // Already aligned, or grapheme breaking returned an unexpected result. + return i + } + var r = String.Index(encodedOffset: prior, characterStride: stride) + if start._isCharacterAligned { + r = r._characterAligned + } else { + r = r._scalarAligned + } + return markEncoding(r) + } +} + extension _StringGuts { @usableFromInline @inline(never) @_effects(releasenone) internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool { + if i._isCharacterAligned { return true } + guard i.transcodedOffset == 0 else { return false } let offset = i._encodedOffset @@ -94,10 +158,12 @@ extension _StringGuts { guard isOnUnicodeScalarBoundary(i) else { return false } - let str = String(self) - return i == str.index(before: str.index(after: i)) + let nearest = roundDownToNearestCharacter(i) + return i == nearest } +} +extension _StringGuts { @usableFromInline @inline(never) @_effects(releasenone) internal func _opaqueCharacterStride(startingAt i: Int) -> Int { diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 18b28864613df..318f38d636974 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -288,12 +288,14 @@ extension _StringGuts { @inlinable @inline(__always) internal var startIndex: String.Index { - Index(_encodedOffset: 0)._scalarAligned._encodingIndependent + // The start index is always `Character` aligned. + Index(_encodedOffset: 0)._characterAligned._encodingIndependent } @inlinable @inline(__always) internal var endIndex: String.Index { - markEncoding(Index(_encodedOffset: self.count)._scalarAligned) + // The end index is always `Character` aligned. + markEncoding(Index(_encodedOffset: self.count)._characterAligned) } } @@ -373,6 +375,7 @@ extension _StringGuts { } @_alwaysEmitIntoClient // TODO(lorentey): Should this remain internal? + @inline(never) internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { _internalInvariant(isForeign || !i._canBeUTF8) if isForeign { @@ -441,7 +444,6 @@ extension _StringGuts { /// - has an encoding that matches this string, /// - is within `start ..< end`, and /// - is aligned on a scalar boundary. - @_alwaysEmitIntoClient internal func validateScalarIndex( _ i: String.Index, from start: String.Index, @@ -513,6 +515,14 @@ extension _StringGuts { upper = scalarAlign(upper) lower = scalarAlign(lower) + // Older binaries may generate `startIndex` without the + // `_isCharacterAligned` flag. Compensate for that here so that substrings + // that start at the beginning will never get the sad path in + // `index(after:)`. Note that we don't need to do this for `upper` and we + // don't need to compare against the `endIndex` -- those aren't nearly as + // critical. + if lower._encodedOffset == 0 { lower = lower._characterAligned } + return Range(_uncheckedBounds: (lower, upper)) } diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index 76af50f251556..a48077ed49313 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -17,7 +17,7 @@ import SwiftShims String's Index has the following layout: ┌──────────┬────────────────╥────────────────┬───────╥───────┐ - │ b63:b16 │ b15:b14 ║ b13:b8 │ b7:b3 ║ b2:b0 │ + │ b63:b16 │ b15:b14 ║ b13:b8 │ b7:b4 ║ b3:b0 │ ├──────────┼────────────────╫────────────────┼───────╫───────┤ │ position │ transc. offset ║ grapheme cache │ rsvd ║ flags │ └──────────┴────────────────╨────────────────┴───────╨───────┘ @@ -40,18 +40,35 @@ isn't frozen. - grapheme cache: A 6-bit value remembering the distance to the next grapheme boundary. -- reserved: 5 unused bits available for future flags etc. The meaning of each +- reserved: 4 unused bits available for future flags etc. The meaning of each bit may change between stdlib versions. These must be set to zero if constructing an index in inlinable code. - b2: UTF-16. If set, position is in known to be UTF-16 code units [Swift 5.7+] - b1: UTF-8. If set, position is in known to be UTF-8 code units [Swift 5.7+] - b0: Scalar alignment. If set, index is known to be scalar-aligned (see below) + * b3: UTF-16 encoding -Before Swift 5.7, bits b1 and b2 used to be part of the resilient slice. -See the note on Index Encoding below to see how this works. + If set, the position is known to be expressed in UTF-16 code units. + (Introduced in Swift 5.7) + + * b2: UTF-8 encoding + + If set, the position is known to be expressed in UTF-8 code units. + (Introduced in Swift 5.7) + + * b1: `_isCharacterAligned` + + If set, the index is known to be on an extended grapheme cluster + boundary (i.e., on a Swift `Character`.) + (Introduced in Swift 5.7) + + * b0: `_isScalarAligned` + + If set, index is known to be on a Unicode scalar boundary (see below). + (Introduced in Swift 5.1) + +Before Swift 5.7, bits b1, b2 and b3 used to be part of the resilient slice. See +the notes on Character Alignment and Index Encoding below to see how this works. */ extension String { @@ -169,6 +186,9 @@ extension String.Index { @usableFromInline @inline(never) @_effects(releasenone) internal func _invariantCheck() { _internalInvariant(_encodedOffset >= 0) + if self._isCharacterAligned { + _internalInvariant(_isScalarAligned) + } if self._isScalarAligned { _internalInvariant_5_1(transcodedOffset == 0) } @@ -278,55 +298,91 @@ extension String.Index { } } -/* - Index Encoding - - Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding - associated with the position value in String indices. Indices whose position - is an offset into UTF-8 storage come with the corresponding flag set, and a - separate flag is set for UTF-16 indices. (Only foreign strings can be UTF-16 - encoded. As of 5.7, all foreign strings are UTF-16; but this is subject to - change later if we ever decide to implement additional foreign forms.) - - In releases before 5.7, the bits corresponding to these flags were considered - reserved, and they were both set to zero in inlinable code. This means that - (on ABI stable platforms at least) we cannot assume that either of these bits - will be reliably set. If they are both clear, then we must fall back to - assuming that the index has the right encoding for whatever string it is used - on. However, if any of these bits are set, then the other bit's value is also - reliable -- whether it's set or cleared. - - The indices of ASCII strings are encoding-independent, i.e. transcoding such - strings from UTF-8 to UTF-16 (or vice versa) does not change the position - value of any of their indices. Therefore it isn't an error for an index to - have both of these flags set. (The start index of every string also behaves - this way: position zero is the same no matter how the rest of string is - stored.) - - These two bits (along with the isKnownUTF16 flag in StringObject) allows newer - versions of the Standard Library to more reliably catch runtime errors where - client code is applying an index from a UTF-16 string to a UTF-8 one, or vice - versa. This typically happens when indices from a UTF-16 Cocoa string that was - verbatim bridged into Swift are accidentally applied to a mutated version of - the same string. (The mutation turns it into a UTF-8 native string, where the - same numerical offsets might correspond to wildly different logical - positions.) - - Such code has always been broken, as the old indices are documented to be no - longer valid after the mutation; however, in previous releases this bug wasn't - reliably detected, and if the code was only ever tested on ASCII strings, then - the bug could lie dormant for a long time. (Until the code encounters a - non-ASCII character and someone gets surprised that the results no longer make - sense.) - - As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become - able to reliably catch and correct all such issues. The error cases are - handled in `_StringGuts.ensureMatchingEncoding(_:)`; see there for the sordid - details. +// ### Character (a.k.a. Extended Grapheme Cluster) Aligment +// +// Swift 5.7 assigned a new bit denoting that an index is known to be +// `Character`-aligned. This is used to enable more reliable detection & +// handling of extended grapheme cluster boundaries in indexing edge cases +// introduced by SE-0180, without slowing down the usual case, when code isn't +// interchanging indices between views. +// +// Beware! In substrings whose bounds aren't `Character`-aligned, extended +// grapheme breaks are sometimes in different places than in their base string. +// (The sequence of characters in a substring depend only on the Unicode scalars +// that make up its contents, not on their surrounding context.) Therefore, such +// substrings must not look at or set this bit: indices must be reliably +// interchangable between strings and their associated substrings, even if the +// latter are irregular. +// +// Note that `startIndex` and `endIndex` have fully inlinable implementations. +// This means that when code built on older releases runs on 5.7, this bit may +// not be set on these, even though they are always `Character`-aligned. This is +// fine -- `index(after:)` and `index(before:)` do the right thing with +// minimal/no performance loss. +extension String.Index { + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _isCharacterAligned: Bool { return 0 != _rawBits & 0x2 } -*/ + /// Return the same index with both the scalar- and `Character`-aligned bits + /// set. + /// + /// (`Character` alignment implies scalar alignment.) + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _characterAligned: String.Index { + let idx = Self(_rawBits | 0x3) + idx._invariantCheck() + return idx + } +} + +// ### Index Encoding +// +// Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding +// associated with the position value in String indices. Indices whose position +// is an offset into UTF-8 storage come with the corresponding flag set, and a +// separate flag is set for UTF-16 indices. (Only foreign strings can be UTF-16 +// encoded. As of 5.7, all foreign strings are UTF-16; but this is subject to +// change later if we ever decide to implement additional foreign forms.) +// +// In releases before 5.7, the bits corresponding to these flags were considered +// reserved, and they were both set to zero in inlinable code. This means that +// (on ABI stable platforms at least) we cannot assume that either of these bits +// will be reliably set. If they are both clear, then we must fall back to +// assuming that the index has the right encoding for whatever string it is used +// on. However, if either of these bits are set, then the other bit's value is +// also reliable -- whether it's set or cleared. +// +// The indices of ASCII strings are encoding-independent, i.e. transcoding such +// strings from UTF-8 to UTF-16 (or vice versa) does not change the position +// value of any of their indices. Therefore it isn't an error for an index to +// have both of these flags set. (The start index of every string also behaves +// this way: position zero is the same no matter how what encoding is used for +// the rest of string.) +// +// These two bits (along with the isKnownUTF16 flag in StringObject) allow newer +// versions of the Standard Library to more reliably catch runtime errors where +// client code is applying an index from a UTF-16 string to a UTF-8 one, or vice +// versa. This typically happens when indices from a UTF-16 Cocoa string that +// was verbatim bridged into Swift are accidentally applied to a mutated version +// of the same string. (The mutation turns it into a UTF-8 native string, where +// the same numerical offsets might correspond to wildly different logical +// positions.) +// +// Such code has always been broken, as the old indices are documented to be no +// longer valid after the mutation; however, in previous releases such cases +// weren't reliably detected, and if the code was only ever tested on ASCII +// strings, then the bug could lie dormant for a long time. (Until the code +// encounters a non-ASCII character and someone gets surprised that the results +// no longer make sense.) +// +// As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become +// able to reliably catch and correct all such issues. The error cases are +// handled in `_StringGuts.ensureMatchingEncoding(_:)`; see there for the sordid +// details. extension String.Index { - /// Returns true if the position in this index is okay to interpret as offset + /// Returns true if the position in this index can be interpreted as an offset /// into UTF-8-encoded string storage. /// /// (This returns true if either we know for sure that this is an UTF-8 index, @@ -335,10 +391,10 @@ extension String.Index { @inline(__always) internal var _canBeUTF8: Bool { // The only way an index cannot be UTF-8 is it has only the UTF-16 flag set. - _rawBits & 0x6 != 0x04 + _rawBits & 0xC != 0x08 } - /// Returns true if the position in this index is okay to interpret as offset + /// Returns true if the position in this index can be interpreted as offset /// into UTF-16-encoded string storage. /// /// (This returns true if either we know for sure that this is an UTF-16 @@ -348,23 +404,41 @@ extension String.Index { @inline(__always) internal var _canBeUTF16: Bool { // The only way an index cannot be UTF-16 is it has only the UTF-8 flag set. - _rawBits & 0x6 != 0x02 + _rawBits & 0xC != 0x04 } /// Returns the same index with the UTF-8 bit set. @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal var _knownUTF8: Self { Self(_rawBits | 0x2) } + internal var _knownUTF8: Self { Self(_rawBits | 0x4) } /// Returns the same index with the UTF-16 bit set. @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal var _knownUTF16: Self { Self(_rawBits | 0x4) } + internal var _knownUTF16: Self { Self(_rawBits | 0x8) } /// Returns the same index with both UTF-8 & UTF-16 bits set. @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal var _encodingIndependent: Self { Self(_rawBits | 0x6) } + internal var _encodingIndependent: Self { Self(_rawBits | 0xC) } + + /// Returns true if the UTF-8 flag is set. + /// + /// This is for debugging purposes only. Do not use this property to determine + /// whether an index is compatible with UTF-8 storage; instead, use + /// `_canBeUTF8`. + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var __isUTF8: Bool { _rawBits & 0x4 != 0 } + + /// Returns true if the UTF-16 flag is set. + /// + /// This is for debugging purposes only. Do not use this property to determine + /// whether an index is compatible with UTF-16 storage; instead, use + /// `_canBeUTF16`. + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var __isUTF16: Bool { _rawBits & 0x8 != 0 } } extension String.Index: Equatable { @@ -394,27 +468,33 @@ extension String.Index: Hashable { } // FIXME: This is for debugging only; remove before merging. -extension String.Index: CustomStringConvertible { +extension String.Index { @_alwaysEmitIntoClient + @inline(never) public var description: String { - var d = "Index(" - d += "offset: \(_encodedOffset)" + var d = "String.Index(" + switch (__isUTF8, __isUTF16) { + case (false, false): d += "unknown" + case (true, false): d += "UTF-8" + case (false, true): d += "UTF-16" + case (true, true): d += "universal" + } + d += " offset: \(_encodedOffset)" if transcodedOffset != 0 { - d += "+\(transcodedOffset)" + d += "(+\(transcodedOffset))" } if let stride = characterStride { d += ", stride: \(stride)" } - if _isScalarAligned { - d += ", scalarAligned" - } - if _rawBits & 0x2 != 0 { - d += ", utf8" - } - if _rawBits & 0x4 != 0 { - d += ", utf16" + if _isCharacterAligned { + d += ", character aligned" + } else if _isScalarAligned { + d += ", scalar aligned" } d += ")" return d } } + +@available(SwiftStdlib 5.7, *) +extension String.Index: CustomStringConvertible {} diff --git a/stdlib/public/core/StringRangeReplaceableCollection.swift b/stdlib/public/core/StringRangeReplaceableCollection.swift index 604571bdb902a..eca06fa2f4251 100644 --- a/stdlib/public/core/StringRangeReplaceableCollection.swift +++ b/stdlib/public/core/StringRangeReplaceableCollection.swift @@ -198,6 +198,9 @@ extension String: RangeReplaceableCollection { _ subrange: Range, with newElements: C ) where C: Collection, C.Iterator.Element == Character { + // Note: SE-0180 requires us to use `subrange` bounds even if they aren't + // `Character` aligned. (We still have to round things down to the nearest + // scalar boundary, though, or we may generate ill-formed encodings.) let subrange = _guts.validateScalarRange(subrange) _guts.replaceSubrange(subrange, with: newElements) } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 13380ca7d93a5..3bf6930e5e19b 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -184,6 +184,13 @@ extension Substring { _slice._base._guts.validateScalarRange( range, from: startIndex, to: endIndex) } + + @inline(__always) + internal func _roundDownToNearestCharacter( + _ i: String.Index + ) -> String.Index { + _wholeGuts.roundDownToNearestCharacter(i, from: startIndex, to: endIndex) + } } extension Substring: StringProtocol { @@ -204,7 +211,7 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. - let i = _validateScalarIndex(i) + let i = _roundDownToNearestCharacter(_validateScalarIndex(i)) let r = _uncheckedIndex(after: i) return _slice.base._guts.internalMarkEncoding(r) } @@ -217,29 +224,36 @@ extension Substring: StringProtocol { /// /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(after i: Index) -> Index { - // FIXME: Unlike `index(before:)`, this function may return incorrect - // results if `i` isn't on a grapheme cluster boundary. (The grapheme - // breaking algorithm assumes we start on a break when we go forward.) - _internalInvariant(_slice.base._guts.hasMatchingEncoding(i)) - _internalInvariant(i < endIndex) + _internalInvariant(_wholeGuts.hasMatchingEncoding(i)) _internalInvariant(i._isScalarAligned) + _internalInvariant(i >= startIndex && i < endIndex) + // Implicit precondition: `i` must be `Character`-aligned within this + // substring, even if it doesn't have the corresponding flag set. + + // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let stride = _characterStride(startingAt: i) let nextOffset = i._encodedOffset &+ stride + _internalInvariant(nextOffset <= endIndex._encodedOffset) let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned - guard - // Don't cache character strides in indices of exotic substrings whose - // startIndex isn't aligned on a grapheme cluster boundary. (Their + let nextStride = _characterStride(startingAt: nextIndex) + + var r = Index( + encodedOffset: nextOffset, characterStride: nextStride)._scalarAligned + + if + // Don't set the `_isCharacterAligned` bit in indices of exotic substrings + // whose startIndex isn't aligned on a grapheme cluster boundary. (Their // grapheme breaks may not match with those in `base`.) - _knownToStartOnGraphemeBreak, - // Don't cache the stride if we end on a partial grapheme cluster. - nextIndex < endIndex || _knownToEndOnGraphemeBreak - else { - return nextIndex + _startIsCharacterAligned, + // Likewise if this is the last character in a substring ending on a + // partial grapheme cluster. + _endIsCharacterAligned || nextOffset + nextStride < endIndex._encodedOffset + { + r = r._characterAligned } - let nextStride = _characterStride(startingAt: nextIndex) - let r = Index(encodedOffset: nextOffset, characterStride: nextStride) - return r._scalarAligned + + return r } public func index(before i: Index) -> Index { @@ -250,10 +264,10 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. - let i = _validateInclusiveScalarIndex(i) - // Note: Scalar aligning an index may move it closer towards the - // `startIndex`, so the `i > startIndex` check needs to come after the - // `validateScalarIndex` call. + let i = _roundDownToNearestCharacter(_validateInclusiveScalarIndex(i)) + // Note: Aligning an index may move it closer towards the `startIndex`, so + // this `i > startIndex` check needs to come after all the + // alignment/validation work. _precondition(i > startIndex, "Substring index is out of bounds") let r = _uncheckedIndex(before: i) @@ -268,24 +282,34 @@ extension Substring: StringProtocol { /// /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(before i: Index) -> Index { - _internalInvariant(_slice.base._guts.hasMatchingEncoding(i)) - _internalInvariant(i < endIndex) + _internalInvariant(_wholeGuts.hasMatchingEncoding(i)) _internalInvariant(i._isScalarAligned) + _internalInvariant(i > startIndex && i <= endIndex) + + // Implicit precondition: `i` must be `Character`-aligned within this + // substring, even if it doesn't have the corresponding flag set. // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. - let stride = _characterStride(endingAt: i) - let priorOffset = i._encodedOffset &- stride + let priorStride = _characterStride(endingAt: i) + let priorOffset = i._encodedOffset &- priorStride _internalInvariant(priorOffset >= startIndex._encodedOffset) - guard _knownToStartOnGraphemeBreak else { - // Don't cache character strides in indices of exotic substrings whose - // startIndex isn't aligned on a grapheme cluster boundary. (Their + var r = Index( + encodedOffset: priorOffset, characterStride: priorStride)._scalarAligned + + if + // Don't set the `_isCharacterAligned` bit in indices of exotic substrings + // whose startIndex isn't aligned on a grapheme cluster boundary. (Their // grapheme breaks may not match with those in `base`.) - return Index(_encodedOffset: priorOffset)._scalarAligned + _startIsCharacterAligned, + // Likewise if this is the last character in a substring ending on a + // partial grapheme cluster. + _endIsCharacterAligned || i < endIndex + { + r = r._characterAligned } - return Index( - encodedOffset: priorOffset, characterStride: stride)._scalarAligned + return r } public func index(_ i: Index, offsetBy distance: Int) -> Index { @@ -390,6 +414,8 @@ extension Substring: StringProtocol { } public subscript(i: Index) -> Character { + // Note: SE-0180 requires us not to round `i` down to the nearest whole + // `Character` boundary. let i = _validateScalarIndex(i) let distance = _characterStride(startingAt: i) return _slice.base._guts.errorCorrectedCharacter( @@ -412,6 +438,9 @@ extension Substring: StringProtocol { internal mutating func _replaceSubrange( _ subrange: Range, with newElements: C ) where C.Element == Element { + // Note: SE-0180 requires us to use `subrange` bounds even if they aren't + // `Character` aligned. (We still have to round things down to the nearest + // scalar boundary, though, or we may generate ill-formed encodings.) defer { _invariantCheck() } let subrange = _validateScalarRange(subrange) @@ -430,7 +459,7 @@ extension Substring: StringProtocol { // merged with the Character preceding/following the replaced range. // // The best way to avoid problems in these cases is to lower index - // calculations to Unicode scalars (or below) -- in this implementation, we + // calculations to Unicode scalars (or below). In this implementation, we // are measuring things in UTF-8 code units, for efficiency. if _slowPath(_slice._base._guts.isKnownUTF16) { @@ -601,14 +630,12 @@ extension Substring: StringProtocol { } extension Substring { - // TODO(lorentey): Rename to proper terminology - internal var _knownToStartOnGraphemeBreak: Bool { - startIndex._encodedOffset == 0 || startIndex.characterStride != nil + internal var _startIsCharacterAligned: Bool { + startIndex._isCharacterAligned } - // TODO(lorentey): Rename to proper terminology - internal var _knownToEndOnGraphemeBreak: Bool { - endIndex == _slice.base.endIndex || endIndex.characterStride != nil + internal var _endIsCharacterAligned: Bool { + endIndex._isCharacterAligned } internal var _encodedOffsetRange: Range { @@ -616,29 +643,39 @@ extension Substring { let upper = _slice._endIndex._encodedOffset return Range(_uncheckedBounds: (lower, upper)) } +} +extension Substring { internal func _characterStride(startingAt i: Index) -> Int { _internalInvariant(i._isScalarAligned) - - // Fast path if the index already has its stride cached. Substrings that - // don't start on a grapheme cluster boundary may have different grapheme - // break positions than their base string, so we must ignore the cache in - // that case. - if let d = i.characterStride, _knownToStartOnGraphemeBreak { - // Make sure a cached stride cannot lead us beyond the substring's end - // index. This can happen if `self` ends between grapheme cluster - // boundaries. - return Swift.min(d, endIndex._encodedOffset &- i._encodedOffset) + _internalInvariant(i._encodedOffset <= _wholeGuts.count) + + // Implicit precondition: `i` must be `Character`-aligned within this + // substring, even if it doesn't have the corresponding flag set. + + // If the index has a character stride, we are therefore free to use it. + if let d = i.characterStride { + // However, make sure a cached stride cannot lead us beyond the + // substring's end index. This can happen if the substring's end isn't + // also `Character` aligned, and someone passes us an index that comes + // from the base string. + return Swift.min(d, _wholeGuts.count &- i._encodedOffset) } - if i == endIndex { return 0 } + if i._encodedOffset == endIndex._encodedOffset { return 0 } - return _slice.base._guts._opaqueCharacterStride( + // I we don't have cached information, we can simply invoke the forward-only + // grapheme breaking algorithm. + return _wholeGuts._opaqueCharacterStride( startingAt: i._encodedOffset, in: _encodedOffsetRange) } internal func _characterStride(endingAt i: Index) -> Int { + // Implicit precondition: `i` must be `Character`-aligned within this + // substring, even if it doesn't have the corresponding flag set. + _internalInvariant(i._isScalarAligned) + _internalInvariant(i._encodedOffset <= _wholeGuts.count) if i == startIndex { return 0 } From 836bf9ad730fc2a9545e43743ea2013508133fc9 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 16 Mar 2022 20:16:03 -0700 Subject: [PATCH 07/83] [stdlib] Mark index encodings in String.UTF8View & UTF16View --- stdlib/public/core/StringUTF16View.swift | 34 +++++++++++++------ stdlib/public/core/StringUTF8View.swift | 14 ++++---- .../public/core/StringUnicodeScalarView.swift | 14 ++++---- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/stdlib/public/core/StringUTF16View.swift b/stdlib/public/core/StringUTF16View.swift index a5cec6b6b9ff7..fb31351714e8a 100644 --- a/stdlib/public/core/StringUTF16View.swift +++ b/stdlib/public/core/StringUTF16View.swift @@ -141,7 +141,7 @@ extension String.UTF16View: BidirectionalCollection { @inlinable @inline(__always) public func index(after idx: Index) -> Index { if _slowPath(_guts.isForeign) { return _foreignIndex(after: idx) } - if _guts.isASCII { return idx.nextEncoded } + if _guts.isASCII { return idx.nextEncoded._knownUTF8._knownUTF16 } // For a BMP scalar (1-3 UTF-8 code units), advance past it. For a non-BMP // scalar, use a transcoded offset first. @@ -152,20 +152,24 @@ extension String.UTF16View: BidirectionalCollection { let len = _guts.fastUTF8ScalarLength(startingAt: idx._encodedOffset) if len == 4 && idx.transcodedOffset == 0 { - return idx.nextTranscoded + return idx.nextTranscoded._knownUTF8 } - return idx.strippingTranscoding.encoded(offsetBy: len)._scalarAligned + return idx + .strippingTranscoding + .encoded(offsetBy: len) + ._scalarAligned + ._knownUTF8 } @inlinable @inline(__always) public func index(before idx: Index) -> Index { _precondition(!idx.isZeroPosition) if _slowPath(_guts.isForeign) { return _foreignIndex(before: idx) } - if _guts.isASCII { return idx.priorEncoded } + if _guts.isASCII { return idx.priorEncoded._knownUTF8._knownUTF16 } if idx.transcodedOffset != 0 { _internalInvariant(idx.transcodedOffset == 1) - return idx.strippingTranscoding + return idx.strippingTranscoding._knownUTF8 } let idx = _utf16AlignNativeIndex(idx) @@ -173,12 +177,12 @@ extension String.UTF16View: BidirectionalCollection { if len == 4 { // 2 UTF-16 code units comprise this scalar; advance to the beginning and // start mid-scalar transcoding - return idx.encoded(offsetBy: -len).nextTranscoded + return idx.encoded(offsetBy: -len).nextTranscoded._knownUTF8 } // Single UTF-16 code unit _internalInvariant((1...3) ~= len) - return idx.encoded(offsetBy: -len)._scalarAligned + return idx.encoded(offsetBy: -len)._scalarAligned._knownUTF8 } public func index(_ i: Index, offsetBy n: Int) -> Index { @@ -660,10 +664,14 @@ extension String.UTF16View { // Trivial and common: start if offset == 0 { return startIndex } - if _guts.isASCII { return Index(_encodedOffset: offset) } + if _guts.isASCII { + return Index( + _encodedOffset: offset + )._scalarAligned._knownUTF8._knownUTF16 + } guard _guts._useBreadcrumbs(forEncodedOffset: offset) else { - return _index(startIndex, offsetBy: offset) + return _index(startIndex, offsetBy: offset)._knownUTF8 } // Simple and common: endIndex aka `length`. @@ -699,9 +707,13 @@ extension String.UTF16View { // Uncommon: final sub-scalar transcoded offset if _slowPath(utf16I > utf16End) { _internalInvariant(utf16Len == 2) - return Index(encodedOffset: readIdx, transcodedOffset: 1) + return Index( + encodedOffset: readIdx, transcodedOffset: 1 + )._knownUTF8 } - return Index(_encodedOffset: readIdx &+ len)._scalarAligned + return Index( + _encodedOffset: readIdx &+ len + )._scalarAligned._knownUTF8 } readIdx &+= len diff --git a/stdlib/public/core/StringUTF8View.swift b/stdlib/public/core/StringUTF8View.swift index 8348f1b21778b..0c0f5f16da51e 100644 --- a/stdlib/public/core/StringUTF8View.swift +++ b/stdlib/public/core/StringUTF8View.swift @@ -137,7 +137,7 @@ extension String.UTF8View: BidirectionalCollection { @inlinable @inline(__always) public func index(after i: Index) -> Index { if _fastPath(_guts.isFastUTF8) { - return i.strippingTranscoding.nextEncoded + return i.strippingTranscoding.nextEncoded._knownUTF8 } return _foreignIndex(after: i) @@ -147,7 +147,7 @@ extension String.UTF8View: BidirectionalCollection { public func index(before i: Index) -> Index { _precondition(!i.isZeroPosition) if _fastPath(_guts.isFastUTF8) { - return i.strippingTranscoding.priorEncoded + return i.strippingTranscoding.priorEncoded._knownUTF8 } return _foreignIndex(before: i) @@ -428,17 +428,17 @@ extension String.UTF8View { if utf8Len == 1 { _internalInvariant(idx.transcodedOffset == 0) - return idx.nextEncoded._scalarAligned + return idx.nextEncoded._scalarAligned._knownUTF16 } // Check if we're still transcoding sub-scalar if idx.transcodedOffset < utf8Len - 1 { - return idx.nextTranscoded + return idx.nextTranscoded._knownUTF16 } // Skip to the next scalar _internalInvariant(idx.transcodedOffset == utf8Len - 1) - return idx.encoded(offsetBy: scalarLen)._scalarAligned + return idx.encoded(offsetBy: scalarLen)._scalarAligned._knownUTF16 } @usableFromInline @inline(never) @@ -450,7 +450,7 @@ extension String.UTF8View { if idx.transcodedOffset != 0 { _internalInvariant((1...3) ~= idx.transcodedOffset) - return idx.priorTranscoded + return idx.priorTranscoded._knownUTF16 } let (scalar, scalarLen) = _guts.foreignErrorCorrectedScalar( @@ -458,7 +458,7 @@ extension String.UTF8View { let utf8Len = UTF8.width(scalar) return idx.encoded( offsetBy: -scalarLen - ).transcoded(withOffset: utf8Len &- 1) + ).transcoded(withOffset: utf8Len &- 1)._knownUTF16 } @usableFromInline @inline(never) diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index 980976bad73c8..b588e5ac7924c 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -114,7 +114,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { if _fastPath(_guts.isFastUTF8) { let len = _guts.fastUTF8ScalarLength(startingAt: i._encodedOffset) - return i.encoded(offsetBy: len)._scalarAligned + return i.encoded(offsetBy: len)._scalarAligned._knownUTF8 } return _foreignIndex(after: i) @@ -143,11 +143,11 @@ extension String.UnicodeScalarView: BidirectionalCollection { _precondition(i > startIndex, "String index is out of bounds") if _fastPath(_guts.isFastUTF8) { - let len = _guts.withFastUTF8 { utf8 -> Int in - return _utf8ScalarLength(utf8, endingAt: i._encodedOffset) + let len = _guts.withFastUTF8 { utf8 in + _utf8ScalarLength(utf8, endingAt: i._encodedOffset) } _internalInvariant(len <= 4, "invalid UTF8") - return i.encoded(offsetBy: -len)._scalarAligned + return i.encoded(offsetBy: -len)._scalarAligned._knownUTF8 } return _foreignIndex(before: i) @@ -436,7 +436,8 @@ extension String.UnicodeScalarView { let cu = _guts.foreignErrorCorrectedUTF16CodeUnit(at: i) let len = UTF16.isLeadSurrogate(cu) ? 2 : 1 - return i.encoded(offsetBy: len)._scalarAligned + let r = i.encoded(offsetBy: len)._scalarAligned + return _guts.internalMarkEncoding(r) } @usableFromInline @inline(never) @@ -447,6 +448,7 @@ extension String.UnicodeScalarView { let cu = _guts.foreignErrorCorrectedUTF16CodeUnit(at: priorIdx) let len = UTF16.isTrailSurrogate(cu) ? 2 : 1 - return i.encoded(offsetBy: -len)._scalarAligned + let r = i.encoded(offsetBy: -len)._scalarAligned + return _guts.internalMarkEncoding(r) } } From 6245da245711d7804ece0dd390d9612a34ee196c Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 16 Mar 2022 20:20:19 -0700 Subject: [PATCH 08/83] [stdlib] Substring: Be consistent about how we refer to the underlying string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prefer direct stored properties to computed ones — there is no reason to risk inlining issues, esp. since things like `Slice.base` aren’t even force-inlined. Prefer using `_wholeGuts` to spelling out the full incantation. --- stdlib/public/core/Substring.swift | 68 +++++++++++++++--------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 3bf6930e5e19b..6684d64cea3e5 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -109,7 +109,7 @@ public struct Substring: Sendable { _invariantCheck() } - @usableFromInline + @usableFromInline // This used to be @inlinable before 5.7 @available(*, deprecated) // Use `init(_unchecked:)` in new code. internal init(_ slice: Slice) { let r = slice.base._guts.validateScalarRange( @@ -133,10 +133,10 @@ public struct Substring: Sendable { extension Substring { /// Returns the underlying string from which this substring was derived. @_alwaysEmitIntoClient - public var base: String { return _slice.base } + public var base: String { return _slice._base } @inlinable @inline(__always) - internal var _wholeGuts: _StringGuts { return base._guts } + internal var _wholeGuts: _StringGuts { return _slice._base._guts } @inlinable @inline(__always) internal var _offsetRange: Range { @@ -149,14 +149,14 @@ extension Substring { #else @usableFromInline @inline(never) @_effects(releasenone) internal func _invariantCheck() { - _internalInvariant(_slice.endIndex <= base._guts.endIndex) + _internalInvariant(_slice.endIndex <= _wholeGuts.endIndex) _internalInvariant( - base._guts.hasMatchingEncoding(_slice.startIndex) && - base._guts.hasMatchingEncoding(_slice.endIndex)) + _wholeGuts.hasMatchingEncoding(_slice.startIndex) && + _wholeGuts.hasMatchingEncoding(_slice.endIndex)) // Indices are always scalar aligned _internalInvariant( - _slice.startIndex == base._guts.scalarAlign(_slice.startIndex) && - _slice.endIndex == base._guts.scalarAlign(_slice.endIndex)) + _slice.startIndex == _wholeGuts.scalarAlign(_slice.startIndex) && + _slice.endIndex == _wholeGuts.scalarAlign(_slice.endIndex)) self.base._invariantCheck() } @@ -166,23 +166,21 @@ extension Substring { extension Substring { @inline(__always) internal func _validateScalarIndex(_ i: String.Index) -> String.Index { - _slice._base._guts.validateScalarIndex(i, from: startIndex, to: endIndex) + _wholeGuts.validateScalarIndex(i, from: startIndex, to: endIndex) } @inline(__always) internal func _validateInclusiveScalarIndex( _ i: String.Index ) -> String.Index { - _slice._base._guts.validateInclusiveScalarIndex( - i, from: startIndex, to: endIndex) + _wholeGuts.validateInclusiveScalarIndex(i, from: startIndex, to: endIndex) } @inline(__always) internal func _validateScalarRange( _ range: Range ) -> Range { - _slice._base._guts.validateScalarRange( - range, from: startIndex, to: endIndex) + _wholeGuts.validateScalarRange(range, from: startIndex, to: endIndex) } @inline(__always) @@ -198,10 +196,10 @@ extension Substring: StringProtocol { public typealias SubSequence = Substring @inlinable @inline(__always) - public var startIndex: Index { return _slice.startIndex } + public var startIndex: Index { _slice._startIndex } @inlinable @inline(__always) - public var endIndex: Index { return _slice.endIndex } + public var endIndex: Index { _slice._endIndex } public func index(after i: Index) -> Index { // Note: in Swift 5.6 and below, this method used to be inlinable, @@ -213,7 +211,7 @@ extension Substring: StringProtocol { let i = _roundDownToNearestCharacter(_validateScalarIndex(i)) let r = _uncheckedIndex(after: i) - return _slice.base._guts.internalMarkEncoding(r) + return _wholeGuts.internalMarkEncoding(r) } /// A version of `index(after:)` that assumes that the given index: @@ -271,7 +269,7 @@ extension Substring: StringProtocol { _precondition(i > startIndex, "Substring index is out of bounds") let r = _uncheckedIndex(before: i) - return _slice.base._guts.internalMarkEncoding(r) + return _wholeGuts.internalMarkEncoding(r) } /// A version of `index(before:)` that assumes that the given index: @@ -333,7 +331,7 @@ extension Substring: StringProtocol { i = _uncheckedIndex(before: i) } } - return _slice.base._guts.internalMarkEncoding(i) + return _wholeGuts.internalMarkEncoding(i) } public func index( @@ -352,7 +350,7 @@ extension Substring: StringProtocol { // Note: `limit` is intentionally not scalar aligned to ensure our behavior // exactly matches the documentation. - let limit = _slice.base._guts.ensureMatchingEncoding(limit) + let limit = _wholeGuts.ensureMatchingEncoding(limit) var i = _validateInclusiveScalarIndex(i) let start = i @@ -371,7 +369,7 @@ extension Substring: StringProtocol { } guard limit > start || i >= limit else { return nil } } - return _slice.base._guts.internalMarkEncoding(i) + return _wholeGuts.internalMarkEncoding(i) } public func distance(from start: Index, to end: Index) -> Int { @@ -418,7 +416,7 @@ extension Substring: StringProtocol { // `Character` boundary. let i = _validateScalarIndex(i) let distance = _characterStride(startingAt: i) - return _slice.base._guts.errorCorrectedCharacter( + return _wholeGuts.errorCorrectedCharacter( startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance) } @@ -462,7 +460,7 @@ extension Substring: StringProtocol { // calculations to Unicode scalars (or below). In this implementation, we // are measuring things in UTF-8 code units, for efficiency. - if _slowPath(_slice._base._guts.isKnownUTF16) { + if _slowPath(_wholeGuts.isKnownUTF16) { // UTF-16 (i.e., foreign) string. The mutation will convert this to the // native UTF-8 encoding, so we need to do some extra work to preserve our // bounds. @@ -476,15 +474,15 @@ extension Substring: StringProtocol { let newUTF8Subrange = _slice._base._guts.replaceSubrange( subrange, with: newElements) - _internalInvariant(!_slice._base._guts.isKnownUTF16) + _internalInvariant(!_wholeGuts.isKnownUTF16) let newUTF8Count = oldUTF8Count + newUTF8Subrange.count - oldSubrangeCount // Get the character stride in the entire string, not just the substring. // (Characters in a substring may end beyond the bounds of it.) - let newStride = _slice.base._guts._opaqueCharacterStride( + let newStride = _wholeGuts._opaqueCharacterStride( startingAt: utf8StartOffset, - in: utf8StartOffset ..< _slice._base._guts.count) + in: utf8StartOffset ..< _wholeGuts.count) _slice._startIndex = String.Index( encodedOffset: utf8StartOffset, @@ -521,9 +519,9 @@ extension Substring: StringProtocol { { // Get the character stride in the entire string, not just the substring. // (Characters in a substring may end beyond the bounds of it.) - let newStride = _slice.base._guts._opaqueCharacterStride( + let newStride = _wholeGuts._opaqueCharacterStride( startingAt: newOffsetBounds.lowerBound, - in: newOffsetBounds.lowerBound ..< _slice._base._guts.count) + in: newOffsetBounds.lowerBound ..< _wholeGuts.count) _slice._startIndex = String.Index( encodedOffset: startIndex._encodedOffset, transcodedOffset: 0, @@ -679,7 +677,7 @@ extension Substring { if i == startIndex { return 0 } - return _slice.base._guts._opaqueCharacterStride( + return _wholeGuts._opaqueCharacterStride( endingAt: i._encodedOffset, in: _encodedOffsetRange) } } @@ -832,7 +830,7 @@ extension Substring { /// - Complexity: O(1) public init(_ content: UTF8View) { self = String( - content._slice.base._guts + content._slice._base._guts )[content.startIndex..) -> Substring.UnicodeScalarView { // TODO(lorentey): Review index validation _failEarlyRangeCheck(r, bounds: startIndex.., with replacement: C ) where C.Element == Element { // TODO(lorentey): Review index validation - let subrange = _slice.base._guts.validateScalarRange( + let subrange = _slice._base._guts.validateScalarRange( subrange, from: startIndex, to: endIndex) _slice.replaceSubrange(subrange, with: replacement) } From 0c0cbe290d6053bf18d27374c0a064c2cf3625c1 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 17 Mar 2022 21:25:53 -0700 Subject: [PATCH 09/83] =?UTF-8?q?[stdlib]=20=5FStringGutsSlice:=20Don?= =?UTF-8?q?=E2=80=99t=20mark=20methods=20on=20non-@usableFromInline=20inte?= =?UTF-8?q?rnal=20types=20@inlinable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (This really ought to be diagnosed by the compiler.) --- stdlib/public/core/StringGutsSlice.swift | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/stdlib/public/core/StringGutsSlice.swift b/stdlib/public/core/StringGutsSlice.swift index dbd0e54744315..308b117846bab 100644 --- a/stdlib/public/core/StringGutsSlice.swift +++ b/stdlib/public/core/StringGutsSlice.swift @@ -36,31 +36,26 @@ internal struct _StringGutsSlice { self._offsetRange = offsetRange } - @inlinable internal var start: Int { @inline(__always) get { return _offsetRange.lowerBound } } - @inlinable + internal var end: Int { @inline(__always) get { return _offsetRange.upperBound } } - @inlinable internal var count: Int { @inline(__always) get { return _offsetRange.count } } - @inlinable internal var isNFCFastUTF8: Bool { @inline(__always) get { return _guts.isNFCFastUTF8 } } - @inlinable internal var isASCII: Bool { @inline(__always) get { return _guts.isASCII } } - @inlinable internal var isFastUTF8: Bool { @inline(__always) get { return _guts.isFastUTF8 } } @@ -74,7 +69,6 @@ internal struct _StringGutsSlice { } } - @inlinable internal var range: Range { @inline(__always) get { let lower = String.Index(_encodedOffset: _offsetRange.lowerBound) From 321284e9a9e182f4d82213cd3521726520009f1f Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Fri, 18 Mar 2022 00:23:32 -0700 Subject: [PATCH 10/83] [stdlib] Review & fix index validation during String index conversions - Validate that the index has the same encoding as the string - Validate that the index is within bounds --- stdlib/public/core/StringCharacterView.swift | 9 +++++++ .../public/core/StringIndexConversions.swift | 26 +++++++++---------- stdlib/public/core/StringUTF16View.swift | 2 ++ stdlib/public/core/StringUTF8View.swift | 9 ++++--- .../public/core/StringUnicodeScalarView.swift | 8 +++--- stdlib/public/core/Substring.swift | 17 ++++++++++++ stdlib/public/core/UnicodeHelpers.swift | 1 + 7 files changed, 52 insertions(+), 20 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 875723173b405..621dfb64ec905 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -43,6 +43,15 @@ extension String: BidirectionalCollection { return distance(from: startIndex, to: endIndex) } + /// Return true if and only if `i` is a valid index in this substring, + /// that is to say, it exactly addresses one of the `Character`s in it. + internal func _isValidIndex(_ i: Index) -> Bool { + return ( + _guts.hasMatchingEncoding(i) + && i._encodedOffset <= _guts.count + && _guts.isOnGraphemeClusterBoundary(i)) + } + /// Returns the position immediately after the given index. /// /// - Parameter i: A valid index of the collection. `i` must be less than diff --git a/stdlib/public/core/StringIndexConversions.swift b/stdlib/public/core/StringIndexConversions.swift index 97fad8d8f21ac..a0a63d8215cac 100644 --- a/stdlib/public/core/StringIndexConversions.swift +++ b/stdlib/public/core/StringIndexConversions.swift @@ -11,18 +11,6 @@ //===----------------------------------------------------------------------===// extension String.Index { - private init?( - _ idx: String.Index, _genericWithin target: S - ) { - guard target._wholeGuts.isOnGraphemeClusterBoundary(idx), - idx >= target.startIndex && idx <= target.endIndex - else { - return nil - } - - self = idx - } - /// Creates an index in the given string that corresponds exactly to the /// specified position. /// @@ -62,7 +50,8 @@ extension String.Index { /// of `target`. /// - target: The string referenced by the resulting index. public init?(_ sourcePosition: String.Index, within target: String) { - self.init(sourcePosition, _genericWithin: target) + guard target._isValidIndex(sourcePosition) else { return nil } + self = sourcePosition._characterAligned } /// Creates an index in the given string that corresponds exactly to the @@ -107,7 +96,16 @@ extension String.Index { public init?( _ sourcePosition: String.Index, within target: S ) { - self.init(sourcePosition, _genericWithin: target) + if let str = target as? String { + self.init(sourcePosition, within: str) + return + } + if let str = target as? Substring { + guard str._isValidIndex(sourcePosition) else { return nil } + self = sourcePosition + return + } + self.init(sourcePosition, within: String(target)) } /// Returns the position in the given UTF-8 view that corresponds exactly to diff --git a/stdlib/public/core/StringUTF16View.swift b/stdlib/public/core/StringUTF16View.swift index fb31351714e8a..0f7125adb0254 100644 --- a/stdlib/public/core/StringUTF16View.swift +++ b/stdlib/public/core/StringUTF16View.swift @@ -371,6 +371,8 @@ extension String.UTF16View.Index { public init?( _ idx: String.Index, within target: String.UTF16View ) { + guard target._guts.hasMatchingEncoding(idx) else { return nil } + guard idx._encodedOffset <= target._guts.count else { return nil } if _slowPath(target._guts.isForeign) { guard idx._foreignIsWithin(target) else { return nil } } else { diff --git a/stdlib/public/core/StringUTF8View.swift b/stdlib/public/core/StringUTF8View.swift index 0c0f5f16da51e..0270d038ff1ff 100644 --- a/stdlib/public/core/StringUTF8View.swift +++ b/stdlib/public/core/StringUTF8View.swift @@ -332,13 +332,16 @@ extension String.UTF8View.Index { /// - Parameters: /// - sourcePosition: A position in a `String` or one of its views. /// - target: The `UTF8View` in which to find the new position. - @inlinable public init?(_ idx: String.Index, within target: String.UTF8View) { + // Note: This method used to be inlinable until Swift 5.7. + + guard target._guts.hasMatchingEncoding(idx) else { return nil } + guard idx._encodedOffset <= target._guts.count else { return nil } if _slowPath(target._guts.isForeign) { guard idx._foreignIsWithin(target) else { return nil } } else { - // All indices, except sub-scalar UTF-16 indices pointing at trailing - // surrogates, are valid. + // All indices that are in range are valid, except sub-scalar UTF-16 + // indices pointing at trailing surrogates. guard idx.transcodedOffset == 0 else { return nil } } diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index b588e5ac7924c..9c2182fb127c2 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -363,8 +363,11 @@ extension String.UnicodeScalarIndex { _ sourcePosition: String.Index, within unicodeScalars: String.UnicodeScalarView ) { - // TODO(lorentey): Review index validation - guard unicodeScalars._guts.isOnUnicodeScalarBoundary(sourcePosition) else { + guard + unicodeScalars._guts.hasMatchingEncoding(sourcePosition), + sourcePosition._encodedOffset <= unicodeScalars._guts.count, + unicodeScalars._guts.isOnUnicodeScalarBoundary(sourcePosition) + else { return nil } self = sourcePosition @@ -391,7 +394,6 @@ extension String.UnicodeScalarIndex { /// an attempt to convert the position of a UTF-8 continuation byte /// returns `nil`. public func samePosition(in characters: String) -> String.Index? { - // TODO(lorentey): Review index validation return String.Index(self, within: characters) } } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 6684d64cea3e5..a249f36c7af39 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -189,6 +189,23 @@ extension Substring { ) -> String.Index { _wholeGuts.roundDownToNearestCharacter(i, from: startIndex, to: endIndex) } + + /// Return true if and only if `i` is a valid index in this substring, + /// that is to say, it exactly addresses one of the `Character`s in it. + /// + /// Note that if the start of the substring isn't `Character`-aligned in its + /// base string, then the substring and the base may not share valid indices. + internal func _isValidIndex(_ i: Index) -> Bool { + guard + _wholeGuts.hasMatchingEncoding(i), + i >= startIndex, + i <= endIndex, + _wholeGuts.isOnUnicodeScalarBoundary(i) + else { + return false + } + return i == _roundDownToNearestCharacter(i) + } } extension Substring: StringProtocol { diff --git a/stdlib/public/core/UnicodeHelpers.swift b/stdlib/public/core/UnicodeHelpers.swift index 76e64309b8fdc..63ae5e9c8f4e3 100644 --- a/stdlib/public/core/UnicodeHelpers.swift +++ b/stdlib/public/core/UnicodeHelpers.swift @@ -242,6 +242,7 @@ extension _StringGuts { @usableFromInline @_effects(releasenone) internal func isOnUnicodeScalarBoundary(_ i: String.Index) -> Bool { + _internalInvariant(i._encodedOffset <= count) // TODO(String micro-performance): check isASCII // Beginning and end are always scalar aligned; mid-scalar never is From a3435704f0d20139330c6656b242e3cfe7494192 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Fri, 18 Mar 2022 00:37:37 -0700 Subject: [PATCH 11/83] =?UTF-8?q?[stdlib][NFC]=20String=20normalization:?= =?UTF-8?q?=20fix=20terminology=20(index=20=E2=9F=B9=20offset)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- stdlib/public/core/StringNormalization.swift | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/stdlib/public/core/StringNormalization.swift b/stdlib/public/core/StringNormalization.swift index 4f4dabe259b90..df5d8d3f336d6 100644 --- a/stdlib/public/core/StringNormalization.swift +++ b/stdlib/public/core/StringNormalization.swift @@ -42,25 +42,24 @@ extension Unicode.Scalar { } extension UnsafeBufferPointer where Element == UInt8 { - internal func hasNormalizationBoundary(before index: Int) -> Bool { - if index == 0 || index == count { + internal func hasNormalizationBoundary(before offset: Int) -> Bool { + if offset == 0 || offset == count { return true } - _internalInvariant(!UTF8.isContinuation(self[_unchecked: index])) + _internalInvariant(!UTF8.isContinuation(self[_unchecked: offset])) // Sub-300 latiny fast-path - if self[_unchecked: index] < 0xCC { return true } + if self[_unchecked: offset] < 0xCC { return true } - let cu = _decodeScalar(self, startingAt: index).0 + let cu = _decodeScalar(self, startingAt: offset).0 return cu._isNFCStarter } - internal func isOnUnicodeScalarBoundary(_ index: Int) -> Bool { - guard index < count else { - _internalInvariant(index == count) + internal func isOnUnicodeScalarBoundary(_ offset: Int) -> Bool { + guard offset < count else { + _internalInvariant(offset == count) return true } - return !UTF8.isContinuation(self[index]) + return !UTF8.isContinuation(self[offset]) } - } From 5a22ceb72b8264bf92d8dbf1c78726f7e762da6a Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Fri, 18 Mar 2022 00:46:04 -0700 Subject: [PATCH 12/83] [stdlib] _StringGutsSlice: Small adjustments --- stdlib/public/core/StringGutsSlice.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stdlib/public/core/StringGutsSlice.swift b/stdlib/public/core/StringGutsSlice.swift index 308b117846bab..a09cb6c96fb83 100644 --- a/stdlib/public/core/StringGutsSlice.swift +++ b/stdlib/public/core/StringGutsSlice.swift @@ -24,11 +24,13 @@ internal struct _StringGutsSlice { @inline(__always) internal init(_ guts: _StringGuts) { self._guts = guts - self._offsetRange = 0..) { + _internalInvariant( + offsetRange.lowerBound >= 0 && offsetRange.upperBound <= guts.count) _internalInvariant( guts.isOnUnicodeScalarBoundary(offsetRange.lowerBound) && guts.isOnUnicodeScalarBoundary(offsetRange.upperBound)) From 2464aa681e697bcc0f9cd40c732ab94e59cfdfc0 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 22 Mar 2022 14:27:35 -0700 Subject: [PATCH 13/83] [stdlib] String: Ensure indices are marked scalar aligned before rounding down to Character --- stdlib/public/core/StringGraphemeBreaking.swift | 2 +- stdlib/public/core/Substring.swift | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 2102e1b66be01..77670f4eab3bd 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -158,7 +158,7 @@ extension _StringGuts { guard isOnUnicodeScalarBoundary(i) else { return false } - let nearest = roundDownToNearestCharacter(i) + let nearest = roundDownToNearestCharacter(i._scalarAligned) return i == nearest } } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index a249f36c7af39..3840021467c81 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -204,7 +204,7 @@ extension Substring { else { return false } - return i == _roundDownToNearestCharacter(i) + return i == _roundDownToNearestCharacter(i._scalarAligned) } } From 6d400c81a27f4cbea2a52186b9b83636b144fd3c Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 22 Mar 2022 14:28:22 -0700 Subject: [PATCH 14/83] [stdlib] Substring: remove _encodedOffsetRange in favor of existing _offsetRange --- stdlib/public/core/Substring.swift | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 3840021467c81..f12a9384ea57c 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -140,8 +140,9 @@ extension Substring { @inlinable @inline(__always) internal var _offsetRange: Range { - return Range( - _uncheckedBounds: (startIndex._encodedOffset, endIndex._encodedOffset)) + let lower = _slice._startIndex._encodedOffset + let upper = _slice._endIndex._encodedOffset + return Range(_uncheckedBounds: (lower, upper)) } #if !INTERNAL_CHECKS_ENABLED @@ -652,12 +653,6 @@ extension Substring { internal var _endIsCharacterAligned: Bool { endIndex._isCharacterAligned } - - internal var _encodedOffsetRange: Range { - let lower = _slice._startIndex._encodedOffset - let upper = _slice._endIndex._encodedOffset - return Range(_uncheckedBounds: (lower, upper)) - } } extension Substring { @@ -682,7 +677,7 @@ extension Substring { // I we don't have cached information, we can simply invoke the forward-only // grapheme breaking algorithm. return _wholeGuts._opaqueCharacterStride( - startingAt: i._encodedOffset, in: _encodedOffsetRange) + startingAt: i._encodedOffset, in: _offsetRange) } internal func _characterStride(endingAt i: Index) -> Int { @@ -695,7 +690,7 @@ extension Substring { if i == startIndex { return 0 } return _wholeGuts._opaqueCharacterStride( - endingAt: i._encodedOffset, in: _encodedOffsetRange) + endingAt: i._encodedOffset, in: _offsetRange) } } From c436654b6170d6422ed5ed55e83d5803f7841829 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 22 Mar 2022 14:36:56 -0700 Subject: [PATCH 15/83] [stdlib] Substring._characterStride(startingAt:): Limit stride to the correct bounds --- stdlib/public/core/Substring.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index f12a9384ea57c..fc9b9be1c5a8b 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -669,7 +669,7 @@ extension Substring { // substring's end index. This can happen if the substring's end isn't // also `Character` aligned, and someone passes us an index that comes // from the base string. - return Swift.min(d, _wholeGuts.count &- i._encodedOffset) + return Swift.min(d, endIndex._encodedOffset &- i._encodedOffset) } if i._encodedOffset == endIndex._encodedOffset { return 0 } From 0523b67e1f273a33ef7642369877fcc4866c3ce7 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 24 Mar 2022 16:31:18 -0700 Subject: [PATCH 16/83] [stdlib] String.index(_:offsetBy:limitedBy:): compare limit against original index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whether the limit actually applies depends on how it’s ordered relative to the original index `i`, not the one we round down to the nearest Character. --- stdlib/public/core/StringCharacterView.swift | 6 ++++-- stdlib/public/core/Substring.swift | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 621dfb64ec905..c246c83e05c11 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -218,13 +218,15 @@ extension String: BidirectionalCollection { // and return a result that makes sense. // Note: `limit` is intentionally not scalar (or character-) aligned to - // ensure our behavior exactly matches the documentation above. + // ensure our behavior exactly matches the documentation above. We do need + // to ensure it has a matching encoding, though. The same goes for `start`, + // which is used to determine whether the limit applies at all. let limit = _guts.ensureMatchingEncoding(limit) + let start = _guts.ensureMatchingEncoding(i) var i = _guts.roundDownToNearestCharacter( _guts.validateInclusiveScalarIndex(i)) - let start = i if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { guard limit < start || i < limit else { return nil } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index fc9b9be1c5a8b..29db24950b176 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -366,12 +366,14 @@ extension Substring: StringProtocol { // breaks, in which case this function must still terminate without trapping // and return a result that makes sense. - // Note: `limit` is intentionally not scalar aligned to ensure our behavior - // exactly matches the documentation. + // Note: `limit` is intentionally not scalar (or character-) aligned to + // ensure our behavior exactly matches the documentation above. We do need + // to ensure it has a matching encoding, though. The same goes for `start`, + // which is used to determine whether the limit applies at all. let limit = _wholeGuts.ensureMatchingEncoding(limit) + let start = _wholeGuts.ensureMatchingEncoding(i) var i = _validateInclusiveScalarIndex(i) - let start = i if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { guard limit < start || i < limit else { return nil } @@ -674,7 +676,7 @@ extension Substring { if i._encodedOffset == endIndex._encodedOffset { return 0 } - // I we don't have cached information, we can simply invoke the forward-only + // If we don't have cached information, we can simply invoke the forward-only // grapheme breaking algorithm. return _wholeGuts._opaqueCharacterStride( startingAt: i._encodedOffset, in: _offsetRange) From 99f693e4bae15de0ce2ca1d29c62abb74ca47bcd Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 24 Mar 2022 16:33:14 -0700 Subject: [PATCH 17/83] [test] stdlib/StringIndex: Review & extend with more cases --- test/stdlib/StringIndex.swift | 272 ++++++++++++++++++++++++---------- 1 file changed, 195 insertions(+), 77 deletions(-) diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index d45aa95fc3a98..6f72768089035 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -18,40 +18,44 @@ enum SimpleString: String { } let simpleStrings: [String] = [ - SimpleString.smallASCII.rawValue, - SimpleString.smallUnicode.rawValue, - SimpleString.largeASCII.rawValue, - SimpleString.largeUnicode.rawValue, - SimpleString.emoji.rawValue, - "", + SimpleString.smallASCII.rawValue, + SimpleString.smallUnicode.rawValue, + SimpleString.largeASCII.rawValue, + SimpleString.largeUnicode.rawValue, + SimpleString.emoji.rawValue, + "", ] -StringIndexTests.test("Wat") { - let s = "\u{1F1FA}\u{1F1F8}\u{1F1E8}\u{1F1E6}" // Regional indicators + - - s.unicodeScalars.indices.forEach { - print("\($0) -> U+\(String(s.unicodeScalars[$0].value, radix: 16, uppercase: true)) \(s.unicodeScalars[$0].properties.name ?? "\(s.unicodeScalars[$0].debugDescription)")") +func dumpIndices(_ string: String) { + print("-------------------------------------------------------------------") + print("String: \(String(reflecting: string))") + print("Characters:") + string.indices.forEach { i in + let char = string[i] + print(" \(i) -> \(String(reflecting: char))") + } + print("Unicode Scalars:") + string.unicodeScalars.indices.forEach { i in + let scalar = string.unicodeScalars[i] + let value = String(scalar.value, radix: 16, uppercase: true) + let padding = String(repeating: "0", count: max(0, 4 - value.count)) + let name = scalar.properties.name ?? "\(scalar.debugDescription)" + print(" \(i) -> U+\(padding)\(value) \(name)") + } + print("UTF-8:") + string.utf8.indices.forEach { i in + let code = string.utf8[i] + let value = String(code, radix: 16, uppercase: true) + let padding = value.count < 2 ? "0" : "" + print(" \(i) -> \(padding)\(value)") + } + print("UTF-16:") + string.utf16.indices.forEach { i in + let code = string.utf16[i] + let value = String(code, radix: 16, uppercase: true) + let padding = String(repeating: "0", count: 4 - value.count) + print(" \(i) -> \(padding)\(value)") } - - let i = s.unicodeScalars.index(s.unicodeScalars.startIndex, offsetBy: 1) // S - let j = s.unicodeScalars.index(s.unicodeScalars.startIndex, offsetBy: 3) // A - // Per SE-0180, `s[i..( by other: G, - body: (G.Index, Self.SubSequence) throws -> Void + body: (G.Index, Self.SubSequence, Int) throws -> Void ) rethrows -where G.Index == Self.Index + where G.Index == Self.Index { if other.isEmpty { assert(self.isEmpty) @@ -514,6 +518,7 @@ where G.Index == Self.Index } var i = other.startIndex var j = self.startIndex + var offset = 0 while i != other.endIndex { let current = i other.formIndex(after: &i) @@ -522,7 +527,8 @@ where G.Index == Self.Index self.formIndex(after: &j) } let end = j - try body(current, self[start ..< end]) + try body(current, self[start ..< end], offset) + offset += 1 } } } @@ -530,40 +536,50 @@ where G.Index == Self.Index extension String { /// Returns a dictionary mapping each valid index to the index that lies on /// the nearest scalar boundary, rounding down. - func scalarMap() -> [String.Index: String.Index] { - var map: [String.Index: String.Index] = [:] - self.utf8.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice in - for i in slice.indices { map[i] = scalar } + func scalarMap() -> [String.Index: (index: String.Index, offset: Int)] { + var map: [String.Index: (index: String.Index, offset: Int)] = [:] + + self.utf8.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } } - self.utf16.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice in - for i in slice.indices { map[i] = scalar } + self.utf16.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } } - self.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice in - for i in slice.indices { map[i] = scalar } + self.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } } - map[endIndex] = endIndex + map[endIndex] = (endIndex, self.unicodeScalars.count) return map } /// Returns a dictionary mapping each valid index to the index that lies on /// the nearest character boundary, rounding down. - func characterMap() -> [String.Index: String.Index] { - var map: [String.Index: String.Index] = [:] - self.utf8.forEachIndexGroup(by: self) { scalar, slice in - for i in slice.indices { map[i] = scalar } + func characterMap() -> [String.Index: (index: String.Index, offset: Int)] { + var map: [String.Index: (index: String.Index, offset: Int)] = [:] + self.utf8.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } } - self.utf16.forEachIndexGroup(by: self) { scalar, slice in - for i in slice.indices { map[i] = scalar } + self.utf16.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } } - self.unicodeScalars.forEachIndexGroup(by: self) { scalar, slice in - for i in slice.indices { map[i] = scalar } + self.unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } } - map[endIndex] = endIndex + map[endIndex] = (endIndex, count) return map } } StringIndexTests.test("Extra Exhaustive Index Interchange") { + guard #available(SwiftStdlib 5.7, *) else { + // Index navigation in 5.7 always rounds input indices down to the nearest + // Character, so that we always have a well-defined distance between + // indices, even if they aren't valid. + // + // 5.6 and below did not behave consistently in this case. + return + } + func check( _ string: String, stackTrace: SourceLocStack = SourceLocStack(), @@ -571,6 +587,8 @@ StringIndexTests.test("Extra Exhaustive Index Interchange") { file: String = #file, line: UInt = #line ) { + dumpIndices(string) + let scalarMap = string.scalarMap() let characterMap = string.characterMap() @@ -587,26 +605,32 @@ StringIndexTests.test("Extra Exhaustive Index Interchange") { ) -> Int { let ci = characterMap[i]! let cj = characterMap[j]! + return cj.offset - ci.offset + } + + func referenceScalarDistance( + from i: String.Index, to j: String.Index + ) -> Int { let si = scalarMap[i]! let sj = scalarMap[j]! - var d = string.distance(from: ci, to: cj) - if si < sj { - if ci == cj { d = 1 } - else if cj < sj { d += 1 } - } else if si > sj { - if ci == cj { d = -1 } - else if ci < si { d -= 1 } - } - return d + return sj.offset - si.offset } for i in allIndices { for j in allIndices { - let si = scalarMap[i]! - let sj = scalarMap[j]! - let characterDistance = referenceCharacterDistance(from: i, to: j) - let scalarDistance = string.unicodeScalars.distance(from: si, to: sj) + let scalarDistance = referenceScalarDistance(from: i, to: j) + + let substringDistance: Int + if i <= j { + // The substring `string[i.. U+\(value) \(name)") - } - check(str) #if _runtime(_ObjC) - let nsstr = NSString(utf8String: s.utf8Start)! + let unichars = Array(str.utf16) + let nsstr = NSString(characters: unichars, length: unichars.count) check(nsstr as String) #endif } } +StringIndexTests.test("Global vs local grapheme cluster boundaries") { + guard #available(SwiftStdlib 5.7, *) else { + // Index navigation in 5.7 always rounds input indices down to the nearest + // Character, so that we always have a well-defined distance between + // indices, even if they aren't valid. + // + // 5.6 and below did not behave consistently in this case. + return + } + + let str = "a🇺🇸🇨🇦b" + // U+0061 LATIN SMALL LETTER A + // U+1F1FA REGIONAL INDICATOR SYMBOL LETTER U + // U+1F1F8 REGIONAL INDICATOR SYMBOL LETTER S + // U+1F1E8 REGIONAL INDICATOR SYMBOL LETTER C + // U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A + // U+0062 LATIN SMALL LETTER B + + let c = Array(str.indices) + [str.endIndex] + let s = Array(str.unicodeScalars.indices) + [str.unicodeScalars.endIndex] + let u8 = Array(str.utf8.indices) + [str.utf8.endIndex] + let u16 = Array(str.utf16.indices) + [str.utf16.endIndex] + + // Index navigation must always round the input index down to the nearest + // Character. + + expectEqual(str.count, 4) + expectEqual(str.index(after: c[0]), c[1]) + expectEqual(str.index(after: c[1]), c[2]) + expectEqual(str.index(after: c[2]), c[3]) + expectEqual(str.index(after: c[3]), c[4]) + + expectEqual(str.index(before: c[4]), c[3]) + expectEqual(str.index(before: c[3]), c[2]) + expectEqual(str.index(before: c[2]), c[1]) + expectEqual(str.index(before: c[1]), c[0]) + + // Scalars + expectEqual(str.unicodeScalars.count, 6) + expectEqual(str.index(after: s[0]), s[1]) + expectEqual(str.index(after: s[1]), s[3]) + expectEqual(str.index(after: s[2]), s[3]) // s[2] ≅ s[1] + expectEqual(str.index(after: s[3]), s[5]) + expectEqual(str.index(after: s[4]), s[5]) // s[4] ≅ s[3] + expectEqual(str.index(after: s[5]), s[6]) + + expectEqual(str.index(before: s[6]), s[5]) + expectEqual(str.index(before: s[5]), s[3]) + expectEqual(str.index(before: s[4]), s[1]) // s[4] ≅ s[3] + expectEqual(str.index(before: s[3]), s[1]) + expectEqual(str.index(before: s[2]), s[0]) // s[2] ≅ s[1] + expectEqual(str.index(before: s[1]), s[0]) + + dumpIndices(str) + // UTF-8 + expectEqual(str.utf8.count, 18) + expectEqual(str.index(after: u8[0]), u8[1]) + for i in 1 ..< 9 { // s[i] ≅ s[1] + expectEqual(str.index(after: u8[i]), u8[9]) + } + for i in 9 ..< 17 { // s[i] ≅ s[9] + expectEqual(str.index(after: u8[i]), u8[17]) + } + expectEqual(str.index(after: u8[17]), u8[18]) + + // UTF-16 + expectEqual(str.utf16.count, 10) + expectEqual(str.index(after: u16[0]), u16[1]) + expectEqual(str.index(after: u16[1]), u16[5]) + expectEqual(str.index(after: u16[2]), u16[5]) // s[2] ≅ s[1] + expectEqual(str.index(after: u16[3]), u16[5]) // s[3] ≅ s[1] + expectEqual(str.index(after: u16[4]), u16[5]) // s[4] ≅ s[1] + expectEqual(str.index(after: u16[5]), u16[9]) + expectEqual(str.index(after: u16[6]), u16[9]) // s[6] ≅ s[5] + expectEqual(str.index(after: u16[7]), u16[9]) // s[7] ≅ s[5] + expectEqual(str.index(after: u16[8]), u16[9]) // s[8] ≅ s[5] + expectEqual(str.index(after: u16[9]), u16[10]) + + let i = s[2] // second scalar of US flag + let j = s[4] // second scalar of CA flag + // However, subscripting should only round down to the nearest scalar. + // Per SE-0180, `s[i.. Date: Thu, 24 Mar 2022 16:35:27 -0700 Subject: [PATCH 18/83] [stdlib][NFC] Update some outdated comments --- stdlib/public/core/StringCharacterView.swift | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index c246c83e05c11..851d0c4dbbedc 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -10,14 +10,10 @@ // //===----------------------------------------------------------------------===// // -// String is-not-a Sequence or Collection, but it exposes a -// collection of characters. +// String is a collection of characters. // //===----------------------------------------------------------------------===// -// FIXME(ABI)#70 : The character string view should have a custom iterator type -// to allow performance optimizations of linear traversals. - import SwiftShims extension String: BidirectionalCollection { From 98d595947824ac7904453e17a98bdd40cf6eee90 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 24 Mar 2022 16:36:31 -0700 Subject: [PATCH 19/83] [stdlib] String.Index: Adjust printing --- stdlib/public/core/StringIndex.swift | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index a48077ed49313..869ef54c94fed 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -473,24 +473,26 @@ extension String.Index { @inline(never) public var description: String { var d = "String.Index(" + d += "offset: \(_encodedOffset)" + if transcodedOffset != 0 { + d += "+\(transcodedOffset)" + } + + d += ", encoding: " switch (__isUTF8, __isUTF16) { case (false, false): d += "unknown" - case (true, false): d += "UTF-8" - case (false, true): d += "UTF-16" - case (true, true): d += "universal" + case (true, false): d += "utf8" + case (false, true): d += "utf16" + case (true, true): d += "any" } - d += " offset: \(_encodedOffset)" - if transcodedOffset != 0 { - d += "(+\(transcodedOffset))" + if _isCharacterAligned { + d += ", aligned: character" + } else if _isScalarAligned { + d += ", aligned: scalar" } if let stride = characterStride { d += ", stride: \(stride)" } - if _isCharacterAligned { - d += ", character aligned" - } else if _isScalarAligned { - d += ", scalar aligned" - } d += ")" return d } From 90fee621b61108e2cb06e58e270490ceec4331bd Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 24 Mar 2022 16:36:52 -0700 Subject: [PATCH 20/83] [stdlib] String.UTF16View: Mark foreign indices as UTF-16 encoded --- stdlib/public/core/StringUTF16View.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/public/core/StringUTF16View.swift b/stdlib/public/core/StringUTF16View.swift index 0f7125adb0254..31621b45949b9 100644 --- a/stdlib/public/core/StringUTF16View.swift +++ b/stdlib/public/core/StringUTF16View.swift @@ -437,14 +437,14 @@ extension String.UTF16View { @_effects(releasenone) internal func _foreignIndex(after i: Index) -> Index { _internalInvariant(_guts.isForeign) - return i.strippingTranscoding.nextEncoded + return i.strippingTranscoding.nextEncoded._knownUTF16 } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignIndex(before i: Index) -> Index { _internalInvariant(_guts.isForeign) - return i.strippingTranscoding.priorEncoded + return i.strippingTranscoding.priorEncoded._knownUTF16 } @usableFromInline @inline(never) From 298899264d1b9b712c72104e6f7e9ddc10748343 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 24 Mar 2022 16:38:58 -0700 Subject: [PATCH 21/83] [stdlib] String: Add some extra invariant checks --- stdlib/public/core/StringGraphemeBreaking.swift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 77670f4eab3bd..d3e0f0d674428 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -187,6 +187,7 @@ extension _StringGuts { startingAt i: Int, in bounds: Range ) -> Int { + _internalInvariant(bounds.lowerBound >= 0 && bounds.upperBound <= count) _internalInvariant(bounds.contains(i)) if _slowPath(isForeign) { return _foreignOpaqueCharacterStride(startingAt: i, in: bounds) @@ -201,6 +202,7 @@ extension _StringGuts { } } + _internalInvariant(nextIdx >= i && nextIdx <= bounds.upperBound) return nextIdx &- i } @@ -241,6 +243,7 @@ extension _StringGuts { } } + _internalInvariant(bounds.contains(previousIdx)) return i &- previousIdx } From 67f01a11599a6a3eb0fd5f5eb79701cbb1005c2e Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 24 Mar 2022 18:33:46 -0700 Subject: [PATCH 22/83] [stdlib] Stop inlining String.subscript MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `index(after:)`/`index(before:)` aren’t inlinable, so I don’t expect force-inlining the subscript has much benefit. --- stdlib/public/core/StringCharacterView.swift | 19 +++++++++---------- stdlib/public/core/StringGuts.swift | 7 ------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 851d0c4dbbedc..be27f67629b89 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -302,14 +302,9 @@ extension String: BidirectionalCollection { /// /// - Parameter i: A valid index of the string. `i` must be less than the /// string's end index. - @inlinable @inline(__always) // TODO(lorentey): Consider removing these. If - // `index(after:)` isn't inlinable, does it - // really matter if this one is? (Potential - // optimizations notwithstanding.) `subscript` - // being inlinable forces a bunch of new - // additions to be _aEIC, even though they ought - // to be internal. public subscript(i: Index) -> Character { + // Prior to Swift 5.7, this function used to be inlinable. + // Note: SE-0180 requires us not to round `i` down to the nearest whole // `Character` boundary. let i = _guts.validateScalarIndex(i) @@ -331,8 +326,10 @@ extension String: BidirectionalCollection { /// This method is called from inlinable `subscript` implementations in /// current and previous versions of the stdlib, wich require this contract /// not to be violated. - @inlinable @inline(__always) + @usableFromInline + @inline(__always) internal func _characterStride(startingAt i: Index) -> Int { + // Prior to Swift 5.7, this function used to be inlinable. _internalInvariant_5_1(i._isScalarAligned) // Fast check if it's already been measured, otherwise check resiliently @@ -343,8 +340,10 @@ extension String: BidirectionalCollection { return _guts._opaqueCharacterStride(startingAt: i._encodedOffset) } - @inlinable @inline(__always) + @usableFromInline + @inline(__always) internal func _characterStride(endingAt i: Index) -> Int { + // Prior to Swift 5.7, this function used to be inlinable. _internalInvariant_5_1(i._isScalarAligned) if i == startIndex { return 0 } @@ -371,8 +370,8 @@ extension String { self._guts = guts } - @inlinable public mutating func next() -> Character? { + // Prior to Swift 5.7, this function used to be inlinable. guard _fastPath(_position < _end) else { return nil } let len = _guts._opaqueCharacterStride(startingAt: _position) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 318f38d636974..d7ac51671ea80 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -367,14 +367,12 @@ extension _StringGuts { /// not set the flags that this method relies on. However, false positives /// cannot happen: if this method detects a mismatch, then it is guaranteed to /// be a real one. - @_alwaysEmitIntoClient // TODO(lorentey): Should this remain internal? @inline(__always) internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index { if _fastPath(!isForeign && i._canBeUTF8) { return i } return _slowEnsureMatchingEncoding(i) } - @_alwaysEmitIntoClient // TODO(lorentey): Should this remain internal? @inline(never) internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { _internalInvariant(isForeign || !i._canBeUTF8) @@ -430,7 +428,6 @@ extension _StringGuts { /// - has an encoding that matches this string, /// - is within the bounds of this string, and /// - is aligned on a scalar boundary. - @_alwaysEmitIntoClient internal func validateScalarIndex(_ i: String.Index) -> String.Index { let i = ensureMatchingEncoding(i) _precondition(i._encodedOffset < count, "String index is out of bounds") @@ -463,7 +460,6 @@ extension _StringGuts { /// - has an encoding that matches this string, /// - is within the bounds of this string (including the `endIndex`), and /// - is aligned on a scalar boundary. - @_alwaysEmitIntoClient internal func validateInclusiveScalarIndex( _ i: String.Index ) -> String.Index { @@ -479,7 +475,6 @@ extension _StringGuts { /// - has an encoding that matches this string, /// - is within the bounds of this string (including the `endIndex`), and /// - is aligned on a scalar boundary. - @_alwaysEmitIntoClient internal func validateInclusiveScalarIndex( _ i: String.Index, from start: String.Index, @@ -499,7 +494,6 @@ extension _StringGuts { /// - have an encoding that matches this string, /// - are within the bounds of this string, and /// - are aligned on a scalar boundary. - @_alwaysEmitIntoClient internal func validateScalarRange( _ range: Range ) -> Range { @@ -533,7 +527,6 @@ extension _StringGuts { /// - have an encoding that matches this string, /// - are within `start ..< end`, and /// - are aligned on a scalar boundary. - @_alwaysEmitIntoClient internal func validateScalarRange( _ range: Range, from start: String.Index, From d58811262d08ff651320f1c74deff51ce536b928 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 24 Mar 2022 20:58:53 -0700 Subject: [PATCH 23/83] [stdlib] String.UnicodeScalarView: Review index validation --- stdlib/public/core/StringGuts.swift | 5 + .../public/core/StringUnicodeScalarView.swift | 121 +++++++++++++----- stdlib/public/core/Substring.swift | 66 +++++----- 3 files changed, 130 insertions(+), 62 deletions(-) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index d7ac51671ea80..e92658902c67b 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -367,12 +367,14 @@ extension _StringGuts { /// not set the flags that this method relies on. However, false positives /// cannot happen: if this method detects a mismatch, then it is guaranteed to /// be a real one. + @_alwaysEmitIntoClient @inline(__always) internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index { if _fastPath(!isForeign && i._canBeUTF8) { return i } return _slowEnsureMatchingEncoding(i) } + @_alwaysEmitIntoClient @inline(never) internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { _internalInvariant(isForeign || !i._canBeUTF8) @@ -428,6 +430,7 @@ extension _StringGuts { /// - has an encoding that matches this string, /// - is within the bounds of this string, and /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient internal func validateScalarIndex(_ i: String.Index) -> String.Index { let i = ensureMatchingEncoding(i) _precondition(i._encodedOffset < count, "String index is out of bounds") @@ -441,6 +444,7 @@ extension _StringGuts { /// - has an encoding that matches this string, /// - is within `start ..< end`, and /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient internal func validateScalarIndex( _ i: String.Index, from start: String.Index, @@ -460,6 +464,7 @@ extension _StringGuts { /// - has an encoding that matches this string, /// - is within the bounds of this string (including the `endIndex`), and /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient internal func validateInclusiveScalarIndex( _ i: String.Index ) -> String.Index { diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index 9c2182fb127c2..424358cfdd09d 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -106,42 +106,38 @@ extension String.UnicodeScalarView: BidirectionalCollection { /// - Precondition: The next location exists. @inlinable @inline(__always) public func index(after i: Index) -> Index { - // TODO(String performance): isASCII fast-path - - // TODO(lorentey): Review index validation - _precondition(i < endIndex, "String index is out of bounds") - let i = _guts.scalarAlign(i) + let i = _guts.validateScalarIndex(i) + return _uncheckedIndex(after: i) + } + @_alwaysEmitIntoClient + @inline(__always) + internal func _uncheckedIndex(after i: Index) -> Index { + // TODO(String performance): isASCII fast-path if _fastPath(_guts.isFastUTF8) { let len = _guts.fastUTF8ScalarLength(startingAt: i._encodedOffset) return i.encoded(offsetBy: len)._scalarAligned._knownUTF8 } - return _foreignIndex(after: i) } - @_alwaysEmitIntoClient // Swift 5.1 bug fix - public func distance(from start: Index, to end: Index) -> Int { - // TODO(lorentey): Review index validation - return _distance(from: _guts.scalarAlign(start), to: _guts.scalarAlign(end)) - } - /// Returns the previous consecutive location before `i`. /// /// - Precondition: The previous location exists. @inlinable @inline(__always) public func index(before i: Index) -> Index { - // TODO(lorentey): Review index validation - // TODO(String performance): isASCII fast-path - - // Note: bounds checking in `index(before:)` is tricky as scalar aligning an - // index may need to access storage, but it may also move it closer towards - // the `startIndex`. Therefore, we must check against the `endIndex` before - // aligning, but we need to delay the `i > startIndex` check until after. - _precondition(i <= endIndex, "String index is out of bounds") - let i = _guts.scalarAlign(i) + let i = _guts.validateInclusiveScalarIndex(i) + // Note: Aligning an index may move it closer towards the `startIndex`, so + // the `i > startIndex` check needs to come after rounding. _precondition(i > startIndex, "String index is out of bounds") + return _uncheckedIndex(before: i) + } + + @_alwaysEmitIntoClient + @inline(__always) + internal func _uncheckedIndex(before i: Index) -> Index { + // TODO(String performance): isASCII fast-path if _fastPath(_guts.isFastUTF8) { let len = _guts.withFastUTF8 { utf8 in _utf8ScalarLength(utf8, endingAt: i._encodedOffset) @@ -171,11 +167,80 @@ extension String.UnicodeScalarView: BidirectionalCollection { /// must be less than the view's end index. @inlinable @inline(__always) public subscript(position: Index) -> Unicode.Scalar { - // TODO(lorentey): Review index validation - String(_guts)._boundsCheck(position) - let i = _guts.scalarAlign(position) + let i = _guts.validateScalarIndex(position) return _guts.errorCorrectedScalar(startingAt: i._encodedOffset).0 } + + @_alwaysEmitIntoClient // Swift 5.1 bug fix + public func distance(from start: Index, to end: Index) -> Int { + let start = _guts.validateInclusiveScalarIndex(start) + let end = _guts.validateInclusiveScalarIndex(end) + + var i = start + var count = 0 + if i < end { + while i < end { + count += 1 + i = _uncheckedIndex(after: i) + } + } + else if i > end { + while i > end { + count -= 1 + i = _uncheckedIndex(before: i) + } + } + return count + } + + @_alwaysEmitIntoClient + public func index(_ i: Index, offsetBy distance: Int) -> Index { + var i = _guts.validateInclusiveScalarIndex(i) + + if distance >= 0 { + for _ in stride(from: 0, to: distance, by: 1) { + _precondition(i._encodedOffset < _guts.count, "String index is out of bounds") + i = _uncheckedIndex(after: i) + } + } else { + for _ in stride(from: 0, to: distance, by: -1) { + _precondition(i._encodedOffset > 0, "String index is out of bounds") + i = _uncheckedIndex(before: i) + } + } + return _guts.markEncoding(i) + } + + @_alwaysEmitIntoClient + public func index( + _ i: Index, offsetBy distance: Int, limitedBy limit: Index + ) -> Index? { + // Note: `limit` is intentionally not scalar aligned to ensure our behavior + // exactly matches the documentation above. We do need to ensure it has a + // matching encoding, though. The same goes for `start`, which is used to + // determine whether the limit applies at all. + let limit = _guts.ensureMatchingEncoding(limit) + let start = _guts.ensureMatchingEncoding(i) + + var i = _guts.validateInclusiveScalarIndex(i) + + if distance >= 0 { + for _ in stride(from: 0, to: distance, by: 1) { + guard limit < start || i < limit else { return nil } + _precondition(i._encodedOffset < _guts.count, "String index is out of bounds") + i = _uncheckedIndex(after: i) + } + guard limit < start || i <= limit else { return nil } + } else { + for _ in stride(from: 0, to: distance, by: -1) { + guard limit > start || i > limit else { return nil } + _precondition(i._encodedOffset > 0, "String index is out of bounds") + i = _uncheckedIndex(before: i) + } + guard limit > start || i >= limit else { return nil } + } + return _guts.markEncoding(i) + } } extension String.UnicodeScalarView { @@ -318,9 +383,8 @@ extension String.UnicodeScalarView: RangeReplaceableCollection { _ bounds: Range, with newElements: C ) where C: Collection, C.Element == Unicode.Scalar { - // TODO(lorentey): Review index validation // TODO(String performance): Skip extra String and Array allocation - + let bounds = _guts.validateScalarRange(bounds) let utf8Replacement = newElements.flatMap { String($0).utf8 } let replacement = utf8Replacement.withUnsafeBufferPointer { return String._uncheckedFromUTF8($0) @@ -423,9 +487,8 @@ extension String.UnicodeScalarView { @available(swift, introduced: 4) public subscript(r: Range) -> String.UnicodeScalarView.SubSequence { - // TODO(lorentey): Review index validation - _failEarlyRangeCheck(r, bounds: startIndex.. /// Creates an instance that slices `base` at `_bounds`. - @inlinable + internal init( + _unchecked base: String.UnicodeScalarView, bounds: Range + ) { + _slice = Slice(base: base, bounds: bounds) + } + + /// Creates an instance that slices `base` at `_bounds`. + @usableFromInline // This used to be inlinable before 5.7 + @available(*, deprecated) // Use `init(_unchecked:)` in new code. internal init(_ base: String.UnicodeScalarView, _bounds: Range) { let start = base._guts.scalarAlign(_bounds.lowerBound) let end = base._guts.scalarAlign(_bounds.upperBound) - _slice = Slice( - base: String(base._guts).unicodeScalars, - bounds: Range(_uncheckedBounds: (start, end))) + _slice = Slice(base: base, bounds: Range(_uncheckedBounds: (start, end))) } + + @_alwaysEmitIntoClient + @inline(__always) + internal var _wholeGuts: _StringGuts { _slice._base._guts } } } @@ -1038,87 +1048,77 @@ extension Substring.UnicodeScalarView: BidirectionalCollection { // // Plumb slice operations through // - @inlinable - public var startIndex: Index { return _slice.startIndex } + @inlinable @inline(__always) + public var startIndex: Index { _slice._startIndex } - @inlinable - public var endIndex: Index { return _slice.endIndex } + @inlinable @inline(__always) + public var endIndex: Index { _slice._endIndex } @inlinable public subscript(index: Index) -> Element { - // TODO(lorentey): Review index validation - return _slice[index] + let index = _wholeGuts.validateScalarIndex( + index, from: startIndex, to: endIndex) + return _wholeGuts.errorCorrectedScalar(startingAt: index._encodedOffset).0 } @inlinable public var indices: Indices { - // TODO(lorentey): Review index validation return _slice.indices } @inlinable public func index(after i: Index) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(after: i) + _slice._base.index(after: i) } @inlinable public func formIndex(after i: inout Index) { - // TODO(lorentey): Review index validation - _slice.formIndex(after: &i) + _slice._base.formIndex(after: &i) } @inlinable public func index(_ i: Index, offsetBy n: Int) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(i, offsetBy: n) + _slice._base.index(i, offsetBy: n) } @inlinable public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { - // TODO(lorentey): Review index validation - return _slice.index(i, offsetBy: n, limitedBy: limit) + _slice._base.index(i, offsetBy: n, limitedBy: limit) } @inlinable public func distance(from start: Index, to end: Index) -> Int { - // TODO(lorentey): Review index validation - return _slice.distance(from: start, to: end) + _slice._base.distance(from: start, to: end) } @inlinable public func _failEarlyRangeCheck(_ index: Index, bounds: Range) { - // TODO(lorentey): Review index validation - _slice._failEarlyRangeCheck(index, bounds: bounds) + _slice._base._failEarlyRangeCheck(index, bounds: bounds) } @inlinable public func _failEarlyRangeCheck( _ range: Range, bounds: Range ) { - // TODO(lorentey): Review index validation - _slice._failEarlyRangeCheck(range, bounds: bounds) + _slice._base._failEarlyRangeCheck(range, bounds: bounds) } @inlinable public func index(before i: Index) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(before: i) + _slice._base.index(before: i) } @inlinable public func formIndex(before i: inout Index) { - // TODO(lorentey): Review index validation - _slice.formIndex(before: &i) + _slice._base.formIndex(before: &i) } - @inlinable public subscript(r: Range) -> Substring.UnicodeScalarView { - // TODO(lorentey): Review index validation - _failEarlyRangeCheck(r, bounds: startIndex.. Date: Mon, 28 Mar 2022 19:04:14 -0700 Subject: [PATCH 24/83] [stdlib] String.unicodeScalars: Add a _modify accessor This will eliminate unnecessary CoW copies when calling mutating Unicode scalar view methods directly through this property. --- stdlib/public/core/StringUnicodeScalarView.swift | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index 424358cfdd09d..1b9abea121c78 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -319,6 +319,14 @@ extension String { public var unicodeScalars: UnicodeScalarView { @inline(__always) get { return UnicodeScalarView(_guts) } @inline(__always) set { _guts = newValue._guts } + + @_alwaysEmitIntoClient @inline(__always) // 5.7 + _modify { + var view = self.unicodeScalars + self = "" + defer { self._guts = view._guts } + yield &view + } } } From 5f6c300adb9d39dc5425db428c6c5bb39757a758 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 28 Mar 2022 20:14:21 -0700 Subject: [PATCH 25/83] [stdlib] String.UTF8View: Review/fix index validation Also, in UTF-8 slices, forward collection methods to the base view instead of `Slice`, to make behavior a bit easier to understand. (There is no need to force readers to page in `Slice` implementations _in addition to_ whatever the base view is doing.) --- stdlib/public/core/StringGuts.swift | 42 +++++++++++++++++ stdlib/public/core/StringUTF8View.swift | 37 +++++++++++++-- stdlib/public/core/Substring.swift | 62 ++++++++++++++----------- 3 files changed, 110 insertions(+), 31 deletions(-) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index e92658902c67b..9d13b48542f92 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -456,7 +456,9 @@ extension _StringGuts { _precondition(i >= start && i < end, "Substring index is out of bounds") return scalarAlign(i) } +} +extension _StringGuts { /// Validate `i` and adjust its position toward the start, returning the /// resulting index or trapping as appropriate. If this function returns, then /// the returned value @@ -491,7 +493,47 @@ extension _StringGuts { _precondition(i >= start && i <= end, "Substring index is out of bounds") return scalarAlign(i) } +} + +extension _StringGuts { + @_alwaysEmitIntoClient + internal func validateSubscalarRange( + _ range: Range + ) -> Range { + let upper = ensureMatchingEncoding(range.upperBound) + let lower = ensureMatchingEncoding(range.lowerBound) + + // Note: if only `lower` was miscoded, then the range invariant `lower <= + // upper` may no longer hold after the above conversions, so we need to + // re-check it here. + _precondition(upper._encodedOffset <= count && lower <= upper, + "String index range is out of bounds") + + return Range(_uncheckedBounds: (lower, upper)) + } + + @_alwaysEmitIntoClient + internal func validateSubscalarRange( + _ range: Range, + from start: String.Index, + to end: String.Index + ) -> Range { + _internalInvariant(start <= end && end <= endIndex) + + let upper = ensureMatchingEncoding(range.upperBound) + let lower = ensureMatchingEncoding(range.lowerBound) + + // Note: if only `lower` was miscoded, then the range invariant `lower <= + // upper` may no longer hold after the above conversions, so we need to + // re-check it here. + _precondition(upper <= end && lower >= start && lower <= upper, + "Substring index range is out of bounds") + + return Range(_uncheckedBounds: (lower, upper)) + } +} +extension _StringGuts { /// Validate `range` and adjust the position of its bounds, returning the /// resulting range or trapping as appropriate. If this function returns, then /// the bounds of the returned value diff --git a/stdlib/public/core/StringUTF8View.swift b/stdlib/public/core/StringUTF8View.swift index 0270d038ff1ff..d3cd7a442cc99 100644 --- a/stdlib/public/core/StringUTF8View.swift +++ b/stdlib/public/core/StringUTF8View.swift @@ -136,28 +136,37 @@ extension String.UTF8View: BidirectionalCollection { /// - Precondition: The next position is representable. @inlinable @inline(__always) public func index(after i: Index) -> Index { + let i = _guts.ensureMatchingEncoding(i) if _fastPath(_guts.isFastUTF8) { + // Note: deferred bounds check return i.strippingTranscoding.nextEncoded._knownUTF8 } - + _precondition(i._encodedOffset < _guts.count, + "String index is out of bounds") return _foreignIndex(after: i) } @inlinable @inline(__always) public func index(before i: Index) -> Index { - _precondition(!i.isZeroPosition) + let i = _guts.ensureMatchingEncoding(i) + _precondition(!i.isZeroPosition, "String index is out of bounds") if _fastPath(_guts.isFastUTF8) { return i.strippingTranscoding.priorEncoded._knownUTF8 } + _precondition(i._encodedOffset <= _guts.count, + "String index is out of bounds") return _foreignIndex(before: i) } @inlinable @inline(__always) public func index(_ i: Index, offsetBy n: Int) -> Index { + let i = _guts.ensureMatchingEncoding(i) if _fastPath(_guts.isFastUTF8) { - _precondition(n + i._encodedOffset <= _guts.count) - return i.strippingTranscoding.encoded(offsetBy: n) + let offset = n + i._encodedOffset + _precondition(offset >= 0 && offset <= _guts.count, + "String index is out of bounds") + return Index(_encodedOffset: offset)._knownUTF8 } return _foreignIndex(i, offsetBy: n) @@ -167,6 +176,7 @@ extension String.UTF8View: BidirectionalCollection { public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { + let i = _guts.ensureMatchingEncoding(i) if _fastPath(_guts.isFastUTF8) { // Check the limit: ignore limit if it precedes `i` (in the correct // direction), otherwise must not be beyond limit (in the correct @@ -179,6 +189,8 @@ extension String.UTF8View: BidirectionalCollection { } else { guard limitOffset > iOffset || result >= limitOffset else { return nil } } + _precondition(result >= 0 && result <= _guts.count, + "String index is out of bounds") return Index(_encodedOffset: result) } @@ -187,9 +199,14 @@ extension String.UTF8View: BidirectionalCollection { @inlinable @inline(__always) public func distance(from i: Index, to j: Index) -> Int { + let i = _guts.ensureMatchingEncoding(i) + let j = _guts.ensureMatchingEncoding(j) if _fastPath(_guts.isFastUTF8) { return j._encodedOffset &- i._encodedOffset } + _precondition( + i._encodedOffset <= _guts.count && j._encodedOffset <= _guts.count, + "String index is out of bounds") return _foreignDistance(from: i, to: j) } @@ -207,7 +224,14 @@ extension String.UTF8View: BidirectionalCollection { /// must be less than the view's end index. @inlinable @inline(__always) public subscript(i: Index) -> UTF8.CodeUnit { - String(_guts)._boundsCheck(i) + let i = _guts.ensureMatchingEncoding(i) + _precondition(i._encodedOffset < _guts.count, + "String index is out of bounds") + return self[_unchecked: i] + } + + @_alwaysEmitIntoClient @inline(__always) + internal subscript(_unchecked i: Index) -> UTF8.CodeUnit { if _fastPath(_guts.isFastUTF8) { return _guts.withFastUTF8 { utf8 in utf8[_unchecked: i._encodedOffset] } } @@ -373,6 +397,7 @@ extension String.UTF8View { @inlinable @available(swift, introduced: 4) public subscript(r: Range) -> String.UTF8View.SubSequence { + let r = _guts.validateSubscalarRange(r) return Substring.UTF8View(self, _bounds: r) } } @@ -422,6 +447,7 @@ extension String.UTF8View { @_effects(releasenone) internal func _foreignIndex(after idx: Index) -> Index { _internalInvariant(_guts.isForeign) + _internalInvariant(idx._encodedOffset < _guts.count) let idx = _utf8AlignForeignIndex(idx) @@ -448,6 +474,7 @@ extension String.UTF8View { @_effects(releasenone) internal func _foreignIndex(before idx: Index) -> Index { _internalInvariant(_guts.isForeign) + _internalInvariant(idx._encodedOffset <= _guts.count) let idx = _utf8AlignForeignIndex(idx) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 465d1061ba659..b2285a602b252 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -731,6 +731,12 @@ extension Substring { base: String(base._guts).utf8, bounds: _bounds) } + + @_alwaysEmitIntoClient @inline(__always) + internal var _wholeGuts: _StringGuts { _slice._base._guts } + + @_alwaysEmitIntoClient @inline(__always) + internal var _base: String.UTF8View { _slice._base } } } @@ -740,48 +746,52 @@ extension Substring.UTF8View: BidirectionalCollection { public typealias Element = String.UTF8View.Element public typealias SubSequence = Substring.UTF8View - // - // Plumb slice operations through - // @inlinable - public var startIndex: Index { return _slice.startIndex } + public var startIndex: Index { _slice._startIndex } @inlinable - public var endIndex: Index { return _slice.endIndex } + public var endIndex: Index { _slice._endIndex } @inlinable - public subscript(index: Index) -> Element { return _slice[index] } + public subscript(index: Index) -> Element { + let index = _wholeGuts.ensureMatchingEncoding(index) + _precondition(index >= startIndex && index < endIndex, + "String index is out of bounds") + return _base[_unchecked: index] + } @inlinable public var indices: Indices { return _slice.indices } @inlinable - public func index(after i: Index) -> Index { return _slice.index(after: i) } + public func index(after i: Index) -> Index { + // Note: deferred bounds check + return _base.index(after: i) + } @inlinable public func formIndex(after i: inout Index) { - // TODO(lorentey): Review index validation - _slice.formIndex(after: &i) + // Note: deferred bounds check + _base.formIndex(after: &i) } @inlinable public func index(_ i: Index, offsetBy n: Int) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(i, offsetBy: n) + // Note: deferred bounds check + return _base.index(i, offsetBy: n) } @inlinable public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { - // TODO(lorentey): Review index validation - return _slice.index(i, offsetBy: n, limitedBy: limit) + // Note: deferred bounds check + return _base.index(i, offsetBy: n, limitedBy: limit) } @inlinable public func distance(from start: Index, to end: Index) -> Int { - // TODO(lorentey): Review index validation - return _slice.distance(from: start, to: end) + return _base.distance(from: start, to: end) } @_alwaysEmitIntoClient @@ -794,36 +804,36 @@ extension Substring.UTF8View: BidirectionalCollection { @inlinable public func _failEarlyRangeCheck(_ index: Index, bounds: Range) { - // TODO(lorentey): Review index validation - _slice._failEarlyRangeCheck(index, bounds: bounds) + // FIXME: This probably ought to ensure that all three indices have matching + // encodings. + _base._failEarlyRangeCheck(index, bounds: bounds) } @inlinable public func _failEarlyRangeCheck( _ range: Range, bounds: Range ) { - // TODO(lorentey): Review index validation - _slice._failEarlyRangeCheck(range, bounds: bounds) + // FIXME: This probably ought to ensure that all three indices have matching + // encodings. + _base._failEarlyRangeCheck(range, bounds: bounds) } @inlinable public func index(before i: Index) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(before: i) + // Note: deferred bounds check + return _base.index(before: i) } @inlinable public func formIndex(before i: inout Index) { - // TODO(lorentey): Review index validation - _slice.formIndex(before: &i) + // Note: deferred bounds check + _base.formIndex(before: &i) } @inlinable public subscript(r: Range) -> Substring.UTF8View { - // TODO(lorentey): Review index validation // FIXME(strings): tests. - _precondition(r.lowerBound >= startIndex && r.upperBound <= endIndex, - "UTF8View index range out of bounds") + let r = _wholeGuts.validateSubscalarRange(r, from: startIndex, to: endIndex) return Substring.UTF8View(_slice.base, _bounds: r) } } From 4ad8b26ab3645fa58bfea7369dffc9e47e1ea4b8 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 28 Mar 2022 20:14:55 -0700 Subject: [PATCH 26/83] [stdlib] String.UTF16View: Review/fix index validation Also, in UTF-16 slices, forward collection methods to the base view instead of `Slice`, to make behavior a bit easier to understand. (There is no need to force readers to page in `Slice` implementations _in addition to_ whatever the base view is doing.) --- stdlib/public/core/StringUTF16View.swift | 65 +++++++++++++++++++----- stdlib/public/core/Substring.swift | 63 ++++++++++++----------- 2 files changed, 86 insertions(+), 42 deletions(-) diff --git a/stdlib/public/core/StringUTF16View.swift b/stdlib/public/core/StringUTF16View.swift index 31621b45949b9..e5fc9fa74f2e3 100644 --- a/stdlib/public/core/StringUTF16View.swift +++ b/stdlib/public/core/StringUTF16View.swift @@ -137,18 +137,23 @@ extension String.UTF16View: BidirectionalCollection { /// In an empty UTF-16 view, `endIndex` is equal to `startIndex`. @inlinable @inline(__always) public var endIndex: Index { return _guts.endIndex } - + @inlinable @inline(__always) public func index(after idx: Index) -> Index { + var idx = _guts.ensureMatchingEncoding(idx) + _precondition(idx._encodedOffset < _guts.count, + "String index is out of bounds") if _slowPath(_guts.isForeign) { return _foreignIndex(after: idx) } - if _guts.isASCII { return idx.nextEncoded._knownUTF8._knownUTF16 } + if _guts.isASCII { + return idx.nextEncoded._scalarAligned._knownUTF8._knownUTF16 + } // For a BMP scalar (1-3 UTF-8 code units), advance past it. For a non-BMP // scalar, use a transcoded offset first. // TODO: If transcoded is 1, can we just skip ahead 4? - let idx = _utf16AlignNativeIndex(idx) + idx = _utf16AlignNativeIndex(idx) let len = _guts.fastUTF8ScalarLength(startingAt: idx._encodedOffset) if len == 4 && idx.transcodedOffset == 0 { @@ -163,16 +168,20 @@ extension String.UTF16View: BidirectionalCollection { @inlinable @inline(__always) public func index(before idx: Index) -> Index { - _precondition(!idx.isZeroPosition) + var idx = _guts.ensureMatchingEncoding(idx) + _precondition(!idx.isZeroPosition && idx <= endIndex, + "String index is out of bounds") if _slowPath(_guts.isForeign) { return _foreignIndex(before: idx) } - if _guts.isASCII { return idx.priorEncoded._knownUTF8._knownUTF16 } + if _guts.isASCII { + return idx.priorEncoded._scalarAligned._knownUTF8._knownUTF16 + } if idx.transcodedOffset != 0 { _internalInvariant(idx.transcodedOffset == 1) - return idx.strippingTranscoding._knownUTF8 + return idx.strippingTranscoding._scalarAligned._knownUTF8 } - let idx = _utf16AlignNativeIndex(idx) + idx = _utf16AlignNativeIndex(idx) let len = _guts.fastUTF8ScalarLength(endingAt: idx._encodedOffset) if len == 4 { // 2 UTF-16 code units comprise this scalar; advance to the beginning and @@ -186,6 +195,8 @@ extension String.UTF16View: BidirectionalCollection { } public func index(_ i: Index, offsetBy n: Int) -> Index { + let i = _guts.ensureMatchingEncoding(i) + _precondition(i <= endIndex, "String index is out of bounds") if _slowPath(_guts.isForeign) { return _foreignIndex(i, offsetBy: n) } @@ -198,6 +209,12 @@ extension String.UTF16View: BidirectionalCollection { public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { + let limit = _guts.ensureMatchingEncoding(limit) + guard _fastPath(limit <= endIndex) else { return index(i, offsetBy: n) } + + let i = _guts.ensureMatchingEncoding(i) + _precondition(i <= endIndex, "String index is out of bounds") + if _slowPath(_guts.isForeign) { return _foreignIndex(i, offsetBy: n, limitedBy: limit) } @@ -219,6 +236,14 @@ extension String.UTF16View: BidirectionalCollection { } public func distance(from start: Index, to end: Index) -> Int { + let start = _guts.ensureMatchingEncoding(start) + let end = _guts.ensureMatchingEncoding(end) + + _precondition(start._encodedOffset <= _guts.count, + "String index is out of bounds") + _precondition(end._encodedOffset <= _guts.count, + "String index is out of bounds") + if _slowPath(_guts.isForeign) { return _foreignDistance(from: start, to: end) } @@ -250,8 +275,14 @@ extension String.UTF16View: BidirectionalCollection { /// less than the view's end index. @inlinable @inline(__always) public subscript(idx: Index) -> UTF16.CodeUnit { - String(_guts)._boundsCheck(idx) + let idx = _guts.ensureMatchingEncoding(idx) + _precondition(idx._encodedOffset < _guts.count, + "String index is out of bounds") + return self[_unchecked: idx] + } + @_alwaysEmitIntoClient @inline(__always) + internal subscript(_unchecked idx: Index) -> UTF16.CodeUnit { if _fastPath(_guts.isFastUTF8) { let scalar = _guts.fastUTF8Scalar( startingAt: _guts.scalarAlign(idx)._encodedOffset) @@ -427,6 +458,7 @@ extension String.UTF16View { public typealias SubSequence = Substring.UTF16View public subscript(r: Range) -> Substring.UTF16View { + let r = _guts.validateSubscalarRange(r) return Substring.UTF16View(self, _bounds: r) } } @@ -474,14 +506,20 @@ extension String.UTF16View { if n > 0 ? l >= 0 && l < n : l <= 0 && n < l { return nil } - return i.strippingTranscoding.encoded(offsetBy: n) + let offset = i._encodedOffset &+ n + _precondition(offset >= 0 && offset <= _guts.count, + "String index is out of bounds") + return Index(_encodedOffset: offset)._knownUTF16 } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignIndex(_ i: Index, offsetBy n: Int) -> Index { _internalInvariant(_guts.isForeign) - return i.strippingTranscoding.encoded(offsetBy: n) + let offset = i._encodedOffset &+ n + _precondition(offset >= 0 && offset <= _guts.count, + "String index is out of bounds") + return Index(_encodedOffset: offset)._knownUTF16 } @usableFromInline @inline(never) @@ -631,10 +669,11 @@ extension String.UTF16View { return utf16Count } } - + @usableFromInline @_effects(releasenone) internal func _nativeGetOffset(for idx: Index) -> Int { + _internalInvariant(idx._encodedOffset <= _guts.count) // Trivial and common: start if idx == startIndex { return 0 } @@ -656,13 +695,14 @@ extension String.UTF16View { // Otherwise, find the nearest lower-bound breadcrumb and count from there let (crumb, crumbOffset) = breadcrumbsPtr.pointee.getBreadcrumb( forIndex: idx) - return crumbOffset + _utf16Distance(from: crumb, to: idx) } @usableFromInline @_effects(releasenone) internal func _nativeGetIndex(for offset: Int) -> Index { + _precondition(offset >= 0, "String index is out of bounds") + // Trivial and common: start if offset == 0 { return startIndex } @@ -701,6 +741,7 @@ extension String.UTF16View { } while true { + _precondition(readIdx < readEnd, "String index is out of bounds") let len = _utf8ScalarLength(utf8[_unchecked: readIdx]) let utf16Len = len == 4 ? 2 : 1 utf16I &+= utf16Len diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index b2285a602b252..b229cfce663fe 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -886,11 +886,14 @@ extension Substring { /// Creates an instance that slices `base` at `_bounds`. @inlinable internal init(_ base: String.UTF16View, _bounds: Range) { - // TODO(lorentey): Review index validation - _slice = Slice( - base: String(base._guts).utf16, - bounds: _bounds) + _slice = Slice(base: base, bounds: _bounds) } + + @_alwaysEmitIntoClient @inline(__always) + internal var _wholeGuts: _StringGuts { _slice._base._guts } + + @_alwaysEmitIntoClient @inline(__always) + internal var _base: String.UTF16View { _slice._base } } } @@ -900,19 +903,18 @@ extension Substring.UTF16View: BidirectionalCollection { public typealias Element = String.UTF16View.Element public typealias SubSequence = Substring.UTF16View - // - // Plumb slice operations through - // @inlinable - public var startIndex: Index { return _slice.startIndex } + public var startIndex: Index { _slice._startIndex } @inlinable - public var endIndex: Index { return _slice.endIndex } + public var endIndex: Index { _slice._endIndex } @inlinable public subscript(index: Index) -> Element { - // TODO(lorentey): Review index validation - return _slice[index] + let index = _wholeGuts.ensureMatchingEncoding(index) + _precondition(index >= startIndex && index < endIndex, + "String index is out of bounds") + return _base[_unchecked: index] } @inlinable @@ -920,65 +922,66 @@ extension Substring.UTF16View: BidirectionalCollection { @inlinable public func index(after i: Index) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(after: i) + // Note: deferred bounds check + return _base.index(after: i) } @inlinable public func formIndex(after i: inout Index) { - // TODO(lorentey): Review index validation - _slice.formIndex(after: &i) + // Note: deferred bounds check + _base.formIndex(after: &i) } @inlinable public func index(_ i: Index, offsetBy n: Int) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(i, offsetBy: n) + // Note: deferred bounds check + return _base.index(i, offsetBy: n) } @inlinable public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { - // TODO(lorentey): Review index validation - return _slice.index(i, offsetBy: n, limitedBy: limit) + // Note: deferred bounds check + return _base.index(i, offsetBy: n, limitedBy: limit) } @inlinable public func distance(from start: Index, to end: Index) -> Int { - // TODO(lorentey): Review index validation - return _slice.distance(from: start, to: end) + return _base.distance(from: start, to: end) } @inlinable public func _failEarlyRangeCheck(_ index: Index, bounds: Range) { - // TODO(lorentey): Review index validation - _slice._failEarlyRangeCheck(index, bounds: bounds) + // FIXME: This probably ought to ensure that all three indices have matching + // encodings. + _base._failEarlyRangeCheck(index, bounds: bounds) } @inlinable public func _failEarlyRangeCheck( _ range: Range, bounds: Range ) { - // TODO(lorentey): Review index validation - _slice._failEarlyRangeCheck(range, bounds: bounds) + // FIXME: This probably ought to ensure that all three indices have matching + // encodings. + _base._failEarlyRangeCheck(range, bounds: bounds) } @inlinable public func index(before i: Index) -> Index { - // TODO(lorentey): Review index validation - return _slice.index(before: i) + // Note: deferred bounds check + return _base.index(before: i) } @inlinable public func formIndex(before i: inout Index) { - // TODO(lorentey): Review index validation - _slice.formIndex(before: &i) + // Note: deferred bounds check + _base.formIndex(before: &i) } @inlinable public subscript(r: Range) -> Substring.UTF16View { - // TODO(lorentey): Review index validation + let r = _wholeGuts.validateSubscalarRange(r, from: startIndex, to: endIndex) return Substring.UTF16View(_slice.base, _bounds: r) } } From e8212690d1ca30e87ebfc6885c7d11922a1766b4 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 28 Mar 2022 20:16:03 -0700 Subject: [PATCH 27/83] [stdlib] String: Apply transcoded offset when converting indices from UTF-16 --- stdlib/public/core/StringGuts.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 9d13b48542f92..66eae2df0afe4 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -417,7 +417,9 @@ extension _StringGuts { // FIXME: Consider performing a linked-on-or-after check & trapping if the // client executable was built on some particular future Swift release. let utf16 = String(self).utf16 - return utf16.index(utf16.startIndex, offsetBy: i._encodedOffset) + let base = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset) + if i.transcodedOffset == 0 { return base } + return base.encoded(offsetBy: i.transcodedOffset)._knownUTF8 } } From 4aae824124a864b12494e711ae837b2ae43791f0 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 28 Mar 2022 20:18:02 -0700 Subject: [PATCH 28/83] [stdlib] String: Deprecate old bounds checking methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These weren’t doing the right thing, and all callers have now migrated to the new `_StringGuts.validate*` methods, which combine bounds checks with encoding validation and scalar alignment. --- .../core/StringRangeReplaceableCollection.swift | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/stdlib/public/core/StringRangeReplaceableCollection.swift b/stdlib/public/core/StringRangeReplaceableCollection.swift index eca06fa2f4251..0ebc7a14c1424 100644 --- a/stdlib/public/core/StringRangeReplaceableCollection.swift +++ b/stdlib/public/core/StringRangeReplaceableCollection.swift @@ -311,24 +311,28 @@ extension String: RangeReplaceableCollection { } extension String { - @inlinable @inline(__always) + @available(*, deprecated, + message: "Use one of the _StringGuts.validateScalarIndex methods") + @usableFromInline // Used to be inlinable before 5.7 internal func _boundsCheck(_ index: Index) { - _precondition(index._encodedOffset >= 0 && index._encodedOffset < _guts.count, + _precondition(index._encodedOffset < _guts.count, "String index is out of bounds") } - @inlinable @inline(__always) + @available(*, deprecated, + message: "Use one of the _StringGuts.validateScalarIndexRange methods") + @usableFromInline // Used to be inlinable before 5.7 internal func _boundsCheck(_ range: Range) { _precondition( - range.lowerBound._encodedOffset >= 0 && range.upperBound._encodedOffset <= _guts.count, "String index range is out of bounds") } - @inlinable @inline(__always) + @available(*, deprecated, + message: "Use one of the _StringGuts.validateScalarIndex methods") + @usableFromInline // Used to be inlinable before 5.7 internal func _boundsCheck(_ range: ClosedRange) { _precondition( - range.lowerBound._encodedOffset >= 0 && range.upperBound._encodedOffset < _guts.count, "String index range is out of bounds") } From 06090ce7f216e78862fc7326f5d4c47f2677bbc7 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 28 Mar 2022 20:18:24 -0700 Subject: [PATCH 29/83] [test] Add more String coverage --- test/stdlib/StringIndex.swift | 798 +++++++++++++++++----------------- test/stdlib/StringTraps.swift | 113 ++++- 2 files changed, 508 insertions(+), 403 deletions(-) diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index 6f72768089035..1191387a8b0be 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -7,7 +7,8 @@ import StdlibUnittest import Foundation #endif -var StringIndexTests = TestSuite("StringIndexTests") +var suite = TestSuite("StringIndexTests") +defer { runAllTests() } enum SimpleString: String { case smallASCII = "abcdefg" @@ -17,15 +18,6 @@ enum SimpleString: String { case emoji = "😀😃🤢🤮👩🏿‍🎤🧛🏻‍♂️🧛🏻‍♂️👩‍👩‍👦‍👦" } -let simpleStrings: [String] = [ - SimpleString.smallASCII.rawValue, - SimpleString.smallUnicode.rawValue, - SimpleString.largeASCII.rawValue, - SimpleString.largeUnicode.rawValue, - SimpleString.emoji.rawValue, - "", -] - func dumpIndices(_ string: String) { print("-------------------------------------------------------------------") print("String: \(String(reflecting: string))") @@ -58,25 +50,34 @@ func dumpIndices(_ string: String) { } } -StringIndexTests.test("basic sanity checks") { - for s in simpleStrings { - let utf8 = Array(s.utf8) - let subUTF8 = Array(s[...].utf8) - let utf16 = Array(s.utf16) - let subUTF16 = Array(s[...].utf16) - let utf32 = Array(s.unicodeScalars.map { $0.value }) - let subUTF32 = Array(s[...].unicodeScalars.map { $0.value }) - - expectEqual(s, String(decoding: utf8, as: UTF8.self)) - expectEqual(s, String(decoding: subUTF8, as: UTF8.self)) - expectEqual(s, String(decoding: utf16, as: UTF16.self)) - expectEqual(s, String(decoding: subUTF16, as: UTF16.self)) - expectEqual(s, String(decoding: utf32, as: UTF32.self)) - expectEqual(s, String(decoding: subUTF32, as: UTF32.self)) - } +let simpleStrings: [String] = [ + SimpleString.smallASCII.rawValue, + SimpleString.smallUnicode.rawValue, + SimpleString.largeASCII.rawValue, + SimpleString.largeUnicode.rawValue, + SimpleString.emoji.rawValue, + "", +] + +suite.test("basic sanity checks") +.forEach(in: simpleStrings) { s in + let utf8 = Array(s.utf8) + let subUTF8 = Array(s[...].utf8) + let utf16 = Array(s.utf16) + let subUTF16 = Array(s[...].utf16) + let utf32 = Array(s.unicodeScalars.map { $0.value }) + let subUTF32 = Array(s[...].unicodeScalars.map { $0.value }) + + expectEqual(s, String(decoding: utf8, as: UTF8.self)) + expectEqual(s, String(decoding: subUTF8, as: UTF8.self)) + expectEqual(s, String(decoding: utf16, as: UTF16.self)) + expectEqual(s, String(decoding: subUTF16, as: UTF16.self)) + expectEqual(s, String(decoding: utf32, as: UTF32.self)) + expectEqual(s, String(decoding: subUTF32, as: UTF32.self)) } -StringIndexTests.test("view counts") { +suite.test("view counts") +.forEach(in: simpleStrings) { s in func validateViewCount( _ view: View, for string: String, stackTrace: SourceLocStack = SourceLocStack(), @@ -118,93 +119,81 @@ StringIndexTests.test("view counts") { } } - for s in simpleStrings { - validateViewCount(s, for: s) - validateViewCount(s.utf8, for: s) - validateViewCount(s.utf16, for: s) - validateViewCount(s.unicodeScalars, for: s) + validateViewCount(s, for: s) + validateViewCount(s.utf8, for: s) + validateViewCount(s.utf16, for: s) + validateViewCount(s.unicodeScalars, for: s) - validateViewCount(s[...], for: s) - validateViewCount(s[...].utf8, for: s) - validateViewCount(s[...].utf16, for: s) - validateViewCount(s[...].unicodeScalars, for: s) - } + validateViewCount(s[...], for: s) + validateViewCount(s[...].utf8, for: s) + validateViewCount(s[...].utf16, for: s) + validateViewCount(s[...].unicodeScalars, for: s) } -StringIndexTests.test("interchange") { +suite.test("interchange") +.forEach(in: simpleStrings) { s in // Basic index alignment - func validateIndices(_ s: String) { - for idx in s.utf8.indices { - let char = s.utf8[idx] + for idx in s.utf8.indices { + let char = s.utf8[idx] - // ASCII or leading code unit in the scalar - if char <= 0x7F || char >= 0b1100_0000 { - expectEqual(idx, idx.samePosition(in: s.unicodeScalars)) - expectEqual(idx, idx.samePosition(in: s.utf16)) + // ASCII or leading code unit in the scalar + if char <= 0x7F || char >= 0b1100_0000 { + expectEqual(idx, idx.samePosition(in: s.unicodeScalars)) + expectEqual(idx, idx.samePosition(in: s.utf16)) - // ASCII - if char <= 0x7F { - expectEqual(UInt16(char), s.utf16[idx]) - expectEqual(UInt32(char), s.unicodeScalars[idx].value) - } - } else { - // Continuation code unit - assert(char & 0b1100_0000 == 0b1000_0000) - expectNil(idx.samePosition(in: s)) - expectNil(idx.samePosition(in: s.utf16)) - expectNil(idx.samePosition(in: s.unicodeScalars)) + // ASCII + if char <= 0x7F { + expectEqual(UInt16(char), s.utf16[idx]) + expectEqual(UInt32(char), s.unicodeScalars[idx].value) } + } else { + // Continuation code unit + assert(char & 0b1100_0000 == 0b1000_0000) + expectNil(idx.samePosition(in: s)) + expectNil(idx.samePosition(in: s.utf16)) + expectNil(idx.samePosition(in: s.unicodeScalars)) } } - - for s in simpleStrings { - validateIndices(s) - } } -StringIndexTests.test("UTF-16 Offsets") { - func validateOffsets(_ s: String) { - let end = s.endIndex - let utf16Count = s.utf16.count +suite.test("UTF-16 Offsets") +.forEach(in: simpleStrings) { s in + let end = s.endIndex + let utf16Count = s.utf16.count - expectEqual(end, String.Index(utf16Offset: utf16Count, in: s)) - expectEqual(end, String.Index(utf16Offset: utf16Count, in: s[...])) + expectEqual(end, String.Index(utf16Offset: utf16Count, in: s)) + expectEqual(end, String.Index(utf16Offset: utf16Count, in: s[...])) - let pastEnd = String.Index(utf16Offset: utf16Count+1, in: s) + let pastEnd = String.Index(utf16Offset: utf16Count+1, in: s) - expectNotEqual(end, pastEnd) - expectEqual(pastEnd, String.Index(utf16Offset: utf16Count+1, in: s[...])) - expectEqual(pastEnd, String.Index(utf16Offset: utf16Count+2, in: s)) - expectEqual(pastEnd, String.Index(utf16Offset: -1, in: s)) - expectEqual( - pastEnd, String.Index(utf16Offset: Swift.max(1, utf16Count), in: s.dropFirst())) + expectNotEqual(end, pastEnd) + expectEqual(pastEnd, String.Index(utf16Offset: utf16Count+1, in: s[...])) + expectEqual(pastEnd, String.Index(utf16Offset: utf16Count+2, in: s)) + expectEqual(pastEnd, String.Index(utf16Offset: -1, in: s)) + expectEqual( + pastEnd, String.Index(utf16Offset: Swift.max(1, utf16Count), in: s.dropFirst())) - let utf16Indices = Array(s.utf16.indices) - expectEqual(utf16Count, utf16Indices.count) - for i in 0.. String.Index { @@ -213,7 +202,7 @@ func swift5ScalarAlign(_ idx: String.Index, in str: String) -> String.Index { return idx } -StringIndexTests.test("Scalar Align UTF-8 indices") { +suite.test("Scalar Align UTF-8 indices") { // TODO: Test a new aligning API when we add it. For now, we // test scalar-aligning UTF-8 indices @@ -234,7 +223,7 @@ StringIndexTests.test("Scalar Align UTF-8 indices") { } #if _runtime(_ObjC) -StringIndexTests.test("String.Index(_:within) / Range(_:in:)") { +suite.test("String.Index(_:within) / Range(_:in:)") { guard #available(SwiftStdlib 5.1, *) else { return } @@ -275,8 +264,10 @@ StringIndexTests.test("String.Index(_:within) / Range(_:in:)") { } } } +#endif -StringIndexTests.test("Misaligned") { +#if _runtime(_ObjC) +suite.test("Misaligned") { // Misaligned indices were fixed in 5.1 guard _hasSwift_5_1() else { return } @@ -334,167 +325,158 @@ StringIndexTests.test("Misaligned") { let string = "aодиde\u{301}日🧟‍♀️" doIt(string) } +#endif -StringIndexTests.test("Exhaustive Index Interchange") { - // Exhaustively test aspects of string index interchange - func testInterchange( - _ str: String, - stackTrace: SourceLocStack = SourceLocStack(), - showFrame: Bool = true, - file: String = #file, - line: UInt = #line - ) { - guard #available(SwiftStdlib 5.1, *) else { - return - } +let _examples: [StaticString] = [ + "abc\r\ndefg", + "ab\r\ncдe\u{301}日🧟‍♀️x🧟x🏳️‍🌈🇺🇸🇨🇦", +] - let stackTrace = stackTrace.pushIf(showFrame, file: file, line: line) - func expect( - _ condition: @autoclosure () -> Bool, - _ message: String = "", - file: String = #file, - line: UInt = #line - ) { - expectTrue(condition(), message, - stackTrace: stackTrace, showFrame: showFrame, - file: file, line: line) - } +let examples: [String] = _examples.flatMap { s in + let str = "\(s)" + #if _runtime(_ObjC) + let unichars = Array(str.utf16) + let nsstr = NSString(characters: unichars, length: unichars.count) + return [str, nsstr as String] + #else + return [str] + #endif +} - var curCharIdx = str.startIndex - var curScalarIdx = str.startIndex - var curUTF8Idx = str.startIndex - var curUTF16Idx = str.startIndex +#if _runtime(_ObjC) +suite.test("Exhaustive Index Interchange") +.forEach(in: examples) { str in + guard #available(SwiftStdlib 5.1, *) else { + return + } - while curCharIdx < str.endIndex { - let curChar = str[curCharIdx] - expect(curChar == str[curScalarIdx]) - expect(curChar == str[curUTF8Idx]) - expect(curChar == str[curUTF16Idx]) + dumpIndices(str) - // Advance the character index once and have the scalar index catch up - str.formIndex(after: &curCharIdx) + var curCharIdx = str.startIndex + var curScalarIdx = str.startIndex + var curUTF8Idx = str.startIndex + var curUTF16Idx = str.startIndex + + while curCharIdx < str.endIndex { + let curChar = str[curCharIdx] + expectEqual(curChar, str[curScalarIdx]) + expectEqual(curChar, str[curUTF8Idx]) + expectEqual(curChar, str[curUTF16Idx]) + + // Advance the character index once and have the scalar index catch up + str.formIndex(after: &curCharIdx) + + let scalarStartIdx = curScalarIdx + defer { + let sub = str[scalarStartIdx.. utf8StartIdx { - str.utf8.formIndex(before: &utf8RevIdx) - - expect(curScalar == str.unicodeScalars[utf8RevIdx]) - expect(curSubChar == str[utf8RevIdx]) - expect(!UTF16.isTrailSurrogate(str.utf16[utf8RevIdx])) - expect(utf8StartIdx == str[utf8RevIdx...].startIndex) - expect(str[utf8StartIdx.. utf8StartIdx { + str.utf8.formIndex(before: &utf8RevIdx) + + expectEqual(curScalar, str.unicodeScalars[utf8RevIdx]) + expectEqual(curSubChar, str[utf8RevIdx]) + expectFalse(UTF16.isTrailSurrogate(str.utf16[utf8RevIdx])) + expectEqual(utf8StartIdx, str[utf8RevIdx...].startIndex) + expectTrue(str[utf8StartIdx.. utf16StartIdx { - str.utf16.formIndex(before: &utf16RevIdx) - - expect(curScalar == str.unicodeScalars[utf16RevIdx]) - expect(curSubChar == str[utf16RevIdx]) - expect(!UTF8.isContinuation(str.utf8[utf16RevIdx])) - expect(utf16StartIdx == str[utf16RevIdx...].startIndex) - expect(str[utf16StartIdx.. utf16StartIdx { + str.utf16.formIndex(before: &utf16RevIdx) + + expectEqual(curScalar, str.unicodeScalars[utf16RevIdx]) + expectEqual(curSubChar, str[utf16RevIdx]) + expectFalse(UTF8.isContinuation(str.utf8[utf16RevIdx])) + expectEqual(utf16StartIdx, str[utf16RevIdx...].startIndex) + expectTrue(str[utf16StartIdx.. [String.Index] { + var r = Array(self.indices) + if includingEnd { r.append(self.endIndex) } + r += Array(self.unicodeScalars.indices) + if includingEnd { r.append(self.unicodeScalars.endIndex) } + r += Array(self.utf8.indices) + if includingEnd { r.append(self.utf8.endIndex) } + r += Array(self.utf16.indices) + if includingEnd { r.append(self.utf16.endIndex) } + return r + } + /// Returns a dictionary mapping each valid index to the index that lies on /// the nearest scalar boundary, rounding down. func scalarMap() -> [String.Index: (index: String.Index, offset: Int)] { @@ -570,7 +567,8 @@ extension String { } } -StringIndexTests.test("Extra Exhaustive Index Interchange") { +suite.test("Fully exhaustive index interchange") +.forEach(in: examples) { string in guard #available(SwiftStdlib 5.7, *) else { // Index navigation in 5.7 always rounds input indices down to the nearest // Character, so that we always have a well-defined distance between @@ -580,186 +578,139 @@ StringIndexTests.test("Extra Exhaustive Index Interchange") { return } - func check( - _ string: String, - stackTrace: SourceLocStack = SourceLocStack(), - showFrame: Bool = true, - file: String = #file, - line: UInt = #line - ) { - dumpIndices(string) - - let scalarMap = string.scalarMap() - let characterMap = string.characterMap() - - // This is a list of every valid index in every string view, including end - // indices. We keep equal indices because they may have different grapheme - // size caches or flags etc. - var allIndices = Array(string.indices) + [string.endIndex] - allIndices += Array(string.unicodeScalars.indices) + [string.unicodeScalars.endIndex] - allIndices += Array(string.utf8.indices) + [string.utf8.endIndex] - allIndices += Array(string.utf16.indices) + [string.utf16.endIndex] - - func referenceCharacterDistance( - from i: String.Index, to j: String.Index - ) -> Int { - let ci = characterMap[i]! - let cj = characterMap[j]! - return cj.offset - ci.offset - } + dumpIndices(string) - func referenceScalarDistance( - from i: String.Index, to j: String.Index - ) -> Int { - let si = scalarMap[i]! - let sj = scalarMap[j]! - return sj.offset - si.offset - } + let scalarMap = string.scalarMap() + let characterMap = string.characterMap() + let allIndices = string.allIndices() - for i in allIndices { - for j in allIndices { - let characterDistance = referenceCharacterDistance(from: i, to: j) - let scalarDistance = referenceScalarDistance(from: i, to: j) - - let substringDistance: Int - if i <= j { - // The substring `string[i.. Int { + let ci = characterMap[i]! + let cj = characterMap[j]! + return cj.offset - ci.offset + } - // Check distance calculations. - if #available(SwiftStdlib 5.7, *) { - expectEqual( - string.distance(from: i, to: j), - characterDistance, - """ - string: \(string.debugDescription) - i: \(i) - j: \(j) - """) - if i <= j { - expectEqual(string[i ..< j].count, substringDistance, - """ - string: \(string.debugDescription) - i: \(i) - j: \(j) - """) - } - } + func referenceScalarDistance( + from i: String.Index, to j: String.Index + ) -> Int { + let si = scalarMap[i]! + let sj = scalarMap[j]! + return sj.offset - si.offset + } - expectEqual( - string.unicodeScalars.distance(from: i, to: j), - scalarDistance, + for i in allIndices { + for j in allIndices { + let characterDistance = referenceCharacterDistance(from: i, to: j) + let scalarDistance = referenceScalarDistance(from: i, to: j) + + // Check distance calculations. + expectEqual( + string.distance(from: i, to: j), + characterDistance, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + """) + expectEqual( + string.unicodeScalars.distance(from: i, to: j), + scalarDistance, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) + """) + + if i <= j { + // The substring `string[i.. 0 && i <= limit && j > limit ? true - : distance < 0 && i >= limit && j < limit ? true - : false) - expectEqual( - string.index(i, offsetBy: distance, limitedBy: limit), - expectHit ? nil : j, - """ - string: \(string.debugDescription) - i: \(i) - j: \(j) (distance: \(distance)) - limit: \(limit) - """) - } - } + // Check `String.index(_:offsetBy:limitedBy:)`. + for limit in allIndices { + let dest = characterMap[j]!.index + let expectHit = ( + characterDistance > 0 && i <= limit && dest > limit ? true + : characterDistance < 0 && i >= limit && dest < limit ? true + : false) + expectEqual( + string.index(i, offsetBy: characterDistance, limitedBy: limit), + expectHit ? nil : dest, + """ + string: \(string.debugDescription) + i: \(i) + j: \(j) (distance: \(characterDistance)) + limit: \(limit) + """) } } } - - let strings: [StaticString] = [ - "abc\r\ndefg", - "ab\r\ncдe\u{301}日🧟‍♀️x🧟x🏳️‍🌈🇺🇸🇨🇦", - ] - - for s in strings { - let str = "\(s)" - check(str) - - #if _runtime(_ObjC) - let unichars = Array(str.utf16) - let nsstr = NSString(characters: unichars, length: unichars.count) - check(nsstr as String) - #endif - } } -StringIndexTests.test("Global vs local grapheme cluster boundaries") { +suite.test("Global vs local grapheme cluster boundaries") { guard #available(SwiftStdlib 5.7, *) else { // Index navigation in 5.7 always rounds input indices down to the nearest // Character, so that we always have a well-defined distance between @@ -859,4 +810,49 @@ StringIndexTests.test("Global vs local grapheme cluster boundaries") { expectEqual(slice.utf16.count, 4) } -runAllTests() +#if _runtime(_ObjC) +suite.test("Index encoding correction") { + guard #available(SwiftStdlib 5.7, *) else { + // String indices did not track their encoding until 5.7. + return + } + // This tests a special case in String's index validation where we allow + // UTF-16-encoded indices to be applied to UTF-8 strings, by transcoding the + // offset of the index. Applying such indices is always an error, but it + // happens relatively often when someone is erroneously holding on to an index + // of a UTF-16-encoded bridged NSString value through a mutation. Mutating a + // bridged string converts it to a UTF-8 native string, changing the meaning + // of the offset value. (Even if the mutation wouldn't otherwise affect the + // original index.) + // + // Before 5.7, the stdlib did not track the offset encoding of String indices, + // so they simply used the UTF-16 offset to access UTF-8 storage. This + // generally produces nonsensical results, but if the string happens to be + // ASCII, then this actually did work "fine". + // + // To avoid breaking binaries that rely on this behavior, the 5.7 stdlib + // automatically converts UTF-16-encoded indices to UTF-8 when needed. + // This can be quite slow, but it always produces the "right" results. + // + // This conversion is one way only (see StringTraps.swift for the other + // direction), and it does not account for the effect of the actual mutation. + // If the mutation's effect included the data addressed by the original index, + // then we may still get nonsensical results. + var s = ("🫱🏼‍🫲🏽 a 🧑🏽‍🌾 b" as NSString) as String + dumpIndices(s) + + let originals = s.allIndices(includingEnd: false).map { + ($0, s[$0], s.unicodeScalars[$0], s.utf8[$0], s.utf16[$0]) + } + + s.append(".") + dumpIndices(s) + + for (i, char, scalar, u8, u16) in originals { + expectEqual(s[i], char, "i: \(i)") + expectEqual(s.unicodeScalars[i], scalar, "i: \(i)") + expectEqual(s.utf8[i], u8, "i: \(i)") + expectEqual(s.utf16[i], u16, "i: \(i)") + } +} +#endif diff --git a/test/stdlib/StringTraps.swift b/test/stdlib/StringTraps.swift index 011c97fe3bbf8..0d22cb5fd98c4 100644 --- a/test/stdlib/StringTraps.swift +++ b/test/stdlib/StringTraps.swift @@ -17,6 +17,7 @@ import Foundation // For NSString let testSuiteSuffix = _isDebugAssertConfiguration() ? "_debug" : "_release" var StringTraps = TestSuite("StringTraps" + testSuiteSuffix) +defer { runAllTests() } StringTraps.test("startIndex/predecessor") .skip(.custom( @@ -353,5 +354,113 @@ StringTraps.test("UTF8View foreign index(before:) trap on i == startIndex") } #endif -runAllTests() - +#if _runtime(_ObjC) +if #available(SwiftStdlib 5.7, *) { + let native = "🫱🏼‍🫲🏽 a 🧑🏽‍🌾 b" + let cocoa = ("🫱🏼‍🫲🏽 a 🧑🏽‍🌾 b" as NSString) as String + + let goodIndices: [String.Index] = [ + native.startIndex, + native.unicodeScalars.startIndex, + native.utf8.startIndex, + native.utf16.startIndex, + ] + + StringTraps.test("Start index encoding").forEach(in: goodIndices) { i in + // The startIndex works fine in both encodings. + print(i) + expectEqual(cocoa[i], native[i]) + } + + let badIndices: [String.Index] = [ + native.index(native.startIndex, offsetBy: 3), + native.unicodeScalars.index(native.startIndex, offsetBy: 3), + native.utf8.index(native.startIndex, offsetBy: 3), + native.utf16.index(native.startIndex, offsetBy: 3), + ] + + StringTraps.test("String.subscript/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa[i]) + } + + StringTraps.test("String.index(after:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.index(after: i)) + } + + StringTraps.test("String.index(before:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.index(before: i)) + } + + StringTraps.test("String.UnicodeScalarView.subscript/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.unicodeScalars[i]) + } + + StringTraps.test("String.UnicodeScalarView.index(after:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.unicodeScalars.index(after: i)) + } + + StringTraps.test("String.UnicodeScalarView.index(before:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.unicodeScalars.index(before: i)) + } + + StringTraps.test("String.UTF8View.subscript/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.utf8[i]) + } + + StringTraps.test("String.UTF8View.index(after:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.utf8.index(after: i)) + } + + StringTraps.test("String.UTF8View.index(before:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.utf8.index(before: i)) + } + + StringTraps.test("String.UTF16View.subscript/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.utf16[i]) + } + + StringTraps.test("String.UTF16View.index(after:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.utf16.index(after: i)) + } + + StringTraps.test("String.UTF16View.index(before:)/encoding trap") + .forEach(in: badIndices) { i in + print(i) + expectCrashLater() + print(cocoa.utf16.index(before: i)) + } +} +#endif From 4eab8355cac55281fc8aef766d41ddcb7f4a7c4f Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 18:22:06 -0700 Subject: [PATCH 30/83] [stdlib] String: prefer passing ranges to start+end argument pairs --- .../public/core/StringGraphemeBreaking.swift | 21 ++--- stdlib/public/core/StringGuts.swift | 36 +++++---- stdlib/public/core/Substring.swift | 80 +++++++++++++------ 3 files changed, 87 insertions(+), 50 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index d3e0f0d674428..4cdc764d7d23c 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -109,22 +109,25 @@ extension _StringGuts { internal func roundDownToNearestCharacter( _ i: String.Index, - from start: String.Index, - to end: String.Index + in bounds: Range ) -> String.Index { - _internalInvariant(start._isScalarAligned && end._isScalarAligned) - _internalInvariant(hasMatchingEncoding(start) && hasMatchingEncoding(end)) - _internalInvariant(start <= end && end <= endIndex) + _internalInvariant( + bounds.lowerBound._isScalarAligned && bounds.upperBound._isScalarAligned) + _internalInvariant( + hasMatchingEncoding(bounds.lowerBound) && hasMatchingEncoding(bounds.upperBound)) + _internalInvariant(bounds.upperBound <= endIndex) _internalInvariant(i._isScalarAligned) _internalInvariant(hasMatchingEncoding(i)) - _internalInvariant(i >= start && i <= end) + _internalInvariant(i >= bounds.lowerBound && i <= bounds.upperBound) // We can only use the `_isCharacterAligned` bit if the start index is also // character-aligned. - if start._isCharacterAligned && i._isCharacterAligned { return i } + if bounds.lowerBound._isCharacterAligned && i._isCharacterAligned { + return i + } - if i == start || i == end { return i } + if i == bounds.lowerBound || i == bounds.upperBound { return i } let offset = i._encodedOffset let prior = offset - _opaqueCharacterStride(endingAt: offset) @@ -136,7 +139,7 @@ extension _StringGuts { return i } var r = String.Index(encodedOffset: prior, characterStride: stride) - if start._isCharacterAligned { + if bounds.lowerBound._isCharacterAligned { r = r._characterAligned } else { r = r._scalarAligned diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 66eae2df0afe4..669d1d546fd27 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -449,13 +449,13 @@ extension _StringGuts { @_alwaysEmitIntoClient internal func validateScalarIndex( _ i: String.Index, - from start: String.Index, - to end: String.Index + in bounds: Range ) -> String.Index { - _internalInvariant(start <= end && end <= endIndex) + _internalInvariant(bounds.upperBound <= endIndex) let i = ensureMatchingEncoding(i) - _precondition(i >= start && i < end, "Substring index is out of bounds") + _precondition(i >= bounds.lowerBound && i < bounds.upperBound, + "Substring index is out of bounds") return scalarAlign(i) } } @@ -486,13 +486,13 @@ extension _StringGuts { /// - is aligned on a scalar boundary. internal func validateInclusiveScalarIndex( _ i: String.Index, - from start: String.Index, - to end: String.Index + in bounds: Range ) -> String.Index { - _internalInvariant(start <= end && end <= endIndex) + _internalInvariant(bounds.upperBound <= endIndex) let i = ensureMatchingEncoding(i) - _precondition(i >= start && i <= end, "Substring index is out of bounds") + _precondition(i >= bounds.lowerBound && i <= bounds.upperBound, + "Substring index is out of bounds") return scalarAlign(i) } } @@ -517,10 +517,9 @@ extension _StringGuts { @_alwaysEmitIntoClient internal func validateSubscalarRange( _ range: Range, - from start: String.Index, - to end: String.Index + in bounds: Range ) -> Range { - _internalInvariant(start <= end && end <= endIndex) + _internalInvariant(bounds.upperBound <= endIndex) let upper = ensureMatchingEncoding(range.upperBound) let lower = ensureMatchingEncoding(range.lowerBound) @@ -528,7 +527,10 @@ extension _StringGuts { // Note: if only `lower` was miscoded, then the range invariant `lower <= // upper` may no longer hold after the above conversions, so we need to // re-check it here. - _precondition(upper <= end && lower >= start && lower <= upper, + _precondition( + upper <= bounds.upperBound + && lower >= bounds.lowerBound + && lower <= upper, "Substring index range is out of bounds") return Range(_uncheckedBounds: (lower, upper)) @@ -578,10 +580,9 @@ extension _StringGuts { /// - are aligned on a scalar boundary. internal func validateScalarRange( _ range: Range, - from start: String.Index, - to end: String.Index + in bounds: Range ) -> Range { - _internalInvariant(start <= end && end <= endIndex) + _internalInvariant(bounds.upperBound <= endIndex) var upper = ensureMatchingEncoding(range.upperBound) var lower = ensureMatchingEncoding(range.lowerBound) @@ -589,7 +590,10 @@ extension _StringGuts { // Note: if only `lower` was miscoded, then the range invariant `lower <= // upper` may no longer hold after the above conversions, so we need to // re-check it here. - _precondition(upper <= end && lower >= start && lower <= upper, + _precondition( + upper <= bounds.upperBound + && lower >= bounds.lowerBound + && lower <= upper, "Substring index range is out of bounds") upper = scalarAlign(upper) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index b229cfce663fe..84fa3d45e7aac 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -145,6 +145,23 @@ extension Substring { return Range(_uncheckedBounds: (lower, upper)) } + @inlinable @inline(__always) + internal var _bounds: Range { + Range(_uncheckedBounds: (startIndex, endIndex)) + } +} + +extension Substring { + internal var _startIsCharacterAligned: Bool { + startIndex._isCharacterAligned + } + + internal var _endIsCharacterAligned: Bool { + endIndex._isCharacterAligned + } +} + +extension Substring { #if !INTERNAL_CHECKS_ENABLED @inlinable @inline(__always) internal func _invariantCheck() {} #else @@ -167,28 +184,28 @@ extension Substring { extension Substring { @inline(__always) internal func _validateScalarIndex(_ i: String.Index) -> String.Index { - _wholeGuts.validateScalarIndex(i, from: startIndex, to: endIndex) + _wholeGuts.validateScalarIndex(i, in: _bounds) } @inline(__always) internal func _validateInclusiveScalarIndex( _ i: String.Index ) -> String.Index { - _wholeGuts.validateInclusiveScalarIndex(i, from: startIndex, to: endIndex) + _wholeGuts.validateInclusiveScalarIndex(i, in: _bounds) } @inline(__always) internal func _validateScalarRange( _ range: Range ) -> Range { - _wholeGuts.validateScalarRange(range, from: startIndex, to: endIndex) + _wholeGuts.validateScalarRange(range, in: _bounds) } @inline(__always) internal func _roundDownToNearestCharacter( _ i: String.Index ) -> String.Index { - _wholeGuts.roundDownToNearestCharacter(i, from: startIndex, to: endIndex) + _wholeGuts.roundDownToNearestCharacter(i, in: _bounds) } /// Return true if and only if `i` is a valid index in this substring, @@ -647,16 +664,6 @@ extension Substring: StringProtocol { } } -extension Substring { - internal var _startIsCharacterAligned: Bool { - startIndex._isCharacterAligned - } - - internal var _endIsCharacterAligned: Bool { - endIndex._isCharacterAligned - } -} - extension Substring { internal func _characterStride(startingAt i: Index) -> Int { _internalInvariant(i._isScalarAligned) @@ -737,6 +744,11 @@ extension Substring { @_alwaysEmitIntoClient @inline(__always) internal var _base: String.UTF8View { _slice._base } + + @_alwaysEmitIntoClient @inline(__always) + internal var _bounds: Range { + Range(_uncheckedBounds: (_slice._startIndex, _slice._endIndex)) + } } } @@ -833,7 +845,7 @@ extension Substring.UTF8View: BidirectionalCollection { @inlinable public subscript(r: Range) -> Substring.UTF8View { // FIXME(strings): tests. - let r = _wholeGuts.validateSubscalarRange(r, from: startIndex, to: endIndex) + let r = _wholeGuts.validateSubscalarRange(r, in: _bounds) return Substring.UTF8View(_slice.base, _bounds: r) } } @@ -894,6 +906,11 @@ extension Substring { @_alwaysEmitIntoClient @inline(__always) internal var _base: String.UTF16View { _slice._base } + + @_alwaysEmitIntoClient @inline(__always) + internal var _bounds: Range { + Range(_uncheckedBounds: (_slice._startIndex, _slice._endIndex)) + } } } @@ -981,7 +998,7 @@ extension Substring.UTF16View: BidirectionalCollection { @inlinable public subscript(r: Range) -> Substring.UTF16View { - let r = _wholeGuts.validateSubscalarRange(r, from: startIndex, to: endIndex) + let r = _wholeGuts.validateSubscalarRange(r, in: _bounds) return Substring.UTF16View(_slice.base, _bounds: r) } } @@ -1039,16 +1056,31 @@ extension Substring { /// Creates an instance that slices `base` at `_bounds`. @usableFromInline // This used to be inlinable before 5.7 - @available(*, deprecated) // Use `init(_unchecked:)` in new code. + @available(*, deprecated, message: "Use `init(_unchecked:bounds)` in new code") internal init(_ base: String.UnicodeScalarView, _bounds: Range) { let start = base._guts.scalarAlign(_bounds.lowerBound) let end = base._guts.scalarAlign(_bounds.upperBound) _slice = Slice(base: base, bounds: Range(_uncheckedBounds: (start, end))) } + } +} - @_alwaysEmitIntoClient - @inline(__always) - internal var _wholeGuts: _StringGuts { _slice._base._guts } +extension Substring.UnicodeScalarView { + @_alwaysEmitIntoClient + @inline(__always) + internal var _wholeGuts: _StringGuts { _slice._base._guts } + + @inline(__always) + internal var _offsetRange: Range { + let lower = _slice._startIndex._encodedOffset + let upper = _slice._endIndex._encodedOffset + return Range(_uncheckedBounds: (lower, upper)) + } + + @_alwaysEmitIntoClient + @inline(__always) + internal var _bounds: Range { + Range(_uncheckedBounds: (startIndex, endIndex)) } } @@ -1069,8 +1101,7 @@ extension Substring.UnicodeScalarView: BidirectionalCollection { @inlinable public subscript(index: Index) -> Element { - let index = _wholeGuts.validateScalarIndex( - index, from: startIndex, to: endIndex) + let index = _wholeGuts.validateScalarIndex(index, in: _bounds) return _wholeGuts.errorCorrectedScalar(startingAt: index._encodedOffset).0 } @@ -1130,7 +1161,7 @@ extension Substring.UnicodeScalarView: BidirectionalCollection { public subscript(r: Range) -> Substring.UnicodeScalarView { // Note: This used to be inlinable until Swift 5.7 - let r = _wholeGuts.validateScalarRange(r, from: startIndex, to: endIndex) + let r = _wholeGuts.validateScalarRange(r, in: _bounds) return Substring.UnicodeScalarView(_unchecked: _slice._base, bounds: r) } } @@ -1175,8 +1206,7 @@ extension Substring.UnicodeScalarView: RangeReplaceableCollection { _ subrange: Range, with replacement: C ) where C.Element == Element { // TODO(lorentey): Review index validation - let subrange = _slice._base._guts.validateScalarRange( - subrange, from: startIndex, to: endIndex) + let subrange = _wholeGuts.validateScalarRange(subrange, in: _bounds) _slice.replaceSubrange(subrange, with: replacement) } } From dc6990370e025ab067c3ca9558fe8d5b3e699e2f Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 18:23:08 -0700 Subject: [PATCH 31/83] [stdlib] StringGuts.scalarAlign: Preserve encoding flags in returned index --- stdlib/public/core/StringIndex.swift | 5 +++++ stdlib/public/core/UnicodeHelpers.swift | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index 869ef54c94fed..3dd4825fd3218 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -439,6 +439,11 @@ extension String.Index { @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) internal var __isUTF16: Bool { _rawBits & 0x8 != 0 } + + @_alwaysEmitIntoClient // Swift 5.7 + internal func _copyEncoding(from index: Self) -> Self { + Self((_rawBits & ~0xC) | (index._rawBits & 0xC)) + } } extension String.Index: Equatable { diff --git a/stdlib/public/core/UnicodeHelpers.swift b/stdlib/public/core/UnicodeHelpers.swift index 63ae5e9c8f4e3..adb277077e3dc 100644 --- a/stdlib/public/core/UnicodeHelpers.swift +++ b/stdlib/public/core/UnicodeHelpers.swift @@ -167,7 +167,7 @@ extension _StringGuts { result = idx } else { // TODO(String performance): isASCII check - result = scalarAlignSlow(idx) + result = scalarAlignSlow(idx)._scalarAligned._copyEncoding(from: idx) } _internalInvariant(isOnUnicodeScalarBoundary(result), @@ -183,7 +183,7 @@ extension _StringGuts { if _slowPath(idx.transcodedOffset != 0 || idx._encodedOffset == 0) { // Transcoded index offsets are already scalar aligned - return String.Index(_encodedOffset: idx._encodedOffset)._scalarAligned + return String.Index(_encodedOffset: idx._encodedOffset) } if _slowPath(self.isForeign) { // In 5.1 this check was added to foreignScalarAlign, but when this is @@ -191,7 +191,7 @@ extension _StringGuts { // a version of foreignScalarAlign that doesn't check for this, which // ends up asking CFString for its endIndex'th character, which throws // an exception. So we duplicate the check here for back deployment. - guard idx._encodedOffset != self.count else { return idx._scalarAligned } + guard idx._encodedOffset != self.count else { return idx } let foreignIdx = foreignScalarAlign(idx) _internalInvariant_5_1(foreignIdx._isScalarAligned) @@ -200,7 +200,7 @@ extension _StringGuts { return String.Index(_encodedOffset: self.withFastUTF8 { _scalarAlign($0, idx._encodedOffset) } - )._scalarAligned + ) } @inlinable From 755712a25d3abf7ab2ed286c51fc5b57cb0b75cc Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 18:26:01 -0700 Subject: [PATCH 32/83] [stdlib] StringGuts.replaceSubrange: Fast path for replacing with a fast substring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the replacement collection is a fast UTF-8 substring, we can simply access its backing store directly — we don’t need to use a circuituous lazy algorithm. --- .../core/StringGutsRangeReplaceable.swift | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/stdlib/public/core/StringGutsRangeReplaceable.swift b/stdlib/public/core/StringGutsRangeReplaceable.swift index 1983c45f1ad51..c5acd5c148bbb 100644 --- a/stdlib/public/core/StringGutsRangeReplaceable.swift +++ b/stdlib/public/core/StringGutsRangeReplaceable.swift @@ -307,10 +307,19 @@ extension _StringGuts { ) -> Range where C: Collection, C.Iterator.Element == Character { if isUniqueNative { - if let replStr = newElements as? String, replStr._guts.isFastUTF8 { - return replStr._guts.withFastUTF8 { - uniqueNativeReplaceSubrange( - bounds, with: $0, isASCII: replStr._guts.isASCII) + if let repl = newElements as? String { + if repl._guts.isFastUTF8 { + return repl._guts.withFastUTF8 { + uniqueNativeReplaceSubrange( + bounds, with: $0, isASCII: repl._guts.isASCII) + } + } + } else if let repl = newElements as? Substring { + if repl._wholeGuts.isFastUTF8 { + return repl._wholeGuts.withFastUTF8(range: repl._offsetRange) { + uniqueNativeReplaceSubrange( + bounds, with: $0, isASCII: repl._wholeGuts.isASCII) + } } } return uniqueNativeReplaceSubrange( From 9714f97ad81a2c27f96765ec45cfa715edad6f6b Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 18:28:50 -0700 Subject: [PATCH 33/83] [stdlib] Substring: round indices down to nearest character in indexing operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Distances between indices aren’t well-defined without this. --- stdlib/public/core/Substring.swift | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 84fa3d45e7aac..827f18578d252 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -354,7 +354,8 @@ extension Substring: StringProtocol { // `Substring`'s bounds do not fall on grapheme boundaries in `base`. // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - var i = _validateInclusiveScalarIndex(i) + var i = _roundDownToNearestCharacter( + _validateInclusiveScalarIndex(i)) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { _precondition(i < endIndex, "String index is out of bounds") @@ -390,7 +391,7 @@ extension Substring: StringProtocol { let limit = _wholeGuts.ensureMatchingEncoding(limit) let start = _wholeGuts.ensureMatchingEncoding(i) - var i = _validateInclusiveScalarIndex(i) + var i = _roundDownToNearestCharacter(_validateInclusiveScalarIndex(i)) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { guard limit < start || i < limit else { return nil } @@ -422,8 +423,10 @@ extension Substring: StringProtocol { // grapheme breaks -- swapping `start` and `end` may change the magnitude of // the result. - let start = _validateInclusiveScalarIndex(start) - let end = _validateInclusiveScalarIndex(end) + let start = _roundDownToNearestCharacter( + _validateInclusiveScalarIndex(start)) + let end = _roundDownToNearestCharacter( + _validateInclusiveScalarIndex(end)) // TODO: known-ASCII and single-scalar-grapheme fast path, etc. From b29d8f48058db71fd5d5b61e119eed044711971a Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 18:33:06 -0700 Subject: [PATCH 34/83] [stdlib] Substring: restrict grapheme breaking to the bounds of the substring (Oops) --- stdlib/public/core/StringGraphemeBreaking.swift | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 4cdc764d7d23c..94ecfa56756cc 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -130,8 +130,13 @@ extension _StringGuts { if i == bounds.lowerBound || i == bounds.upperBound { return i } let offset = i._encodedOffset - let prior = offset - _opaqueCharacterStride(endingAt: offset) - let stride = _opaqueCharacterStride(startingAt: prior) + + let offsetBounds = Range( + _uncheckedBounds: ( + bounds.lowerBound._encodedOffset, bounds.upperBound._encodedOffset)) + + let prior = offset - _opaqueCharacterStride(endingAt: offset, in: offsetBounds) + let stride = _opaqueCharacterStride(startingAt: prior, in: offsetBounds) _internalInvariant(offset <= prior + stride, "Grapheme breaking inconsistency") if offset >= prior + stride { From 1c9c5ccbf6583e662b068153bd0bdccfa3973f87 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 19:01:10 -0700 Subject: [PATCH 35/83] [test] test/StringIndex: Add some tests exercising replaceSubrange MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The exhaustive substring.replaceSubrange test probably takes too long to include in regular testing, but let’s enable it for now: it has caught a bunch of problems already and it will probably catch more before this lands. --- test/stdlib/StringIndex.swift | 200 +++++++++++++++++++++++++++++++--- 1 file changed, 187 insertions(+), 13 deletions(-) diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index 1191387a8b0be..c647568586644 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -18,6 +18,8 @@ enum SimpleString: String { case emoji = "😀😃🤢🤮👩🏿‍🎤🧛🏻‍♂️🧛🏻‍♂️👩‍👩‍👦‍👦" } +/// Print out a full list of indices in every view of `string`. +/// This is useful while debugging test failures in this test. func dumpIndices(_ string: String) { print("-------------------------------------------------------------------") print("String: \(String(reflecting: string))") @@ -350,7 +352,7 @@ suite.test("Exhaustive Index Interchange") return } - dumpIndices(str) + //dumpIndices(str) var curCharIdx = str.startIndex var curScalarIdx = str.startIndex @@ -481,10 +483,9 @@ suite.test("Exhaustive Index Interchange") #endif extension Collection { - // Assuming both `self` and `other` are sorted, call `body` for each element - // `a` in `other` together with the slice in `self` that starts with the first - // element in `self` that is greater than or equal to `a`, up to the first - // element that is greater than or equal to the next value in `other`. + // Assuming both `self` and `other` use the same index space, call `body` for + // each index `i` in `other`, along with the slice in `self` that begins at + // `i` and ends at the index following it in `other`. // // `other` must start with an item that is less than or equal to the first // item in `self`. @@ -516,9 +517,9 @@ extension Collection { } extension String { - // Returns a list of every valid index in every string view, including end - // indices. We keep equal indices originating from different views because - // they may have different grapheme size caches or flags etc. + // Returns a list of every valid index in every string view, optionally + // including end indices. We keep equal indices originating from different + // views because they may have different grapheme size caches or flags etc. func allIndices(includingEnd: Bool = true) -> [String.Index] { var r = Array(self.indices) if includingEnd { r.append(self.endIndex) } @@ -567,6 +568,23 @@ extension String { } } +extension Substring { + // Returns a list of every valid index in every string view, optionally + // including end indices. We keep equal indices originating from different + // views because they may have different grapheme size caches or flags etc. + func allIndices(includingEnd: Bool = true) -> [String.Index] { + var r = Array(self.indices) + if includingEnd { r.append(self.endIndex) } + r += Array(self.unicodeScalars.indices) + if includingEnd { r.append(self.unicodeScalars.endIndex) } + r += Array(self.utf8.indices) + if includingEnd { r.append(self.utf8.endIndex) } + r += Array(self.utf16.indices) + if includingEnd { r.append(self.utf16.endIndex) } + return r + } +} + suite.test("Fully exhaustive index interchange") .forEach(in: examples) { string in guard #available(SwiftStdlib 5.7, *) else { @@ -578,7 +596,7 @@ suite.test("Fully exhaustive index interchange") return } - dumpIndices(string) + //dumpIndices(string) let scalarMap = string.scalarMap() let characterMap = string.characterMap() @@ -627,7 +645,9 @@ suite.test("Fully exhaustive index interchange") // The substring `string[i.. Date: Tue, 29 Mar 2022 19:33:00 -0700 Subject: [PATCH 36/83] [stdlib] Substring.init: Stop checking things twice --- stdlib/public/core/Substring.swift | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 827f18578d252..40c72b19674a0 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -99,12 +99,6 @@ public struct Substring: Sendable { @inline(__always) internal init(_unchecked slice: Slice) { - _internalInvariant(slice.endIndex <= slice._base._guts.endIndex) - _internalInvariant( - slice._base._guts.hasMatchingEncoding(slice.startIndex) && - slice._base._guts.hasMatchingEncoding(slice.endIndex)) - _internalInvariant( - slice.startIndex._isScalarAligned && slice.endIndex._isScalarAligned) self._slice = slice _invariantCheck() } @@ -167,15 +161,12 @@ extension Substring { #else @usableFromInline @inline(never) @_effects(releasenone) internal func _invariantCheck() { - _internalInvariant(_slice.endIndex <= _wholeGuts.endIndex) + _internalInvariant(endIndex <= _wholeGuts.endIndex) _internalInvariant( - _wholeGuts.hasMatchingEncoding(_slice.startIndex) && - _wholeGuts.hasMatchingEncoding(_slice.endIndex)) - // Indices are always scalar aligned + _wholeGuts.hasMatchingEncoding(startIndex) && + _wholeGuts.hasMatchingEncoding(endIndex)) _internalInvariant( - _slice.startIndex == _wholeGuts.scalarAlign(_slice.startIndex) && - _slice.endIndex == _wholeGuts.scalarAlign(_slice.endIndex)) - + startIndex._isScalarAligned && endIndex._isScalarAligned) self.base._invariantCheck() } #endif // INTERNAL_CHECKS_ENABLED From c9adf7aaea4e38688b4c8ec17043ee6cb518ab2b Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 19:35:01 -0700 Subject: [PATCH 37/83] [stdlib] Substring: Review view creation/conversion code --- stdlib/public/core/Substring.swift | 57 +++++++++++++++++++----------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 40c72b19674a0..7386d9ab297b9 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -97,12 +97,19 @@ public struct Substring: Sendable { @usableFromInline internal var _slice: Slice + @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) internal init(_unchecked slice: Slice) { self._slice = slice _invariantCheck() } + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal init(_unchecked guts: _StringGuts, bounds: Range) { + self.init(_unchecked: Slice(base: String(guts), bounds: bounds)) + } + @usableFromInline // This used to be @inlinable before 5.7 @available(*, deprecated) // Use `init(_unchecked:)` in new code. internal init(_ slice: Slice) { @@ -728,9 +735,7 @@ extension Substring { /// Creates an instance that slices `base` at `_bounds`. @inlinable internal init(_ base: String.UTF8View, _bounds: Range) { - _slice = Slice( - base: String(base._guts).utf8, - bounds: _bounds) + _slice = Slice(base: base, bounds: _bounds) } @_alwaysEmitIntoClient @inline(__always) @@ -848,7 +853,8 @@ extension Substring { @inlinable public var utf8: UTF8View { get { - return base.utf8[startIndex.. /// Creates an instance that slices `base` at `_bounds`. + @_alwaysEmitIntoClient internal init( _unchecked base: String.UnicodeScalarView, bounds: Range ) { @@ -1164,7 +1178,8 @@ extension Substring { @inlinable public var unicodeScalars: UnicodeScalarView { get { - return base.unicodeScalars[startIndex.. Date: Tue, 29 Mar 2022 19:55:17 -0700 Subject: [PATCH 38/83] [stdlib] Substring.makeContiguousUTF8: Do not lose context in `base` --- stdlib/public/core/StringProtocol.swift | 35 ++++++++++++++++++------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/stdlib/public/core/StringProtocol.swift b/stdlib/public/core/StringProtocol.swift index eac94b5a6ecb7..196596ff6b521 100644 --- a/stdlib/public/core/StringProtocol.swift +++ b/stdlib/public/core/StringProtocol.swift @@ -235,10 +235,31 @@ extension Substring { /// /// Complexity: O(n) if non-contiguous, O(1) if already contiguous /// - @_alwaysEmitIntoClient + @_alwaysEmitIntoClient @inline(__always) public mutating func makeContiguousUTF8() { - if _fastPath(isContiguousUTF8) { return } - self = String._copying(self)[...] + if isContiguousUTF8 { return } + return _slowMakeContiguousUTF8() + } + + @_alwaysEmitIntoClient // Swift 5.7 + @inline(never) + internal mutating func _slowMakeContiguousUTF8() { + _internalInvariant(!isContiguousUTF8) + + let scalarOffset = base.unicodeScalars.distance( + from: base.startIndex, to: startIndex) + let scalarCount = base.unicodeScalars.distance( + from: startIndex, to: endIndex) + + let scalars = String._copying(base).unicodeScalars + + var newStart = scalars.index(scalars.startIndex, offsetBy: scalarOffset) + var newEnd = scalars.index(newStart, offsetBy: scalarCount) + + if startIndex._isCharacterAligned { newStart = newStart._characterAligned } + if endIndex._isCharacterAligned { newEnd = newEnd._characterAligned } + + self = Substring(_unchecked: scalars._guts, bounds: newStart ..< newEnd) } /// Runs `body` over the content of this substring in contiguous memory. If @@ -258,13 +279,7 @@ extension Substring { public mutating func withUTF8( _ body: (UnsafeBufferPointer) throws -> R ) rethrows -> R { - if _fastPath(isContiguousUTF8) { - return try _wholeGuts.withFastUTF8(range: self._offsetRange) { - return try body($0) - } - } - makeContiguousUTF8() - return try _wholeGuts.withFastUTF8(body) + return try _wholeGuts.withFastUTF8(range: _offsetRange, body) } } From ff58d545652b5b91308ba5b5bb8c589b0bff4450 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 29 Mar 2022 20:07:04 -0700 Subject: [PATCH 39/83] [stdlib][NFC] Substring adjustments --- stdlib/public/core/Substring.swift | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 7386d9ab297b9..6afe69667cff0 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -251,7 +251,7 @@ extension Substring: StringProtocol { /// /// - has the right encoding, /// - is within bounds, and - /// - is scalar aligned. + /// - is character aligned within this substring. /// /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(after i: Index) -> Index { @@ -259,8 +259,8 @@ extension Substring: StringProtocol { _internalInvariant(i._isScalarAligned) _internalInvariant(i >= startIndex && i < endIndex) - // Implicit precondition: `i` must be `Character`-aligned within this - // substring, even if it doesn't have the corresponding flag set. + // Note: `i` must be `Character`-aligned within this substring, even if it + // doesn't have the corresponding flag set. // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let stride = _characterStride(startingAt: i) @@ -309,7 +309,7 @@ extension Substring: StringProtocol { /// /// - has the right encoding, /// - is within bounds, and - /// - is scalar aligned. + /// - is character aligned within this substring. /// /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(before i: Index) -> Index { @@ -317,8 +317,8 @@ extension Substring: StringProtocol { _internalInvariant(i._isScalarAligned) _internalInvariant(i > startIndex && i <= endIndex) - // Implicit precondition: `i` must be `Character`-aligned within this - // substring, even if it doesn't have the corresponding flag set. + // Note: `i` must be `Character`-aligned within this substring, even if it + // doesn't have the corresponding flag set. // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let priorStride = _characterStride(endingAt: i) @@ -562,17 +562,18 @@ extension Substring: StringProtocol { in: newOffsetBounds.lowerBound ..< _wholeGuts.count) _slice._startIndex = String.Index( encodedOffset: startIndex._encodedOffset, - transcodedOffset: 0, - characterStride: newStride)._scalarAligned._knownUTF8 + characterStride: newStride + )._scalarAligned._knownUTF8 } // Update endIndex. if newOffsetBounds.upperBound != endIndex._encodedOffset { _slice._endIndex = Index( - encodedOffset: newOffsetBounds.upperBound, - transcodedOffset: 0 + _encodedOffset: newOffsetBounds.upperBound )._scalarAligned._knownUTF8 } + + // TODO(lorentey): Mark new bounds character aligned if possible } /// Creates a string from the given Unicode code units in the specified @@ -1214,7 +1215,7 @@ extension Substring.UnicodeScalarView: RangeReplaceableCollection { public mutating func replaceSubrange( _ subrange: Range, with replacement: C ) where C.Element == Element { - // TODO(lorentey): Review index validation + // TODO(lorentey): Don't forward to slice let subrange = _wholeGuts.validateScalarRange(subrange, in: _bounds) _slice.replaceSubrange(subrange, with: replacement) } From f7c674ed558146ae6adda4fd0254e4ff4543d132 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 4 Apr 2022 17:59:30 -0700 Subject: [PATCH 40/83] [stdlib] Slice._bounds, Range._encodedOffsetRange: New helpers --- stdlib/public/core/Range.swift | 11 +++++++++ stdlib/public/core/Slice.swift | 19 +++++++++------ stdlib/public/core/Substring.swift | 37 ++++++++---------------------- 3 files changed, 33 insertions(+), 34 deletions(-) diff --git a/stdlib/public/core/Range.swift b/stdlib/public/core/Range.swift index 731ac88e36254..094d33c3fc416 100644 --- a/stdlib/public/core/Range.swift +++ b/stdlib/public/core/Range.swift @@ -1027,3 +1027,14 @@ extension PartialRangeUpTo: Sendable where Bound: Sendable { } extension PartialRangeThrough: Sendable where Bound: Sendable { } extension PartialRangeFrom: Sendable where Bound: Sendable { } extension PartialRangeFrom.Iterator: Sendable where Bound: Sendable { } + +extension Range where Bound == String.Index { + @_alwaysEmitIntoClient // Swift 5.7 + internal var _encodedOffsetRange: Range { + _internalInvariant( + (lowerBound._canBeUTF8 && upperBound._canBeUTF8) + || (lowerBound._canBeUTF16 && upperBound._canBeUTF16)) + return Range( + _uncheckedBounds: (lowerBound._encodedOffset, upperBound._encodedOffset)) + } +} diff --git a/stdlib/public/core/Slice.swift b/stdlib/public/core/Slice.swift index 79001aba29bec..8825f0ee04eb0 100644 --- a/stdlib/public/core/Slice.swift +++ b/stdlib/public/core/Slice.swift @@ -135,6 +135,11 @@ public struct Slice { public var base: Base { return _base } + + @_alwaysEmitIntoClient @inline(__always) + internal var _bounds: Range { + Range(_uncheckedBounds: (_startIndex, _endIndex)) + } } extension Slice: Collection { @@ -157,7 +162,7 @@ extension Slice: Collection { @inlinable // generic-performance public subscript(index: Index) -> Base.Element { get { - _failEarlyRangeCheck(index, bounds: startIndex..) -> Slice { get { - _failEarlyRangeCheck(bounds, bounds: startIndex.. Base.Element { get { - _failEarlyRangeCheck(index, bounds: startIndex..) -> Slice { get { - _failEarlyRangeCheck(bounds, bounds: startIndex..) { - let r = slice.base._guts.validateScalarRange( - slice.startIndex ..< slice.endIndex) - self._slice = Slice(base: slice.base, bounds: r) + let r = slice._base._guts.validateScalarRange(slice._bounds) + self._slice = Slice(base: slice._base, bounds: r) _invariantCheck() } @@ -137,19 +136,13 @@ extension Substring { public var base: String { return _slice._base } @inlinable @inline(__always) - internal var _wholeGuts: _StringGuts { return _slice._base._guts } + internal var _wholeGuts: _StringGuts { _slice._base._guts } @inlinable @inline(__always) - internal var _offsetRange: Range { - let lower = _slice._startIndex._encodedOffset - let upper = _slice._endIndex._encodedOffset - return Range(_uncheckedBounds: (lower, upper)) - } + internal var _offsetRange: Range { _slice._bounds._encodedOffsetRange } @inlinable @inline(__always) - internal var _bounds: Range { - Range(_uncheckedBounds: (startIndex, endIndex)) - } + internal var _bounds: Range { _slice._bounds } } extension Substring { @@ -746,9 +739,7 @@ extension Substring { internal var _base: String.UTF8View { _slice._base } @_alwaysEmitIntoClient @inline(__always) - internal var _bounds: Range { - Range(_uncheckedBounds: (_slice._startIndex, _slice._endIndex)) - } + internal var _bounds: Range { _slice._bounds } } } @@ -912,9 +903,7 @@ extension Substring { internal var _base: String.UTF16View { _slice._base } @_alwaysEmitIntoClient @inline(__always) - internal var _bounds: Range { - Range(_uncheckedBounds: (_slice._startIndex, _slice._endIndex)) - } + internal var _bounds: Range { _slice._bounds } } } @@ -1075,21 +1064,15 @@ extension Substring { } extension Substring.UnicodeScalarView { - @_alwaysEmitIntoClient - @inline(__always) + @_alwaysEmitIntoClient @inline(__always) internal var _wholeGuts: _StringGuts { _slice._base._guts } @inline(__always) - internal var _offsetRange: Range { - let lower = _slice._startIndex._encodedOffset - let upper = _slice._endIndex._encodedOffset - return Range(_uncheckedBounds: (lower, upper)) - } + internal var _offsetRange: Range { _slice._bounds._encodedOffsetRange } @_alwaysEmitIntoClient @inline(__always) - internal var _bounds: Range { - Range(_uncheckedBounds: (startIndex, endIndex)) + internal var _bounds: Range { _slice._bounds } } } From 2e9fd9eb6b959df11799784b0fe93a2f77a8b325 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 4 Apr 2022 18:01:42 -0700 Subject: [PATCH 41/83] [stdlib] Substring.UnicodeScalarView: Add _invariantCheck --- stdlib/public/core/StringGuts.swift | 1 + stdlib/public/core/Substring.swift | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 669d1d546fd27..0c3d0ba18114b 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -357,6 +357,7 @@ extension _StringGuts { /// index, but it is guaranteed to never incorrectly return false. If all /// loaded binaries were built in 5.7+, then this method is guaranteed to /// always return the correct value. + @_alwaysEmitIntoClient internal func hasMatchingEncoding(_ i: String.Index) -> Bool { (isForeign && i._canBeUTF16) || (!isForeign && i._canBeUTF8) } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 7c4ba9016077b..cdbcccad3b702 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -1050,6 +1050,7 @@ extension Substring { _unchecked base: String.UnicodeScalarView, bounds: Range ) { _slice = Slice(base: base, bounds: bounds) + _invariantCheck() } /// Creates an instance that slices `base` at `_bounds`. @@ -1073,7 +1074,25 @@ extension Substring.UnicodeScalarView { @_alwaysEmitIntoClient @inline(__always) internal var _bounds: Range { _slice._bounds } +} + +extension Substring.UnicodeScalarView { + #if !INTERNAL_CHECKS_ENABLED + @_alwaysEmitIntoClient @inline(__always) + internal func _invariantCheck() {} + #else + @_alwaysEmitIntoClient + @inline(never) @_effects(releasenone) + internal func _invariantCheck() { + _internalInvariant(endIndex <= _wholeGuts.endIndex) + _internalInvariant( + _wholeGuts.hasMatchingEncoding(startIndex) && + _wholeGuts.hasMatchingEncoding(endIndex)) + _internalInvariant( + startIndex._isScalarAligned && endIndex._isScalarAligned) + _slice._base._invariantCheck() } + #endif // INTERNAL_CHECKS_ENABLED } extension Substring.UnicodeScalarView: BidirectionalCollection { From e0bd5f7a79ba013b154b1e554f2ff1141662b797 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 4 Apr 2022 18:02:24 -0700 Subject: [PATCH 42/83] [stdlib] Fix Substring.UnicodeScalarView.replaceSubrange --- .../core/StringGutsRangeReplaceable.swift | 165 ++++++++++++++++++ stdlib/public/core/Substring.swift | 107 +++--------- 2 files changed, 190 insertions(+), 82 deletions(-) diff --git a/stdlib/public/core/StringGutsRangeReplaceable.swift b/stdlib/public/core/StringGutsRangeReplaceable.swift index c5acd5c148bbb..d13ac157b213c 100644 --- a/stdlib/public/core/StringGutsRangeReplaceable.swift +++ b/stdlib/public/core/StringGutsRangeReplaceable.swift @@ -342,6 +342,60 @@ extension _StringGuts { return Range(_uncheckedBounds: (i, j)) } + // - Returns: The encoded offset range of the replaced contents in the result. + @discardableResult + internal mutating func replaceSubrange( + _ bounds: Range, + with newElements: C + ) -> Range + where C: Collection, C.Iterator.Element == UnicodeScalar { + if isUniqueNative { + if let repl = newElements as? String.UnicodeScalarView { + if repl._guts.isFastUTF8 { + return repl._guts.withFastUTF8 { + uniqueNativeReplaceSubrange( + bounds, with: $0, isASCII: repl._guts.isASCII) + } + } + } else if let repl = newElements as? Substring.UnicodeScalarView { + if repl._wholeGuts.isFastUTF8 { + return repl._wholeGuts.withFastUTF8(range: repl._offsetRange) { + uniqueNativeReplaceSubrange( + bounds, with: $0, isASCII: repl._wholeGuts.isASCII) + } + } + } + if #available(SwiftStdlib 5.1, *) { + return uniqueNativeReplaceSubrange( + bounds, with: newElements.lazy.flatMap { $0.utf8 }) + } else { + // FIXME: The stdlib should not have a deployment target this ancient. + let c = newElements.reduce(0) { $0 + UTF8.width($1) } + var utf8: [UInt8] = [] + utf8.reserveCapacity(c) + utf8 = newElements.reduce(into: utf8) { utf8, next in + next.withUTF8CodeUnits { utf8.append(contentsOf: $0) } + } + return uniqueNativeReplaceSubrange(bounds, with: utf8) + } + } + + var result = String.UnicodeScalarView() + // FIXME: It should be okay to get rid of excess capacity + // here. rdar://problem/45635432 + if let capacity = self.nativeCapacity { + result.reserveCapacity(capacity) + } + let selfStr = String.UnicodeScalarView(self) + result.append(contentsOf: selfStr[.., @@ -386,5 +440,116 @@ extension _StringGuts { self = _StringGuts(_object.nativeStorage) return Range(_uncheckedBounds: (start, start + replCount)) } + + /// Run `body` to mutate the given `subrange` of this string within + /// `startIndex ..< endIndex`, then update `startIndex` and `endIndex` to be + /// valid positions in the resulting string, addressing the same (logical) + /// locations as in the original string. + /// + /// This is used by both `Substring` and `Substring.UnicodeScalarView` to + /// implement their `replaceSubrange` methods. + /// + /// - Parameter subrange: A scalar-aligned offset range in this string. + /// - Parameter startIndex: The start index of the substring that performs + /// this operation. + /// - Parameter endIndex: The end index of the substring that performs this + /// operations. + /// - Parameter body: The mutation operation to execute on `self`. The + /// returned offset range must correspond to `subrange` in the resulting + /// string. + internal mutating func mutateSubrangeInSubstring( + subrange: Range, + startIndex: inout Index, + endIndex: inout Index, + with body: (inout _StringGuts) -> Range + ) { + _internalInvariant( + subrange.lowerBound >= startIndex && subrange.upperBound <= endIndex) + + if _slowPath(isKnownUTF16) { + // UTF-16 (i.e., foreign) string. The mutation will convert this to the + // native UTF-8 encoding, so we need to do some extra work to preserve our + // bounds. + let utf8StartOffset = String(self).utf8.distance( + from: self.startIndex, to: startIndex) + let oldUTF8Count = String(self).utf8.distance( + from: startIndex, to: endIndex) + + let oldUTF8SubrangeCount = String(self).utf8.distance( + from: subrange.lowerBound, to: subrange.upperBound) + + let newUTF8Subrange = body(&self) + _internalInvariant(!isKnownUTF16) + + let newUTF8Count = + oldUTF8Count + newUTF8Subrange.count - oldUTF8SubrangeCount + + // Get the character stride in the entire string, not just the substring. + // (Characters in a substring may end beyond the bounds of it.) + let newStride = _opaqueCharacterStride( + startingAt: utf8StartOffset, in: utf8StartOffset ..< count) + + startIndex = String.Index( + encodedOffset: utf8StartOffset, + transcodedOffset: 0, + characterStride: newStride)._scalarAligned._knownUTF8 + if isOnGraphemeClusterBoundary(startIndex) { + startIndex = startIndex._characterAligned + } + + endIndex = String.Index( + encodedOffset: utf8StartOffset + newUTF8Count, + transcodedOffset: 0)._scalarAligned._knownUTF8 + return + } + + // UTF-8 string. + + let oldRange = subrange._encodedOffsetRange + let newRange = body(&self) + + let oldBounds = Range( + _uncheckedBounds: (startIndex._encodedOffset, endIndex._encodedOffset)) + let newBounds = Range(_uncheckedBounds: ( + oldBounds.lowerBound, + oldBounds.upperBound &+ newRange.count &- oldRange.count)) + + // Update `startIndex` if necessary. The replacement may have invalidated + // its cached character stride and character alignment flag, but not its + // stored offset, encoding, or scalar alignment. + // + // We are exploiting the fact that mutating the string _after_ the scalar + // following the end of the character at `startIndex` cannot possibly change + // the length of that character. (This is true because `index(after:)` never + // needs to look ahead by more than one Unicode scalar.) + let oldStride = startIndex.characterStride ?? 0 + if oldRange.lowerBound <= oldBounds.lowerBound &+ oldStride { + // Get the character stride in the entire string, not just the substring. + // (Characters in a substring may end beyond the bounds of it.) + let newStride = _opaqueCharacterStride( + startingAt: newBounds.lowerBound, + in: newBounds.lowerBound ..< self.count) + var newStart = String.Index( + encodedOffset: newBounds.lowerBound, + characterStride: newStride + )._scalarAligned._knownUTF8 + + // Preserve character alignment flag if possible + if startIndex._isCharacterAligned, + (oldRange.lowerBound > oldBounds.lowerBound || + isOnGraphemeClusterBoundary(newStart)) { + newStart = newStart._characterAligned + } + + startIndex = newStart + } + + // Update `endIndex`. + if newBounds.upperBound != endIndex._encodedOffset { + endIndex = Index( + _encodedOffset: newBounds.upperBound + )._scalarAligned._knownUTF8 + } + } } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index cdbcccad3b702..8d986af371d86 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -470,7 +470,6 @@ extension Substring: StringProtocol { // Note: SE-0180 requires us to use `subrange` bounds even if they aren't // `Character` aligned. (We still have to round things down to the nearest // scalar boundary, though, or we may generate ill-formed encodings.) - defer { _invariantCheck() } let subrange = _validateScalarRange(subrange) // Replacing the range is easy -- we can just reuse `String`'s @@ -488,85 +487,14 @@ extension Substring: StringProtocol { // merged with the Character preceding/following the replaced range. // // The best way to avoid problems in these cases is to lower index - // calculations to Unicode scalars (or below). In this implementation, we - // are measuring things in UTF-8 code units, for efficiency. - - if _slowPath(_wholeGuts.isKnownUTF16) { - // UTF-16 (i.e., foreign) string. The mutation will convert this to the - // native UTF-8 encoding, so we need to do some extra work to preserve our - // bounds. - let utf8StartOffset = _slice._base.utf8.distance( - from: _slice._base.startIndex, - to: _slice._startIndex) - let oldUTF8Count = self.utf8.count - - let oldSubrangeCount = self.utf8.distance( - from: subrange.lowerBound, to: subrange.upperBound) - - let newUTF8Subrange = _slice._base._guts.replaceSubrange( - subrange, with: newElements) - _internalInvariant(!_wholeGuts.isKnownUTF16) - - let newUTF8Count = oldUTF8Count + newUTF8Subrange.count - oldSubrangeCount - - // Get the character stride in the entire string, not just the substring. - // (Characters in a substring may end beyond the bounds of it.) - let newStride = _wholeGuts._opaqueCharacterStride( - startingAt: utf8StartOffset, - in: utf8StartOffset ..< _wholeGuts.count) - - _slice._startIndex = String.Index( - encodedOffset: utf8StartOffset, - transcodedOffset: 0, - characterStride: newStride)._scalarAligned._knownUTF8 - _slice._endIndex = String.Index( - encodedOffset: utf8StartOffset + newUTF8Count, - transcodedOffset: 0)._scalarAligned._knownUTF8 - return - } - - // UTF-8 string. - - let oldRange = Range(_uncheckedBounds: ( - subrange.lowerBound._encodedOffset, subrange.upperBound._encodedOffset)) - - let newRange = _slice._base._guts.replaceSubrange( - subrange, with: newElements) - - let newOffsetBounds = Range(_uncheckedBounds: ( - startIndex._encodedOffset, - endIndex._encodedOffset &+ newRange.count &- oldRange.count)) - - // Update `startIndex` if necessary. The replacement may have invalidated - // its cached character stride, but not its stored offset. - // - // We are exploiting the fact that mutating the string _after_ the scalar - // following the end of the character at `startIndex` cannot possibly change - // the length of that character. (This is true because `index(after:)` never - // needs to look ahead by more than one Unicode scalar.) - if - let stride = startIndex.characterStride, - oldRange.lowerBound <= startIndex._encodedOffset &+ stride - { - // Get the character stride in the entire string, not just the substring. - // (Characters in a substring may end beyond the bounds of it.) - let newStride = _wholeGuts._opaqueCharacterStride( - startingAt: newOffsetBounds.lowerBound, - in: newOffsetBounds.lowerBound ..< _wholeGuts.count) - _slice._startIndex = String.Index( - encodedOffset: startIndex._encodedOffset, - characterStride: newStride - )._scalarAligned._knownUTF8 - } - - // Update endIndex. - if newOffsetBounds.upperBound != endIndex._encodedOffset { - _slice._endIndex = Index( - _encodedOffset: newOffsetBounds.upperBound - )._scalarAligned._knownUTF8 - } + // calculations to Unicode scalars (or below). + _slice._base._guts.mutateSubrangeInSubstring( + subrange: subrange, + startIndex: &_slice._startIndex, + endIndex: &_slice._endIndex, + with: { $0.replaceSubrange(subrange, with: newElements) }) - // TODO(lorentey): Mark new bounds character aligned if possible + _invariantCheck() } /// Creates a string from the given Unicode code units in the specified @@ -1209,7 +1137,6 @@ extension String { } } -// FIXME: The other String views should be RangeReplaceable too. extension Substring.UnicodeScalarView: RangeReplaceableCollection { @inlinable public init() { _slice = Slice.init() } @@ -1217,9 +1144,25 @@ extension Substring.UnicodeScalarView: RangeReplaceableCollection { public mutating func replaceSubrange( _ subrange: Range, with replacement: C ) where C.Element == Element { - // TODO(lorentey): Don't forward to slice let subrange = _wholeGuts.validateScalarRange(subrange, in: _bounds) - _slice.replaceSubrange(subrange, with: replacement) + + // Replacing the range is easy -- we can just reuse `String`'s + // implementation. However, we must also update `startIndex` and `endIndex` + // to keep them valid & pointing to the same positions, which is somewhat + // tricky. + // + // In Swift <=5.6, this used to forward to `Slice.replaceSubrange`, which + // (incorrectly) assumes that indices before the replaced subrange are + // preserved after the mutation. (This isn't true for strings, esp. when the + // original value is UTF-16 encoded.) + + _slice._base._guts.mutateSubrangeInSubstring( + subrange: subrange, + startIndex: &_slice._startIndex, + endIndex: &_slice._endIndex, + with: { $0.replaceSubrange(subrange, with: replacement) }) + + _invariantCheck() } } From 42c823847e4646ad39ddbf0304883c403d47b70d Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 4 Apr 2022 18:02:43 -0700 Subject: [PATCH 43/83] [test] stdlib/StringIndex: Simplify --- test/stdlib/StringIndex.swift | 37 +++++++++++++---------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index c647568586644..a28ef86d177b7 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -28,7 +28,7 @@ func dumpIndices(_ string: String) { let char = string[i] print(" \(i) -> \(String(reflecting: char))") } - print("Unicode Scalars:") + print("Scalars:") string.unicodeScalars.indices.forEach { i in let scalar = string.unicodeScalars[i] let value = String(scalar.value, radix: 16, uppercase: true) @@ -978,19 +978,20 @@ suite.test("Substring.replaceSubrange index validation") let sm = scalarMap[m]!.index let sn = scalarMap[n]!.index - // Check Substring.replaceSubrange(_:with:) - do { - let replacement = "x" + let replacement = "x" - var expected = "".unicodeScalars - expected += string.unicodeScalars[si ..< sm] - expected += replacement.unicodeScalars - expected += string.unicodeScalars[sn ..< sj] + var _expected = "".unicodeScalars + _expected += string.unicodeScalars[si ..< sm] + _expected += replacement.unicodeScalars + _expected += string.unicodeScalars[sn ..< sj] + let expected = String(_expected)[...] + // Check Substring.replaceSubrange(_:with:) + do { var actual = substring - actual.replaceSubrange(m ..< n, with: replacement) + actual.replaceSubrange(m ..< n, with: Array(replacement)) - expectEqual(actual, Substring(expected[...]), + expectEqual(actual, expected, """ string: \(string.debugDescription) i: \(i) @@ -1002,27 +1003,17 @@ suite.test("Substring.replaceSubrange index validation") // Check String.unicodeScalars.replaceSubrange(_:with:) do { - let replacement = "x".unicodeScalars - - var expected = "".unicodeScalars - expected += string.unicodeScalars[si ..< sm] - expected += replacement - expected += string.unicodeScalars[sn ..< sj] - var actual = substring - actual.unicodeScalars.replaceSubrange(m ..< n, with: replacement) + actual.unicodeScalars.replaceSubrange( + m ..< n, with: Array(replacement.unicodeScalars)) - expectEqual(actual, Substring(expected[...]), + expectEqual(actual, expected, """ string: \(string.debugDescription) i: \(i) j: \(j) m: \(m) n: \(n) - substring.startIndex: \(substring.startIndex) - substring.endIndex: \(substring.endIndex) - actual.startIndex: \(actual.startIndex) - actual.endIndex: \(actual.endIndex) """) } } From 424727988921a7786d87e1173ab99b1dbf98303b Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 5 Apr 2022 17:18:51 -0700 Subject: [PATCH 44/83] [stdlib] String.UnicodeScalarView: Optimize replaceSubrange --- stdlib/public/core/StringUnicodeScalarView.swift | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index 1b9abea121c78..e21280e3298ab 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -379,7 +379,7 @@ extension String.UnicodeScalarView: RangeReplaceableCollection { /// string. /// /// - Parameters: - /// - bounds: The range of elements to replace. The bounds of the range + /// - subrange: The range of elements to replace. The bounds of the range /// must be valid indices of the view. /// - newElements: The new Unicode scalar values to add to the string. /// @@ -388,18 +388,12 @@ extension String.UnicodeScalarView: RangeReplaceableCollection { /// removes elements at the end of the string, the complexity is O(*n*), /// where *n* is equal to `bounds.count`. public mutating func replaceSubrange( - _ bounds: Range, + _ subrange: Range, with newElements: C ) where C: Collection, C.Element == Unicode.Scalar { - // TODO(String performance): Skip extra String and Array allocation - let bounds = _guts.validateScalarRange(bounds) - let utf8Replacement = newElements.flatMap { String($0).utf8 } - let replacement = utf8Replacement.withUnsafeBufferPointer { - return String._uncheckedFromUTF8($0) - } - var copy = String(_guts) - copy.replaceSubrange(bounds, with: replacement) - self = copy.unicodeScalars + let subrange = _guts.validateScalarRange(subrange) + _guts.replaceSubrange(subrange, with: newElements) + _invariantCheck() } } From 73312fedd4d4e88b98f236b7606a49fa55de1d89 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 5 Apr 2022 20:40:48 -0700 Subject: [PATCH 45/83] [stdlib] Grapheme breaking: Refactor to simplify logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Split forward and backward direction into separate code paths. This makes the code more readable and paves the way for future improvements. (E.g., switching to a linear-time algorithm for breaking backwards.) - `Substring.index(after:)` now uses the same grapheme breaking paths as `String.index(after:)`. - The cached stride value in string indices is now well-defined even on indices that aren’t character-aligned. --- .../public/core/StringGraphemeBreaking.swift | 382 +++++++++++------- .../core/StringGutsRangeReplaceable.swift | 7 +- stdlib/public/core/StringIndex.swift | 11 +- stdlib/public/core/Substring.swift | 76 ++-- 4 files changed, 291 insertions(+), 185 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 94ecfa56756cc..3646636a9cd25 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -84,6 +84,7 @@ internal func _hasGraphemeBreakBetween( } extension _StringGuts { + @inline(__always) internal func roundDownToNearestCharacter( _ i: String.Index ) -> String.Index { @@ -92,9 +93,16 @@ extension _StringGuts { _internalInvariant(i._encodedOffset <= count) let offset = i._encodedOffset - if i._isCharacterAligned { return i } + if _fastPath(i._isCharacterAligned) { return i } if offset == 0 || offset == count { return i._characterAligned } + return _slowRoundDownToNearestCharacter(i) + } + @inline(never) + internal func _slowRoundDownToNearestCharacter( + _ i: String.Index + ) -> String.Index { + let offset = i._encodedOffset let start = offset - _opaqueCharacterStride(endingAt: offset) let stride = _opaqueCharacterStride(startingAt: start) _internalInvariant(offset <= start + stride, @@ -107,6 +115,7 @@ extension _StringGuts { return markEncoding(r._characterAligned) } + @inline(__always) internal func roundDownToNearestCharacter( _ i: String.Index, in bounds: Range @@ -114,7 +123,8 @@ extension _StringGuts { _internalInvariant( bounds.lowerBound._isScalarAligned && bounds.upperBound._isScalarAligned) _internalInvariant( - hasMatchingEncoding(bounds.lowerBound) && hasMatchingEncoding(bounds.upperBound)) + hasMatchingEncoding(bounds.lowerBound) + && hasMatchingEncoding(bounds.upperBound)) _internalInvariant(bounds.upperBound <= endIndex) _internalInvariant(i._isScalarAligned) @@ -123,20 +133,26 @@ extension _StringGuts { // We can only use the `_isCharacterAligned` bit if the start index is also // character-aligned. - if bounds.lowerBound._isCharacterAligned && i._isCharacterAligned { + if _fastPath( + bounds.lowerBound._isCharacterAligned && i._isCharacterAligned + ) { return i } - if i == bounds.lowerBound || i == bounds.upperBound { return i } + return _slowRoundDownToNearestCharacter(i, in: bounds) + } + @inline(never) + internal func _slowRoundDownToNearestCharacter( + _ i: String.Index, + in bounds: Range + ) -> String.Index { let offset = i._encodedOffset - let offsetBounds = Range( - _uncheckedBounds: ( - bounds.lowerBound._encodedOffset, bounds.upperBound._encodedOffset)) - - let prior = offset - _opaqueCharacterStride(endingAt: offset, in: offsetBounds) - let stride = _opaqueCharacterStride(startingAt: prior, in: offsetBounds) + let offsetBounds = bounds._encodedOffsetRange + let prior = + offset - _opaqueCharacterStride(endingAt: offset, in: offsetBounds) + let stride = _opaqueCharacterStride(startingAt: prior) _internalInvariant(offset <= prior + stride, "Grapheme breaking inconsistency") if offset >= prior + stride { @@ -172,6 +188,14 @@ extension _StringGuts { } extension _StringGuts { + /// Return the length of the extended grapheme cluster starting at offset `i`, + /// assuming it falls on a grapheme cluster boundary. + /// + /// Note: This does not look behind at data preceding `i`, so if `i` is not on + /// a grapheme cluster boundary, then it may return results that are + /// inconsistent with `_opaqueCharacterStride(endingAt:)`. On the other hand, + /// this behavior makes this suitable for use in substrings whose start index + /// itself does not fall on a cluster boundary. @usableFromInline @inline(never) @_effects(releasenone) internal func _opaqueCharacterStride(startingAt i: Int) -> Int { @@ -180,7 +204,8 @@ extension _StringGuts { } let nextIdx = withFastUTF8 { utf8 in - nextBoundary(startingAt: i, startIndex: 0) { j in + nextBoundary(startingAt: i) { j in + _internalInvariant(j >= 0) guard j < utf8.count else { return nil } let (scalar, len) = _decodeScalar(utf8, startingAt: j) return (scalar, j &+ len) @@ -190,30 +215,12 @@ extension _StringGuts { return nextIdx &- i } - @_effects(releasenone) - internal func _opaqueCharacterStride( - startingAt i: Int, - in bounds: Range - ) -> Int { - _internalInvariant(bounds.lowerBound >= 0 && bounds.upperBound <= count) - _internalInvariant(bounds.contains(i)) - if _slowPath(isForeign) { - return _foreignOpaqueCharacterStride(startingAt: i, in: bounds) - } - - let nextIdx = withFastUTF8 { utf8 in - nextBoundary(startingAt: i, startIndex: bounds.lowerBound) { j in - _internalInvariant(j >= bounds.lowerBound) - guard j < bounds.upperBound else { return nil } - let (scalar, len) = _decodeScalar(utf8, startingAt: j) - return (scalar, j &+ len) - } - } - - _internalInvariant(nextIdx >= i && nextIdx <= bounds.upperBound) - return nextIdx &- i - } - + /// Return the length of the extended grapheme cluster ending at offset `i`, + /// or if `i` happens to be in the middle of a grapheme cluster, find and + /// return the distance to its start. + /// + /// Note: unlike `_opaqueCharacterStride(startingAt:)`, this method always + /// finds a correct grapheme cluster boundary. @usableFromInline @inline(never) @_effects(releasenone) internal func _opaqueCharacterStride(endingAt i: Int) -> Int { @@ -222,7 +229,8 @@ extension _StringGuts { } let previousIdx = withFastUTF8 { utf8 in - previousBoundary(endingAt: i, startIndex: 0) { j in + previousBoundary(endingAt: i) { j in + _internalInvariant(j <= utf8.count) guard j > 0 else { return nil } let (scalar, len) = _decodeScalar(utf8, endingAt: j) return (scalar, j &- len) @@ -232,6 +240,13 @@ extension _StringGuts { return i &- previousIdx } + /// Return the length of the extended grapheme cluster ending at offset `i` in + /// bounds, or if `i` happens to be in the middle of a grapheme cluster, find + /// and return the distance to its start. + /// + /// Note: unlike `_opaqueCharacterStride(startingAt:)`, this method always + /// finds a correct grapheme cluster boundary within the substring defined by + /// the specified bounds. @_effects(releasenone) internal func _opaqueCharacterStride( endingAt i: Int, @@ -243,7 +258,7 @@ extension _StringGuts { } let previousIdx = withFastUTF8 { utf8 in - previousBoundary(endingAt: i, startIndex: bounds.lowerBound) { j in + previousBoundary(endingAt: i) { j in _internalInvariant(j <= bounds.upperBound) guard j > bounds.lowerBound else { return nil } let (scalar, len) = _decodeScalar(utf8, endingAt: j) @@ -261,7 +276,8 @@ extension _StringGuts { #if _runtime(_ObjC) _internalInvariant(isForeign) - let nextIdx = nextBoundary(startingAt: i, startIndex: 0) { j in + let nextIdx = nextBoundary(startingAt: i) { j in + _internalInvariant(j >= 0) guard j < count else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) @@ -288,9 +304,8 @@ extension _StringGuts { _internalInvariant(isForeign) _internalInvariant(bounds.contains(i)) - let nextIdx = nextBoundary( - startingAt: i, startIndex: bounds.lowerBound - ) { j in + let nextIdx = nextBoundary(startingAt: i) { j in + _internalInvariant(j >= bounds.lowerBound) guard j < bounds.upperBound else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) @@ -313,7 +328,8 @@ extension _StringGuts { #if _runtime(_ObjC) _internalInvariant(isForeign) - let previousIdx = previousBoundary(endingAt: i, startIndex: 0) { j in + let previousIdx = previousBoundary(endingAt: i) { j in + _internalInvariant(j <= self.count) guard j > 0 else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) @@ -340,9 +356,8 @@ extension _StringGuts { _internalInvariant(isForeign) _internalInvariant(i > bounds.lowerBound && i <= bounds.upperBound) - let previousIdx = previousBoundary( - endingAt: i, startIndex: bounds.lowerBound - ) { j in + let previousIdx = previousBoundary(endingAt: i) { j in + _internalInvariant(j <= bounds.upperBound) guard j > bounds.lowerBound else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) @@ -422,21 +437,29 @@ internal struct _GraphemeBreakingState { } extension _StringGuts { - // Returns the stride of the grapheme cluster starting at offset `index`. + // Returns the stride of the grapheme cluster starting at offset `index`, + // assuming it is on a grapheme cluster boundary. + // + // This method never looks at data below `index`. If `index` isn't on a + // grapheme cluster boundary, then the result may not be consistent with the + // actual breaks in the string. `Substring` relies on this to generate the + // right breaks if its start index isn't aligned on one -- in this case, the + // substring's breaks may not match the ones in its base string. internal func nextBoundary( startingAt index: Int, - startIndex: Int, - nextScalar: (Int) -> (Unicode.Scalar, end: Int)? + nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)? ) -> Int { _internalInvariant(index < endIndex._encodedOffset) + + // Note: If `index` in't already on a boundary, then starting with an empty + // state here sometimes leads to this method returning results that diverge + // from the true breaks in the string. var state = _GraphemeBreakingState() var (scalar, index) = nextScalar(index)! while true { guard let (scalar2, nextIndex) = nextScalar(index) else { break } - if shouldBreak( - scalar, between: scalar2, &state, index, startIndex: startIndex - ) { + if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) { break } index = nextIndex @@ -447,24 +470,30 @@ extension _StringGuts { } // Returns the stride of the grapheme cluster ending at offset `index`. + // + // This method uses `previousScalar` to looks back in the string as far as + // necessary to find a correct grapheme cluster boundary, whether or not + // `index` happens to be on a boundary itself. internal func previousBoundary( endingAt index: Int, - startIndex: Int, - previousScalar: (Int) -> (Unicode.Scalar, start: Int)? + previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? ) -> Int { - _internalInvariant(index > startIndex) - var state = _GraphemeBreakingState() + // FIXME: This requires potentially arbitrary lookback in each iteration, + // leading to quadratic behavior in some edge cases. Ideally lookback should + // only be done once per cluster (or in the case of RI sequences, once per + // flag sequence). One way to avoid most quadratic behavior is to replace + // this implementation with a scheme that first searches backwards for a + // safe point then iterates forward using the regular `shouldBreak` until we + // reach `index`, as recommended in section 6.4 of TR#29. + // + // https://www.unicode.org/reports/tr29/#Random_Access + var (scalar2, index) = previousScalar(index)! while true { guard let (scalar1, previousIndex) = previousScalar(index) else { break } - if shouldBreak( - scalar1, - between: scalar2, - &state, - index, - startIndex: startIndex, - isBackwards: true + if shouldBreakWithLookback( + between: scalar1, and: scalar2, at: index, with: previousScalar ) { break } @@ -477,18 +506,22 @@ extension _StringGuts { } extension _StringGuts { - // The "algorithm" that determines whether or not we should break between - // certain grapheme break properties. + // Return true if there is an extended grapheme cluster boundary between two + // scalars, based on state information previously collected about preceding + // scalars. // - // This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary + // This method never looks at scalars other than the two that are explicitly + // passed to it. The `state` parameter is assumed to hold all contextual + // information necessary to make a correct decision; it gets updated with more + // data as needed. + // + // This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules). internal func shouldBreak( - _ scalar1: Unicode.Scalar, - between scalar2: Unicode.Scalar, - _ state: inout _GraphemeBreakingState, - _ index: Int, - startIndex: Int = 0, - isBackwards: Bool = false + between scalar1: Unicode.Scalar, + and scalar2: Unicode.Scalar, + at index: Int, + with state: inout _GraphemeBreakingState ) -> Bool { // GB3 if scalar1.value == 0xD, scalar2.value == 0xA { @@ -565,8 +598,7 @@ extension _StringGuts { // If we're currently in an indic sequence (or if our lhs is a linking // consonant), then this check and everything underneath ensures that // we continue being in one and may check if this extend is a Virama. - if state.isInIndicSequence || - (!isBackwards && scalar1._isLinkingConsonant) { + if state.isInIndicSequence || scalar1._isLinkingConsonant { if y == .extend { let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300) @@ -595,18 +627,10 @@ extension _StringGuts { // GB11 case (.zwj, .extendedPictographic): - if isBackwards { - return !checkIfInEmojiSequence(index, startIndex: startIndex) - } - return !state.isInEmojiSequence // GB12 & GB13 case (.regionalIndicator, .regionalIndicator): - if isBackwards { - return countRIs(index, startIndex: startIndex) - } - defer { state.shouldBreakRI.toggle() } @@ -616,33 +640,119 @@ extension _StringGuts { // GB999 default: // GB9c - if !isBackwards, state.isInIndicSequence, state.hasSeenVirama, - scalar2._isLinkingConsonant { + if + state.isInIndicSequence, + state.hasSeenVirama, + scalar2._isLinkingConsonant + { state.hasSeenVirama = false return false } - // Handle GB9c when walking backwards. - if isBackwards { - switch (x, scalar2._isLinkingConsonant) { - case (.extend, true): - let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300) + return true + } + } - guard extendNormData.ccc != 0 else { - return true - } + // Return true if there is an extended grapheme cluster boundary between two + // scalars, with no previous knowledge about preceding scalars. + // + // This method looks back as far as it needs to determine the correct + // placement of boundaries. + // + // This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary + // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules). + internal func shouldBreakWithLookback( + between scalar1: Unicode.Scalar, + and scalar2: Unicode.Scalar, + at index: Int, + with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? + ) -> Bool { + // GB3 + if scalar1.value == 0xD, scalar2.value == 0xA { + return false + } - return !checkIfInIndicSequence(index, startIndex: startIndex) + if _hasGraphemeBreakBetween(scalar1, scalar2) { + return true + } + + let x = Unicode._GraphemeBreakProperty(from: scalar1) + let y = Unicode._GraphemeBreakProperty(from: scalar2) - case (.zwj, true): - return !checkIfInIndicSequence(index, startIndex: startIndex) + switch (x, y) { - default: + // Fast path: If we know our scalars have no properties the decision is + // trivial and we don't need to crawl to the default statement. + case (.any, .any): + return true + + // GB4 + case (.control, _): + return true + + // GB5 + case (_, .control): + return true + + // GB6 + case (.l, .l), + (.l, .v), + (.l, .lv), + (.l, .lvt): + return false + + // GB7 + case (.lv, .v), + (.v, .v), + (.lv, .t), + (.v, .t): + return false + + // GB8 + case (.lvt, .t), + (.t, .t): + return false + + // GB9 (partial GB11) + case (_, .extend), + (_, .zwj): + return false + + // GB9a + case (_, .spacingMark): + return false + + // GB9b + case (.prepend, _): + return false + + // GB11 + case (.zwj, .extendedPictographic): + return !checkIfInEmojiSequence(at: index, with: previousScalar) + + // GB12 & GB13 + case (.regionalIndicator, .regionalIndicator): + return countRIs(at: index, with: previousScalar) + + // GB999 + default: + // GB9c + switch (x, scalar2._isLinkingConsonant) { + case (.extend, true): + let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300) + + guard extendNormData.ccc != 0 else { return true } - } - return true + return !checkIfInIndicSequence(at: index, with: previousScalar) + + case (.zwj, true): + return !checkIfInIndicSequence(at: index, with: previousScalar) + + default: + return true + } } } @@ -689,18 +799,14 @@ extension _StringGuts { // | = We found our starting .extendedPictographic letting us // know that we are in an emoji sequence so our initial // break question is answered as NO. - internal func checkIfInEmojiSequence(_ index: Int, startIndex: Int) -> Bool { - guard index > startIndex else { return false } - - var emojiIdx = String.Index(_encodedOffset: index) - let scalars = String.UnicodeScalarView(self) - scalars.formIndex(before: &emojiIdx) - - while emojiIdx._encodedOffset > startIndex { - scalars.formIndex(before: &emojiIdx) - let scalar = scalars[emojiIdx] - - let gbp = Unicode._GraphemeBreakProperty(from: scalar) + internal func checkIfInEmojiSequence( + at index: Int, + with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? + ) -> Bool { + guard var i = previousScalar(index)?.start else { return false } + while let prev = previousScalar(i) { + i = prev.start + let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar) switch gbp { case .extend: @@ -711,7 +817,6 @@ extension _StringGuts { return false } } - return false } @@ -743,26 +848,17 @@ extension _StringGuts { // ^ // | = Is a linking consonant and we've seen a virama, so this is a // legitimate indic sequence, so do NOT break the initial question. - internal func checkIfInIndicSequence(_ index: Int, startIndex: Int) -> Bool { - guard index > startIndex else { return false } - - var indicIdx = String.Index(_encodedOffset: index) - let scalars = String.UnicodeScalarView(self) - scalars.formIndex(before: &indicIdx) - - var hasSeenVirama = false - - // Check if the first extend was the Virama. - let scalar = scalars[indicIdx] - - if scalar._isVirama { - hasSeenVirama = true - } + internal func checkIfInIndicSequence( + at index: Int, + with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? + ) -> Bool { + guard let p = previousScalar(index) else { return false } - while indicIdx._encodedOffset > startIndex { - scalars.formIndex(before: &indicIdx) - let scalar = scalars[indicIdx] + var hasSeenVirama = p.scalar._isVirama + var i = p.start + while let (scalar, prev) = previousScalar(i) { + i = prev let gbp = Unicode._GraphemeBreakProperty(from: scalar) switch (gbp, scalar._isLinkingConsonant) { @@ -782,17 +878,12 @@ extension _StringGuts { // LinkingConsonant case (_, true): - guard hasSeenVirama else { - return false - } - - return true + return hasSeenVirama default: return false } } - return false } @@ -826,28 +917,23 @@ extension _StringGuts { // ^ // | = Not a .regionalIndicator. riCount = 1 which is odd, so break // the last two .regionalIndicators. - internal func countRIs(_ index: Int, startIndex: Int) -> Bool { - guard index > startIndex else { return false } - - var riIdx = String.Index(_encodedOffset: index) + internal func countRIs( + at index: Int, + with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? + ) -> Bool { + guard let p = previousScalar(index) else { return false } + var i = p.start var riCount = 0 + while let p = previousScalar(i) { + i = p.start - let scalars = String.UnicodeScalarView(self) - scalars.formIndex(before: &riIdx) - - while riIdx._encodedOffset > startIndex { - scalars.formIndex(before: &riIdx) - let scalar = scalars[riIdx] - - let gbp = Unicode._GraphemeBreakProperty(from: scalar) - + let gbp = Unicode._GraphemeBreakProperty(from: p.scalar) guard gbp == .regionalIndicator else { break } riCount += 1 } - return riCount & 1 != 0 } } diff --git a/stdlib/public/core/StringGutsRangeReplaceable.swift b/stdlib/public/core/StringGutsRangeReplaceable.swift index d13ac157b213c..36b01bda4d277 100644 --- a/stdlib/public/core/StringGutsRangeReplaceable.swift +++ b/stdlib/public/core/StringGutsRangeReplaceable.swift @@ -486,8 +486,7 @@ extension _StringGuts { // Get the character stride in the entire string, not just the substring. // (Characters in a substring may end beyond the bounds of it.) - let newStride = _opaqueCharacterStride( - startingAt: utf8StartOffset, in: utf8StartOffset ..< count) + let newStride = _opaqueCharacterStride(startingAt: utf8StartOffset) startIndex = String.Index( encodedOffset: utf8StartOffset, @@ -526,9 +525,7 @@ extension _StringGuts { if oldRange.lowerBound <= oldBounds.lowerBound &+ oldStride { // Get the character stride in the entire string, not just the substring. // (Characters in a substring may end beyond the bounds of it.) - let newStride = _opaqueCharacterStride( - startingAt: newBounds.lowerBound, - in: newBounds.lowerBound ..< self.count) + let newStride = _opaqueCharacterStride(startingAt: newBounds.lowerBound) var newStart = String.Index( encodedOffset: newBounds.lowerBound, characterStride: newStride diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index 3dd4825fd3218..a34b9f4ceb168 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -37,8 +37,11 @@ isn't frozen. -- grapheme cache: A 6-bit value remembering the distance to the next grapheme -boundary. +- grapheme cache: A 6-bit value remembering the distance to the next extended + grapheme cluster boundary, or 0 if unknown. The value stored (if any) must be + calculated assuming that the index addresses a boundary itself, i.e., without + looking back at scalars preceding the index. (Substrings that don't start on a + `Character` boundary heavily rely on this.) - reserved: 4 unused bits available for future flags etc. The meaning of each bit may change between stdlib versions. These must be set to zero if @@ -317,8 +320,8 @@ extension String.Index { // Note that `startIndex` and `endIndex` have fully inlinable implementations. // This means that when code built on older releases runs on 5.7, this bit may // not be set on these, even though they are always `Character`-aligned. This is -// fine -- `index(after:)` and `index(before:)` do the right thing with -// minimal/no performance loss. +// fine -- `index(after:)` and `index(before:)` still do the right thing with +// minimal/no performance loss. (The start/end index is handled specially.) extension String.Index { @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 8d986af371d86..7ece46a5bde7e 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -257,23 +257,26 @@ extension Substring: StringProtocol { // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc. let stride = _characterStride(startingAt: i) - let nextOffset = i._encodedOffset &+ stride - _internalInvariant(nextOffset <= endIndex._encodedOffset) + + // Make sure a cached stride cannot lead us beyond the substring's end + // index. (This can happen if the substring's end isn't `Character` + // aligned.) + let nextOffset = Swift.min( + i._encodedOffset &+ stride, + endIndex._encodedOffset) let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned let nextStride = _characterStride(startingAt: nextIndex) var r = Index( encodedOffset: nextOffset, characterStride: nextStride)._scalarAligned - if - // Don't set the `_isCharacterAligned` bit in indices of exotic substrings - // whose startIndex isn't aligned on a grapheme cluster boundary. (Their - // grapheme breaks may not match with those in `base`.) - _startIsCharacterAligned, - // Likewise if this is the last character in a substring ending on a - // partial grapheme cluster. - _endIsCharacterAligned || nextOffset + nextStride < endIndex._encodedOffset - { + // Don't set the `_isCharacterAligned` bit in indices of exotic substrings + // whose startIndex isn't aligned on a grapheme cluster boundary. (Their + // grapheme breaks may not match with those in `base`.) + // + // Note that we don't need to care about whether the end index is aligned + // here. + if _startIsCharacterAligned { r = r._characterAligned } @@ -446,9 +449,13 @@ extension Substring: StringProtocol { // Note: SE-0180 requires us not to round `i` down to the nearest whole // `Character` boundary. let i = _validateScalarIndex(i) - let distance = _characterStride(startingAt: i) + let stride = _characterStride(startingAt: i) + // Don't let the subscript return data outside this substring. + let endOffset = Swift.min( + i._encodedOffset &+ stride, + endIndex._encodedOffset) return _wholeGuts.errorCorrectedCharacter( - startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance) + startingAt: i._encodedOffset, endingAt: endOffset) } public mutating func replaceSubrange( @@ -588,30 +595,43 @@ extension Substring: StringProtocol { } extension Substring { + /// Return the length of the extended grapheme cluster that begins at `i`. + /// + /// This method assumes that `i` starts a new grapheme cluster; it does not + /// verify that this is actually the case. If it isn't, then the return value + /// reflects grapheme breaking results as if the string started at `i`, + /// ignoring every preceding scalar. + /// + /// - Parameter `i`: An index within the bounds of this substring. internal func _characterStride(startingAt i: Index) -> Int { _internalInvariant(i._isScalarAligned) _internalInvariant(i._encodedOffset <= _wholeGuts.count) - // Implicit precondition: `i` must be `Character`-aligned within this - // substring, even if it doesn't have the corresponding flag set. - - // If the index has a character stride, we are therefore free to use it. - if let d = i.characterStride { - // However, make sure a cached stride cannot lead us beyond the - // substring's end index. This can happen if the substring's end isn't - // also `Character` aligned, and someone passes us an index that comes - // from the base string. - return Swift.min(d, endIndex._encodedOffset &- i._encodedOffset) - } + // If the index has a character stride, it reflects the stride assuming that + // it addresses a `Character` boundary, which is exactly what we want. + if let d = i.characterStride { return d } if i._encodedOffset == endIndex._encodedOffset { return 0 } - // If we don't have cached information, we can simply invoke the forward-only - // grapheme breaking algorithm. - return _wholeGuts._opaqueCharacterStride( - startingAt: i._encodedOffset, in: _offsetRange) + // If we don't have cached information, we can simply invoke the + // forward-only grapheme breaking algorithm. Note that this ignores the + // Substring bounds; this is okay because this method never looks back at + // preceding scalars, so it will place the boundary at the right position in + // the substring. The reported stride may go above the end index, but that + // case is handled in the caller. + return _wholeGuts._opaqueCharacterStride(startingAt: i._encodedOffset) } + + /// Return the length of the extended grapheme cluster that ends with, or + /// includes, `i`. + /// + /// This method does not assume that `i` addresses a grapheme cluster + /// boundary; it looks back as far as necessary within the substring to find + /// the right boundary location, stopping at the start index to prevent + /// results that are inconsistent with `_characterStride(startingAt:)`. + /// + /// - Parameter `i`: An index within the bounds of this substring. internal func _characterStride(endingAt i: Index) -> Int { // Implicit precondition: `i` must be `Character`-aligned within this // substring, even if it doesn't have the corresponding flag set. From 3f2550feb4dc37235a6c46223d0803d0c0be8f30 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 5 Apr 2022 21:42:17 -0700 Subject: [PATCH 46/83] [test] stdlib/StringGraphemeBreaking: Add backwards checks --- .../stdlib/StringGraphemeBreaking.swift | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/validation-test/stdlib/StringGraphemeBreaking.swift b/validation-test/stdlib/StringGraphemeBreaking.swift index 650e772292427..3ae6c5147cca4 100644 --- a/validation-test/stdlib/StringGraphemeBreaking.swift +++ b/validation-test/stdlib/StringGraphemeBreaking.swift @@ -11,10 +11,29 @@ import Foundation let StringGraphemeBreaking = TestSuite("StringGraphemeBreaking") +extension String { + var backwardsCount: Int { + var c = 0 + var index = endIndex + while index != startIndex { + c += 1 + formIndex(before: &index) + } + return c + } +} + if #available(SwiftStdlib 5.6, *) { StringGraphemeBreaking.test("grapheme breaking") { for graphemeBreakTest in graphemeBreakTests { - expectEqual(graphemeBreakTest.1, graphemeBreakTest.0.count) + expectEqual( + graphemeBreakTest.1, + graphemeBreakTest.0.count, + "string: \(String(reflecting: graphemeBreakTest.0))") + expectEqual( + graphemeBreakTest.1, + graphemeBreakTest.0.backwardsCount, + "string: \(String(reflecting: graphemeBreakTest.0))") } } } @@ -80,7 +99,12 @@ if #available(SwiftStdlib 5.6, *) { let test = foreignTest as String expectTrue(test._guts._isForeign()) - expectEqual(graphemeBreakTest.1, test.count) + expectEqual( + graphemeBreakTest.1, test.count, + "string: \(String(reflecting: graphemeBreakTest.0))") + expectEqual( + graphemeBreakTest.1, test.backwardsCount, + "string: \(String(reflecting: graphemeBreakTest.0))") } } } From 71216009e32d0af71dd141d67c27b63270ff50e5 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 6 Apr 2022 17:09:39 -0700 Subject: [PATCH 47/83] [test] Move useful helpers into StdlibUnicodeUnittest --- .../StdlibUnicodeUnittest.swift | 139 ++++++++++++++ test/stdlib/StringIndex.swift | 169 +++--------------- 2 files changed, 162 insertions(+), 146 deletions(-) diff --git a/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift b/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift index ecf18524a1680..8540b71865ec9 100644 --- a/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift +++ b/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift @@ -779,3 +779,142 @@ public let utf16Tests = [ ], ] +extension String { + /// Print out a full list of indices in every view of this string. + /// This is useful while debugging string indexing issues. + public func dumpIndices() { + print("-------------------------------------------------------------------") + print("String: \(String(reflecting: self))") + print("Characters:") + self.indices.forEach { i in + let char = self[i] + print(" \(i) -> \(String(reflecting: char))") + } + print("Scalars:") + self.unicodeScalars.indices.forEach { i in + let scalar = self.unicodeScalars[i] + let value = String(scalar.value, radix: 16, uppercase: true) + let padding = String(repeating: "0", count: max(0, 4 - value.count)) + let name = scalar.properties.name ?? "\(scalar.debugDescription)" + print(" \(i) -> U+\(padding)\(value) \(name)") + } + print("UTF-8:") + self.utf8.indices.forEach { i in + let code = self.utf8[i] + let value = String(code, radix: 16, uppercase: true) + let padding = value.count < 2 ? "0" : "" + print(" \(i) -> \(padding)\(value)") + } + print("UTF-16:") + self.utf16.indices.forEach { i in + let code = self.utf16[i] + let value = String(code, radix: 16, uppercase: true) + let padding = String(repeating: "0", count: 4 - value.count) + print(" \(i) -> \(padding)\(value)") + } + } + + // Returns a list of every valid index in every string view, optionally + // including end indices. We keep equal indices originating from different + // views because they may have different grapheme size caches or flags etc. + public func allIndices(includingEnd: Bool = true) -> [String.Index] { + var r = Array(self.indices) + if includingEnd { r.append(self.endIndex) } + r += Array(self.unicodeScalars.indices) + if includingEnd { r.append(self.unicodeScalars.endIndex) } + r += Array(self.utf8.indices) + if includingEnd { r.append(self.utf8.endIndex) } + r += Array(self.utf16.indices) + if includingEnd { r.append(self.utf16.endIndex) } + return r + } +} + +extension Substring { + // Returns a list of every valid index in every substring view, optionally + // including end indices. We keep equal indices originating from different + // views because they may have different grapheme size caches or flags etc. + public func allIndices(includingEnd: Bool = true) -> [String.Index] { + var r = Array(self.indices) + if includingEnd { r.append(self.endIndex) } + r += Array(self.unicodeScalars.indices) + if includingEnd { r.append(self.unicodeScalars.endIndex) } + r += Array(self.utf8.indices) + if includingEnd { r.append(self.utf8.endIndex) } + r += Array(self.utf16.indices) + if includingEnd { r.append(self.utf16.endIndex) } + return r + } +} + +extension Collection { + // Assuming both `self` and `other` use the same index space, call `body` for + // each index `i` in `other`, along with the slice in `self` that begins at + // `i` and ends at the index following it in `other`. + // + // `other` must start with an item that is less than or equal to the first + // item in `self`. + func forEachIndexGroup( + by other: G, + body: (G.Index, Self.SubSequence, Int) throws -> Void + ) rethrows + where G.Index == Self.Index + { + if other.isEmpty { + assert(self.isEmpty) + return + } + var i = other.startIndex + var j = self.startIndex + var offset = 0 + while i != other.endIndex { + let current = i + other.formIndex(after: &i) + let start = j + while j < i, j < self.endIndex { + self.formIndex(after: &j) + } + let end = j + try body(current, self[start ..< end], offset) + offset += 1 + } + } +} + +extension String { + /// Returns a dictionary mapping each valid index to the index that addresses + /// the nearest scalar boundary, rounding down. + public func scalarMap() -> [Index: (index: Index, offset: Int)] { + var map: [Index: (index: Index, offset: Int)] = [:] + + utf8.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } + } + utf16.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } + } + self.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } + } + map[endIndex] = (endIndex, unicodeScalars.count) + return map + } + + /// Returns a dictionary mapping each valid index to the index that addresses + /// the nearest character boundary, rounding down. + public func characterMap() -> [Index: (index: Index, offset: Int)] { + var map: [Index: (index: Index, offset: Int)] = [:] + utf8.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } + } + utf16.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } + } + unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } + } + map[endIndex] = (endIndex, count) + return map + } +} + diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index a28ef86d177b7..6e64926051423 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -1,4 +1,4 @@ -// RUN: %target-run-simple-swift +// RUN: %target-run-stdlib-swift %S/Inputs/ // REQUIRES: executable_test // UNSUPPORTED: freestanding @@ -6,6 +6,7 @@ import StdlibUnittest #if _runtime(_ObjC) import Foundation #endif +import StdlibUnicodeUnittest var suite = TestSuite("StringIndexTests") defer { runAllTests() } @@ -18,40 +19,6 @@ enum SimpleString: String { case emoji = "😀😃🤢🤮👩🏿‍🎤🧛🏻‍♂️🧛🏻‍♂️👩‍👩‍👦‍👦" } -/// Print out a full list of indices in every view of `string`. -/// This is useful while debugging test failures in this test. -func dumpIndices(_ string: String) { - print("-------------------------------------------------------------------") - print("String: \(String(reflecting: string))") - print("Characters:") - string.indices.forEach { i in - let char = string[i] - print(" \(i) -> \(String(reflecting: char))") - } - print("Scalars:") - string.unicodeScalars.indices.forEach { i in - let scalar = string.unicodeScalars[i] - let value = String(scalar.value, radix: 16, uppercase: true) - let padding = String(repeating: "0", count: max(0, 4 - value.count)) - let name = scalar.properties.name ?? "\(scalar.debugDescription)" - print(" \(i) -> U+\(padding)\(value) \(name)") - } - print("UTF-8:") - string.utf8.indices.forEach { i in - let code = string.utf8[i] - let value = String(code, radix: 16, uppercase: true) - let padding = value.count < 2 ? "0" : "" - print(" \(i) -> \(padding)\(value)") - } - print("UTF-16:") - string.utf16.indices.forEach { i in - let code = string.utf16[i] - let value = String(code, radix: 16, uppercase: true) - let padding = String(repeating: "0", count: 4 - value.count) - print(" \(i) -> \(padding)\(value)") - } -} - let simpleStrings: [String] = [ SimpleString.smallASCII.rawValue, SimpleString.smallUnicode.rawValue, @@ -352,7 +319,7 @@ suite.test("Exhaustive Index Interchange") return } - //dumpIndices(str) + //str.dumpIndices() var curCharIdx = str.startIndex var curScalarIdx = str.startIndex @@ -482,111 +449,7 @@ suite.test("Exhaustive Index Interchange") } #endif -extension Collection { - // Assuming both `self` and `other` use the same index space, call `body` for - // each index `i` in `other`, along with the slice in `self` that begins at - // `i` and ends at the index following it in `other`. - // - // `other` must start with an item that is less than or equal to the first - // item in `self`. - func forEachIndexGroup( - by other: G, - body: (G.Index, Self.SubSequence, Int) throws -> Void - ) rethrows - where G.Index == Self.Index - { - if other.isEmpty { - assert(self.isEmpty) - return - } - var i = other.startIndex - var j = self.startIndex - var offset = 0 - while i != other.endIndex { - let current = i - other.formIndex(after: &i) - let start = j - while j < i, j < self.endIndex { - self.formIndex(after: &j) - } - let end = j - try body(current, self[start ..< end], offset) - offset += 1 - } - } -} - -extension String { - // Returns a list of every valid index in every string view, optionally - // including end indices. We keep equal indices originating from different - // views because they may have different grapheme size caches or flags etc. - func allIndices(includingEnd: Bool = true) -> [String.Index] { - var r = Array(self.indices) - if includingEnd { r.append(self.endIndex) } - r += Array(self.unicodeScalars.indices) - if includingEnd { r.append(self.unicodeScalars.endIndex) } - r += Array(self.utf8.indices) - if includingEnd { r.append(self.utf8.endIndex) } - r += Array(self.utf16.indices) - if includingEnd { r.append(self.utf16.endIndex) } - return r - } - - /// Returns a dictionary mapping each valid index to the index that lies on - /// the nearest scalar boundary, rounding down. - func scalarMap() -> [String.Index: (index: String.Index, offset: Int)] { - var map: [String.Index: (index: String.Index, offset: Int)] = [:] - - self.utf8.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice, offset in - for i in slice.indices { map[i] = (scalar, offset) } - } - self.utf16.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice, offset in - for i in slice.indices { map[i] = (scalar, offset) } - } - self.forEachIndexGroup(by: self.unicodeScalars) { scalar, slice, offset in - for i in slice.indices { map[i] = (scalar, offset) } - } - map[endIndex] = (endIndex, self.unicodeScalars.count) - return map - } - - /// Returns a dictionary mapping each valid index to the index that lies on - /// the nearest character boundary, rounding down. - func characterMap() -> [String.Index: (index: String.Index, offset: Int)] { - var map: [String.Index: (index: String.Index, offset: Int)] = [:] - self.utf8.forEachIndexGroup(by: self) { char, slice, offset in - for i in slice.indices { map[i] = (char, offset) } - } - self.utf16.forEachIndexGroup(by: self) { char, slice, offset in - for i in slice.indices { map[i] = (char, offset) } - } - self.unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in - for i in slice.indices { map[i] = (char, offset) } - } - map[endIndex] = (endIndex, count) - return map - } -} - -extension Substring { - // Returns a list of every valid index in every string view, optionally - // including end indices. We keep equal indices originating from different - // views because they may have different grapheme size caches or flags etc. - func allIndices(includingEnd: Bool = true) -> [String.Index] { - var r = Array(self.indices) - if includingEnd { r.append(self.endIndex) } - r += Array(self.unicodeScalars.indices) - if includingEnd { r.append(self.unicodeScalars.endIndex) } - r += Array(self.utf8.indices) - if includingEnd { r.append(self.utf8.endIndex) } - r += Array(self.utf16.indices) - if includingEnd { r.append(self.utf16.endIndex) } - return r - } -} - -suite.test("Fully exhaustive index interchange") -.forEach(in: examples) { string in +func fullyExhaustiveIndexInterchange(_ string: String) { guard #available(SwiftStdlib 5.7, *) else { // Index navigation in 5.7 always rounds input indices down to the nearest // Character, so that we always have a well-defined distance between @@ -596,7 +459,7 @@ suite.test("Fully exhaustive index interchange") return } - //dumpIndices(string) + //string.dumpIndices() let scalarMap = string.scalarMap() let characterMap = string.characterMap() @@ -736,6 +599,18 @@ suite.test("Fully exhaustive index interchange") } } +suite.test("Fully exhaustive index interchange") +.forEach(in: examples) { string in + fullyExhaustiveIndexInterchange(string) +} + +suite.test("Fully exhaustive index interchange/GraphemeBreakTests") { + for string in graphemeBreakTests.map { $0.0 } { + fullyExhaustiveIndexInterchange(string) + } +} + + suite.test("Global vs local grapheme cluster boundaries") { guard #available(SwiftStdlib 5.7, *) else { // Index navigation in 5.7 always rounds input indices down to the nearest @@ -864,14 +739,14 @@ suite.test("Index encoding correction") { // If the mutation's effect included the data addressed by the original index, // then we may still get nonsensical results. var s = ("🫱🏼‍🫲🏽 a 🧑🏽‍🌾 b" as NSString) as String - //dumpIndices(s) + //s.dumpIndices() let originals = s.allIndices(includingEnd: false).map { ($0, s[$0], s.unicodeScalars[$0], s.utf8[$0], s.utf16[$0]) } s.append(".") - //dumpIndices(s) + //s.dumpIndices() for (i, char, scalar, u8, u16) in originals { expectEqual(s[i], char, "i: \(i)") @@ -893,7 +768,7 @@ suite.test("String.replaceSubrange index validation") return } - //dumpIndices(string) + //string.dumpIndices() let scalarMap = string.scalarMap() let allIndices = string.allIndices() @@ -958,12 +833,13 @@ suite.test("Substring.replaceSubrange index validation") return } - dumpIndices(string) + string.dumpIndices() let scalarMap = string.scalarMap() let allIndices = string.allIndices() for i in allIndices { + print(i) for j in allIndices { guard i <= j else { continue } let si = scalarMap[i]!.index @@ -1021,3 +897,4 @@ suite.test("Substring.replaceSubrange index validation") } } } + From b034c48b1daf92e1bce7e22b20c5a8948862918c Mon Sep 17 00:00:00 2001 From: Alex Hoppen Date: Thu, 7 Apr 2022 08:16:09 +0200 Subject: [PATCH 48/83] [CodeCompletion] Record fixes while solving result builders for code completion We record fixes while solving normal expressions for code completion and we should do the same when solving result builders if we are reporting the solutions to completion callbacks. --- include/swift/Sema/ConstraintSystem.h | 11 ++++--- lib/Sema/BuilderTransform.cpp | 9 ++++-- lib/Sema/CSSolver.cpp | 46 +++++++++++++++------------ 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/include/swift/Sema/ConstraintSystem.h b/include/swift/Sema/ConstraintSystem.h index 546c705d5a51c..3edd230ebb928 100644 --- a/include/swift/Sema/ConstraintSystem.h +++ b/include/swift/Sema/ConstraintSystem.h @@ -5296,18 +5296,21 @@ class ConstraintSystem { = FreeTypeVariableBinding::Disallow, bool allowFixes = false); - /// Construct and solve a system of constraints based on the given expression - /// and its contextual information. + /// Assuming that constraints have already been generated, solve the + /// constraint system for code completion, writing all solutions to + /// \p solutions. /// /// This method is designed to be used for code completion which means that /// it doesn't mutate given expression, even if there is a single valid /// solution, and constraint solver is allowed to produce partially correct /// solutions. Such solutions can have any number of holes in them. /// - /// \param target The expression involved in code completion. - /// /// \param solutions The solutions produced for the given target without /// filtering. + void solveForCodeCompletion(SmallVectorImpl &solutions); + + /// Generate constraints for \p target and solve the resulting constraint + /// system for code completion (see overload above). /// /// \returns `false` if this call fails (e.g. pre-check or constraint /// generation fails), `true` otherwise. diff --git a/lib/Sema/BuilderTransform.cpp b/lib/Sema/BuilderTransform.cpp index e0f4b6df0b6bb..3a44e894aed9d 100644 --- a/lib/Sema/BuilderTransform.cpp +++ b/lib/Sema/BuilderTransform.cpp @@ -1714,10 +1714,10 @@ Optional TypeChecker::applyResultBuilderBodyTransform( } // Solve the constraint system. - SmallVector solutions; - bool solvingFailed = cs.solve(solutions); - if (cs.getASTContext().CompletionCallback) { + SmallVector solutions; + cs.solveForCodeCompletion(solutions); + CompletionContextFinder analyzer(func, func->getDeclContext()); filterSolutionsForCodeCompletion(solutions, analyzer); for (const auto &solution : solutions) { @@ -1726,6 +1726,9 @@ Optional TypeChecker::applyResultBuilderBodyTransform( return nullptr; } + SmallVector solutions; + bool solvingFailed = cs.solve(solutions); + if (solvingFailed || solutions.size() != 1) { // Try to fix the system or provide a decent diagnostic. auto salvagedResult = cs.salvage(); diff --git a/lib/Sema/CSSolver.cpp b/lib/Sema/CSSolver.cpp index 10e1e2642a593..eb2aadbdbf751 100644 --- a/lib/Sema/CSSolver.cpp +++ b/lib/Sema/CSSolver.cpp @@ -1516,26 +1516,8 @@ void ConstraintSystem::solveImpl(SmallVectorImpl &solutions) { } } -bool ConstraintSystem::solveForCodeCompletion( - SolutionApplicationTarget &target, SmallVectorImpl &solutions) { - auto *expr = target.getAsExpr(); - // Tell the constraint system what the contextual type is. - setContextualType(expr, target.getExprContextualTypeLoc(), - target.getExprContextualTypePurpose()); - - // Set up the expression type checker timer. - Timer.emplace(expr, *this); - - shrink(expr); - - if (isDebugMode()) { - auto &log = llvm::errs(); - log << "--- Code Completion ---\n"; - } - - if (generateConstraints(target, FreeTypeVariableBinding::Disallow)) - return false; - +void ConstraintSystem::solveForCodeCompletion( + SmallVectorImpl &solutions) { { SolverState state(*this, FreeTypeVariableBinding::Disallow); @@ -1556,6 +1538,30 @@ bool ConstraintSystem::solveForCodeCompletion( } } + return; +} + +bool ConstraintSystem::solveForCodeCompletion( + SolutionApplicationTarget &target, SmallVectorImpl &solutions) { + auto *expr = target.getAsExpr(); + // Tell the constraint system what the contextual type is. + setContextualType(expr, target.getExprContextualTypeLoc(), + target.getExprContextualTypePurpose()); + + // Set up the expression type checker timer. + Timer.emplace(expr, *this); + + shrink(expr); + + if (isDebugMode()) { + auto &log = llvm::errs(); + log << "--- Code Completion ---\n"; + } + + if (generateConstraints(target, FreeTypeVariableBinding::Disallow)) + return false; + + solveForCodeCompletion(solutions); return true; } From 1abd15ba23c2a6bf978a8608c24101f1aabf2d81 Mon Sep 17 00:00:00 2001 From: Alex Hoppen Date: Wed, 6 Apr 2022 14:44:53 +0200 Subject: [PATCH 49/83] [Sema] Print whether a type variable allows binding to a hole Helped me while debugging and might help others as well. --- lib/Sema/TypeCheckConstraints.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Sema/TypeCheckConstraints.cpp b/lib/Sema/TypeCheckConstraints.cpp index a035bb77f8bb1..2275140eb6bc6 100644 --- a/lib/Sema/TypeCheckConstraints.cpp +++ b/lib/Sema/TypeCheckConstraints.cpp @@ -1410,6 +1410,8 @@ void ConstraintSystem::print(raw_ostream &out) const { out << " [inout allowed]"; if (tv->getImpl().canBindToNoEscape()) out << " [noescape allowed]"; + if (tv->getImpl().canBindToHole()) + out << " [hole allowed]"; auto rep = getRepresentative(tv); if (rep == tv) { if (auto fixed = getFixedType(tv)) { From 0b9644a0d4884268fd2a4f4558cea4fb737f9529 Mon Sep 17 00:00:00 2001 From: Alex Hoppen Date: Fri, 8 Apr 2022 11:24:31 +0200 Subject: [PATCH 50/83] [CodeCompletion] Report type relations when completing inside result builders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This requires navigating the constraint system solution to retrieve the argument type of the `buildBlock` call. The comments in the code should describe what I’m doing well enough. rdar://83846531 --- lib/Sema/ConstraintSystem.cpp | 75 ++++++++++++++++++++--- test/IDE/complete_in_result_builder.swift | 48 ++++++++++++--- 2 files changed, 107 insertions(+), 16 deletions(-) diff --git a/lib/Sema/ConstraintSystem.cpp b/lib/Sema/ConstraintSystem.cpp index b10cecec71474..bf34c5e694762 100644 --- a/lib/Sema/ConstraintSystem.cpp +++ b/lib/Sema/ConstraintSystem.cpp @@ -3558,18 +3558,79 @@ Type Solution::simplifyTypeForCodeCompletion(Type Ty) const { // Replace all type variables (which must come from placeholders) by their // generic parameters. Because we call into simplifyTypeImpl - Ty = CS.simplifyTypeImpl(Ty, [](TypeVariableType *typeVar) -> Type { - if (auto *GP = typeVar->getImpl().getGenericParameter()) { - // Code completion depends on generic parameter type being - // represented in terms of `ArchetypeType` since it's easy - // to extract protocol requirements from it. - if (auto *GPD = GP->getDecl()) { - return GPD->getInnermostDeclContext()->mapTypeIntoContext(GP); + Ty = CS.simplifyTypeImpl(Ty, [&CS](TypeVariableType *typeVar) -> Type { + // Code completion depends on generic parameter type being represented in + // terms of `ArchetypeType` since it's easy to extract protocol requirements + // from it. + auto getTypeVarAsArchetype = [](TypeVariableType *typeVar) -> Type { + if (auto *GP = typeVar->getImpl().getGenericParameter()) { + if (auto *GPD = GP->getDecl()) { + return GPD->getInnermostDeclContext()->mapTypeIntoContext(GP); + } + } + return Type(); + }; + + if (auto archetype = getTypeVarAsArchetype(typeVar)) { + return archetype; + } + + // When applying the logic below to get contextual types inside result + // builders, the code completion type variable is connected by a one-way + // constraint to a type variable in the buildBlock call, but that is not the + // type variable that represents the argument type. We need to find the type + // variable representing the argument to retrieve protocol requirements from + // it. Look for a ArgumentConversion constraint that allows us to retrieve + // the argument type var. + for (auto argConstraint : + CS.getConstraintGraph()[typeVar].getConstraints()) { + if (argConstraint->getKind() == ConstraintKind::ArgumentConversion && + argConstraint->getFirstType()->getRValueType()->isEqual(typeVar)) { + if (auto argTV = + argConstraint->getSecondType()->getAs()) { + if (auto archetype = getTypeVarAsArchetype(argTV)) { + return archetype; + } + } } } + return typeVar; }); + // Logic to determine the contextual type inside buildBlock result builders: + // + // When completing inside a result builder, the result builder + // @ViewBuilder var body: some View { + // Text("Foo") + // #^COMPLETE^# + // } + // gets rewritten to + // @ViewBuilder var body: some View { + // let $__builder2: Text + // let $__builder0 = Text("Foo") + // let $__builder1 = #^COMPLETE^# + // $__builder2 = ViewBuilder.buildBlock($__builder0, $__builder1) + // return $__builder2 + // } + // Inside the constraint system + // let $__builder1 = #^COMPLETE^# + // gets type checked without context, so we can't know the contexutal type for + // the code completion token. But we know that $__builder1 (and thus the type + // of #^COMPLETE^#) is used as the second argument to ViewBuilder.buildBlock, + // so we can extract the contextual type from that call. To do this, figure + // out the type variable that is used for $__builder1 in the buildBlock call. + // This type variable is connected to the type variable of $__builder1's + // definition by a one-way constraint. + if (auto TV = Ty->getAs()) { + for (auto constraint : CS.getConstraintGraph()[TV].getConstraints()) { + if (constraint->getKind() == ConstraintKind::OneWayEqual && + constraint->getSecondType()->isEqual(TV)) { + return simplifyTypeForCodeCompletion(constraint->getFirstType()); + } + } + } + // Remove any remaining type variables and placeholders Ty = simplifyType(Ty); diff --git a/test/IDE/complete_in_result_builder.swift b/test/IDE/complete_in_result_builder.swift index 151a7d76b32f0..9ddd32a3b7a71 100644 --- a/test/IDE/complete_in_result_builder.swift +++ b/test/IDE/complete_in_result_builder.swift @@ -36,25 +36,28 @@ func testGlobalLookup() { @TupleBuilder var x1 { #^GLOBAL_LOOKUP^# // GLOBAL_LOOKUP: Begin completions - // GLOBAL_LOOKUP: Decl[GlobalVar]/CurrModule: MyConstantString[#String#]; + // GLOBAL_LOOKUP: Decl[GlobalVar]/CurrModule/TypeRelation[Identical]: MyConstantString[#String#]; // GLOBAL_LOOKUP: End completions } @TupleBuilder var x2 { if true { - #^GLOBAL_LOOKUP_IN_IF_BODY?check=GLOBAL_LOOKUP^# + #^GLOBAL_LOOKUP_IN_IF_BODY?check=GLOBAL_LOOKUP_NO_TYPE_RELATION^# +// GLOBAL_LOOKUP_NO_TYPE_RELATION: Begin completions +// GLOBAL_LOOKUP_NO_TYPE_RELATION: Decl[GlobalVar]/CurrModule: MyConstantString[#String#]; +// GLOBAL_LOOKUP_NO_TYPE_RELATION: End completions } } @TupleBuilder var x3 { if { - #^GLOBAL_LOOKUP_IN_IF_BODY_WITHOUT_CONDITION?check=GLOBAL_LOOKUP^# + #^GLOBAL_LOOKUP_IN_IF_BODY_WITHOUT_CONDITION?check=GLOBAL_LOOKUP_NO_TYPE_RELATION^# } } @TupleBuilder var x4 { guard else { - #^GLOBAL_LOOKUP_IN_GUARD_BODY_WITHOUT_CONDITION?check=GLOBAL_LOOKUP^# + #^GLOBAL_LOOKUP_IN_GUARD_BODY_WITHOUT_CONDITION?check=GLOBAL_LOOKUP_NO_TYPE_RELATION^# } } @@ -78,13 +81,16 @@ func testStaticMemberLookup() { @TupleBuilder var x1 { StringFactory.#^COMPLETE_STATIC_MEMBER^# // COMPLETE_STATIC_MEMBER: Begin completions - // COMPLETE_STATIC_MEMBER: Decl[StaticMethod]/CurrNominal: makeString({#x: String#})[#String#]; + // COMPLETE_STATIC_MEMBER: Decl[StaticMethod]/CurrNominal/TypeRelation[Identical]: makeString({#x: String#})[#String#]; // COMPLETE_STATIC_MEMBER: End completions } @TupleBuilder var x2 { if true { - StringFactory.#^COMPLETE_STATIC_MEMBER_IN_IF_BODY?check=COMPLETE_STATIC_MEMBER^# + StringFactory.#^COMPLETE_STATIC_MEMBER_IN_IF_BODY^# +// COMPLETE_STATIC_MEMBER_IN_IF_BODY: Begin completions +// COMPLETE_STATIC_MEMBER_IN_IF_BODY: Decl[StaticMethod]/CurrNominal: makeString({#x: String#})[#String#]; +// COMPLETE_STATIC_MEMBER_IN_IF_BODY: End completions } } @@ -208,13 +214,37 @@ func testCompleteInStringLiteral() { // STRING_LITERAL_VAR-DAG: Keyword[self]/CurrNominal: self[#Island#]; name=self // STRING_LITERAL_VAR-DAG: Decl[InstanceVar]/CurrNominal/TypeRelation[Convertible]: turnipPrice[#String#]; name=turnipPrice // STRING_LITERAL_VAR: End completions + } - - func bar(island: Island) { + func bar(island: Island) { BStack { Text("\(island.#^STRING_LITERAL_AS_ARGUMENT?check=STRING_LITERAL_VAR^#turnipPrice)") takeTrailingClosure {} - } + } + } +} + +func testTypeRelationInResultBuilder() { + protocol View2 {} + + @resultBuilder public struct ViewBuilder2 { + static func buildBlock(_ content: Content) -> Content where Content : View2 { fatalError() } + static func buildBlock(_ c0: C0, _ c1: C1) -> C0 where C0 : View2, C1: View2 { fatalError() } } + struct MyText: View2 {} + + struct MyView { + @ViewBuilder2 var body: some View2 { + #^SINGLE_ELEMENT^# + } + // SINGLE_ELEMENT: Begin completions + // SINGLE_ELEMENT-DAG: Decl[Struct]/Local/TypeRelation[Convertible]: MyText[#MyText#]; + // SINGLE_ELEMENT: End completions + + @ViewBuilder2 var body2: some View2 { + MyText() + #^SECOND_ELEMENT?check=SINGLE_ELEMENT^# + } + } } From 83df814c6310c2957bee95d5c1c21df90f0273fc Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 6 Apr 2022 20:04:14 -0700 Subject: [PATCH 51/83] =?UTF-8?q?[stdlib]=20=5FStringObject.isKnownUTF16?= =?UTF-8?q?=20=E2=86=92=20isForeignUTF8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compatibility issue with potential future UTF-8 encoded foreign String forms, as well as simplifying the code a bit — we no longer need to do an availability check on inlinable fast paths. The isForeignUTF8 bit is never set by any past or current stdlib version, but it allows us to introduce UTF-8 encoded foreign forms without breaking inlinable index encoding validation introduced in Swift 5.7. --- stdlib/public/core/StringBridge.swift | 3 +- stdlib/public/core/StringCharacterView.swift | 18 +-- stdlib/public/core/StringGuts.swift | 79 ++++------ .../core/StringGutsRangeReplaceable.swift | 4 +- stdlib/public/core/StringIndex.swift | 16 +- stdlib/public/core/StringObject.swift | 138 +++++++----------- .../public/core/StringUnicodeScalarView.swift | 4 +- stdlib/public/core/Substring.swift | 8 +- 8 files changed, 103 insertions(+), 167 deletions(-) diff --git a/stdlib/public/core/StringBridge.swift b/stdlib/public/core/StringBridge.swift index 195d702aacd38..a26bdc8d3cf37 100644 --- a/stdlib/public/core/StringBridge.swift +++ b/stdlib/public/core/StringBridge.swift @@ -616,8 +616,7 @@ extension String { let gutsCountAndFlags = _guts._object._countAndFlags let countAndFlags = _StringObject.CountAndFlags( sharedCount: _guts.count, - isASCII: gutsCountAndFlags.isASCII, - isUTF16: false) + isASCII: gutsCountAndFlags.isASCII) return __SharedStringStorage( immortal: _guts._object.fastUTF8.baseAddress!, countAndFlags: countAndFlags) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index be27f67629b89..1b431d32883b7 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -55,8 +55,7 @@ extension String: BidirectionalCollection { /// - Returns: The index value immediately after `i`. public func index(after i: Index) -> Index { let i = _guts.roundDownToNearestCharacter(_guts.validateScalarIndex(i)) - let r = _uncheckedIndex(after: i) - return _guts.internalMarkEncoding(r) + return _uncheckedIndex(after: i) } /// A version of `index(after:)` that assumes that the given index: @@ -64,8 +63,6 @@ extension String: BidirectionalCollection { /// - has the right encoding, /// - is within bounds, and /// - is scalar aligned. - /// - /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(after i: Index) -> Index { _internalInvariant(_guts.hasMatchingEncoding(i)) _internalInvariant(i < endIndex) @@ -77,7 +74,7 @@ extension String: BidirectionalCollection { let nextIndex = Index(_encodedOffset: nextOffset)._characterAligned let nextStride = _characterStride(startingAt: nextIndex) let r = Index(encodedOffset: nextOffset, characterStride: nextStride) - return _guts.internalMarkEncoding(r._characterAligned) + return _guts.markEncoding(r._characterAligned) } /// Returns the position immediately before the given index. @@ -92,8 +89,7 @@ extension String: BidirectionalCollection { // the `i > startIndex` check needs to come after rounding. _precondition(i > startIndex, "String index is out of bounds") - let r = _uncheckedIndex(before: i) - return _guts.internalMarkEncoding(r) + return _uncheckedIndex(before: i) } /// A version of `index(before:)` that assumes that the given index: @@ -101,8 +97,6 @@ extension String: BidirectionalCollection { /// - has the right encoding, /// - is within bounds, and /// - is character aligned. - /// - /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(before i: Index) -> Index { _internalInvariant(_guts.hasMatchingEncoding(i)) _internalInvariant(i > startIndex && i <= endIndex) @@ -113,7 +107,7 @@ extension String: BidirectionalCollection { let priorOffset = i._encodedOffset &- stride let r = Index(encodedOffset: priorOffset, characterStride: stride) - return r._characterAligned + return _guts.markEncoding(r._characterAligned) } /// Returns an index that is the specified distance from the given index. @@ -158,7 +152,7 @@ extension String: BidirectionalCollection { i = _uncheckedIndex(before: i) } } - return _guts.internalMarkEncoding(i) + return i } /// Returns an index that is the specified distance from the given index, @@ -238,7 +232,7 @@ extension String: BidirectionalCollection { } guard limit > start || i >= limit else { return nil } } - return _guts.internalMarkEncoding(i) + return i } /// Returns the distance between two indices. diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 0c3d0ba18114b..331aa6ad59731 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -311,55 +311,36 @@ func _isSwiftStdlib_5_7() -> Bool { // Encoding extension _StringGuts { - /// Returns whether this string is known to use UTF-16 code units. + /// Returns whether this string has a UTF-8 storage representation. /// - /// This always returns a value corresponding to the string's actual encoding - /// on stdlib versions >=5.7. + /// This always returns a value corresponding to the string's actual encoding. + @_alwaysEmitIntoClient + @inline(__always) + internal var isUTF8: Bool { _object.isUTF8 } + + /// Returns whether this string has a UTF-16 storage representation. /// - /// Standard Library versions <=5.6 did not set the corresponding flag, so - /// this property always returns false. + /// This always returns a value corresponding to the string's actual encoding. @_alwaysEmitIntoClient @inline(__always) - internal var isKnownUTF16: Bool { _object.isKnownUTF16 } + internal var isUTF16: Bool { _object.isUTF16 } @_alwaysEmitIntoClient // Swift 5.7 internal func markEncoding(_ i: String.Index) -> String.Index { - // In this inlinable function, we cannot assume that all foreign strings are - // UTF-16 encoded, as this code may run on a future stdlib that may have - // introduced other foreign forms. - if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { // SwiftStdlib 5.7 - // With a >=5.7 stdlib, we can rely on `isKnownUTF16` to contain the truth. - return isKnownUTF16 ? i._knownUTF16 : i._knownUTF8 - } - // We know that in stdlibs 5.0..<5.7, all foreign strings were UTF-16, - // so we can use `isForeign` to determine the encoding. - return isForeign ? i._knownUTF16 : i._knownUTF8 - } - - @inline(__always) - internal func internalMarkEncoding(_ i: String.Index) -> String.Index { - // This code is behind a resiliance boundary, so it always runs on a >=5.7 - // stdlib. Note though that it doesn't match the 5.7+ case in the inlinable - // version above! - // - // We know that in this version of the stdlib, foreign strings happen to - // always be UTF-16 encoded (like they were between 5.0 and 5.6), and - // looking at `isForeign` instead of `isKnownUTF16` may allow the stdlib's - // internal code to be better optimized -- so let's do that. - isForeign ? i._knownUTF16 : i._knownUTF8 + isUTF8 ? i._knownUTF8 : i._knownUTF16 } /// Returns true if the encoding of the given index isn't known to be in /// conflict with this string's encoding. /// - /// If the index or the string was created by code that was built on stdlibs - /// below 5.7, then this check may incorrectly return true on a mismatching - /// index, but it is guaranteed to never incorrectly return false. If all - /// loaded binaries were built in 5.7+, then this method is guaranteed to - /// always return the correct value. - @_alwaysEmitIntoClient + /// If the index was created by code that was built on a stdlib below 5.7, + /// then this check may incorrectly return true on a mismatching index, but it + /// is guaranteed to never incorrectly return false. If all loaded binaries + /// were built in 5.7+, then this method is guaranteed to always return the + /// correct value. + @_alwaysEmitIntoClient @inline(__always) internal func hasMatchingEncoding(_ i: String.Index) -> Bool { - (isForeign && i._canBeUTF16) || (!isForeign && i._canBeUTF8) + isUTF8 ? i._canBeUTF8 : i._canBeUTF16 } /// Return an index whose encoding can be assumed to match that of `self`. @@ -371,22 +352,20 @@ extension _StringGuts { @_alwaysEmitIntoClient @inline(__always) internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index { - if _fastPath(!isForeign && i._canBeUTF8) { return i } + if _fastPath(hasMatchingEncoding(i)) { return i } return _slowEnsureMatchingEncoding(i) } @_alwaysEmitIntoClient @inline(never) internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { - _internalInvariant(isForeign || !i._canBeUTF8) - if isForeign { - // Opportunistically detect attempts to use an UTF-8 index on a UTF-16 - // string. Strings don't usually get converted to UTF-16 storage, so it - // seems okay to trap in this case -- the index most likely comes from an - // unrelated string. (Trapping here may still turn out to affect binary - // compatibility with broken code in existing binaries running with new - // stdlibs. If so, we can replace this with the same transcoding hack as - // in the UTF-16->8 case below.) + guard isUTF8 else { + // Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually + // get converted to UTF-16 storage, so it seems okay to trap in this case + // -- the index most likely comes from an unrelated string. (Trapping here + // may still turn out to affect binary compatibility with broken code in + // existing binaries running with new stdlibs. If so, we can replace this + // with the same transcoding hack as in the UTF-16->8 case below.) // // Note that this trap is not guaranteed to trigger when the process // includes client binaries compiled with a previous Swift release. @@ -397,13 +376,9 @@ extension _StringGuts { // // This trap can never trigger on OSes that have stdlibs <= 5.6, because // those versions never set the `isKnownUTF16` flag in `_StringObject`. - // - _precondition(!isKnownUTF16 || i._canBeUTF16, - "Invalid string index") - return i + _preconditionFailure("Invalid string index") } - // If we get here, then we know for sure that this is an attempt to use an - // UTF-16 index on a UTF-8 string. + // Attempt to use an UTF-16 index on a UTF-8 string. // // This can happen if `self` was originally verbatim-bridged, and someone // mistakenly attempts to keep using an old index after a mutation. This is diff --git a/stdlib/public/core/StringGutsRangeReplaceable.swift b/stdlib/public/core/StringGutsRangeReplaceable.swift index 36b01bda4d277..31b5ca32cf3e3 100644 --- a/stdlib/public/core/StringGutsRangeReplaceable.swift +++ b/stdlib/public/core/StringGutsRangeReplaceable.swift @@ -466,7 +466,7 @@ extension _StringGuts { _internalInvariant( subrange.lowerBound >= startIndex && subrange.upperBound <= endIndex) - if _slowPath(isKnownUTF16) { + if _slowPath(isUTF16) { // UTF-16 (i.e., foreign) string. The mutation will convert this to the // native UTF-8 encoding, so we need to do some extra work to preserve our // bounds. @@ -479,7 +479,7 @@ extension _StringGuts { from: subrange.lowerBound, to: subrange.upperBound) let newUTF8Subrange = body(&self) - _internalInvariant(!isKnownUTF16) + _internalInvariant(isUTF8) let newUTF8Count = oldUTF8Count + newUTF8Subrange.count - oldUTF8SubrangeCount diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index a34b9f4ceb168..dad8b6cfd447d 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -364,14 +364,14 @@ extension String.Index { // this way: position zero is the same no matter how what encoding is used for // the rest of string.) // -// These two bits (along with the isKnownUTF16 flag in StringObject) allow newer -// versions of the Standard Library to more reliably catch runtime errors where -// client code is applying an index from a UTF-16 string to a UTF-8 one, or vice -// versa. This typically happens when indices from a UTF-16 Cocoa string that -// was verbatim bridged into Swift are accidentally applied to a mutated version -// of the same string. (The mutation turns it into a UTF-8 native string, where -// the same numerical offsets might correspond to wildly different logical -// positions.) +// These two bits (along with the isForeignUTF8 flag in StringObject) allow +// newer versions of the Standard Library to more reliably catch runtime errors +// where client code is applying an index from a UTF-16 string to a UTF-8 one, +// or vice versa. This typically happens when indices from a UTF-16 Cocoa string +// that was verbatim bridged into Swift are accidentally applied to a mutated +// version of the same string. (The mutation turns it into a UTF-8 native +// string, where the same numerical offsets might correspond to wildly different +// logical positions.) // // Such code has always been broken, as the old indices are documented to be no // longer valid after the mutation; however, in previous releases such cases diff --git a/stdlib/public/core/StringObject.swift b/stdlib/public/core/StringObject.swift index e49cbb4a3bbc2..f03004bbb8dd5 100644 --- a/stdlib/public/core/StringObject.swift +++ b/stdlib/public/core/StringObject.swift @@ -588,7 +588,7 @@ extension _StringObject { ┌──────┬──────┬──────┬──────┬──────┬──────────┬───────────────────────────────┐ │ b63 │ b62 │ b61 │ b60 │ b59 │ b58:48 │ b47:0 │ ├──────┼──────┼──────┼──────┼──────┼──────────┼───────────────────────────────┤ -│ ASCII│ NFC │native│ tail │ UTF16│ reserved │ count │ +│ ASCII│ NFC │native│ tail │ UTF8 │ reserved │ count │ └──────┴──────┴──────┴──────┴──────┴──────────┴───────────────────────────────┘ b63: isASCII. set when all code units are known to be ASCII, enabling: @@ -613,26 +613,17 @@ extension _StringObject { (e.g. literals) - `isTailAllocated` always implies `isFastUTF8` - b59: isKnownUTF16. This bit is set if index positions in the string are known - to be measured in UTF-16 code units, rather than the default UTF-8. - - This is only ever set on UTF-16 foreign strings created in noninlinable - code in stdlib versions >= 5.7. On stdlibs <= 5.6, this bit is always set - to zero. - - Note that while as of 5.7 all foreign strings are UTF-16, this isn't - guaranteed to remain this way -- future versions of the stdlib may - introduce new foreign forms that use a different encoding. (Likely UTF-8.) - - Foreign strings are only created in non-inlinable code, so on stdlib - versions >=5.7, this bit always correctly reflects the correct encoding - for the string's offset values. - - This bit along with the two related bits in String.Index allow us to - opportunistically catch cases where an UTF-16 index is used on an UTF-8 - string (or vice versa), and to provide better error reporting & recovery. - As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become - able to reliably catch all such issues. - - It is okay for isASCII strings to not set this flag, even if they are - UTF-16 encoded -- the offsets in that case can work in either encoding. - (This is not currently exercised, as foreign bridged strings never set - the isASCII flag.) + b59: isForeignUTF8. This bit is to be set on future UTF-8 encoded string + variants, i.e. on strings whose index positions are measured in UTF-8 code + units, even though their storage isn't continuous. As of Swift 5.7, we + don't have any such foreign forms, but inlinable index validation methods + need to prepare for the possibility of their introduction, so we need to + assign this bit in preparation. + + If we decide to never introduce such forms, we can stop checking this bit + at any time, but we cannot reuse it for something else -- we need to + preserve its current meaning to keep inlined index validation code + working. b48-58: Reserved for future usage. - Because Swift is ABI stable (on some platforms at least), these bits can @@ -649,7 +640,7 @@ extension _StringObject { performance shortcuts, e.g. to signal the availability of a potential fast path. (However, it is also possible to store information here that allows more reliable detection & handling of runtime errors, like the - `isKnownUTF16` bit above.) + `isForeignUTF8` bit above.) b0-47: count. Stores the number of code units. Corresponds to the position of the `endIndex`. @@ -680,7 +671,7 @@ extension _StringObject.CountAndFlags { @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal static var isKnownUTF16Mask: UInt64 { + internal static var isForeignUTF8Mask: UInt64 { return 0x0800_0000_0000_0000 } @@ -722,49 +713,6 @@ extension _StringObject.CountAndFlags { _internalInvariant(isTailAllocated == self.isTailAllocated) } - @inline(__always) - internal init( - count: Int, - isASCII: Bool, - isNFC: Bool, - isNativelyStored: Bool, - isTailAllocated: Bool, - isKnownUTF16: Bool - ) { - var rawBits = UInt64(truncatingIfNeeded: count) - _internalInvariant(rawBits <= _StringObject.CountAndFlags.countMask) - - if isASCII { - _internalInvariant(isNFC) - rawBits |= _StringObject.CountAndFlags.isASCIIMask - } - - if isNFC { - rawBits |= _StringObject.CountAndFlags.isNFCMask - } - - if isNativelyStored { - _internalInvariant(isTailAllocated) - rawBits |= _StringObject.CountAndFlags.isNativelyStoredMask - } - - if isTailAllocated { - rawBits |= _StringObject.CountAndFlags.isTailAllocatedMask - } - - if isKnownUTF16 { - rawBits |= _StringObject.CountAndFlags.isKnownUTF16Mask - } - - self.init(raw: rawBits) - _internalInvariant(count == self.count) - _internalInvariant(isASCII == self.isASCII) - _internalInvariant(isNFC == self.isNFC) - _internalInvariant(isNativelyStored == self.isNativelyStored) - _internalInvariant(isTailAllocated == self.isTailAllocated) - _internalInvariant(isKnownUTF16 == self.isKnownUTF16) - } - @inlinable @inline(__always) internal init(count: Int, flags: UInt16) { // Currently, we only use top 5 flags @@ -798,14 +746,13 @@ extension _StringObject.CountAndFlags { isTailAllocated: true) } @inline(__always) - internal init(sharedCount: Int, isASCII: Bool, isUTF16: Bool) { + internal init(sharedCount: Int, isASCII: Bool) { self.init( count: sharedCount, isASCII: isASCII, isNFC: isASCII, isNativelyStored: false, - isTailAllocated: false, - isKnownUTF16: isUTF16) + isTailAllocated: false) } // @@ -840,17 +787,15 @@ extension _StringObject.CountAndFlags { return 0 != _storage & _StringObject.CountAndFlags.isTailAllocatedMask } - /// Returns whether this string is known to use UTF-16 code units. - /// - /// This always returns a value corresponding to the string's actual encoding - /// on stdlib versions >=5.7. + /// Returns whether this string is a foreign form with a UTF-8 storage + /// representation. /// - /// Standard Library versions <=5.6 did not set the corresponding flag, so - /// this property always returns false. + /// As of Swift 5.7, this bit is never set; however, future releases may + /// introduce such forms. @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 - internal var isKnownUTF16: Bool { - return 0 != _storage & _StringObject.CountAndFlags.isKnownUTF16Mask + internal var isForeignUTF8: Bool { + (_storage & Self.isForeignUTF8Mask) != 0 } #if !INTERNAL_CHECKS_ENABLED @@ -864,7 +809,7 @@ extension _StringObject.CountAndFlags { if isNativelyStored { _internalInvariant(isTailAllocated) } - if isKnownUTF16 { + if isForeignUTF8 { _internalInvariant(!isNativelyStored) _internalInvariant(!isTailAllocated) } @@ -1001,11 +946,35 @@ extension _StringObject { return _countAndFlags.isNFC } - @_alwaysEmitIntoClient // Swift 5.7 - @inline(__always) - internal var isKnownUTF16: Bool { - if isSmall { return false } - return _countAndFlags.isKnownUTF16 + /// Returns whether this string has a UTF-8 storage representation. + /// + /// This always returns a value corresponding to the string's actual encoding. + @_alwaysEmitIntoClient + @inline(__always) // Swift 5.7 + internal var isUTF8: Bool { + // This is subtle. It is designed to return the right value in all past & + // future stdlibs. + // + // If `providesFastUTF8` is true, then we know we have an UTF-8 string. + // + // Otherwise we have a foreign string. On Swift <=5.7, foreign strings are + // always UTF-16 encoded, but a future Swift release may introduce UTF-8 + // encoded foreign strings. To allow this, we have a dedicated + // `isForeignUTF8` bit that future UTF-8 encoded foreign forms will need to + // set to avoid breaking index validation. + // + // Note that `providesFastUTF8` returns true for small strings, so we don't + // need to check for smallness before accessing the `isForeignUTF8` bit. + providesFastUTF8 || _countAndFlags.isForeignUTF8 + } + + /// Returns whether this string has a UTF-16 storage representation. + /// + /// This always returns a value corresponding to the string's actual encoding. + @_alwaysEmitIntoClient + @inline(__always) // Swift 5.7 + internal var isUTF16: Bool { + !isUTF8 } // Get access to fast UTF-8 contents for large strings which provide it. @@ -1107,8 +1076,7 @@ extension _StringObject { internal init( cocoa: AnyObject, providesFastUTF8: Bool, isASCII: Bool, length: Int ) { - let countAndFlags = CountAndFlags( - sharedCount: length, isASCII: isASCII, isUTF16: !providesFastUTF8) + let countAndFlags = CountAndFlags(sharedCount: length, isASCII: isASCII) let discriminator = Nibbles.largeCocoa(providesFastUTF8: providesFastUTF8) #if arch(i386) || arch(arm) || arch(arm64_32) || arch(wasm32) self.init( diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index e21280e3298ab..0cb8f32fc14e2 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -504,7 +504,7 @@ extension String.UnicodeScalarView { let len = UTF16.isLeadSurrogate(cu) ? 2 : 1 let r = i.encoded(offsetBy: len)._scalarAligned - return _guts.internalMarkEncoding(r) + return r._knownUTF16 } @usableFromInline @inline(never) @@ -516,6 +516,6 @@ extension String.UnicodeScalarView { let len = UTF16.isTrailSurrogate(cu) ? 2 : 1 let r = i.encoded(offsetBy: -len)._scalarAligned - return _guts.internalMarkEncoding(r) + return r._knownUTF16 } } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 7ece46a5bde7e..023386cf17f88 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -237,7 +237,7 @@ extension Substring: StringProtocol { let i = _roundDownToNearestCharacter(_validateScalarIndex(i)) let r = _uncheckedIndex(after: i) - return _wholeGuts.internalMarkEncoding(r) + return _wholeGuts.markEncoding(r) } /// A version of `index(after:)` that assumes that the given index: @@ -298,7 +298,7 @@ extension Substring: StringProtocol { _precondition(i > startIndex, "Substring index is out of bounds") let r = _uncheckedIndex(before: i) - return _wholeGuts.internalMarkEncoding(r) + return _wholeGuts.markEncoding(r) } /// A version of `index(before:)` that assumes that the given index: @@ -361,7 +361,7 @@ extension Substring: StringProtocol { i = _uncheckedIndex(before: i) } } - return _wholeGuts.internalMarkEncoding(i) + return _wholeGuts.markEncoding(i) } public func index( @@ -401,7 +401,7 @@ extension Substring: StringProtocol { } guard limit > start || i >= limit else { return nil } } - return _wholeGuts.internalMarkEncoding(i) + return _wholeGuts.markEncoding(i) } public func distance(from start: Index, to end: Index) -> Int { From d24ae9dfcda7158ed6bcbd9135ca4c77e2ed42a1 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 6 Apr 2022 20:26:31 -0700 Subject: [PATCH 52/83] [stdlib] Remove Substring._endIsCharacterAligned Now that the cached character stride in indices always mean the stride in the full string, we can stop looking at whether a substring has a character-aligned end index. --- stdlib/public/core/Substring.swift | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 023386cf17f88..2ca923c126954 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -149,10 +149,6 @@ extension Substring { internal var _startIsCharacterAligned: Bool { startIndex._isCharacterAligned } - - internal var _endIsCharacterAligned: Bool { - endIndex._isCharacterAligned - } } extension Substring { @@ -322,20 +318,16 @@ extension Substring: StringProtocol { _internalInvariant(priorOffset >= startIndex._encodedOffset) var r = Index( - encodedOffset: priorOffset, characterStride: priorStride)._scalarAligned - - if - // Don't set the `_isCharacterAligned` bit in indices of exotic substrings - // whose startIndex isn't aligned on a grapheme cluster boundary. (Their - // grapheme breaks may not match with those in `base`.) - _startIsCharacterAligned, - // Likewise if this is the last character in a substring ending on a - // partial grapheme cluster. - _endIsCharacterAligned || i < endIndex - { + encodedOffset: priorOffset, characterStride: priorStride) + + // Don't set the `_isCharacterAligned` bit in indices of exotic substrings + // whose startIndex isn't aligned on a grapheme cluster boundary. (Their + // grapheme breaks may not match with those in `base`.) + if _startIsCharacterAligned { r = r._characterAligned + } else { + r = r._scalarAligned } - return r } From b06e6e5dd31a386344680a79564c3a85ec2c7f3a Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 6 Apr 2022 23:15:28 -0700 Subject: [PATCH 53/83] [stdlib] String: Fix major perf regression due to extra arc traffic --- stdlib/public/core/StringGuts.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 331aa6ad59731..c3cd4f24db57f 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -358,6 +358,7 @@ extension _StringGuts { @_alwaysEmitIntoClient @inline(never) + @_effects(releasenone) internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index { guard isUTF8 else { // Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually From eadef7a204f7b82d8cbeab9d3024840c1852f0f2 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sat, 9 Apr 2022 15:59:21 -0700 Subject: [PATCH 54/83] [stdlib] String.Index: Use symbolic names rather than magic constants --- stdlib/public/core/StringIndex.swift | 74 +++++++++++++++++----------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index dad8b6cfd447d..d1c2d36e748ac 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -248,6 +248,26 @@ extension String.Index { } } +extension String.Index { + @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 + internal static var __scalarAlignmentBit: UInt64 { 0x1 } + + @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 + internal static var __characterAlignmentBit: UInt64 { 0x2 } + + @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 + internal static var __utf8Bit: UInt64 { 0x4 } + + @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 + internal static var __utf16Bit: UInt64 { 0x8 } + + @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 + internal static func __encodingBit(utf16: Bool) -> UInt64 { + let utf16 = Int8(Builtin.zext_Int1_Int8(utf16._value)) + return __utf8Bit &<< utf16 + } +} + /* Index Scalar Alignment @@ -289,13 +309,15 @@ extension String.Index { extension String.Index { @_alwaysEmitIntoClient // Swift 5.1 @inline(__always) - internal var _isScalarAligned: Bool { return 0 != _rawBits & 0x1 } + internal var _isScalarAligned: Bool { + 0 != _rawBits & Self.__scalarAlignmentBit + } @_alwaysEmitIntoClient // Swift 5.1 @inline(__always) internal var _scalarAligned: String.Index { var idx = self - idx._rawBits |= 0x1 + idx._rawBits |= Self.__scalarAlignmentBit idx._invariantCheck() return idx } @@ -325,7 +347,9 @@ extension String.Index { extension String.Index { @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal var _isCharacterAligned: Bool { return 0 != _rawBits & 0x2 } + internal var _isCharacterAligned: Bool { + 0 != _rawBits & Self.__characterAlignmentBit + } /// Return the same index with both the scalar- and `Character`-aligned bits /// set. @@ -334,7 +358,8 @@ extension String.Index { @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) internal var _characterAligned: String.Index { - let idx = Self(_rawBits | 0x3) + let r = _rawBits | Self.__characterAlignmentBit | Self.__scalarAlignmentBit + let idx = Self(r) idx._invariantCheck() return idx } @@ -385,6 +410,12 @@ extension String.Index { // handled in `_StringGuts.ensureMatchingEncoding(_:)`; see there for the sordid // details. extension String.Index { + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal var _encodingBits: UInt64 { + _rawBits & (Self.__utf8Bit | Self.__utf16Bit) + } + /// Returns true if the position in this index can be interpreted as an offset /// into UTF-8-encoded string storage. /// @@ -394,7 +425,7 @@ extension String.Index { @inline(__always) internal var _canBeUTF8: Bool { // The only way an index cannot be UTF-8 is it has only the UTF-16 flag set. - _rawBits & 0xC != 0x08 + _encodingBits != Self.__utf16Bit } /// Returns true if the position in this index can be interpreted as offset @@ -407,45 +438,30 @@ extension String.Index { @inline(__always) internal var _canBeUTF16: Bool { // The only way an index cannot be UTF-16 is it has only the UTF-8 flag set. - _rawBits & 0xC != 0x04 + _encodingBits != Self.__utf8Bit } /// Returns the same index with the UTF-8 bit set. @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal var _knownUTF8: Self { Self(_rawBits | 0x4) } + internal var _knownUTF8: Self { Self(_rawBits | Self.__utf8Bit) } /// Returns the same index with the UTF-16 bit set. @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal var _knownUTF16: Self { Self(_rawBits | 0x8) } + internal var _knownUTF16: Self { Self(_rawBits | Self.__utf16Bit) } /// Returns the same index with both UTF-8 & UTF-16 bits set. @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) - internal var _encodingIndependent: Self { Self(_rawBits | 0xC) } - - /// Returns true if the UTF-8 flag is set. - /// - /// This is for debugging purposes only. Do not use this property to determine - /// whether an index is compatible with UTF-8 storage; instead, use - /// `_canBeUTF8`. - @_alwaysEmitIntoClient // Swift 5.7 - @inline(__always) - internal var __isUTF8: Bool { _rawBits & 0x4 != 0 } - - /// Returns true if the UTF-16 flag is set. - /// - /// This is for debugging purposes only. Do not use this property to determine - /// whether an index is compatible with UTF-16 storage; instead, use - /// `_canBeUTF16`. - @_alwaysEmitIntoClient // Swift 5.7 - @inline(__always) - internal var __isUTF16: Bool { _rawBits & 0x8 != 0 } + internal var _encodingIndependent: Self { + Self(_rawBits | Self.__utf8Bit | Self.__utf16Bit) + } @_alwaysEmitIntoClient // Swift 5.7 internal func _copyEncoding(from index: Self) -> Self { - Self((_rawBits & ~0xC) | (index._rawBits & 0xC)) + let mask = Self.__utf8Bit | Self.__utf16Bit + return Self((_rawBits & ~mask) | (index._rawBits & mask)) } } @@ -487,7 +503,7 @@ extension String.Index { } d += ", encoding: " - switch (__isUTF8, __isUTF16) { + switch (_rawBits & Self.__utf8Bit != 0, _rawBits & Self.__utf16Bit != 0) { case (false, false): d += "unknown" case (true, false): d += "utf8" case (false, true): d += "utf16" From d18b5f573f204e425db13ea376ddca4b533e5d33 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sat, 9 Apr 2022 16:03:54 -0700 Subject: [PATCH 55/83] [stdlib] Branchless _StringGuts.hasMatchingEncoding --- stdlib/public/core/StringGuts.swift | 2 +- stdlib/public/core/StringIndex.swift | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index c3cd4f24db57f..c1bf3a1480a7d 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -340,7 +340,7 @@ extension _StringGuts { /// correct value. @_alwaysEmitIntoClient @inline(__always) internal func hasMatchingEncoding(_ i: String.Index) -> Bool { - isUTF8 ? i._canBeUTF8 : i._canBeUTF16 + i._hasMatchingEncoding(isUTF8: isUTF8) } /// Return an index whose encoding can be assumed to match that of `self`. diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index d1c2d36e748ac..cf845945b4d93 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -441,6 +441,20 @@ extension String.Index { _encodingBits != Self.__utf8Bit } + /// Returns true if the encoding of this index isn't known to be in conflict + /// with the specified encoding. + /// + /// If the index was created by code that was built on a stdlib below 5.7, + /// then this check may incorrectly return true on a mismatching index, but it + /// is guaranteed to never incorrectly return false. If all loaded binaries + /// were built in 5.7+, then this method is guaranteed to always return the + /// correct value. + @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) + internal func _hasMatchingEncoding(isUTF8 utf8: Bool) -> Bool { + _encodingBits != Self.__encodingBit(utf16: utf8) + } + /// Returns the same index with the UTF-8 bit set. @_alwaysEmitIntoClient // Swift 5.7 @inline(__always) From 3c9968945ebcd45264a0db5bda5010196f5c061f Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sat, 9 Apr 2022 21:33:41 -0700 Subject: [PATCH 56/83] [stdlib] String: Implement happy paths for index validation --- stdlib/public/core/CMakeLists.txt | 1 + stdlib/public/core/GroupInfo.json | 1 + stdlib/public/core/StringCharacterView.swift | 17 +- stdlib/public/core/StringGuts.swift | 180 --------- stdlib/public/core/StringIndex.swift | 12 + .../public/core/StringIndexValidation.swift | 344 ++++++++++++++++++ stdlib/public/core/Substring.swift | 65 ++-- 7 files changed, 389 insertions(+), 231 deletions(-) create mode 100644 stdlib/public/core/StringIndexValidation.swift diff --git a/stdlib/public/core/CMakeLists.txt b/stdlib/public/core/CMakeLists.txt index 8cad9b1013e17..401eab5d8aa53 100644 --- a/stdlib/public/core/CMakeLists.txt +++ b/stdlib/public/core/CMakeLists.txt @@ -157,6 +157,7 @@ set(SWIFTLIB_ESSENTIAL StringProtocol.swift StringIndex.swift StringIndexConversions.swift + StringIndexValidation.swift StringInterpolation.swift StringLegacy.swift StringNormalization.swift diff --git a/stdlib/public/core/GroupInfo.json b/stdlib/public/core/GroupInfo.json index 7a4b57089c302..cd67f67f00798 100644 --- a/stdlib/public/core/GroupInfo.json +++ b/stdlib/public/core/GroupInfo.json @@ -26,6 +26,7 @@ "StringHashable.swift", "StringIndex.swift", "StringIndexConversions.swift", + "StringIndexValidation.swift", "StringInterpolation.swift", "StringLegacy.swift", "StringNormalization.swift", diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 1b431d32883b7..007660c97c65c 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -54,7 +54,7 @@ extension String: BidirectionalCollection { /// `endIndex`. /// - Returns: The index value immediately after `i`. public func index(after i: Index) -> Index { - let i = _guts.roundDownToNearestCharacter(_guts.validateScalarIndex(i)) + let i = _guts.validateCharacterIndex(i) return _uncheckedIndex(after: i) } @@ -83,8 +83,7 @@ extension String: BidirectionalCollection { /// `startIndex`. /// - Returns: The index value immediately before `i`. public func index(before i: Index) -> Index { - let i = _guts.roundDownToNearestCharacter( - _guts.validateInclusiveScalarIndex(i)) + let i = _guts.validateInclusiveCharacterIndex(i) // Note: Aligning an index may move it closer towards the `startIndex`, so // the `i > startIndex` check needs to come after rounding. _precondition(i > startIndex, "String index is out of bounds") @@ -138,8 +137,7 @@ extension String: BidirectionalCollection { // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - var i = _guts.roundDownToNearestCharacter( - _guts.validateInclusiveScalarIndex(i)) + var i = _guts.validateInclusiveCharacterIndex(i) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { @@ -214,8 +212,7 @@ extension String: BidirectionalCollection { let limit = _guts.ensureMatchingEncoding(limit) let start = _guts.ensureMatchingEncoding(i) - var i = _guts.roundDownToNearestCharacter( - _guts.validateInclusiveScalarIndex(i)) + var i = _guts.validateInclusiveCharacterIndex(i) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { @@ -253,10 +250,8 @@ extension String: BidirectionalCollection { // grapheme breaks -- swapping `start` and `end` may change the magnitude of // the result. - let start = _guts.roundDownToNearestCharacter( - _guts.validateInclusiveScalarIndex(start)) - let end = _guts.roundDownToNearestCharacter( - _guts.validateInclusiveScalarIndex(end)) + let start = _guts.validateInclusiveCharacterIndex(start) + let end = _guts.validateInclusiveCharacterIndex(end) // TODO: known-ASCII and single-scalar-grapheme fast path, etc. diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index c1bf3a1480a7d..7ed4ef8691f89 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -400,186 +400,6 @@ extension _StringGuts { } } -// Index validation -extension _StringGuts { - /// Validate `i` and adjust its position toward the start, returning the - /// resulting index or trapping as appropriate. If this function returns, then - /// the returned value - /// - /// - has an encoding that matches this string, - /// - is within the bounds of this string, and - /// - is aligned on a scalar boundary. - @_alwaysEmitIntoClient - internal func validateScalarIndex(_ i: String.Index) -> String.Index { - let i = ensureMatchingEncoding(i) - _precondition(i._encodedOffset < count, "String index is out of bounds") - return scalarAlign(i) - } - - /// Validate `i` and adjust its position toward the start, returning the - /// resulting index or trapping as appropriate. If this function returns, then - /// the returned value - /// - /// - has an encoding that matches this string, - /// - is within `start ..< end`, and - /// - is aligned on a scalar boundary. - @_alwaysEmitIntoClient - internal func validateScalarIndex( - _ i: String.Index, - in bounds: Range - ) -> String.Index { - _internalInvariant(bounds.upperBound <= endIndex) - - let i = ensureMatchingEncoding(i) - _precondition(i >= bounds.lowerBound && i < bounds.upperBound, - "Substring index is out of bounds") - return scalarAlign(i) - } -} - -extension _StringGuts { - /// Validate `i` and adjust its position toward the start, returning the - /// resulting index or trapping as appropriate. If this function returns, then - /// the returned value - /// - /// - has an encoding that matches this string, - /// - is within the bounds of this string (including the `endIndex`), and - /// - is aligned on a scalar boundary. - @_alwaysEmitIntoClient - internal func validateInclusiveScalarIndex( - _ i: String.Index - ) -> String.Index { - let i = ensureMatchingEncoding(i) - _precondition(i._encodedOffset <= count, "String index is out of bounds") - return scalarAlign(i) - } - - /// Validate `i` and adjust its position toward the start, returning the - /// resulting index or trapping as appropriate. If this function returns, then - /// the returned value - /// - /// - has an encoding that matches this string, - /// - is within the bounds of this string (including the `endIndex`), and - /// - is aligned on a scalar boundary. - internal func validateInclusiveScalarIndex( - _ i: String.Index, - in bounds: Range - ) -> String.Index { - _internalInvariant(bounds.upperBound <= endIndex) - - let i = ensureMatchingEncoding(i) - _precondition(i >= bounds.lowerBound && i <= bounds.upperBound, - "Substring index is out of bounds") - return scalarAlign(i) - } -} - -extension _StringGuts { - @_alwaysEmitIntoClient - internal func validateSubscalarRange( - _ range: Range - ) -> Range { - let upper = ensureMatchingEncoding(range.upperBound) - let lower = ensureMatchingEncoding(range.lowerBound) - - // Note: if only `lower` was miscoded, then the range invariant `lower <= - // upper` may no longer hold after the above conversions, so we need to - // re-check it here. - _precondition(upper._encodedOffset <= count && lower <= upper, - "String index range is out of bounds") - - return Range(_uncheckedBounds: (lower, upper)) - } - - @_alwaysEmitIntoClient - internal func validateSubscalarRange( - _ range: Range, - in bounds: Range - ) -> Range { - _internalInvariant(bounds.upperBound <= endIndex) - - let upper = ensureMatchingEncoding(range.upperBound) - let lower = ensureMatchingEncoding(range.lowerBound) - - // Note: if only `lower` was miscoded, then the range invariant `lower <= - // upper` may no longer hold after the above conversions, so we need to - // re-check it here. - _precondition( - upper <= bounds.upperBound - && lower >= bounds.lowerBound - && lower <= upper, - "Substring index range is out of bounds") - - return Range(_uncheckedBounds: (lower, upper)) - } -} - -extension _StringGuts { - /// Validate `range` and adjust the position of its bounds, returning the - /// resulting range or trapping as appropriate. If this function returns, then - /// the bounds of the returned value - /// - /// - have an encoding that matches this string, - /// - are within the bounds of this string, and - /// - are aligned on a scalar boundary. - internal func validateScalarRange( - _ range: Range - ) -> Range { - var upper = ensureMatchingEncoding(range.upperBound) - var lower = ensureMatchingEncoding(range.lowerBound) - - // Note: if only `lower` was miscoded, then the range invariant `lower <= - // upper` may no longer hold after the above conversions, so we need to - // re-check it here. - _precondition(upper._encodedOffset <= count && lower <= upper, - "String index range is out of bounds") - - upper = scalarAlign(upper) - lower = scalarAlign(lower) - - // Older binaries may generate `startIndex` without the - // `_isCharacterAligned` flag. Compensate for that here so that substrings - // that start at the beginning will never get the sad path in - // `index(after:)`. Note that we don't need to do this for `upper` and we - // don't need to compare against the `endIndex` -- those aren't nearly as - // critical. - if lower._encodedOffset == 0 { lower = lower._characterAligned } - - return Range(_uncheckedBounds: (lower, upper)) - } - - /// Validate `range` and adjust the position of its bounds, returning the - /// resulting range or trapping as appropriate. If this function returns, then - /// the bounds of the returned value - /// - /// - have an encoding that matches this string, - /// - are within `start ..< end`, and - /// - are aligned on a scalar boundary. - internal func validateScalarRange( - _ range: Range, - in bounds: Range - ) -> Range { - _internalInvariant(bounds.upperBound <= endIndex) - - var upper = ensureMatchingEncoding(range.upperBound) - var lower = ensureMatchingEncoding(range.lowerBound) - - // Note: if only `lower` was miscoded, then the range invariant `lower <= - // upper` may no longer hold after the above conversions, so we need to - // re-check it here. - _precondition( - upper <= bounds.upperBound - && lower >= bounds.lowerBound - && lower <= upper, - "Substring index range is out of bounds") - - upper = scalarAlign(upper) - lower = scalarAlign(lower) - - return Range(_uncheckedBounds: (lower, upper)) - } -} - // Old SPI(corelibs-foundation) extension _StringGuts { @available(*, deprecated) diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index cf845945b4d93..048c558854320 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -479,6 +479,18 @@ extension String.Index { } } +extension String.Index { + @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 + internal var _isUTF8CharacterIndex: Bool { + _canBeUTF8 && _isCharacterAligned + } + + @_alwaysEmitIntoClient @inline(__always) // Swift 5.7 + internal var _isUTF8ScalarIndex: Bool { + _canBeUTF8 && _isScalarAligned + } +} + extension String.Index: Equatable { @inlinable @inline(__always) public static func == (lhs: String.Index, rhs: String.Index) -> Bool { diff --git a/stdlib/public/core/StringIndexValidation.swift b/stdlib/public/core/StringIndexValidation.swift new file mode 100644 index 0000000000000..08713a860fa7e --- /dev/null +++ b/stdlib/public/core/StringIndexValidation.swift @@ -0,0 +1,344 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2014 - 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +// Index validation +extension _StringGuts { + @_alwaysEmitIntoClient @inline(__always) + internal func isFastScalarIndex(_ i: String.Index) -> Bool { + hasMatchingEncoding(i) && i._isScalarAligned + } + + @_alwaysEmitIntoClient @inline(__always) + internal func isFastCharacterIndex(_ i: String.Index) -> Bool { + hasMatchingEncoding(i) && i._isCharacterAligned + } +} + +// Subscalar index validation (UTF-8 & UTF-16 views) +extension _StringGuts { + @_alwaysEmitIntoClient + internal func validateSubscalarIndex(_ i: String.Index) -> String.Index { + let i = ensureMatchingEncoding(i) + _precondition(i._encodedOffset < count, "String index is out of bounds") + return i + } + + @_alwaysEmitIntoClient + internal func validateSubscalarIndex( + _ i: String.Index, + in bounds: Range + ) -> String.Index { + _internalInvariant(bounds.upperBound <= endIndex) + + let i = ensureMatchingEncoding(i) + _precondition(i >= bounds.lowerBound && i < bounds.upperBound, + "Substring index is out of bounds") + return i + } + + @_alwaysEmitIntoClient + internal func validateInclusiveSubscalarIndex( + _ i: String.Index + ) -> String.Index { + let i = ensureMatchingEncoding(i) + _precondition(i._encodedOffset <= count, "String index is out of bounds") + return i + } + + internal func validateInclusiveSubscalarIndex( + _ i: String.Index, + in bounds: Range + ) -> String.Index { + _internalInvariant(bounds.upperBound <= endIndex) + + let i = ensureMatchingEncoding(i) + _precondition(i >= bounds.lowerBound && i <= bounds.upperBound, + "Substring index is out of bounds") + return i + } + + @_alwaysEmitIntoClient + internal func validateSubscalarRange( + _ range: Range + ) -> Range { + let upper = ensureMatchingEncoding(range.upperBound) + let lower = ensureMatchingEncoding(range.lowerBound) + + // Note: if only `lower` was miscoded, then the range invariant `lower <= + // upper` may no longer hold after the above conversions, so we need to + // re-check it here. + _precondition(upper <= endIndex && lower <= upper, + "String index range is out of bounds") + + return Range(_uncheckedBounds: (lower, upper)) + } + + @_alwaysEmitIntoClient + internal func validateSubscalarRange( + _ range: Range, + in bounds: Range + ) -> Range { + _internalInvariant(bounds.upperBound <= endIndex) + + let upper = ensureMatchingEncoding(range.upperBound) + let lower = ensureMatchingEncoding(range.lowerBound) + + // Note: if only `lower` was miscoded, then the range invariant `lower <= + // upper` may no longer hold after the above conversions, so we need to + // re-check it here. + _precondition( + lower >= bounds.lowerBound + && lower <= upper + && upper <= bounds.upperBound, + "Substring index range is out of bounds") + + return Range(_uncheckedBounds: (lower, upper)) + } +} + +// Scalar index validation (Unicode scalar views) +extension _StringGuts { + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within the bounds of this string, and + /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateScalarIndex(_ i: String.Index) -> String.Index { + if isFastScalarIndex(i) { + _precondition(i._encodedOffset < count, "String index is out of bounds") + return i + } + + return scalarAlign(validateSubscalarIndex(i)) + } + + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within `start ..< end`, and + /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateScalarIndex( + _ i: String.Index, + in bounds: Range + ) -> String.Index { + _internalInvariant(bounds.upperBound <= endIndex) + + if isFastScalarIndex(i) { + _precondition(i >= bounds.lowerBound && i < bounds.upperBound, + "Substring index is out of bounds") + return i + } + + return scalarAlign(validateSubscalarIndex(i, in: bounds)) + } +} + +extension _StringGuts { + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within the bounds of this string (including the `endIndex`), and + /// - is aligned on a scalar boundary. + @_alwaysEmitIntoClient + internal func validateInclusiveScalarIndex( + _ i: String.Index + ) -> String.Index { + if isFastScalarIndex(i) { + _precondition(i._encodedOffset <= count, "String index is out of bounds") + return i + } + + return scalarAlign(validateInclusiveSubscalarIndex(i)) + } + + /// Validate `i` and adjust its position toward the start, returning the + /// resulting index or trapping as appropriate. If this function returns, then + /// the returned value + /// + /// - has an encoding that matches this string, + /// - is within the bounds of this string (including the `endIndex`), and + /// - is aligned on a scalar boundary. + internal func validateInclusiveScalarIndex( + _ i: String.Index, + in bounds: Range + ) -> String.Index { + _internalInvariant(bounds.upperBound <= endIndex) + + if isFastScalarIndex(i) { + _precondition(i >= bounds.lowerBound && i <= bounds.upperBound, + "Substring index is out of bounds") + return i + } + + return scalarAlign(validateInclusiveSubscalarIndex(i, in: bounds)) + } +} + +extension _StringGuts { + /// Validate `range` and adjust the position of its bounds, returning the + /// resulting range or trapping as appropriate. If this function returns, then + /// the bounds of the returned value + /// + /// - have an encoding that matches this string, + /// - are within the bounds of this string, and + /// - are aligned on a scalar boundary. + internal func validateScalarRange( + _ range: Range + ) -> Range { + if + isFastScalarIndex(range.lowerBound), isFastScalarIndex(range.upperBound) + { + _precondition(range.upperBound._encodedOffset <= count, + "String index range is out of bounds") + return range + } + + let r = validateSubscalarRange(range) + return Range( + _uncheckedBounds: (scalarAlign(r.lowerBound), scalarAlign(r.upperBound))) + } + + /// Validate `range` and adjust the position of its bounds, returning the + /// resulting range or trapping as appropriate. If this function returns, then + /// the bounds of the returned value + /// + /// - have an encoding that matches this string, + /// - are within `start ..< end`, and + /// - are aligned on a scalar boundary. + internal func validateScalarRange( + _ range: Range, + in bounds: Range + ) -> Range { + _internalInvariant(bounds.upperBound <= endIndex) + + if + isFastScalarIndex(range.lowerBound), isFastScalarIndex(range.upperBound) + { + _precondition( + range.lowerBound >= bounds.lowerBound + && range.upperBound <= bounds.upperBound, + "String index range is out of bounds") + return range + } + + let r = validateSubscalarRange(range, in: bounds) + let upper = scalarAlign(r.upperBound) + let lower = scalarAlign(r.lowerBound) + return Range(_uncheckedBounds: (lower, upper)) + } +} + +// Character index validation (String & Substring) +extension _StringGuts { + internal func validateCharacterIndex(_ i: String.Index) -> String.Index { + if isFastCharacterIndex(i) { + _precondition(i._encodedOffset < count, "String index is out of bounds") + return i + } + return roundDownToNearestCharacter(scalarAlign(validateSubscalarIndex(i))) + } + + internal func validateCharacterIndex( + _ i: String.Index, + in bounds: Range + ) -> String.Index { + _internalInvariant(bounds.upperBound <= endIndex) + + if isFastCharacterIndex(i) { + _precondition(i >= bounds.lowerBound && i < bounds.upperBound, + "Substring index is out of bounds") + return i + } + + return roundDownToNearestCharacter( + scalarAlign(validateSubscalarIndex(i, in: bounds)), + in: bounds) + } + + internal func validateInclusiveCharacterIndex( + _ i: String.Index + ) -> String.Index { + if isFastCharacterIndex(i) { + _precondition(i._encodedOffset <= count, "String index is out of bounds") + return i + } + + return roundDownToNearestCharacter( + scalarAlign(validateInclusiveSubscalarIndex(i))) + } + + internal func validateInclusiveCharacterIndex( + _ i: String.Index, + in bounds: Range + ) -> String.Index { + _internalInvariant(bounds.upperBound <= endIndex) + + if isFastCharacterIndex(i) { + _precondition(i >= bounds.lowerBound && i <= bounds.upperBound, + "Substring index is out of bounds") + return i + } + + return roundDownToNearestCharacter( + scalarAlign(validateInclusiveSubscalarIndex(i, in: bounds)), + in: bounds) + } + + internal func validateCharacterRange( + _ range: Range + ) -> Range { + if + isFastCharacterIndex(range.lowerBound), + isFastCharacterIndex(range.upperBound) + { + _precondition(range.upperBound._encodedOffset <= count, + "String index range is out of bounds") + return range + } + + let r = validateSubscalarRange(range) + let l = roundDownToNearestCharacter(scalarAlign(r.lowerBound)) + let u = roundDownToNearestCharacter(scalarAlign(r.upperBound)) + return Range(_uncheckedBounds: (l, u)) + } + + internal func validateCharacterRange( + _ range: Range, + in bounds: Range + ) -> Range { + _internalInvariant(bounds.upperBound <= endIndex) + + if + isFastCharacterIndex(range.lowerBound), + isFastCharacterIndex(range.upperBound) + { + _precondition( + range.lowerBound >= bounds.lowerBound + && range.upperBound <= bounds.upperBound, + "String index range is out of bounds") + return range + } + + let r = validateSubscalarRange(range, in: bounds) + let l = roundDownToNearestCharacter(scalarAlign(r.lowerBound), in: bounds) + let u = roundDownToNearestCharacter(scalarAlign(r.upperBound), in: bounds) + return Range(_uncheckedBounds: (l, u)) + } +} diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 2ca923c126954..a21b43bada6e4 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -169,32 +169,6 @@ extension Substring { } extension Substring { - @inline(__always) - internal func _validateScalarIndex(_ i: String.Index) -> String.Index { - _wholeGuts.validateScalarIndex(i, in: _bounds) - } - - @inline(__always) - internal func _validateInclusiveScalarIndex( - _ i: String.Index - ) -> String.Index { - _wholeGuts.validateInclusiveScalarIndex(i, in: _bounds) - } - - @inline(__always) - internal func _validateScalarRange( - _ range: Range - ) -> Range { - _wholeGuts.validateScalarRange(range, in: _bounds) - } - - @inline(__always) - internal func _roundDownToNearestCharacter( - _ i: String.Index - ) -> String.Index { - _wholeGuts.roundDownToNearestCharacter(i, in: _bounds) - } - /// Return true if and only if `i` is a valid index in this substring, /// that is to say, it exactly addresses one of the `Character`s in it. /// @@ -209,7 +183,9 @@ extension Substring { else { return false } - return i == _roundDownToNearestCharacter(i._scalarAligned) + let c = _wholeGuts.roundDownToNearestCharacter( + i._scalarAligned, in: _bounds) + return i == c } } @@ -231,7 +207,7 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. - let i = _roundDownToNearestCharacter(_validateScalarIndex(i)) + let i = _wholeGuts.validateCharacterIndex(i, in: _bounds) let r = _uncheckedIndex(after: i) return _wholeGuts.markEncoding(r) } @@ -287,7 +263,7 @@ extension Substring: StringProtocol { // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. - let i = _roundDownToNearestCharacter(_validateInclusiveScalarIndex(i)) + let i = _wholeGuts.validateInclusiveCharacterIndex(i, in: _bounds) // Note: Aligning an index may move it closer towards the `startIndex`, so // this `i > startIndex` check needs to come after all the // alignment/validation work. @@ -340,8 +316,7 @@ extension Substring: StringProtocol { // `Substring`'s bounds do not fall on grapheme boundaries in `base`. // TODO: known-ASCII and single-scalar-grapheme fast path, etc. - var i = _roundDownToNearestCharacter( - _validateInclusiveScalarIndex(i)) + var i = _wholeGuts.validateInclusiveCharacterIndex(i, in: _bounds) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { _precondition(i < endIndex, "String index is out of bounds") @@ -377,7 +352,7 @@ extension Substring: StringProtocol { let limit = _wholeGuts.ensureMatchingEncoding(limit) let start = _wholeGuts.ensureMatchingEncoding(i) - var i = _roundDownToNearestCharacter(_validateInclusiveScalarIndex(i)) + var i = _wholeGuts.validateInclusiveCharacterIndex(i, in: _bounds) if distance >= 0 { for _ in stride(from: 0, to: distance, by: 1) { guard limit < start || i < limit else { return nil } @@ -409,10 +384,8 @@ extension Substring: StringProtocol { // grapheme breaks -- swapping `start` and `end` may change the magnitude of // the result. - let start = _roundDownToNearestCharacter( - _validateInclusiveScalarIndex(start)) - let end = _roundDownToNearestCharacter( - _validateInclusiveScalarIndex(end)) + let start = _wholeGuts.validateInclusiveCharacterIndex(start, in: _bounds) + let end = _wholeGuts.validateInclusiveCharacterIndex(end, in: _bounds) // TODO: known-ASCII and single-scalar-grapheme fast path, etc. @@ -440,7 +413,7 @@ extension Substring: StringProtocol { public subscript(i: Index) -> Character { // Note: SE-0180 requires us not to round `i` down to the nearest whole // `Character` boundary. - let i = _validateScalarIndex(i) + let i = _wholeGuts.validateScalarIndex(i, in: _bounds) let stride = _characterStride(startingAt: i) // Don't let the subscript return data outside this substring. let endOffset = Swift.min( @@ -469,7 +442,7 @@ extension Substring: StringProtocol { // Note: SE-0180 requires us to use `subrange` bounds even if they aren't // `Character` aligned. (We still have to round things down to the nearest // scalar boundary, though, or we may generate ill-formed encodings.) - let subrange = _validateScalarRange(subrange) + let subrange = _wholeGuts.validateScalarRange(subrange, in: _bounds) // Replacing the range is easy -- we can just reuse `String`'s // implementation. However, we must also update `startIndex` and `endIndex` @@ -1258,7 +1231,19 @@ extension Substring: ExpressibleByStringLiteral { extension String { @available(swift, introduced: 4) public subscript(r: Range) -> Substring { - let r = _guts.validateScalarRange(r) + var r = _guts.validateScalarRange(r) + + // Older binaries may generate `startIndex` without the + // `_isCharacterAligned` flag. Compensate for that here so that substrings + // that start at the beginning will never get the sad path in + // `index(after:)`. Note that we don't need to do this for `upperBound` and + // we don't need to compare against the `endIndex` -- those aren't nearly as + // critical. + if r.lowerBound._encodedOffset == 0 { + r = Range(_uncheckedBounds: + (r.lowerBound._characterAligned, r.upperBound)) + } + return Substring(_unchecked: Slice(base: self, bounds: r)) } } @@ -1266,7 +1251,7 @@ extension String { extension Substring { @available(swift, introduced: 4) public subscript(r: Range) -> Substring { - let r = _validateScalarRange(r) + let r = _wholeGuts.validateScalarRange(r, in: _bounds) return Substring(_unchecked: Slice(base: base, bounds: r)) } } From 58ab3fea34d2db04b65815a7a0a937ddb6aefc97 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sat, 9 Apr 2022 22:54:49 -0700 Subject: [PATCH 57/83] Apply suggestions from code review Co-authored-by: Alejandro Alonso --- stdlib/public/core/StringCharacterView.swift | 2 +- stdlib/public/core/StringUTF16View.swift | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 007660c97c65c..dd3b85e2a0018 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -313,7 +313,7 @@ extension String: BidirectionalCollection { /// present!) /// /// This method is called from inlinable `subscript` implementations in - /// current and previous versions of the stdlib, wich require this contract + /// current and previous versions of the stdlib, which require this contract /// not to be violated. @usableFromInline @inline(__always) diff --git a/stdlib/public/core/StringUTF16View.swift b/stdlib/public/core/StringUTF16View.swift index e5fc9fa74f2e3..50768539da613 100644 --- a/stdlib/public/core/StringUTF16View.swift +++ b/stdlib/public/core/StringUTF16View.swift @@ -145,7 +145,7 @@ extension String.UTF16View: BidirectionalCollection { "String index is out of bounds") if _slowPath(_guts.isForeign) { return _foreignIndex(after: idx) } if _guts.isASCII { - return idx.nextEncoded._scalarAligned._knownUTF8._knownUTF16 + return idx.nextEncoded._scalarAligned._encodingIndependent } // For a BMP scalar (1-3 UTF-8 code units), advance past it. For a non-BMP @@ -173,7 +173,7 @@ extension String.UTF16View: BidirectionalCollection { "String index is out of bounds") if _slowPath(_guts.isForeign) { return _foreignIndex(before: idx) } if _guts.isASCII { - return idx.priorEncoded._scalarAligned._knownUTF8._knownUTF16 + return idx.priorEncoded._scalarAligned._encodingIndependent } if idx.transcodedOffset != 0 { @@ -709,7 +709,7 @@ extension String.UTF16View { if _guts.isASCII { return Index( _encodedOffset: offset - )._scalarAligned._knownUTF8._knownUTF16 + )._scalarAligned._encodingIndependent } guard _guts._useBreadcrumbs(forEncodedOffset: offset) else { From 67adcabefc84c0240e0e32c72da686183ac4b825 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sat, 9 Apr 2022 22:55:44 -0700 Subject: [PATCH 58/83] Apply notes from code review --- stdlib/public/core/StringCharacterView.swift | 15 +++-- stdlib/public/core/StringGutsSlice.swift | 3 - .../public/core/StringUnicodeScalarView.swift | 3 +- stdlib/public/core/Substring.swift | 57 +++++++++---------- 4 files changed, 36 insertions(+), 42 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index dd3b85e2a0018..96f9a193ce6fb 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -132,8 +132,8 @@ extension String: BidirectionalCollection { /// `index(before:)`. /// - Complexity: O(*n*), where *n* is the absolute value of `distance`. public func index(_ i: Index, offsetBy distance: Int) -> Index { - // Note: in Swift 5.6 and below, this method used to be inlinable, - // forwarding to `_index(_:offsetBy:)`. + // Note: prior to Swift 5.7, this method used to be inlinable, forwarding to + // `_index(_:offsetBy:)`. // TODO: known-ASCII and single-scalar-grapheme fast path, etc. @@ -194,8 +194,8 @@ extension String: BidirectionalCollection { public func index( _ i: Index, offsetBy distance: Int, limitedBy limit: Index ) -> Index? { - // Note: In Swift 5.6 and below, this function used to be inlinable, - // forwarding to `BidirectionalCollection._index(_:offsetBy:limitedBy:)`. + // Note: Prior to Swift 5.7, this function used to be inlinable, forwarding + // to `BidirectionalCollection._index(_:offsetBy:limitedBy:)`. // Unfortunately, that approach isn't compatible with SE-0180, as it doesn't // support cases where `i` or `limit` aren't character aligned. @@ -242,8 +242,8 @@ extension String: BidirectionalCollection { /// /// - Complexity: O(*n*), where *n* is the resulting distance. public func distance(from start: Index, to end: Index) -> Int { - // Note: In Swift 5.6 and below, this function used to be inlinable, - // forwarding to `BidirectionalCollection._distance(from:to:)`. + // Note: Prior to Swift 5.7, this function used to be inlinable, forwarding + // to `BidirectionalCollection._distance(from:to:)`. // FIXME: Due to the `index(after:)` problem above, this function doesn't // always return consistent results when the given indices fall between @@ -266,8 +266,7 @@ extension String: BidirectionalCollection { count += 1 i = _uncheckedIndex(after: i) } - } - else if i > end { + } else if i > end { while i > end { // Note `<` instead of `==` count -= 1 i = _uncheckedIndex(before: i) diff --git a/stdlib/public/core/StringGutsSlice.swift b/stdlib/public/core/StringGutsSlice.swift index a09cb6c96fb83..3f92c40ab408d 100644 --- a/stdlib/public/core/StringGutsSlice.swift +++ b/stdlib/public/core/StringGutsSlice.swift @@ -31,9 +31,6 @@ internal struct _StringGutsSlice { internal init(_ guts: _StringGuts, _ offsetRange: Range) { _internalInvariant( offsetRange.lowerBound >= 0 && offsetRange.upperBound <= guts.count) - _internalInvariant( - guts.isOnUnicodeScalarBoundary(offsetRange.lowerBound) - && guts.isOnUnicodeScalarBoundary(offsetRange.upperBound)) self._guts = guts self._offsetRange = offsetRange } diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index 0cb8f32fc14e2..bfbf0cdfb498b 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -183,8 +183,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { count += 1 i = _uncheckedIndex(after: i) } - } - else if i > end { + } else if i > end { while i > end { count -= 1 i = _uncheckedIndex(before: i) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index a21b43bada6e4..f6cb1bf9f879f 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -200,9 +200,9 @@ extension Substring: StringProtocol { public var endIndex: Index { _slice._endIndex } public func index(after i: Index) -> Index { - // Note: in Swift 5.6 and below, this method used to be inlinable, - // forwarding to `_slice.base.index(after:)`. Unfortunately, that approach - // isn't compatible with SE-0180, as it allows Unicode scalars outside the + // Note: Prior to Swift 5.7, this method used to be inlinable, forwarding to + // `_slice.base.index(after:)`. Unfortunately, that approach isn't + // compatible with SE-0180, as it allows Unicode scalars outside the // substring to affect grapheme breaking results within the substring. This // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. @@ -256,9 +256,9 @@ extension Substring: StringProtocol { } public func index(before i: Index) -> Index { - // Note: in Swift 5.6 and below, this method used to be inlinable, - // forwarding to `_slice.base.index(before:)`. Unfortunately, that approach - // isn't compatible with SE-0180, as it allows Unicode scalars outside the + // Note: Prior to Swift 5.7, this method used to be inlinable, forwarding to + // `_slice.base.index(before:)`. Unfortunately, that approach isn't + // compatible with SE-0180, as it allows Unicode scalars outside the // substring to affect grapheme breaking results within the substring. This // leads to Collection conformance issues when the `Substring`'s bounds do // not fall on grapheme boundaries in `base`. @@ -294,26 +294,26 @@ extension Substring: StringProtocol { _internalInvariant(priorOffset >= startIndex._encodedOffset) var r = Index( - encodedOffset: priorOffset, characterStride: priorStride) + encodedOffset: priorOffset, characterStride: priorStride + )._scalarAligned // Don't set the `_isCharacterAligned` bit in indices of exotic substrings // whose startIndex isn't aligned on a grapheme cluster boundary. (Their // grapheme breaks may not match with those in `base`.) if _startIsCharacterAligned { r = r._characterAligned - } else { - r = r._scalarAligned } + return r } public func index(_ i: Index, offsetBy distance: Int) -> Index { - // Note: in Swift 5.6 and below, this method used to be inlinable, - // forwarding to `_slice.base.index(_:offsetBy:)`. Unfortunately, that - // approach isn't compatible with SE-0180, as it allows Unicode scalars - // outside the substring to affect grapheme breaking results within the - // substring. This leads to Collection conformance issues when the - // `Substring`'s bounds do not fall on grapheme boundaries in `base`. + // Note: Prior to Swift 5.7, this method used to be inlinable, forwarding to + // `_slice.base.index(_:offsetBy:)`. Unfortunately, that approach isn't + // compatible with SE-0180, as it allows Unicode scalars outside the + // substring to affect grapheme breaking results within the substring. This + // leads to Collection conformance issues when the `Substring`'s bounds do + // not fall on grapheme boundaries in `base`. // TODO: known-ASCII and single-scalar-grapheme fast path, etc. var i = _wholeGuts.validateInclusiveCharacterIndex(i, in: _bounds) @@ -334,12 +334,12 @@ extension Substring: StringProtocol { public func index( _ i: Index, offsetBy distance: Int, limitedBy limit: Index ) -> Index? { - // Note: in Swift 5.6 and below, this method used to be inlinable, - // forwarding to `_slice.base.index(_:offsetBy:limitedBy:)`. Unfortunately, - // that approach isn't compatible with SE-0180, as it allows Unicode scalars - // outside the substring to affect grapheme breaking results within the - // substring. This leads to Collection conformance issues when the - // `Substring`'s bounds do not fall on grapheme boundaries in `base`. + // Note: Prior to Swift 5.7, this method used to be inlinable, forwarding to + // `_slice.base.index(_:offsetBy:limitedBy:)`. Unfortunately, that approach + // isn't compatible with SE-0180, as it allows Unicode scalars outside the + // substring to affect grapheme breaking results within the substring. This + // leads to Collection conformance issues when the `Substring`'s bounds do + // not fall on grapheme boundaries in `base`. // Per SE-0180, `i` and `limit` are allowed to fall in between grapheme // breaks, in which case this function must still terminate without trapping @@ -372,12 +372,12 @@ extension Substring: StringProtocol { } public func distance(from start: Index, to end: Index) -> Int { - // Note: in Swift 5.6 and below, this method used to be inlinable, - // forwarding to `_slice.base.distance(from:to:)`. Unfortunately, that - // approach isn't compatible with SE-0180, as it allows Unicode scalars - // outside the substring to affect grapheme breaking results within the - // substring. This leads to Collection conformance issues when the - // `Substring`'s bounds do not fall on grapheme boundaries in `base`. + // Note: Prior to Swift 5.7, this method used to be inlinable, forwarding to + // `_slice.base.distance(from:to:)`. Unfortunately, that approach isn't + // compatible with SE-0180, as it allows Unicode scalars outside the + // substring to affect grapheme breaking results within the substring. This + // leads to Collection conformance issues when the `Substring`'s bounds do + // not fall on grapheme boundaries in `base`. // FIXME: Due to the `index(after:)` problem above, this function doesn't // always return consistent results when the given indices fall between @@ -400,8 +400,7 @@ extension Substring: StringProtocol { count += 1 i = _uncheckedIndex(after: i) } - } - else if i > end { + } else if i > end { while i > end { // Note `<` instead of `==` count -= 1 i = _uncheckedIndex(before: i) From bbb004854ea0fc4a7bbb2a957b204b286530c41b Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sun, 10 Apr 2022 16:49:01 -0700 Subject: [PATCH 59/83] [stdlib] Minor enhancements --- stdlib/public/core/StringUnicodeScalarView.swift | 2 +- stdlib/public/core/UnicodeHelpers.swift | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index bfbf0cdfb498b..1bc33b7776248 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -143,7 +143,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { _utf8ScalarLength(utf8, endingAt: i._encodedOffset) } _internalInvariant(len <= 4, "invalid UTF8") - return i.encoded(offsetBy: -len)._scalarAligned._knownUTF8 + return i.encoded(offsetBy: 0 &- len)._scalarAligned._knownUTF8 } return _foreignIndex(before: i) diff --git a/stdlib/public/core/UnicodeHelpers.swift b/stdlib/public/core/UnicodeHelpers.swift index adb277077e3dc..1ef9ad7a0ff9b 100644 --- a/stdlib/public/core/UnicodeHelpers.swift +++ b/stdlib/public/core/UnicodeHelpers.swift @@ -178,6 +178,7 @@ extension _StringGuts { @inline(never) // slow-path @_alwaysEmitIntoClient // Swift 5.1 + @_effects(releasenone) internal func scalarAlignSlow(_ idx: Index) -> Index { _internalInvariant_5_1(!idx._isScalarAligned) From d3df05cb30577097c8378cc488b41540bc8bfcfa Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sun, 10 Apr 2022 16:51:51 -0700 Subject: [PATCH 60/83] [stdlib] String.Index: Remove custom printing --- stdlib/public/core/StringIndex.swift | 34 ---------------------------- 1 file changed, 34 deletions(-) diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index 048c558854320..71b52400adb94 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -516,37 +516,3 @@ extension String.Index: Hashable { hasher.combine(orderingValue) } } - -// FIXME: This is for debugging only; remove before merging. -extension String.Index { - @_alwaysEmitIntoClient - @inline(never) - public var description: String { - var d = "String.Index(" - d += "offset: \(_encodedOffset)" - if transcodedOffset != 0 { - d += "+\(transcodedOffset)" - } - - d += ", encoding: " - switch (_rawBits & Self.__utf8Bit != 0, _rawBits & Self.__utf16Bit != 0) { - case (false, false): d += "unknown" - case (true, false): d += "utf8" - case (false, true): d += "utf16" - case (true, true): d += "any" - } - if _isCharacterAligned { - d += ", aligned: character" - } else if _isScalarAligned { - d += ", aligned: scalar" - } - if let stride = characterStride { - d += ", stride: \(stride)" - } - d += ")" - return d - } -} - -@available(SwiftStdlib 5.7, *) -extension String.Index: CustomStringConvertible {} From dcfc26cbc563e3ccfd9bb41c6d600dc42dd65d64 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Sun, 10 Apr 2022 16:53:48 -0700 Subject: [PATCH 61/83] [stdlib][NFC] Doc adjustments --- stdlib/public/core/StringIndex.swift | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index 71b52400adb94..3ac30b97e2130 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -49,9 +49,15 @@ isn't frozen. - * b3: UTF-16 encoding + * b0: `_isScalarAligned` - If set, the position is known to be expressed in UTF-16 code units. + If set, index is known to be on a Unicode scalar boundary (see below). + (Introduced in Swift 5.1) + + * b1: `_isCharacterAligned` + + If set, the index is known to be on an extended grapheme cluster + boundary (i.e., on a Swift `Character`.) (Introduced in Swift 5.7) * b2: UTF-8 encoding @@ -59,17 +65,11 @@ isn't frozen. If set, the position is known to be expressed in UTF-8 code units. (Introduced in Swift 5.7) - * b1: `_isCharacterAligned` + * b3: UTF-16 encoding - If set, the index is known to be on an extended grapheme cluster - boundary (i.e., on a Swift `Character`.) + If set, the position is known to be expressed in UTF-16 code units. (Introduced in Swift 5.7) - * b0: `_isScalarAligned` - - If set, index is known to be on a Unicode scalar boundary (see below). - (Introduced in Swift 5.1) - Before Swift 5.7, bits b1, b2 and b3 used to be part of the resilient slice. See the notes on Character Alignment and Index Encoding below to see how this works. From ed7d60c711be891e88969f62c1b4010bff7cba40 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Mon, 11 Apr 2022 14:03:01 -0700 Subject: [PATCH 62/83] [stdlib] Remove unused fn --- stdlib/public/core/StringGuts.swift | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index 7ed4ef8691f89..f7d79e48f1607 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -299,16 +299,6 @@ extension _StringGuts { } } -@_alwaysEmitIntoClient -@inline(__always) -func _isSwiftStdlib_5_7() -> Bool { - if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { // SwiftStdlib 5.7 - return true - } else { - return false - } -} - // Encoding extension _StringGuts { /// Returns whether this string has a UTF-8 storage representation. From f27005b8d0e7e2f9a0a3bcc8247761940b32f3ae Mon Sep 17 00:00:00 2001 From: Allan Shortlidge Date: Fri, 8 Apr 2022 19:11:14 -0700 Subject: [PATCH 63/83] ModuleInterface: When printing synthesized extensions, we need to be sure to guard them with required features if applicable. Not doing so can result in broken interfaces that do not typecheck because, for instance, a conformance can refer to a nominal type that is only declared when certain features are enabled. Also, fix a typo where `#elsif` was printed into interfaces instead of `#elseif`. Resolves rdar://91509673 --- include/swift/AST/PrintOptions.h | 6 ++ lib/AST/ASTPrinter.cpp | 25 +++--- lib/Frontend/ModuleInterfaceSupport.cpp | 103 +++++++++++++----------- test/ModuleInterface/features.swift | 11 ++- 4 files changed, 82 insertions(+), 63 deletions(-) diff --git a/include/swift/AST/PrintOptions.h b/include/swift/AST/PrintOptions.h index 9419aef5312f9..de095ed34659a 100644 --- a/include/swift/AST/PrintOptions.h +++ b/include/swift/AST/PrintOptions.h @@ -391,6 +391,12 @@ struct PrintOptions { /// Whether to use an empty line to separate two members in a single decl. bool EmptyLineBetweenMembers = false; + /// Whether to print empty members of a declaration on a single line, e.g.: + /// ``` + /// extension Foo: Bar {} + /// ``` + bool PrintEmptyMembersOnSameLine = false; + /// Whether to print the extensions from conforming protocols. bool PrintExtensionFromConformingProtocols = false; diff --git a/lib/AST/ASTPrinter.cpp b/lib/AST/ASTPrinter.cpp index 299882abf81a6..0b8af5f8e5eb8 100644 --- a/lib/AST/ASTPrinter.cpp +++ b/lib/AST/ASTPrinter.cpp @@ -2335,7 +2335,8 @@ void PrintAST::printMembers(ArrayRef members, bool needComma, bool openBracket, bool closeBracket) { if (openBracket) { Printer << " {"; - Printer.printNewline(); + if (!Options.PrintEmptyMembersOnSameLine || !members.empty()) + Printer.printNewline(); } { IndentRAII indentMore(*this); @@ -3128,13 +3129,12 @@ static FeatureSet getUniqueFeaturesUsed(Decl *decl) { return features; } -static void printCompatibilityCheckIf(ASTPrinter &printer, - bool isElsif, +static void printCompatibilityCheckIf(ASTPrinter &printer, bool isElseIf, bool includeCompilerCheck, const BasicFeatureSet &features) { assert(!features.empty()); - printer << (isElsif ? "#elsif " : "#if "); + printer << (isElseIf ? "#elseif " : "#if "); if (includeCompilerCheck) printer << "compiler(>=5.3) && "; @@ -3150,7 +3150,7 @@ static void printCompatibilityCheckIf(ASTPrinter &printer, printer.printNewline(); } -/// Generate a #if ... #elsif ... #endif chain for the given +/// Generate a #if ... #elseif ... #endif chain for the given /// suppressible feature checks. static void printWithSuppressibleFeatureChecks(ASTPrinter &printer, PrintOptions &options, @@ -3171,18 +3171,17 @@ static void printWithSuppressibleFeatureChecks(ASTPrinter &printer, return; } - // Otherwise, enter a `#if` or `#elsif` for the next feature. + // Otherwise, enter a `#if` or `#elseif` for the next feature. Feature feature = generator.next(); - printCompatibilityCheckIf(printer, /*elsif*/ !firstInChain, - includeCompilerCheck, - {feature}); + printCompatibilityCheckIf(printer, /*elseif*/ !firstInChain, + includeCompilerCheck, {feature}); // Print the body. printBody(); printer.printNewline(); // Start suppressing the feature and recurse to either generate - // more `#elsif` clauses or finish off with `#endif`. + // more `#elseif` clauses or finish off with `#endif`. suppressingFeature(options, feature, [&] { printWithSuppressibleFeatureChecks(printer, options, /*first*/ false, includeCompilerCheck, generator, @@ -3195,13 +3194,13 @@ static void printWithSuppressibleFeatureChecks(ASTPrinter &printer, /// /// In the most general form, with both required features and multiple /// suppressible features in play, the generated code pattern looks like -/// the following (assuming that feaature $bar implies feature $baz): +/// the following (assuming that feature $bar implies feature $baz): /// /// ``` /// #if compiler(>=5.3) && $foo /// #if $bar /// @foo @bar @baz func @test() {} -/// #elsif $baz +/// #elseif $baz /// @foo @baz func @test() {} /// #else /// @foo func @test() {} @@ -3229,7 +3228,7 @@ void swift::printWithCompatibilityFeatureChecks(ASTPrinter &printer, bool hasRequiredFeatures = features.hasAnyRequired(); if (hasRequiredFeatures) { printCompatibilityCheckIf(printer, - /*elsif*/ false, + /*elseif*/ false, /*compiler check*/ true, features.requiredFeatures()); } diff --git a/lib/Frontend/ModuleInterfaceSupport.cpp b/lib/Frontend/ModuleInterfaceSupport.cpp index aa710bb0c82d3..9e248fbf2eb4e 100644 --- a/lib/Frontend/ModuleInterfaceSupport.cpp +++ b/lib/Frontend/ModuleInterfaceSupport.cpp @@ -19,7 +19,9 @@ #include "swift/AST/FileSystem.h" #include "swift/AST/Module.h" #include "swift/AST/ModuleNameLookup.h" +#include "swift/AST/NameLookupRequests.h" #include "swift/AST/ProtocolConformance.h" +#include "swift/AST/TypeCheckRequests.h" #include "swift/AST/TypeRepr.h" #include "swift/Basic/STLExtras.h" #include "swift/Frontend/Frontend.h" @@ -569,13 +571,14 @@ class InheritedProtocolCollector { }); } + // Preserve the behavior of previous implementations which formatted of + // empty extensions compactly with '{}' on the same line. + PrintOptions extensionPrintOptions = printOptions; + extensionPrintOptions.PrintEmptyMembersOnSameLine = true; + // Then walk the remaining ones, and see what we need to print. - // Note: We could do this in one pass, but the logic is easier to - // understand if we build up the list and then print it, even if it takes - // a bit more memory. // FIXME: This will pick the availability attributes from the first sight // of a protocol rather than the maximally available case. - SmallVector protocolsToPrint; for (const auto &protoAndAvailability : ExtraProtocols) { auto proto = std::get<0>(protoAndAvailability); auto availability = std::get<1>(protoAndAvailability); @@ -601,58 +604,64 @@ class InheritedProtocolCollector { if (isPublicOrUsableFromInline(inherited) && conformanceDeclaredInModule(M, nominal, inherited) && !M->isImportedImplementationOnly(inherited->getParentModule())) { - protocolsToPrint.push_back( - ProtocolAndAvailability(inherited, availability, isUnchecked, - otherAttrs)); + auto protoAndAvailability = ProtocolAndAvailability( + inherited, availability, isUnchecked, otherAttrs); + printSynthesizedExtension(out, extensionPrintOptions, M, nominal, + protoAndAvailability); return TypeWalker::Action::SkipChildren; } return TypeWalker::Action::Continue; }); } - if (protocolsToPrint.empty()) - return; - - for (const auto &protoAndAvailability : protocolsToPrint) { - StreamPrinter printer(out); - auto proto = std::get<0>(protoAndAvailability); - auto availability = std::get<1>(protoAndAvailability); - auto isUnchecked = std::get<2>(protoAndAvailability); - auto otherAttrs = std::get<3>(protoAndAvailability); - - PrintOptions curPrintOptions = printOptions; - auto printBody = [&] { - // FIXME: Shouldn't this be an implicit conversion? - TinyPtrVector attrs; - attrs.insert(attrs.end(), availability.begin(), availability.end()); - auto spiAttributes = proto->getAttrs().getAttributes(); - attrs.insert(attrs.end(), spiAttributes.begin(), spiAttributes.end()); - attrs.insert(attrs.end(), otherAttrs.begin(), otherAttrs.end()); - DeclAttributes::print(printer, curPrintOptions, attrs); - - printer << "extension "; - { - bool oldFullyQualifiedTypesIfAmbiguous = - curPrintOptions.FullyQualifiedTypesIfAmbiguous; - curPrintOptions.FullyQualifiedTypesIfAmbiguous = - curPrintOptions.FullyQualifiedExtendedTypesIfAmbiguous; - nominal->getDeclaredType().print(printer, curPrintOptions); - curPrintOptions.FullyQualifiedTypesIfAmbiguous = - oldFullyQualifiedTypesIfAmbiguous; - } - printer << " : "; - - if (isUnchecked) - printer << "@unchecked "; + } - proto->getDeclaredInterfaceType()->print(printer, curPrintOptions); + /// Prints a dummy extension on \p nominal to \p out for a public conformance + /// to the protocol contained by \p protoAndAvailability. + static void + printSynthesizedExtension(raw_ostream &out, const PrintOptions &printOptions, + ModuleDecl *M, const NominalTypeDecl *nominal, + ProtocolAndAvailability &protoAndAvailability) { + StreamPrinter printer(out); + + auto proto = std::get<0>(protoAndAvailability); + auto availability = std::get<1>(protoAndAvailability); + auto isUnchecked = std::get<2>(protoAndAvailability); + auto otherAttrs = std::get<3>(protoAndAvailability); + + // Create a synthesized ExtensionDecl for the conformance. + ASTContext &ctx = M->getASTContext(); + auto inherits = ctx.AllocateCopy(llvm::makeArrayRef(InheritedEntry( + TypeLoc::withoutLoc(proto->getDeclaredInterfaceType()), isUnchecked))); + auto extension = + ExtensionDecl::create(ctx, SourceLoc(), nullptr, inherits, + nominal->getModuleScopeContext(), nullptr); + extension->setImplicit(); + + // Build up synthesized DeclAttributes for the extension. + TinyPtrVector attrs; + attrs.insert(attrs.end(), availability.begin(), availability.end()); + auto spiAttributes = + proto->getAttrs().getAttributes(); + attrs.insert(attrs.end(), spiAttributes.begin(), spiAttributes.end()); + attrs.insert(attrs.end(), otherAttrs.begin(), otherAttrs.end()); + + // Since DeclAttributes is a linked list where each added attribute becomes + // the head, we need to add these attributes in reverse order to reproduce + // the order in which previous implementations printed these attributes. + DeclAttributes declAttrs; + for (auto attr = attrs.rbegin(), end = attrs.rend(); attr != end; ++attr) { + declAttrs.add(const_cast(*attr)); + } + extension->getAttrs() = declAttrs; - printer << " {}"; - }; + ctx.evaluator.cacheOutput(ExtendedTypeRequest{extension}, + nominal->getDeclaredType()); + ctx.evaluator.cacheOutput(ExtendedNominalRequest{extension}, + const_cast(nominal)); - printBody(); - printer << "\n"; - } + extension->print(printer, printOptions); + printer << "\n"; } /// If there were any conditional conformances that couldn't be printed, diff --git a/test/ModuleInterface/features.swift b/test/ModuleInterface/features.swift index 206365aba309d..0254acb2fe9c0 100644 --- a/test/ModuleInterface/features.swift +++ b/test/ModuleInterface/features.swift @@ -1,7 +1,8 @@ // RUN: %empty-directory(%t) -// RUN: %target-swift-frontend -typecheck -swift-version 5 -module-name FeatureTest -emit-module-interface-path - -enable-library-evolution -disable-availability-checking %s | %FileCheck %s -// REQUIRES: concurrency +// RUN: %target-swift-frontend -typecheck -swift-version 5 -module-name FeatureTest -emit-module-interface-path %t/FeatureTest.swiftinterface -enable-library-evolution -disable-availability-checking %s +// RUN: %FileCheck %s < %t/FeatureTest.swiftinterface +// RUN: %target-swift-frontend -typecheck-module-from-interface -disable-availability-checking -swift-version 5 -module-name FeatureTest %t/FeatureTest.swiftinterface // REQUIRES: concurrency @@ -166,7 +167,7 @@ public func unsafeInheritExecutor() async {} // CHECK-NEXT: #if $UnsafeInheritExecutor // CHECK-NEXT: @_specialize{{.*}} // CHECK-NEXT: @_unsafeInheritExecutor public func multipleSuppressible(value: T) async -// CHECK-NEXT: #elsif $SpecializeAttributeWithAvailability +// CHECK-NEXT: #elseif $SpecializeAttributeWithAvailability // CHECK-NEXT: @_specialize{{.*}} // CHECK-NEXT: public func multipleSuppressible(value: T) async // CHECK-NEXT: #else @@ -195,3 +196,7 @@ public func unavailableFromAsyncFunc() { } public func noAsyncFunc() { } // CHECK-NOT: extension FeatureTest.MyActor : Swift.Sendable + +// CHECK: #if compiler(>=5.3) && $GlobalActors +// CHECK-NEXT: extension FeatureTest.SomeGlobalActor : _Concurrency.GlobalActor {} +// CHECK-NEXT: #endif From 680bf2e7537f66cf707ce0f735f31a53c2df4b77 Mon Sep 17 00:00:00 2001 From: Ben Barham Date: Wed, 13 Apr 2022 14:51:43 -0700 Subject: [PATCH 64/83] [CursorInfo] Add ObjC location to generated symbol graph Add the file for an imported ObjC-symbol to the generated symbol graph when ObjC documentation is requested. Skips line/column information for now since it's not needed. If we add those we should extract location retrieval to a common method for both cursor info and symbol graph gen. Resolves rdar://91658873. --- lib/SymbolGraphGen/Symbol.cpp | 25 +++++++++++++++++++ .../CursorInfo/cursor_symbol_graph_objc.swift | 17 +++++++++++++ 2 files changed, 42 insertions(+) diff --git a/lib/SymbolGraphGen/Symbol.cpp b/lib/SymbolGraphGen/Symbol.cpp index 47c57f68b644d..f411b6b60ac7a 100644 --- a/lib/SymbolGraphGen/Symbol.cpp +++ b/lib/SymbolGraphGen/Symbol.cpp @@ -21,6 +21,7 @@ #include "swift/Basic/Unicode.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" +#include "clang/Basic/SourceManager.h" #include "AvailabilityMixin.h" #include "JSON.h" #include "Symbol.h" @@ -409,6 +410,30 @@ void Symbol::serializeAccessLevelMixin(llvm::json::OStream &OS) const { } void Symbol::serializeLocationMixin(llvm::json::OStream &OS) const { + if (ClangNode ClangN = VD->getClangNode()) { + if (!Graph->Walker.Options.IncludeClangDocs) + return; + + if (auto *ClangD = ClangN.getAsDecl()) { + clang::SourceManager &ClangSM = + ClangD->getASTContext().getSourceManager(); + + clang::PresumedLoc Loc = ClangSM.getPresumedLoc(ClangD->getLocation()); + if (Loc.isValid()) { + // TODO: We should use a common function to fill in the location + // information for both cursor info and symbol graph gen, then also + // include position here. + OS.attributeObject("location", [&](){ + SmallString<1024> FileURI("file://"); + FileURI.append(Loc.getFilename()); + OS.attribute("uri", FileURI.str()); + }); + } + } + + return; + } + auto Loc = VD->getLoc(/*SerializedOK=*/true); if (Loc.isInvalid()) { return; diff --git a/test/SourceKit/CursorInfo/cursor_symbol_graph_objc.swift b/test/SourceKit/CursorInfo/cursor_symbol_graph_objc.swift index c57137d609e44..f2a04bbe51d6e 100644 --- a/test/SourceKit/CursorInfo/cursor_symbol_graph_objc.swift +++ b/test/SourceKit/CursorInfo/cursor_symbol_graph_objc.swift @@ -1,5 +1,10 @@ // REQUIRES: objc_interop +// TODO: Add some way to specify extra options to symbolgraph-extract and then +// split this test into parts under the SymbolGraph directory. The SK +// tests should just check that we run the generation on 1. the correct +// symbol and 2. with the correct options. + // RUN: %empty-directory(%t) // RUN: split-file --leading-lines %s %t @@ -10,6 +15,9 @@ func test(s: ObjCStruct) { // RUN: %sourcekitd-test -req=cursor -pos=%(line+1):3 -req-opts=retrieve_symbol_graph=1 %t/use.swift -- -I %t/mod -target %target-triple %t/use.swift | %FileCheck -check-prefix=CHECK-FUNC %s someFunc() + // RUN: %sourcekitd-test -req=cursor -pos=%(line+1):3 -req-opts=retrieve_symbol_graph=1 %t/use.swift -- -I %t/mod -target %target-triple %t/use.swift | %FileCheck -check-prefix=CHECK-DIRECTIVE-FUNC %s + funcUnderDirective() + // RUN: %sourcekitd-test -req=cursor -pos=%(line+1):9 -req-opts=retrieve_symbol_graph=1 %t/use.swift -- -I %t/mod -target %target-triple %t/use.swift | %FileCheck -check-prefix=CHECK-NO %s _ = s.noDoc @@ -100,6 +108,9 @@ func test(s: ObjCStruct) { // CHECK-FUNC: "displayName": "Function", // CHECK-FUNC: "identifier": "swift.func" // CHECK-FUNC: }, +// CHECK-FUNC: "location": { +// CHECK-FUNC: "uri": "file://{{.*}}mod{{\\\\|/}}M.h" +// CHECK-FUNC: }, // CHECK-FUNC: "names": { // CHECK-FUNC: "subHeading": [ // CHECK-FUNC: { @@ -266,3 +277,9 @@ struct ObjCStruct { // CHECK-MIXED-DOC-NEXT: ] // CHECK-MIXED-DOC-NEXT: } }; + +#line 10 "other.h" +void funcUnderDirective(void); +// CHECK-DIRECTIVE-FUNC: "location": { +// CHECK-DIRECTIVE-FUNC: "uri": "file://{{.*}}other.h" +// CHECK-DIRECTIVE-FUNC: } From b33fefb71c41d0f12395599c029de816eb22bec0 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 13 Apr 2022 18:38:41 -0700 Subject: [PATCH 65/83] [stdlib] String: be more consistent about when markEncoding is called --- stdlib/public/core/StringGuts.swift | 1 + .../public/core/StringUnicodeScalarView.swift | 4 ++-- stdlib/public/core/Substring.swift | 18 ++++++------------ 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/stdlib/public/core/StringGuts.swift b/stdlib/public/core/StringGuts.swift index f7d79e48f1607..f1dcf8442e107 100644 --- a/stdlib/public/core/StringGuts.swift +++ b/stdlib/public/core/StringGuts.swift @@ -316,6 +316,7 @@ extension _StringGuts { internal var isUTF16: Bool { _object.isUTF16 } @_alwaysEmitIntoClient // Swift 5.7 + @inline(__always) internal func markEncoding(_ i: String.Index) -> String.Index { isUTF8 ? i._knownUTF8 : i._knownUTF16 } diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index 1bc33b7776248..a5d945308dd37 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -207,7 +207,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { i = _uncheckedIndex(before: i) } } - return _guts.markEncoding(i) + return i } @_alwaysEmitIntoClient @@ -238,7 +238,7 @@ extension String.UnicodeScalarView: BidirectionalCollection { } guard limit > start || i >= limit else { return nil } } - return _guts.markEncoding(i) + return i } } diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index f6cb1bf9f879f..ac1476f1f1a72 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -208,8 +208,7 @@ extension Substring: StringProtocol { // not fall on grapheme boundaries in `base`. let i = _wholeGuts.validateCharacterIndex(i, in: _bounds) - let r = _uncheckedIndex(after: i) - return _wholeGuts.markEncoding(r) + return _uncheckedIndex(after: i) } /// A version of `index(after:)` that assumes that the given index: @@ -217,8 +216,6 @@ extension Substring: StringProtocol { /// - has the right encoding, /// - is within bounds, and /// - is character aligned within this substring. - /// - /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(after i: Index) -> Index { _internalInvariant(_wholeGuts.hasMatchingEncoding(i)) _internalInvariant(i._isScalarAligned) @@ -252,7 +249,7 @@ extension Substring: StringProtocol { r = r._characterAligned } - return r + return _wholeGuts.markEncoding(r) } public func index(before i: Index) -> Index { @@ -269,8 +266,7 @@ extension Substring: StringProtocol { // alignment/validation work. _precondition(i > startIndex, "Substring index is out of bounds") - let r = _uncheckedIndex(before: i) - return _wholeGuts.markEncoding(r) + return _uncheckedIndex(before: i) } /// A version of `index(before:)` that assumes that the given index: @@ -278,8 +274,6 @@ extension Substring: StringProtocol { /// - has the right encoding, /// - is within bounds, and /// - is character aligned within this substring. - /// - /// It does not mark the encoding of the returned index. internal func _uncheckedIndex(before i: Index) -> Index { _internalInvariant(_wholeGuts.hasMatchingEncoding(i)) _internalInvariant(i._isScalarAligned) @@ -304,7 +298,7 @@ extension Substring: StringProtocol { r = r._characterAligned } - return r + return _wholeGuts.markEncoding(r) } public func index(_ i: Index, offsetBy distance: Int) -> Index { @@ -328,7 +322,7 @@ extension Substring: StringProtocol { i = _uncheckedIndex(before: i) } } - return _wholeGuts.markEncoding(i) + return i } public func index( @@ -368,7 +362,7 @@ extension Substring: StringProtocol { } guard limit > start || i >= limit else { return nil } } - return _wholeGuts.markEncoding(i) + return i } public func distance(from start: Index, to end: Index) -> Int { From 89d69a946729566a8590df182b38a10e66070ba6 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 13 Apr 2022 18:39:14 -0700 Subject: [PATCH 66/83] [stdlib] Revert noop change --- stdlib/public/core/StringBridge.swift | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/stdlib/public/core/StringBridge.swift b/stdlib/public/core/StringBridge.swift index a26bdc8d3cf37..a62dbeec5365a 100644 --- a/stdlib/public/core/StringBridge.swift +++ b/stdlib/public/core/StringBridge.swift @@ -614,12 +614,10 @@ extension String { // TODO: We'd rather emit a valid ObjC object statically than create a // shared string class instance. let gutsCountAndFlags = _guts._object._countAndFlags - let countAndFlags = _StringObject.CountAndFlags( - sharedCount: _guts.count, - isASCII: gutsCountAndFlags.isASCII) return __SharedStringStorage( immortal: _guts._object.fastUTF8.baseAddress!, - countAndFlags: countAndFlags) + countAndFlags: _StringObject.CountAndFlags( + sharedCount: _guts.count, isASCII: gutsCountAndFlags.isASCII)) } _internalInvariant(_guts._object.hasObjCBridgeableObject, From cb2194c024d9d8e95344886b470140d1ccbf0908 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 13 Apr 2022 19:15:30 -0700 Subject: [PATCH 67/83] [stdlib] Fix ABI and portability issues --- stdlib/public/core/StringUnicodeScalarView.swift | 2 +- stdlib/public/core/Substring.swift | 2 +- test/stdlib/StringIndex.swift | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index a5d945308dd37..fa00d00bdfc05 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -319,7 +319,7 @@ extension String { @inline(__always) get { return UnicodeScalarView(_guts) } @inline(__always) set { _guts = newValue._guts } - @_alwaysEmitIntoClient @inline(__always) // 5.7 + @inlinable @inline(__always) _modify { var view = self.unicodeScalars self = "" diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index ac1476f1f1a72..c9c81206c53f6 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -141,7 +141,7 @@ extension Substring { @inlinable @inline(__always) internal var _offsetRange: Range { _slice._bounds._encodedOffsetRange } - @inlinable @inline(__always) + @_alwaysEmitIntoClient @inline(__always) internal var _bounds: Range { _slice._bounds } } diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index 6e64926051423..e75024be579a8 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -604,12 +604,13 @@ suite.test("Fully exhaustive index interchange") fullyExhaustiveIndexInterchange(string) } +#if _runtime(_ObjC) suite.test("Fully exhaustive index interchange/GraphemeBreakTests") { for string in graphemeBreakTests.map { $0.0 } { fullyExhaustiveIndexInterchange(string) } } - +#endif suite.test("Global vs local grapheme cluster boundaries") { guard #available(SwiftStdlib 5.7, *) else { From c67f08b2aff966d11a1d10aae4fa134046d7dbc5 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Wed, 13 Apr 2022 19:15:42 -0700 Subject: [PATCH 68/83] [stdlib][NFC] Remove obsolete note --- stdlib/public/core/StringCharacterView.swift | 5 ----- 1 file changed, 5 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 96f9a193ce6fb..ad722071b43f4 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -245,11 +245,6 @@ extension String: BidirectionalCollection { // Note: Prior to Swift 5.7, this function used to be inlinable, forwarding // to `BidirectionalCollection._distance(from:to:)`. - // FIXME: Due to the `index(after:)` problem above, this function doesn't - // always return consistent results when the given indices fall between - // grapheme breaks -- swapping `start` and `end` may change the magnitude of - // the result. - let start = _guts.validateInclusiveCharacterIndex(start) let end = _guts.validateInclusiveCharacterIndex(end) From 8cef6d5572529342bfab89f2001e7d80515ccfd9 Mon Sep 17 00:00:00 2001 From: Doug Gregor Date: Wed, 13 Apr 2022 23:19:50 -0700 Subject: [PATCH 69/83] Don't emit `@preconcurrency import` warnings for Swift interfaces Fixes rdar://88758592. --- lib/Sema/TypeChecker.cpp | 10 ++++++++++ .../SilencePreconcurrency.swiftinterface | 7 +++++++ 2 files changed, 17 insertions(+) create mode 100644 test/ModuleInterface/SilencePreconcurrency.swiftinterface diff --git a/lib/Sema/TypeChecker.cpp b/lib/Sema/TypeChecker.cpp index f366e90e8047d..6d9287e8d8ffc 100644 --- a/lib/Sema/TypeChecker.cpp +++ b/lib/Sema/TypeChecker.cpp @@ -268,6 +268,16 @@ void swift::performTypeChecking(SourceFile &SF) { /// there were no diagnostics downgraded or suppressed due to that /// @preconcurrency, suggest that the attribute be removed. static void diagnoseUnnecessaryPreconcurrencyImports(SourceFile &sf) { + switch (sf.Kind) { + case SourceFileKind::Interface: + case SourceFileKind::SIL: + return; + + case SourceFileKind::Library: + case SourceFileKind::Main: + break; + } + ASTContext &ctx = sf.getASTContext(); for (const auto &import : sf.getImports()) { if (import.options.contains(ImportFlags::Preconcurrency) && diff --git a/test/ModuleInterface/SilencePreconcurrency.swiftinterface b/test/ModuleInterface/SilencePreconcurrency.swiftinterface new file mode 100644 index 0000000000000..d3ed5bd250c30 --- /dev/null +++ b/test/ModuleInterface/SilencePreconcurrency.swiftinterface @@ -0,0 +1,7 @@ +// swift-interface-format-version: 1.0 +// swift-module-flags: -target x86_64-apple-macos10.9 -module-name SilencePreconcurrency + +// RUN: %empty-directory(%t) +// RUN: %target-swift-frontend -compile-module-from-interface -o %/t/SilencePreconcurrency.swiftmodule %s -verify + +@preconcurrency import Swift From c5e99e453f1ac0a7140517773bbe760078cc2f79 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Thu, 14 Apr 2022 06:52:25 -0700 Subject: [PATCH 70/83] Concurrency: include missing header Include the new header for placement new. --- stdlib/public/Concurrency/AsyncLet.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stdlib/public/Concurrency/AsyncLet.cpp b/stdlib/public/Concurrency/AsyncLet.cpp index e1542cb18df91..70e2fd833de47 100644 --- a/stdlib/public/Concurrency/AsyncLet.cpp +++ b/stdlib/public/Concurrency/AsyncLet.cpp @@ -31,6 +31,8 @@ #include #endif +#include + using namespace swift; namespace { From 4a0f6cedc988252a9e72681d36de2b307ee9e882 Mon Sep 17 00:00:00 2001 From: Anthony Latsis Date: Thu, 14 Apr 2022 10:27:00 +0300 Subject: [PATCH 71/83] CS: `optional` storage key path components are read-only --- lib/IRGen/GenObjC.cpp | 11 ++++- lib/Sema/TypeCheckStorage.cpp | 4 +- test/SILGen/keypaths_objc.swift | 37 +++++++++------ ...wift-keypath-objc-optional-component.swift | 45 +++++++++++++++++++ 4 files changed, 81 insertions(+), 16 deletions(-) create mode 100644 test/expr/primary/keypath/swift-keypath-objc-optional-component.swift diff --git a/lib/IRGen/GenObjC.cpp b/lib/IRGen/GenObjC.cpp index d99ef7b718d7c..595cc67242619 100644 --- a/lib/IRGen/GenObjC.cpp +++ b/lib/IRGen/GenObjC.cpp @@ -1279,7 +1279,10 @@ irgen::emitObjCGetterDescriptorParts(IRGenModule &IGM, ObjCMethodDescriptor irgen::emitObjCSetterDescriptorParts(IRGenModule &IGM, VarDecl *property) { - assert(property->isSettable(property->getDeclContext()) && + // Optional properties support mutation on the Objective-C side, but not the + // Swift side. + assert((property->getAttrs().hasAttribute() || + property->isSettable(property->getDeclContext())) && "not a settable property?!"); Selector setterSel(property, Selector::ForSetter); @@ -1320,7 +1323,11 @@ irgen::emitObjCSetterDescriptorParts(IRGenModule &IGM, ObjCMethodDescriptor irgen::emitObjCSetterDescriptorParts(IRGenModule &IGM, SubscriptDecl *subscript) { - assert(subscript->supportsMutation() && "not a settable subscript?!"); + // Optional subscripts support mutation on the Objective-C side, but not the + // Swift side. + assert((subscript->getAttrs().hasAttribute() || + subscript->supportsMutation()) && + "not a settable subscript?!"); Selector setterSel(subscript, Selector::ForSetter); ObjCMethodDescriptor descriptor{}; diff --git a/lib/Sema/TypeCheckStorage.cpp b/lib/Sema/TypeCheckStorage.cpp index 752d15c3551a7..1856b35a2086c 100644 --- a/lib/Sema/TypeCheckStorage.cpp +++ b/lib/Sema/TypeCheckStorage.cpp @@ -3309,7 +3309,9 @@ StorageImplInfoRequest::evaluate(Evaluator &evaluator, WriteImplKind writeImpl = WriteImplKind::Immutable; ReadWriteImplKind readWriteImpl = ReadWriteImplKind::Immutable; - if (storage->getParsedAccessor(AccessorKind::Set)) { + // TODO: Writing to optional storage requirements is not supported. + if (!storage->getAttrs().hasAttribute() && + storage->getParsedAccessor(AccessorKind::Set)) { readImpl = ReadImplKind::Get; writeImpl = WriteImplKind::Set; readWriteImpl = ReadWriteImplKind::MaterializeToTemporary; diff --git a/test/SILGen/keypaths_objc.swift b/test/SILGen/keypaths_objc.swift index d7e817b582768..d145bc106c723 100644 --- a/test/SILGen/keypaths_objc.swift +++ b/test/SILGen/keypaths_objc.swift @@ -158,32 +158,43 @@ func dynamicMemberLookupMixedKeypaths(foo: DynamicClass) { _ = foo.bar.foo.nonobjc.y } +@objc class Object: NSObject { + var name: String + init(name: String) { + self.name = name + } +} @objc protocol ObjCProtoOptional { - @objc optional var optionalProperty: Bool { get } + @objc optional var object: Object { get set } - @objc optional subscript(_: Int) -> Bool { get } + @objc optional subscript(_: Bool) -> Object { get set } } // CHECK-LABEL: sil hidden [ossa] @{{.*}}0B28ProtocolOptionalRequirementsyyF -// CHECK: keypath $KeyPath>, (objc "optionalProperty"; root $ObjCProtoOptional; gettable_property $Optional, id #ObjCProtoOptional.optionalProperty!getter.foreign : (Self) -> () -> Bool, getter @$[[PROP_GETTER:[_a-zA-Z0-9]+]] -// CHECK: keypath $KeyPath>, (root $ObjCProtoOptional; gettable_property $Optional, id #ObjCProtoOptional.subscript!getter.foreign : (Self) -> (Int) -> Bool, getter @$[[SUBSCR_GETTER:[_a-zA-Z0-9]+]] +// CHECK: keypath $KeyPath>, (objc "object"; root $ObjCProtoOptional; gettable_property $Optional, id #ObjCProtoOptional.object!getter.foreign : (Self) -> () -> Object, getter @$[[PROP_GETTER:[_a-zA-Z0-9]+]] +// CHECK: keypath $KeyPath>, (root $ObjCProtoOptional; gettable_property $Optional, id #ObjCProtoOptional.subscript!getter.foreign : (Self) -> (Bool) -> Object, getter @$[[SUBSCR_GETTER:[_a-zA-Z0-9]+]] +// CHECK: keypath $ReferenceWritableKeyPath, (root $ObjCProtoOptional; gettable_property $Optional, id #ObjCProtoOptional.object!getter.foreign : (Self) -> () -> Object, getter @$[[PROP_GETTER]] : {{.*}}; optional_force : $Object; settable_property $String, +// CHECK: keypath $ReferenceWritableKeyPath, (root $ObjCProtoOptional; gettable_property $Optional, id #ObjCProtoOptional.subscript!getter.foreign : (Self) -> (Bool) -> Object, getter @$[[SUBSCR_GETTER]] : {{.*}}; optional_force : $Object; settable_property $String, // CHECK: } // end sil function '${{.*}}0B28ProtocolOptionalRequirementsyyF' // -// CHECK: sil shared [thunk] [ossa] @$[[PROP_GETTER]] : $@convention(thin) (@in_guaranteed ObjCProtoOptional) -> @out Optional { +// CHECK: sil shared [thunk] [ossa] @$[[PROP_GETTER]] : $@convention(thin) (@in_guaranteed ObjCProtoOptional) -> @out Optional { // CHECK: [[BASE:%[0-9]+]] = open_existential_ref {{%[0-9]+}} : $ObjCProtoOptional to $[[OPENED_TY:@opened\("[-A-F0-9]+"\) ObjCProtoOptional]] -// CHECK: dynamic_method_br [[BASE]] : $[[OPENED_TY]], #ObjCProtoOptional.optionalProperty!getter.foreign, bb1 -// CHECK: bb1({{%[0-9]+}} : $@convention(objc_method) ([[OPENED_TY]]) -> ObjCBool) +// CHECK: dynamic_method_br [[BASE]] : $[[OPENED_TY]], #ObjCProtoOptional.object!getter.foreign, bb1 +// CHECK: bb1({{%[0-9]+}} : $@convention(objc_method) ([[OPENED_TY]]) -> @autoreleased Object) // CHECK: } // end sil function '$[[PROP_GETTER]]' // -// CHECK: sil shared [thunk] [ossa] @$[[SUBSCR_GETTER]] : $@convention(thin) (@in_guaranteed ObjCProtoOptional, UnsafeRawPointer) -> @out Optional { +// CHECK: sil shared [thunk] [ossa] @$[[SUBSCR_GETTER]] : $@convention(thin) (@in_guaranteed ObjCProtoOptional, UnsafeRawPointer) -> @out Optional { // CHECK: [[BASE:%[0-9]+]] = open_existential_ref {{%[0-9]+}} : $ObjCProtoOptional to $[[OPENED_TY:@opened\("[-A-F0-9]+"\) ObjCProtoOptional]] -// CHECK: [[INDEX:%[0-9]+]] = load [trivial] {{%[0-9]+}} : $*Int +// CHECK: [[INDEX:%[0-9]+]] = load [trivial] {{%[0-9]+}} : $*Bool // CHECK: dynamic_method_br [[BASE]] : $[[OPENED_TY]], #ObjCProtoOptional.subscript!getter.foreign, bb1, bb2 -// CHECK: bb1({{%[0-9]+}} : $@convention(objc_method) (Int, [[OPENED_TY]]) -> ObjCBool): -// CHECK: %17 = apply {{%[0-9]+}}([[INDEX]]) : $@callee_guaranteed (Int) -> Bool +// CHECK: bb1({{%[0-9]+}} : $@convention(objc_method) (ObjCBool, [[OPENED_TY]]) -> @autoreleased Object): +// CHECK: %17 = apply {{%[0-9]+}}([[INDEX]]) : $@callee_guaranteed (Bool) -> @owned Object // CHECK: bb2: // CHECK: } // end sil function '$[[SUBSCR_GETTER]]' func objcProtocolOptionalRequirements() { - _ = \ObjCProtoOptional.optionalProperty - _ = \ObjCProtoOptional.[0] + _ = \ObjCProtoOptional.object + _ = \ObjCProtoOptional.[true] + + _ = \ObjCProtoOptional.object!.name + _ = \ObjCProtoOptional.[true]!.name } diff --git a/test/expr/primary/keypath/swift-keypath-objc-optional-component.swift b/test/expr/primary/keypath/swift-keypath-objc-optional-component.swift new file mode 100644 index 0000000000000..4a9ca3c57ab3d --- /dev/null +++ b/test/expr/primary/keypath/swift-keypath-objc-optional-component.swift @@ -0,0 +1,45 @@ +// RUN: %target-typecheck-verify-swift -disable-objc-attr-requires-foundation-module -enable-objc-interop + +@objc class Object { + var name: String + + init(name: String) { + self.name = name + } +} + +@objc protocol P { + @objc optional var object: Object { get set } + + @objc optional subscript(_: Int) -> Object { get set } +} + +func assertExactType(of _: T, is _: T.Type) {} + +// An optional storage component makes the key path read-only... +do { + let kp_property = \P.object + let kp_subscript = \P.[0] + + var p: P + // expected-error@+1 {{cannot assign through subscript: 'kp_property' is a read-only key path}} + p[keyPath: kp_property] = Object(name: "nope") + // expected-error@+1 {{cannot assign through subscript: 'kp_subscript' is a read-only key path}} + p[keyPath: kp_subscript] = Object(name: "nope") + + assertExactType(of: kp_property, is: KeyPath.self) + assertExactType(of: kp_subscript, is: KeyPath.self) +} + +// ...unless a reference-writable component shows up later. +do { + let kp_propertyForce_name = \P.object!.name + let kp_subscriptForce_name = \P.[0]!.name + + let p: P + p[keyPath: kp_propertyForce_name] = "yes" + p[keyPath: kp_subscriptForce_name] = "yes" + + assertExactType(of: kp_propertyForce_name, is: ReferenceWritableKeyPath.self) + assertExactType(of: kp_subscriptForce_name, is: ReferenceWritableKeyPath.self) +} From 364dc39c3cf37752499300f282d890ec068f1dee Mon Sep 17 00:00:00 2001 From: Robert Widmann Date: Wed, 13 Apr 2022 17:06:21 -0700 Subject: [PATCH 72/83] Turn on Existential Metatypes These were accidentally left off - turn them on subject to the -enable-parameterized-existential-types flag. --- lib/Sema/TypeCheckType.h | 2 +- .../parameterized_existential_metatypes.swift | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 test/Constraints/parameterized_existential_metatypes.swift diff --git a/lib/Sema/TypeCheckType.h b/lib/Sema/TypeCheckType.h index 766b849686631..2cbbfe485ee56 100644 --- a/lib/Sema/TypeCheckType.h +++ b/lib/Sema/TypeCheckType.h @@ -288,11 +288,11 @@ class TypeResolutionOptions { case Context::GenericRequirement: return true; case Context::ExistentialConstraint: + case Context::MetatypeBase: return opts.EnableParameterizedExistentialTypes; case Context::None: case Context::TypeAliasDecl: case Context::GenericTypeAliasDecl: - case Context::MetatypeBase: case Context::InExpression: case Context::ExplicitCastExpr: case Context::ForEachStmt: diff --git a/test/Constraints/parameterized_existential_metatypes.swift b/test/Constraints/parameterized_existential_metatypes.swift new file mode 100644 index 0000000000000..40394685941bf --- /dev/null +++ b/test/Constraints/parameterized_existential_metatypes.swift @@ -0,0 +1,31 @@ +// RUN: %target-typecheck-verify-swift -enable-parameterized-existential-types +// +// FIXME: Merge this file with existential_metatypes.swift once -enable-parameterized-existential-types becomes the default + +protocol P { + associatedtype T +} + +protocol Q { + associatedtype T +} + +protocol PP: P { + associatedtype U: P +} + +var qp: (any Q).Type +var pp: (any P).Type = qp // expected-error{{cannot convert value of type '(any Q).Type' to specified type '(any P).Type'}} + +var qt: any Q.Type +qt = qp // expected-error{{cannot assign value of type '(any Q).Type' to type 'any Q.Type'}} +qp = qt // expected-error{{cannot assign value of type 'any Q.Type' to type '(any Q).Type'}} +var pt: any P.Type = qt // expected-error{{cannot convert value of type 'any Q.Type' to specified type 'any P.Type'}} +pt = pp // expected-error{{cannot assign value of type '(any P).Type' to type 'any P.Type'}} +pp = pt // expected-error{{cannot assign value of type 'any P.Type' to type '(any P).Type'}} + +var ppp: (any PP).Type +pp = ppp // expected-error{{cannot assign value of type '(any PP).Type' to type '(any P).Type'}} + +var ppt: any PP.Type +pt = ppt From 52d224c401a831b540d64edaf415d1f404344d08 Mon Sep 17 00:00:00 2001 From: Ben Barham Date: Thu, 14 Apr 2022 11:47:18 -0700 Subject: [PATCH 73/83] [Test] Fix broken IDE test `complete_in_result_builder.swift` was modified in 0b9644a0d4884268fd2a4f4558cea4fb737f9529 but ended up being merged *after* 5d01a097e1e70963929c995ace134d15cb11a548 which removed the `Identical` type relation entirely. Replace the two added test cases with `Convertible` instead. --- test/IDE/complete_in_result_builder.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/IDE/complete_in_result_builder.swift b/test/IDE/complete_in_result_builder.swift index ed09edecc6a02..cf72094da006e 100644 --- a/test/IDE/complete_in_result_builder.swift +++ b/test/IDE/complete_in_result_builder.swift @@ -36,7 +36,7 @@ func testGlobalLookup() { @TupleBuilder var x1 { #^GLOBAL_LOOKUP^# // GLOBAL_LOOKUP: Begin completions - // GLOBAL_LOOKUP: Decl[GlobalVar]/CurrModule/TypeRelation[Identical]: MyConstantString[#String#]; + // GLOBAL_LOOKUP: Decl[GlobalVar]/CurrModule/TypeRelation[Convertible]: MyConstantString[#String#]; // GLOBAL_LOOKUP: End completions } @@ -81,7 +81,7 @@ func testStaticMemberLookup() { @TupleBuilder var x1 { StringFactory.#^COMPLETE_STATIC_MEMBER^# // COMPLETE_STATIC_MEMBER: Begin completions - // COMPLETE_STATIC_MEMBER: Decl[StaticMethod]/CurrNominal/TypeRelation[Identical]: makeString({#x: String#})[#String#]; + // COMPLETE_STATIC_MEMBER: Decl[StaticMethod]/CurrNominal/TypeRelation[Convertible]: makeString({#x: String#})[#String#]; // COMPLETE_STATIC_MEMBER: End completions } From e52ccf4c0de77b2285e2d65916e62f6161d405e4 Mon Sep 17 00:00:00 2001 From: Ben Barham Date: Thu, 14 Apr 2022 13:14:03 -0700 Subject: [PATCH 74/83] [SymbolGraph] Initialize newly added IncludeClangDocs field This was added recently but not initialized in symbolgraph-extract. We never output Clang imported symbols from this tool, so always passing false is fine for now. --- lib/DriverTool/swift_symbolgraph_extract_main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/DriverTool/swift_symbolgraph_extract_main.cpp b/lib/DriverTool/swift_symbolgraph_extract_main.cpp index 9d13e3c1c0411..4fc9dfe0febee 100644 --- a/lib/DriverTool/swift_symbolgraph_extract_main.cpp +++ b/lib/DriverTool/swift_symbolgraph_extract_main.cpp @@ -170,6 +170,7 @@ int swift_symbolgraph_extract_main(ArrayRef Args, ParsedArgs.hasArg(OPT_v), ParsedArgs.hasArg(OPT_skip_inherited_docs), ParsedArgs.hasArg(OPT_include_spi_symbols), + /*IncludeClangDocs=*/false, }; if (auto *A = ParsedArgs.getLastArg(OPT_minimum_access_level)) { From d8ade63521193950869d226555404c6a9ff96179 Mon Sep 17 00:00:00 2001 From: Pavel Yaskevich Date: Thu, 14 Apr 2022 13:37:59 -0700 Subject: [PATCH 75/83] [TypeChecker] NFC: Un-XFAIL SwiftUI test-case which has been fixed Test-case should no longer produce "type of expression is ambiguous" fallback diagnostic. Resolves: rdar://66110075 --- validation-test/Sema/SwiftUI/rdar57201781.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validation-test/Sema/SwiftUI/rdar57201781.swift b/validation-test/Sema/SwiftUI/rdar57201781.swift index 4033ed3cc61b0..21d919d40aa2d 100644 --- a/validation-test/Sema/SwiftUI/rdar57201781.swift +++ b/validation-test/Sema/SwiftUI/rdar57201781.swift @@ -1,5 +1,5 @@ // RUN: %target-typecheck-verify-swift -target x86_64-apple-macosx10.15 -swift-version 5 -// REQUIRES: rdar66110075 + // REQUIRES: objc_interop // REQUIRES: OS=macosx @@ -9,7 +9,7 @@ struct ContentView : View { @State var foo: [String] = Array(repeating: "", count: 5) var body: some View { - VStack { // expected-error{{type of expression is ambiguous without more context}} + VStack { HStack { Text("") TextFi // expected-error {{cannot find 'TextFi' in scope}} From 3f24533da154acddeb9b2c186a2f95a8558ee259 Mon Sep 17 00:00:00 2001 From: Jonathan Grynspan Date: Wed, 13 Apr 2022 23:15:38 -0400 Subject: [PATCH 76/83] Ensure AtomicWaitQueue allocates its inner queues in an aligned fashion even when the compiler does not support the C++17 over-aligned new feature (and avoid using new anyway since it might be overridden by something else in the process.) --- include/swift/Runtime/AtomicWaitQueue.h | 12 +++---- include/swift/Runtime/HeapObject.h | 45 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/include/swift/Runtime/AtomicWaitQueue.h b/include/swift/Runtime/AtomicWaitQueue.h index cc0d99890d571..0985067828b25 100644 --- a/include/swift/Runtime/AtomicWaitQueue.h +++ b/include/swift/Runtime/AtomicWaitQueue.h @@ -20,6 +20,7 @@ #define SWIFT_RUNTIME_ATOMICWAITQUEUE_H #include "swift/Runtime/Heap.h" +#include "swift/Runtime/HeapObject.h" #include "swift/Runtime/Mutex.h" #include @@ -84,7 +85,7 @@ class AtomicWaitQueue { /// global lock and while *not* holding the wait queue lock. void release_locked() { if (referenceCount == 1) { - delete &asImpl(); + swift_cxx_deleteObject(&asImpl()); } else { referenceCount--; } @@ -211,7 +212,7 @@ class AtomicWaitQueue { // If we created the queue but never published it, destroy it. if (CurrentQueue) { CurrentQueue->WaitQueueLock.unlock(); - delete CurrentQueue; + swift_cxx_deleteObject(CurrentQueue); } } @@ -425,12 +426,7 @@ class AtomicWaitQueue { private: template static Impl *createNewQueue(Args &&...args) { -#if !defined(__cpp_aligned_new) - static_assert(!swift::requires_aligned_alloc::value>::value || - is_aligned_alloc_aware::value, - "type is over-aligned for non-alignment aware operator new"); -#endif - auto queue = new Impl(std::forward(args)...); + auto queue = swift_cxx_newObject(std::forward(args)...); queue->WaitQueueLock.lock(); return queue; } diff --git a/include/swift/Runtime/HeapObject.h b/include/swift/Runtime/HeapObject.h index efb50b3f93c57..7825484670df1 100644 --- a/include/swift/Runtime/HeapObject.h +++ b/include/swift/Runtime/HeapObject.h @@ -19,6 +19,8 @@ #include #include +#include +#include #include "swift/Runtime/Config.h" #if SWIFT_OBJC_INTEROP @@ -131,6 +133,49 @@ void *swift_slowAlloc(size_t bytes, size_t alignMask); SWIFT_RUNTIME_EXPORT void swift_slowDealloc(void *ptr, size_t bytes, size_t alignMask); +/// Allocate and construct an instance of type \c T. +/// +/// \param args The arguments to pass to the constructor for \c T. +/// +/// \returns A pointer to a new, fully constructed instance of \c T. This +/// function never returns \c nullptr. The caller is responsible for +/// eventually destroying the resulting object by passing it to +/// \c swift_cxx_deleteObject(). +/// +/// This function avoids the use of the global \c operator \c new (which may be +/// overridden by other code in a process) in favor of calling +/// \c swift_slowAlloc() and constructing the new object with placement new. +/// +/// This function is capable of returning well-aligned memory even on platforms +/// that do not implement the C++17 "over-aligned new" feature. +template +static inline T *swift_cxx_newObject(Args &&... args) { + auto result = reinterpret_cast(swift_slowAlloc(sizeof(T), + alignof(T) - 1)); + ::new (result) T(std::forward(args)...); + return result; +} + +/// Destruct and deallocate an instance of type \c T. +/// +/// \param ptr A pointer to an instance of type \c T previously created with a +/// call to \c swift_cxx_newObject(). +/// +/// This function avoids the use of the global \c operator \c delete (which may +/// be overridden by other code in a process) in favor of directly calling the +/// destructor for \a *ptr and then freeing its memory by calling +/// \c swift_slowDealloc(). +/// +/// The effect of passing a pointer to this function that was \em not returned +/// from \c swift_cxx_newObject() is undefined. +template +static inline void swift_cxx_deleteObject(T *ptr) { + if (ptr) { + ptr->~T(); + swift_slowDealloc(ptr, sizeof(T), alignof(T) - 1); + } +} + /// Atomically increments the retain count of an object. /// /// \param object - may be null, in which case this is a no-op From a8b0ee24dcb2a8758b4854df226f17a203821d08 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Thu, 14 Apr 2022 14:20:03 -0700 Subject: [PATCH 77/83] runtime: blanket application of namespacing and inclusion of `new` Apply a blanket pass of including `new` for the placement new allocation and namespacing the call to the global placement new allocator. This should repair the Android ARMv7 builds. --- stdlib/public/Concurrency/Actor.cpp | 3 ++- stdlib/public/Concurrency/AsyncLet.cpp | 2 +- stdlib/public/Concurrency/Task.cpp | 11 ++++++----- stdlib/public/Concurrency/TaskGroup.cpp | 3 ++- stdlib/public/Concurrency/TaskLocal.cpp | 3 ++- stdlib/public/Concurrency/TaskPrivate.h | 5 +++-- stdlib/public/runtime/AccessibleFunction.cpp | 3 ++- stdlib/public/runtime/AnyHashableSupport.cpp | 4 +++- stdlib/public/runtime/AutoDiffSupport.cpp | 4 +++- stdlib/public/runtime/HeapObject.cpp | 3 ++- stdlib/public/runtime/KeyPaths.cpp | 3 ++- stdlib/public/runtime/Metadata.cpp | 4 ++-- stdlib/public/runtime/MetadataLookup.cpp | 5 +++-- stdlib/public/runtime/ProtocolConformance.cpp | 3 ++- stdlib/public/runtime/StackAllocator.h | 7 ++++--- stdlib/public/runtime/SwiftRT-COFF.cpp | 2 +- stdlib/public/runtime/SwiftRT-ELF.cpp | 2 +- stdlib/public/runtime/SwiftValue.mm | 4 +++- 18 files changed, 44 insertions(+), 27 deletions(-) diff --git a/stdlib/public/Concurrency/Actor.cpp b/stdlib/public/Concurrency/Actor.cpp index 1fa854444e8fa..f45f8bb4082fc 100644 --- a/stdlib/public/Concurrency/Actor.cpp +++ b/stdlib/public/Concurrency/Actor.cpp @@ -17,6 +17,7 @@ #include "swift/Runtime/Concurrency.h" #include +#include #ifdef _WIN32 // On Windows, an include below triggers an indirect include of minwindef.h @@ -1217,7 +1218,7 @@ void DefaultActorImpl::scheduleActorProcessJob(JobPriority priority, bool useInl if (useInlineJob) { if (JobStorageHeapObject.metadata != nullptr) JobStorage.~ProcessInlineJob(); - job = new (&JobStorage) ProcessInlineJob(priority); + job = ::new (&JobStorage) ProcessInlineJob(priority); } else { assert(false && "Should not be here - we don't have support for any OOL actor process jobs yet"); // TODO (rokhinip): Don't we need to take a +1 per ref count rules specified? diff --git a/stdlib/public/Concurrency/AsyncLet.cpp b/stdlib/public/Concurrency/AsyncLet.cpp index 70e2fd833de47..6ac84a265caf3 100644 --- a/stdlib/public/Concurrency/AsyncLet.cpp +++ b/stdlib/public/Concurrency/AsyncLet.cpp @@ -141,7 +141,7 @@ static AsyncLetImpl *asImpl(const AsyncLet *alet) { void swift::asyncLet_addImpl(AsyncTask *task, AsyncLet *asyncLet, bool didAllocateInParentTask) { - AsyncLetImpl *impl = new (asyncLet) AsyncLetImpl(task); + AsyncLetImpl *impl = ::new (asyncLet) AsyncLetImpl(task); impl->setDidAllocateFromParentTask(didAllocateInParentTask); auto record = impl->getTaskRecord(); diff --git a/stdlib/public/Concurrency/Task.cpp b/stdlib/public/Concurrency/Task.cpp index 1bdbea0bc1b6b..1d232e1ca2245 100644 --- a/stdlib/public/Concurrency/Task.cpp +++ b/stdlib/public/Concurrency/Task.cpp @@ -28,6 +28,7 @@ #include "Debug.h" #include "Error.h" #include +#include #if SWIFT_CONCURRENCY_ENABLE_DISPATCH #include @@ -770,20 +771,20 @@ static AsyncTaskAndContext swift_task_create_commonImpl( // Initialize the child fragment if applicable. if (parent) { auto childFragment = task->childFragment(); - new (childFragment) AsyncTask::ChildFragment(parent); + ::new (childFragment) AsyncTask::ChildFragment(parent); } // Initialize the group child fragment if applicable. if (group) { auto groupChildFragment = task->groupChildFragment(); - new (groupChildFragment) AsyncTask::GroupChildFragment(group); + ::new (groupChildFragment) AsyncTask::GroupChildFragment(group); } // Initialize the future fragment if applicable. if (futureResultType) { assert(task->isFuture()); auto futureFragment = task->futureFragment(); - new (futureFragment) FutureFragment(futureResultType); + ::new (futureFragment) FutureFragment(futureResultType); // Set up the context for the future so there is no error, and a successful // result will be written into the future fragment's storage. @@ -1202,7 +1203,7 @@ swift_task_addCancellationHandlerImpl( void *allocation = swift_task_alloc(sizeof(CancellationNotificationStatusRecord)); auto unsigned_handler = swift_auth_code(handler, 3848); - auto *record = new (allocation) + auto *record = ::new (allocation) CancellationNotificationStatusRecord(unsigned_handler, context); bool fireHandlerNow = false; @@ -1237,7 +1238,7 @@ swift_task_createNullaryContinuationJobImpl( void *allocation = swift_task_alloc(sizeof(NullaryContinuationJob)); auto *job = - new (allocation) NullaryContinuationJob( + ::new (allocation) NullaryContinuationJob( swift_task_getCurrent(), static_cast(priority), continuation); diff --git a/stdlib/public/Concurrency/TaskGroup.cpp b/stdlib/public/Concurrency/TaskGroup.cpp index fa29de226f1e7..60ff5d8670c21 100644 --- a/stdlib/public/Concurrency/TaskGroup.cpp +++ b/stdlib/public/Concurrency/TaskGroup.cpp @@ -33,6 +33,7 @@ #include "string" #include "queue" // TODO: remove and replace with usage of our mpsc queue #include +#include #include #if SWIFT_CONCURRENCY_ENABLE_DISPATCH #include @@ -469,7 +470,7 @@ SWIFT_CC(swift) static void swift_taskGroup_initializeImpl(TaskGroup *group, const Metadata *T) { SWIFT_TASK_DEBUG_LOG("creating task group = %p", group); - TaskGroupImpl *impl = new (group) TaskGroupImpl(T); + TaskGroupImpl *impl = ::new (group) TaskGroupImpl(T); auto record = impl->getTaskRecord(); assert(impl == record && "the group IS the task record"); diff --git a/stdlib/public/Concurrency/TaskLocal.cpp b/stdlib/public/Concurrency/TaskLocal.cpp index 2961403988ce8..dec28b157a012 100644 --- a/stdlib/public/Concurrency/TaskLocal.cpp +++ b/stdlib/public/Concurrency/TaskLocal.cpp @@ -24,6 +24,7 @@ #include "swift/ABI/Metadata.h" #include "llvm/ADT/PointerIntPair.h" #include "TaskPrivate.h" +#include #include #if SWIFT_STDLIB_HAS_ASL @@ -207,7 +208,7 @@ TaskLocal::Item::createLink(AsyncTask *task, size_t amountToAllocate = Item::itemSize(valueType); void *allocation = task ? _swift_task_alloc_specific(task, amountToAllocate) : malloc(amountToAllocate); - Item *item = new (allocation) Item(key, valueType); + Item *item = ::new (allocation) Item(key, valueType); auto next = task ? task->_private().Local.head : FallbackTaskLocalStorage::get()->head; diff --git a/stdlib/public/Concurrency/TaskPrivate.h b/stdlib/public/Concurrency/TaskPrivate.h index ed7daaaf0c81b..f5429db28330b 100644 --- a/stdlib/public/Concurrency/TaskPrivate.h +++ b/stdlib/public/Concurrency/TaskPrivate.h @@ -29,6 +29,7 @@ #include "swift/Runtime/Heap.h" #include "swift/Runtime/HeapObject.h" #include +#include #define SWIFT_FATAL_ERROR swift_Concurrency_fatalError #include "../runtime/StackAllocator.h" @@ -655,11 +656,11 @@ AsyncTask::OpaquePrivateStorage::get() const { return reinterpret_cast(*this); } inline void AsyncTask::OpaquePrivateStorage::initialize(JobPriority basePri) { - new (this) PrivateStorage(basePri); + ::new (this) PrivateStorage(basePri); } inline void AsyncTask::OpaquePrivateStorage::initializeWithSlab( JobPriority basePri, void *slab, size_t slabCapacity) { - new (this) PrivateStorage(basePri, slab, slabCapacity); + ::new (this) PrivateStorage(basePri, slab, slabCapacity); } inline void AsyncTask::OpaquePrivateStorage::complete(AsyncTask *task) { get().complete(task); diff --git a/stdlib/public/runtime/AccessibleFunction.cpp b/stdlib/public/runtime/AccessibleFunction.cpp index 95e9678323bf6..e71cf09206af8 100644 --- a/stdlib/public/runtime/AccessibleFunction.cpp +++ b/stdlib/public/runtime/AccessibleFunction.cpp @@ -23,6 +23,7 @@ #include "swift/Runtime/Metadata.h" #include +#include using namespace swift; @@ -153,7 +154,7 @@ swift::runtime::swift_findAccessibleFunction(const char *targetNameStart, S.Cache.getOrInsert( name, [&](AccessibleFunctionCacheEntry *entry, bool created) { if (created) - new (entry) AccessibleFunctionCacheEntry{name, record}; + ::new (entry) AccessibleFunctionCacheEntry{name, record}; return true; }); } diff --git a/stdlib/public/runtime/AnyHashableSupport.cpp b/stdlib/public/runtime/AnyHashableSupport.cpp index 65d9e810bdbcf..3002ccca53b9f 100644 --- a/stdlib/public/runtime/AnyHashableSupport.cpp +++ b/stdlib/public/runtime/AnyHashableSupport.cpp @@ -21,6 +21,8 @@ #include "swift/Runtime/Debug.h" #include "swift/Runtime/HeapObject.h" +#include + using namespace swift; using namespace swift::hashable_support; @@ -103,7 +105,7 @@ findHashableBaseTypeImpl(const Metadata *type) { HashableConformances.getOrInsert(key, [&](HashableConformanceEntry *entry, bool created) { if (created) - new (entry) HashableConformanceEntry(key, baseTypeThatConformsToHashable); + ::new (entry) HashableConformanceEntry(key, baseTypeThatConformsToHashable); return true; // Keep the new entry. }); return baseTypeThatConformsToHashable; diff --git a/stdlib/public/runtime/AutoDiffSupport.cpp b/stdlib/public/runtime/AutoDiffSupport.cpp index 467ae2b3e37ce..6e8ec4d1d8190 100644 --- a/stdlib/public/runtime/AutoDiffSupport.cpp +++ b/stdlib/public/runtime/AutoDiffSupport.cpp @@ -14,6 +14,8 @@ #include "swift/ABI/Metadata.h" #include "swift/Runtime/HeapObject.h" +#include + using namespace swift; using namespace llvm; @@ -59,7 +61,7 @@ AutoDiffLinearMapContext *swift::swift_autoDiffCreateLinearMapContext( sizeof(AutoDiffLinearMapContext), alignof(AutoDiffLinearMapContext)) + topLevelLinearMapStructSize; auto *buffer = (AutoDiffLinearMapContext *)malloc(allocationSize); - return new (buffer) AutoDiffLinearMapContext; + return ::new (buffer) AutoDiffLinearMapContext; } void *swift::swift_autoDiffProjectTopLevelSubcontext( diff --git a/stdlib/public/runtime/HeapObject.cpp b/stdlib/public/runtime/HeapObject.cpp index 74c13899e52db..32f5b6aaf3021 100644 --- a/stdlib/public/runtime/HeapObject.cpp +++ b/stdlib/public/runtime/HeapObject.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "../SwiftShims/GlobalObjects.h" #include "../SwiftShims/RuntimeShims.h" @@ -124,7 +125,7 @@ static HeapObject *_swift_allocObject_(HeapMetadata const *metadata, // NOTE: this relies on the C++17 guaranteed semantics of no null-pointer // check on the placement new allocator which we have observed on Windows, // Linux, and macOS. - new (object) HeapObject(metadata); + ::new (object) HeapObject(metadata); // If leak tracking is enabled, start tracking this object. SWIFT_LEAKS_START_TRACKING_OBJECT(object); diff --git a/stdlib/public/runtime/KeyPaths.cpp b/stdlib/public/runtime/KeyPaths.cpp index 4afed3e71ee1a..d504b2374b0ff 100644 --- a/stdlib/public/runtime/KeyPaths.cpp +++ b/stdlib/public/runtime/KeyPaths.cpp @@ -14,6 +14,7 @@ #include "swift/Runtime/Metadata.h" #include #include +#include using namespace swift; @@ -98,7 +99,7 @@ namespace { static OpaqueValue *allocateIn(const Metadata *type, YieldOnceBuffer *buffer) { auto *temp = - new (reinterpret_cast(buffer)) YieldOnceTemporary(type); + ::new (reinterpret_cast(buffer)) YieldOnceTemporary(type); return type->allocateBufferIn(&temp->Buffer); } diff --git a/stdlib/public/runtime/Metadata.cpp b/stdlib/public/runtime/Metadata.cpp index fda3470bc8e46..bca2539c68684 100644 --- a/stdlib/public/runtime/Metadata.cpp +++ b/stdlib/public/runtime/Metadata.cpp @@ -2385,7 +2385,7 @@ static ValueWitnessTable *getMutableVWTableForInit(StructMetadata *self, // Otherwise, allocate permanent memory for it and copy the existing table. void *memory = allocateMetadata(sizeof(ValueWitnessTable), alignof(ValueWitnessTable)); - auto newTable = new (memory) ValueWitnessTable(*oldTable); + auto newTable = ::new (memory) ValueWitnessTable(*oldTable); // If we ever need to check layout-completeness asynchronously from // initialization, we'll need this to be a store-release (and rely on @@ -4650,7 +4650,7 @@ static const WitnessTable *_getForeignWitnessTable( ForeignWitnessTables.getOrInsert( key, [&](ForeignWitnessTableCacheEntry *entryPtr, bool created) { if (created) - new (entryPtr) + ::new (entryPtr) ForeignWitnessTableCacheEntry(key, witnessTableCandidate); result = entryPtr->data; return true; diff --git a/stdlib/public/runtime/MetadataLookup.cpp b/stdlib/public/runtime/MetadataLookup.cpp index d0815b7e04736..fde4220c496ee 100644 --- a/stdlib/public/runtime/MetadataLookup.cpp +++ b/stdlib/public/runtime/MetadataLookup.cpp @@ -38,6 +38,7 @@ #include #include #include +#include using namespace swift; using namespace Demangle; @@ -774,7 +775,7 @@ _findContextDescriptor(Demangle::NodePointer node, *entry, bool created) { if (created) - new (entry) NominalTypeDescriptorCacheEntry{mangledName, foundContext}; + ::new (entry) NominalTypeDescriptorCacheEntry{mangledName, foundContext}; return true; }); @@ -931,7 +932,7 @@ _findProtocolDescriptor(NodePointer node, *entry, bool created) { if (created) - new (entry) ProtocolDescriptorCacheEntry{mangledName, foundProtocol}; + ::new (entry) ProtocolDescriptorCacheEntry{mangledName, foundProtocol}; return true; }); } diff --git a/stdlib/public/runtime/ProtocolConformance.cpp b/stdlib/public/runtime/ProtocolConformance.cpp index c3ebd11cbf44b..856132b92452d 100644 --- a/stdlib/public/runtime/ProtocolConformance.cpp +++ b/stdlib/public/runtime/ProtocolConformance.cpp @@ -31,6 +31,7 @@ #include "ImageInspection.h" #include "Private.h" +#include #include #if __has_include() @@ -510,7 +511,7 @@ struct ConformanceState { SectionsToScan.snapshot().count() != sectionsCount) return false; // abandon the new entry - new (entry) ConformanceCacheEntry( + ::new (entry) ConformanceCacheEntry( ConformanceCacheKey(type, proto), witness); return true; // keep the new entry }); diff --git a/stdlib/public/runtime/StackAllocator.h b/stdlib/public/runtime/StackAllocator.h index 2f23ff41cf1c5..9ab0c84030722 100644 --- a/stdlib/public/runtime/StackAllocator.h +++ b/stdlib/public/runtime/StackAllocator.h @@ -25,6 +25,7 @@ #include "swift/Runtime/Debug.h" #include "llvm/Support/Alignment.h" #include +#include // Notes: swift::fatalError is not shared between libswiftCore and libswift_Concurrency // and libswift_Concurrency uses swift_Concurrency_fatalError instead. @@ -170,7 +171,7 @@ class StackAllocator { assert(llvm::isAligned(llvm::Align(alignment), alignedSize)); assert(canAllocate(alignedSize)); void *buffer = getAddr(currentOffset); - auto *allocation = new (buffer) Allocation(lastAllocation, this); + auto *allocation = ::new (buffer) Allocation(lastAllocation, this); currentOffset += Allocation::includingHeader(alignedSize); if (guardAllocations) { uintptr_t *endOfCurrentAllocation = (uintptr_t *)getAddr(currentOffset); @@ -251,7 +252,7 @@ class StackAllocator { size_t capacity = std::max(SlabCapacity, Allocation::includingHeader(size)); void *slabBuffer = malloc(Slab::includingHeader(capacity)); - Slab *newSlab = new (slabBuffer) Slab(capacity); + Slab *newSlab = ::new (slabBuffer) Slab(capacity); if (slab) slab->next = newSlab; else @@ -292,7 +293,7 @@ class StackAllocator { char *end = (char *)firstSlabBuffer + bufferCapacity; assert(start + Slab::headerSize() <= end && "buffer for first slab too small"); - firstSlab = new (start) Slab(end - start - Slab::headerSize()); + firstSlab = ::new (start) Slab(end - start - Slab::headerSize()); firstSlabIsPreallocated = true; numAllocatedSlabs = 0; } diff --git a/stdlib/public/runtime/SwiftRT-COFF.cpp b/stdlib/public/runtime/SwiftRT-COFF.cpp index 61fc81b3c6fc7..5f94fcbe25bb3 100644 --- a/stdlib/public/runtime/SwiftRT-COFF.cpp +++ b/stdlib/public/runtime/SwiftRT-COFF.cpp @@ -65,7 +65,7 @@ static void swift_image_constructor() { { reinterpret_cast(&__start_##name) + sizeof(__start_##name), \ reinterpret_cast(&__stop_##name) - reinterpret_cast(&__start_##name) - sizeof(__start_##name) } - new (§ions) swift::MetadataSections { + ::new (§ions) swift::MetadataSections { swift::CurrentSectionMetadataVersion, { __ImageBase }, diff --git a/stdlib/public/runtime/SwiftRT-ELF.cpp b/stdlib/public/runtime/SwiftRT-ELF.cpp index 577843e449714..0c2253d50f93b 100644 --- a/stdlib/public/runtime/SwiftRT-ELF.cpp +++ b/stdlib/public/runtime/SwiftRT-ELF.cpp @@ -56,7 +56,7 @@ static void swift_image_constructor() { { reinterpret_cast(&__start_##name), \ static_cast(&__stop_##name - &__start_##name) } - new (§ions) swift::MetadataSections { + ::new (§ions) swift::MetadataSections { swift::CurrentSectionMetadataVersion, { __dso_handle }, diff --git a/stdlib/public/runtime/SwiftValue.mm b/stdlib/public/runtime/SwiftValue.mm index b7ea34b864a61..31b8fc3561083 100644 --- a/stdlib/public/runtime/SwiftValue.mm +++ b/stdlib/public/runtime/SwiftValue.mm @@ -34,6 +34,8 @@ #include #include +#include + using namespace swift; using namespace swift::hashable_support; @@ -196,7 +198,7 @@ static size_t getSwiftValuePayloadAlignMask(const Metadata *type) { */ auto header = getSwiftValueHeader(instance); - new (header) SwiftValueHeader(); + ::new (header) SwiftValueHeader(); header->type = srcType; auto payload = getSwiftValuePayload(instance, alignMask); From a2328b084352b9c46a73609e39e6d19d4a1940c2 Mon Sep 17 00:00:00 2001 From: Holly Borla Date: Thu, 14 Apr 2022 16:00:45 -0700 Subject: [PATCH 78/83] [Type Resolution] Resolve (any P.Type).Type as the metatype of an existential metatype. Until we model ExistentialMetatypeType as ExistentialType(MetatypeType), type resolution needs to look through the instance type repr when resolving a metatype. Otherwise, there's no way to distinguish between P.Type.Type, which is an existential metatype, and (any P.Type).Type, which is the static metatype of an existential metatype. --- lib/Sema/TypeCheckType.cpp | 24 ++++++++++++++++++++++++ test/type/explicit_existential.swift | 6 ++++++ 2 files changed, 30 insertions(+) diff --git a/lib/Sema/TypeCheckType.cpp b/lib/Sema/TypeCheckType.cpp index bafbcb705ce1b..52290e3ce03b3 100644 --- a/lib/Sema/TypeCheckType.cpp +++ b/lib/Sema/TypeCheckType.cpp @@ -4008,6 +4008,30 @@ NeverNullType TypeResolver::resolveMetatypeType(MetatypeTypeRepr *repr, NeverNullType TypeResolver::buildMetatypeType(MetatypeTypeRepr *repr, Type instanceType, Optional storedRepr) { + // If the instance type is an existential metatype, figure out if + // the syntax is of the form '(any ).Type'. In + // this case, type resolution should produce the static metatype + // of that existential metatype, versus another existential metatype + // via the old '.Type' syntax. + if (instanceType->is()) { + // First, look for the paren type. + auto *tuple = dyn_cast(repr->getBase()); + if (tuple && tuple->isParenType()) { + // Then, look through parens for the 'any' keyword. + auto *element = tuple->getWithoutParens(); + if (auto *existential = dyn_cast(element)) { + // Finally, look for a constraint ending with '.Type'. Assume the + // base is a protocol, otherwise resolveExistentialType would + // have emitted an error message and returned the concrete type + // instead of an existential metatype. + auto *constraint = existential->getConstraint()->getWithoutParens(); + if (isa(constraint)) { + return MetatypeType::get(instanceType, storedRepr); + } + } + } + } + if (instanceType->isAnyExistentialType() && !instanceType->is()) { // TODO: diagnose invalid representations? diff --git a/test/type/explicit_existential.swift b/test/type/explicit_existential.swift index 77efd8a54426f..67a90ed803b14 100644 --- a/test/type/explicit_existential.swift +++ b/test/type/explicit_existential.swift @@ -314,3 +314,9 @@ func testAnyFixIt() { // expected-error@+1 {{optional 'any' type must be written '(any HasAssoc.Type)?'}}{{10-28=(any HasAssoc.Type)?}} let _: any HasAssoc.Type? = nil } + +func testNestedMetatype() { + let _: (any P.Type).Type = (any P.Type).self + let _: (any (P.Type)).Type = (any P.Type).self + let _: ((any (P.Type))).Type = (any P.Type).self +} From aad67a828b9667db7b68addf21dab9550bdc3205 Mon Sep 17 00:00:00 2001 From: Doug Gregor Date: Thu, 14 Apr 2022 17:25:48 -0700 Subject: [PATCH 79/83] Only run this new test on macOS --- test/ModuleInterface/SilencePreconcurrency.swiftinterface | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/ModuleInterface/SilencePreconcurrency.swiftinterface b/test/ModuleInterface/SilencePreconcurrency.swiftinterface index d3ed5bd250c30..95c160b4eb787 100644 --- a/test/ModuleInterface/SilencePreconcurrency.swiftinterface +++ b/test/ModuleInterface/SilencePreconcurrency.swiftinterface @@ -4,4 +4,6 @@ // RUN: %empty-directory(%t) // RUN: %target-swift-frontend -compile-module-from-interface -o %/t/SilencePreconcurrency.swiftmodule %s -verify +// REQUIRES: OS=macosx + @preconcurrency import Swift From 245321199e327d5e4502a1e5c131ddf3a713207f Mon Sep 17 00:00:00 2001 From: Robert Widmann Date: Thu, 14 Apr 2022 18:38:21 -0700 Subject: [PATCH 80/83] [NFC] Remove Legacy Parser-Based Redeclaration Diagnostics --- include/swift/AST/DiagnosticsCommon.def | 3 --- include/swift/AST/DiagnosticsParse.def | 2 -- include/swift/Parse/Parser.h | 4 +--- lib/Parse/Parser.cpp | 9 --------- 4 files changed, 1 insertion(+), 17 deletions(-) diff --git a/include/swift/AST/DiagnosticsCommon.def b/include/swift/AST/DiagnosticsCommon.def index 14701f6756c5a..e597eb6c8adc1 100644 --- a/include/swift/AST/DiagnosticsCommon.def +++ b/include/swift/AST/DiagnosticsCommon.def @@ -38,9 +38,6 @@ ERROR(cannot_parse_group_info_file,none, ERROR(error_no_group_info,none, "no group info found for file: '%0'", (StringRef)) -NOTE(previous_decldef,none, - "previous definition of %0 is here", (DeclBaseName)) - NOTE(brace_stmt_suggest_do,none, "did you mean to use a 'do' statement?", ()) diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index e1493f59fff94..fe02f3f491ae2 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -219,8 +219,6 @@ ERROR(number_cant_start_decl_name,none, (StringRef)) ERROR(expected_identifier_after_case_comma, PointsToFirstBadToken, "expected identifier after comma in enum 'case' declaration", ()) -ERROR(decl_redefinition,none, - "definition conflicts with previous value", ()) ERROR(let_cannot_be_computed_property,none, "'let' declarations cannot be computed properties", ()) ERROR(let_cannot_be_observing_property,none, diff --git a/include/swift/Parse/Parser.h b/include/swift/Parse/Parser.h index d425d1b4f639e..30e938d8c8113 100644 --- a/include/swift/Parse/Parser.h +++ b/include/swift/Parse/Parser.h @@ -784,9 +784,7 @@ class Parser { return diagnose(Tok.getLoc(), Diagnostic(DiagID, std::forward(Args)...)); } - - void diagnoseRedefinition(ValueDecl *Prev, ValueDecl *New); - + /// Add a fix-it to remove the space in consecutive identifiers. /// Add a camel-cased option if it is different than the first option. void diagnoseConsecutiveIDs(StringRef First, SourceLoc FirstLoc, diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index 56229a73dd653..63dc294bf9749 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -1159,15 +1159,6 @@ Parser::parseList(tok RightK, SourceLoc LeftLoc, SourceLoc &RightLoc, return Status; } -/// diagnoseRedefinition - Diagnose a redefinition error, with a note -/// referring back to the original definition. - -void Parser::diagnoseRedefinition(ValueDecl *Prev, ValueDecl *New) { - assert(New != Prev && "Cannot conflict with self"); - diagnose(New->getLoc(), diag::decl_redefinition); - diagnose(Prev->getLoc(), diag::previous_decldef, Prev->getBaseName()); -} - Optional Parser::getStringLiteralIfNotInterpolated(SourceLoc Loc, StringRef DiagText) { From fb4720d3553656f1c454e17bf1560b5385f438e6 Mon Sep 17 00:00:00 2001 From: Pavel Yaskevich Date: Tue, 12 Apr 2022 17:23:06 -0700 Subject: [PATCH 81/83] [BuilderTransform] Replace use of TypeExpr with a special $builderSelf variable For all of the `build*` calls, let's use a special variable declaration `$builderSelf` which refers to a type of the builder used. This allows us to remove hacks related to use of `TypeExpr`. Reference to `$builderSelf` is replaced with `TypeExpr` during solution application when the builder type is completely resolved. --- include/swift/AST/KnownIdentifiers.def | 1 + lib/Sema/BuilderTransform.cpp | 45 ++++++++----------- lib/Sema/CSApply.cpp | 14 ++++++ test/Constraints/result_builder_one_way.swift | 22 ++++----- 4 files changed, 44 insertions(+), 38 deletions(-) diff --git a/include/swift/AST/KnownIdentifiers.def b/include/swift/AST/KnownIdentifiers.def index d33f5e285a198..2da0dc6dbe05d 100644 --- a/include/swift/AST/KnownIdentifiers.def +++ b/include/swift/AST/KnownIdentifiers.def @@ -303,6 +303,7 @@ IDENTIFIER(InvocationEncoder) IDENTIFIER(whenLocal) IDENTIFIER(decodeNextArgument) IDENTIFIER(SerializationRequirement) +IDENTIFIER_WITH_NAME(builderSelf, "$builderSelf") #undef IDENTIFIER #undef IDENTIFIER_ diff --git a/lib/Sema/BuilderTransform.cpp b/lib/Sema/BuilderTransform.cpp index e0f4b6df0b6bb..9df8a83c49b2c 100644 --- a/lib/Sema/BuilderTransform.cpp +++ b/lib/Sema/BuilderTransform.cpp @@ -73,6 +73,10 @@ class BuilderClosureVisitor Identifier buildOptionalId; llvm::SmallDenseMap supportedOps; + /// The variable used as a base for all `build*` operations added + /// by this transform. + VarDecl *builderVar = nullptr; + SkipUnhandledConstructInResultBuilder::UnhandledNode unhandledNode; /// Whether an error occurred during application of the builder closure, @@ -94,32 +98,6 @@ class BuilderClosureVisitor if (!cs) return nullptr; - // FIXME: Setting a base on this expression is necessary in order - // to get diagnostics if something about this builder call fails, - // e.g. if there isn't a matching overload for `buildBlock`. - TypeExpr *typeExpr; - auto simplifiedTy = cs->simplifyType(builderType); - if (!simplifiedTy->hasTypeVariable()) { - typeExpr = TypeExpr::createImplicitHack(loc, simplifiedTy, ctx); - } else if (auto *decl = simplifiedTy->getAnyGeneric()) { - // HACK: If there's not enough information to completely resolve the - // builder type, but we have the base available to us, form an *explicit* - // TypeExpr pointing at it. We cannot form an implicit base without - // a fully-resolved concrete type. Really, whatever we put here has no - // bearing on the generated solution because we're going to use this node - // to stash the builder type and hand it back to the ambient - // constraint system. - typeExpr = TypeExpr::createForDecl(DeclNameLoc(loc), decl, dc); - } else { - // HACK: If there's not enough information in the constraint system, - // create a garbage base type to force it to diagnose - // this as an ambiguous expression. - // FIXME: We can also construct an UnresolvedMemberExpr here instead of - // an UnresolvedDotExpr and get a slightly better diagnostic. - typeExpr = TypeExpr::createImplicitHack(loc, ErrorType::get(ctx), ctx); - } - cs->setType(typeExpr, MetatypeType::get(builderType)); - SmallVector args; for (auto i : indices(argExprs)) { auto *expr = argExprs[i]; @@ -128,8 +106,11 @@ class BuilderClosureVisitor args.emplace_back(labelLoc, label, expr); } + auto *baseExpr = new (ctx) DeclRefExpr({builderVar}, DeclNameLoc(loc), + /*isImplicit=*/true); + auto memberRef = new (ctx) UnresolvedDotExpr( - typeExpr, loc, DeclNameRef(fnName), DeclNameLoc(loc), + baseExpr, loc, DeclNameRef(fnName), DeclNameLoc(loc), /*implicit=*/true); memberRef->setFunctionRefKind(FunctionRefKind::SingleApply); @@ -223,6 +204,16 @@ class BuilderClosureVisitor buildOptionalId = ctx.Id_buildOptional; else buildOptionalId = ctx.Id_buildIf; + + // If we are about to generate constraints, let's establish builder + // variable for the base of `build*` calls. + if (cs) { + builderVar = new (ctx) VarDecl( + /*isStatic=*/false, VarDecl::Introducer::Let, + /*nameLoc=*/SourceLoc(), ctx.Id_builderSelf, dc); + builderVar->setImplicit(); + cs->setType(builderVar, MetatypeType::get(cs->simplifyType(builderType))); + } } /// Apply the builder transform to the given statement. diff --git a/lib/Sema/CSApply.cpp b/lib/Sema/CSApply.cpp index 93208e0567b9b..d612b6016b440 100644 --- a/lib/Sema/CSApply.cpp +++ b/lib/Sema/CSApply.cpp @@ -2869,6 +2869,20 @@ namespace { Expr *visitDeclRefExpr(DeclRefExpr *expr) { auto locator = cs.getConstraintLocator(expr); + // Check whether this is a reference to `__buildSelf`, and if so, + // replace it with a type expression with fully resolved type. + if (auto *var = dyn_cast(expr->getDecl())) { + auto &ctx = cs.getASTContext(); + if (var->getName() == ctx.Id_builderSelf) { + assert(expr->isImplicit() && var->isImplicit()); + auto builderTy = + solution.getResolvedType(var)->getMetatypeInstanceType(); + + return cs.cacheType( + TypeExpr::createImplicitHack(expr->getLoc(), builderTy, ctx)); + } + } + // Find the overload choice used for this declaration reference. auto selected = solution.getOverloadChoice(locator); return buildDeclRef(selected, expr->getNameLoc(), locator, diff --git a/test/Constraints/result_builder_one_way.swift b/test/Constraints/result_builder_one_way.swift index a45c99cac066a..4764e22092935 100644 --- a/test/Constraints/result_builder_one_way.swift +++ b/test/Constraints/result_builder_one_way.swift @@ -49,17 +49,17 @@ func tuplify(_ collection: C, @TupleBuilder body: (C.Element) } // CHECK: ---Connected components--- -// CHECK-NEXT: 1: $T10 depends on 0 -// CHECK-NEXT: 0: $T1 $T2 $T3 $T5 $T6 $T7 $T8 $T77 $T78 depends on 3 -// CHECK-NEXT: 3: $T12 $T17 $T28 $T42 $T53 $T54 $T55 $T56 $T57 $T58 $T59 $T60 $T61 $T62 $T63 $T64 $T65 $T66 $T68 $T69 $T70 $T71 $T72 $T73 $T74 $T75 $T76 depends on 2, 4, 5, 7, 10 -// CHECK-NEXT: 10: $T48 $T49 $T50 $T51 $T52 depends on 9 -// CHECK-NEXT: 9: $T43 $T44 $T45 $T46 $T47 -// CHECK-NEXT: 7: $T31 $T35 $T36 $T37 $T38 $T39 $T40 $T41 depends on 6, 8 -// CHECK-NEXT: 8: $T32 $T33 $T34 -// CHECK-NEXT: 6: $T30 -// CHECK-NEXT: 5: $T18 $T19 $T20 $T21 $T22 $T23 $T24 $T25 $T26 $T27 -// CHECK-NEXT: 4: $T15 $T16 -// CHECK-NEXT: 2: $T11 +// CHECK-NEXT: 1: $T10 depends on 0 +// CHECK-NEXT: 0: $T1 $T2 $T3 $T5 $T6 $T7 $T8 $T82 $T83 depends on 3 +// CHECK-NEXT: 3: $T12 $T17 $T28 $T43 $T55 $T57 $T58 $T59 $T60 $T61 $T63 $T64 $T65 $T66 $T67 $T68 $T70 $T71 $T73 $T74 $T75 $T76 $T77 $T78 $T79 $T80 $T81 depends on 2, 4, 5, 7, 10 +// CHECK-NEXT: 10: $T49 $T51 $T52 $T53 $T54 depends on 9 +// CHECK-NEXT: 9: $T44 $T45 $T46 $T47 $T48 +// CHECK-NEXT: 7: $T31 $T35 $T37 $T38 $T39 $T40 $T41 $T42 depends on 6, 8 +// CHECK-NEXT: 8: $T32 $T33 $T34 +// CHECK-NEXT: 6: $T30 +// CHECK-NEXT: 5: $T18 $T19 $T20 $T21 $T22 $T23 $T24 $T25 $T26 $T27 +// CHECK-NEXT: 4: $T15 $T16 +// CHECK-NEXT: 2: $T11 let names = ["Alice", "Bob", "Charlie"] let b = true var number = 17 From eb8c43e131e8986b3a83b4d8da28689f11380383 Mon Sep 17 00:00:00 2001 From: Pavel Yaskevich Date: Tue, 12 Apr 2022 17:27:05 -0700 Subject: [PATCH 82/83] [CSGen] Remove a hack from `visitTypeExpr` that used to support result builders --- lib/Sema/CSGen.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/lib/Sema/CSGen.cpp b/lib/Sema/CSGen.cpp index cfd7fc74bb266..6c865ad2b64a2 100644 --- a/lib/Sema/CSGen.cpp +++ b/lib/Sema/CSGen.cpp @@ -1410,13 +1410,6 @@ namespace { type = CS.getInstanceType(CS.cacheType(E)); assert(type && "Implicit type expr must have type set!"); type = CS.replaceInferableTypesWithTypeVars(type, locator); - } else if (CS.hasType(E)) { - // If there's a type already set into the constraint system, honor it. - // FIXME: This supports the result builder transform, which sneakily - // stashes a type in the constraint system through a TypeExpr in order - // to pass it down to the rest of CSGen. This is a terribly - // unprincipled thing to do. - return CS.getType(E); } else { auto *repr = E->getTypeRepr(); assert(repr && "Explicit node has no type repr!"); From 66a8ae07dc47340768701943519208932b3922b0 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 14 Apr 2022 21:27:40 -0700 Subject: [PATCH 83/83] [test] Move string test helper methods to StdlibUnittest This fixes a Windows regression triggered by https://github.com/apple/swift/pull/41417. --- .../StdlibUnicodeUnittest.swift | 140 ------------------ stdlib/private/StdlibUnittest/CMakeLists.txt | 1 + .../StdlibUnittest/StringTestHelpers.swift | 139 +++++++++++++++++ test/stdlib/StringIndex.swift | 2 +- 4 files changed, 141 insertions(+), 141 deletions(-) create mode 100644 stdlib/private/StdlibUnittest/StringTestHelpers.swift diff --git a/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift b/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift index 8540b71865ec9..ed17c8a0249a6 100644 --- a/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift +++ b/stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift @@ -778,143 +778,3 @@ public let utf16Tests = [ [ 0xDC00, 0xD800, 0xD800, 0xDC00 ]), ], ] - -extension String { - /// Print out a full list of indices in every view of this string. - /// This is useful while debugging string indexing issues. - public func dumpIndices() { - print("-------------------------------------------------------------------") - print("String: \(String(reflecting: self))") - print("Characters:") - self.indices.forEach { i in - let char = self[i] - print(" \(i) -> \(String(reflecting: char))") - } - print("Scalars:") - self.unicodeScalars.indices.forEach { i in - let scalar = self.unicodeScalars[i] - let value = String(scalar.value, radix: 16, uppercase: true) - let padding = String(repeating: "0", count: max(0, 4 - value.count)) - let name = scalar.properties.name ?? "\(scalar.debugDescription)" - print(" \(i) -> U+\(padding)\(value) \(name)") - } - print("UTF-8:") - self.utf8.indices.forEach { i in - let code = self.utf8[i] - let value = String(code, radix: 16, uppercase: true) - let padding = value.count < 2 ? "0" : "" - print(" \(i) -> \(padding)\(value)") - } - print("UTF-16:") - self.utf16.indices.forEach { i in - let code = self.utf16[i] - let value = String(code, radix: 16, uppercase: true) - let padding = String(repeating: "0", count: 4 - value.count) - print(" \(i) -> \(padding)\(value)") - } - } - - // Returns a list of every valid index in every string view, optionally - // including end indices. We keep equal indices originating from different - // views because they may have different grapheme size caches or flags etc. - public func allIndices(includingEnd: Bool = true) -> [String.Index] { - var r = Array(self.indices) - if includingEnd { r.append(self.endIndex) } - r += Array(self.unicodeScalars.indices) - if includingEnd { r.append(self.unicodeScalars.endIndex) } - r += Array(self.utf8.indices) - if includingEnd { r.append(self.utf8.endIndex) } - r += Array(self.utf16.indices) - if includingEnd { r.append(self.utf16.endIndex) } - return r - } -} - -extension Substring { - // Returns a list of every valid index in every substring view, optionally - // including end indices. We keep equal indices originating from different - // views because they may have different grapheme size caches or flags etc. - public func allIndices(includingEnd: Bool = true) -> [String.Index] { - var r = Array(self.indices) - if includingEnd { r.append(self.endIndex) } - r += Array(self.unicodeScalars.indices) - if includingEnd { r.append(self.unicodeScalars.endIndex) } - r += Array(self.utf8.indices) - if includingEnd { r.append(self.utf8.endIndex) } - r += Array(self.utf16.indices) - if includingEnd { r.append(self.utf16.endIndex) } - return r - } -} - -extension Collection { - // Assuming both `self` and `other` use the same index space, call `body` for - // each index `i` in `other`, along with the slice in `self` that begins at - // `i` and ends at the index following it in `other`. - // - // `other` must start with an item that is less than or equal to the first - // item in `self`. - func forEachIndexGroup( - by other: G, - body: (G.Index, Self.SubSequence, Int) throws -> Void - ) rethrows - where G.Index == Self.Index - { - if other.isEmpty { - assert(self.isEmpty) - return - } - var i = other.startIndex - var j = self.startIndex - var offset = 0 - while i != other.endIndex { - let current = i - other.formIndex(after: &i) - let start = j - while j < i, j < self.endIndex { - self.formIndex(after: &j) - } - let end = j - try body(current, self[start ..< end], offset) - offset += 1 - } - } -} - -extension String { - /// Returns a dictionary mapping each valid index to the index that addresses - /// the nearest scalar boundary, rounding down. - public func scalarMap() -> [Index: (index: Index, offset: Int)] { - var map: [Index: (index: Index, offset: Int)] = [:] - - utf8.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in - for i in slice.indices { map[i] = (scalar, offset) } - } - utf16.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in - for i in slice.indices { map[i] = (scalar, offset) } - } - self.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in - for i in slice.indices { map[i] = (scalar, offset) } - } - map[endIndex] = (endIndex, unicodeScalars.count) - return map - } - - /// Returns a dictionary mapping each valid index to the index that addresses - /// the nearest character boundary, rounding down. - public func characterMap() -> [Index: (index: Index, offset: Int)] { - var map: [Index: (index: Index, offset: Int)] = [:] - utf8.forEachIndexGroup(by: self) { char, slice, offset in - for i in slice.indices { map[i] = (char, offset) } - } - utf16.forEachIndexGroup(by: self) { char, slice, offset in - for i in slice.indices { map[i] = (char, offset) } - } - unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in - for i in slice.indices { map[i] = (char, offset) } - } - map[endIndex] = (endIndex, count) - return map - } -} - diff --git a/stdlib/private/StdlibUnittest/CMakeLists.txt b/stdlib/private/StdlibUnittest/CMakeLists.txt index 29d98d1e51bbc..d9d144bcf4705 100644 --- a/stdlib/private/StdlibUnittest/CMakeLists.txt +++ b/stdlib/private/StdlibUnittest/CMakeLists.txt @@ -42,6 +42,7 @@ add_swift_target_library(swiftStdlibUnittest ${SWIFT_STDLIB_LIBRARY_BUILD_TYPES} Statistics.swift StdlibCoreExtras.swift StringConvertible.swift + StringTestHelpers.swift SymbolLookup.swift TestHelpers.swift TypeIndexed.swift diff --git a/stdlib/private/StdlibUnittest/StringTestHelpers.swift b/stdlib/private/StdlibUnittest/StringTestHelpers.swift new file mode 100644 index 0000000000000..a2994b65d9a49 --- /dev/null +++ b/stdlib/private/StdlibUnittest/StringTestHelpers.swift @@ -0,0 +1,139 @@ +extension String { + /// Print out a full list of indices in every view of this string. + /// This is useful while debugging string indexing issues. + public func dumpIndices() { + print("-------------------------------------------------------------------") + print("String: \(String(reflecting: self))") + print("Characters:") + self.indices.forEach { i in + let char = self[i] + print(" \(i) -> \(String(reflecting: char))") + } + print("Scalars:") + self.unicodeScalars.indices.forEach { i in + let scalar = self.unicodeScalars[i] + let value = String(scalar.value, radix: 16, uppercase: true) + let padding = String(repeating: "0", count: max(0, 4 - value.count)) + let name = scalar.properties.name ?? "\(scalar.debugDescription)" + print(" \(i) -> U+\(padding)\(value) \(name)") + } + print("UTF-8:") + self.utf8.indices.forEach { i in + let code = self.utf8[i] + let value = String(code, radix: 16, uppercase: true) + let padding = value.count < 2 ? "0" : "" + print(" \(i) -> \(padding)\(value)") + } + print("UTF-16:") + self.utf16.indices.forEach { i in + let code = self.utf16[i] + let value = String(code, radix: 16, uppercase: true) + let padding = String(repeating: "0", count: 4 - value.count) + print(" \(i) -> \(padding)\(value)") + } + } + + // Returns a list of every valid index in every string view, optionally + // including end indices. We keep equal indices originating from different + // views because they may have different grapheme size caches or flags etc. + public func allIndices(includingEnd: Bool = true) -> [String.Index] { + var r = Array(self.indices) + if includingEnd { r.append(self.endIndex) } + r += Array(self.unicodeScalars.indices) + if includingEnd { r.append(self.unicodeScalars.endIndex) } + r += Array(self.utf8.indices) + if includingEnd { r.append(self.utf8.endIndex) } + r += Array(self.utf16.indices) + if includingEnd { r.append(self.utf16.endIndex) } + return r + } +} + +extension Substring { + // Returns a list of every valid index in every substring view, optionally + // including end indices. We keep equal indices originating from different + // views because they may have different grapheme size caches or flags etc. + public func allIndices(includingEnd: Bool = true) -> [String.Index] { + var r = Array(self.indices) + if includingEnd { r.append(self.endIndex) } + r += Array(self.unicodeScalars.indices) + if includingEnd { r.append(self.unicodeScalars.endIndex) } + r += Array(self.utf8.indices) + if includingEnd { r.append(self.utf8.endIndex) } + r += Array(self.utf16.indices) + if includingEnd { r.append(self.utf16.endIndex) } + return r + } +} + +extension Collection { + // Assuming both `self` and `other` use the same index space, call `body` for + // each index `i` in `other`, along with the slice in `self` that begins at + // `i` and ends at the index following it in `other`. + // + // `other` must start with an item that is less than or equal to the first + // item in `self`. + func forEachIndexGroup( + by other: G, + body: (G.Index, Self.SubSequence, Int) throws -> Void + ) rethrows + where G.Index == Self.Index + { + if other.isEmpty { + assert(self.isEmpty) + return + } + var i = other.startIndex + var j = self.startIndex + var offset = 0 + while i != other.endIndex { + let current = i + other.formIndex(after: &i) + let start = j + while j < i, j < self.endIndex { + self.formIndex(after: &j) + } + let end = j + try body(current, self[start ..< end], offset) + offset += 1 + } + } +} + +extension String { + /// Returns a dictionary mapping each valid index to the index that addresses + /// the nearest scalar boundary, rounding down. + public func scalarMap() -> [Index: (index: Index, offset: Int)] { + var map: [Index: (index: Index, offset: Int)] = [:] + + utf8.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } + } + utf16.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } + } + self.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in + for i in slice.indices { map[i] = (scalar, offset) } + } + map[endIndex] = (endIndex, unicodeScalars.count) + return map + } + + /// Returns a dictionary mapping each valid index to the index that addresses + /// the nearest character boundary, rounding down. + public func characterMap() -> [Index: (index: Index, offset: Int)] { + var map: [Index: (index: Index, offset: Int)] = [:] + utf8.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } + } + utf16.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } + } + unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in + for i in slice.indices { map[i] = (char, offset) } + } + map[endIndex] = (endIndex, count) + return map + } +} + diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index e75024be579a8..24683ff582f98 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -5,8 +5,8 @@ import StdlibUnittest #if _runtime(_ObjC) import Foundation -#endif import StdlibUnicodeUnittest +#endif var suite = TestSuite("StringIndexTests") defer { runAllTests() }