From 751cf26ef508c1c9ad53bc7b4ffb4f1d1e97a4aa Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 26 Feb 2019 11:13:12 -0800 Subject: [PATCH 1/8] [Unicode.Scalar] Add RAC UTF8View This adds new (availability-controlled) API to Unicode.Scalar, exposing the scalar's UTF-8 code units as a random-access collection similarly to how it currently exposes UTF-16 code units. Tests added. --- stdlib/public/core/UnicodeScalar.swift | 45 ++++++++++++++++++++++++++ test/stdlib/Character.swift | 15 +++++++++ 2 files changed, 60 insertions(+) diff --git a/stdlib/public/core/UnicodeScalar.swift b/stdlib/public/core/UnicodeScalar.swift index 18215e52a127f..882757688d3b7 100644 --- a/stdlib/public/core/UnicodeScalar.swift +++ b/stdlib/public/core/UnicodeScalar.swift @@ -434,6 +434,51 @@ extension Unicode.Scalar.UTF16View : RandomAccessCollection { } } +extension Unicode.Scalar { + @available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) + @_fixed_layout + public struct UTF8View { + @inlinable + internal init(value: Unicode.Scalar) { + self.value = value + } + @usableFromInline + internal var value: Unicode.Scalar + } + + @available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) + @inlinable + public var utf8: UTF8View { return UTF8View(value: self) } +} + +@available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) +extension Unicode.Scalar.UTF8View : RandomAccessCollection { + public typealias Indices = Range + + /// The position of the first code unit. + @inlinable + public var startIndex: Int { return 0 } + + /// The "past the end" position---that is, the position one + /// greater than the last valid subscript argument. + /// + /// If the collection is empty, `endIndex` is equal to `startIndex`. + @inlinable + public var endIndex: Int { return 0 + UTF8.width(value) } + + /// Accesses the code unit at the specified position. + /// + /// - Parameter position: The position of the element to access. `position` + /// must be a valid index of the collection that is not equal to the + /// `endIndex` property. + @inlinable + public subscript(position: Int) -> UTF8.CodeUnit { + _precondition(position >= startIndex && position < endIndex, + "Unicode.Scalar.UTF8View index is out of bounds") + return value.withUTF8CodeUnits { $0[position] } + } +} + extension Unicode.Scalar { internal static var _replacementCharacter: Unicode.Scalar { return Unicode.Scalar(_value: UTF32._replacementCodeUnit) diff --git a/test/stdlib/Character.swift b/test/stdlib/Character.swift index 6f6acc549ffab..da31d0a5d68a0 100644 --- a/test/stdlib/Character.swift +++ b/test/stdlib/Character.swift @@ -404,5 +404,20 @@ UnicodeScalarTests.test("LosslessStringConvertible") { checkLosslessStringConvertible((0...127).map { UnicodeScalar(Int($0))! }) } +if #available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) { + UnicodeScalarTests.test("Views") { + let scalars = baseScalars + continuingScalars + for scalar in scalars { + expectEqual(scalar, String(scalar).unicodeScalars.first!) + expectEqualSequence(String(scalar).utf8, scalar.utf8) + expectEqualSequence(String(scalar).utf16, scalar.utf16) + + expectEqualSequence(String(scalar).utf8.reversed(), scalar.utf8.reversed()) + expectEqualSequence( + String(scalar).utf16.reversed(), scalar.utf16.reversed()) + } + } +} + runAllTests() From b6a158e1fc45487c652b6eb46ee6d0e70ac5a132 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 1 Mar 2019 10:46:52 -0800 Subject: [PATCH 2/8] [stdlib] Add LegacyABI.swift for legacy non-API declarations --- stdlib/public/core/CMakeLists.txt | 1 + stdlib/public/core/GroupInfo.json | 1 + stdlib/public/core/LegacyABI.swift | 15 +++++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 stdlib/public/core/LegacyABI.swift diff --git a/stdlib/public/core/CMakeLists.txt b/stdlib/public/core/CMakeLists.txt index 3e181ab60d403..f7874df471288 100644 --- a/stdlib/public/core/CMakeLists.txt +++ b/stdlib/public/core/CMakeLists.txt @@ -83,6 +83,7 @@ set(SWIFTLIB_ESSENTIAL KeyValuePairs.swift LazyCollection.swift LazySequence.swift + LegacyABI.swift LifetimeManager.swift ManagedBuffer.swift Map.swift diff --git a/stdlib/public/core/GroupInfo.json b/stdlib/public/core/GroupInfo.json index ca2c2ab879526..06ba858a9f666 100644 --- a/stdlib/public/core/GroupInfo.json +++ b/stdlib/public/core/GroupInfo.json @@ -223,6 +223,7 @@ "Equatable.swift", "Comparable.swift", "Codable.swift", + "LegacyABI.swift", "MigrationSupport.swift" ], "Result": [ diff --git a/stdlib/public/core/LegacyABI.swift b/stdlib/public/core/LegacyABI.swift new file mode 100644 index 0000000000000..959e09ee2fc46 --- /dev/null +++ b/stdlib/public/core/LegacyABI.swift @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2014 - 2019 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +// This file contains non-API (or underscored) declarations that are needed to +// be kept around for ABI compatibility + From 4967fc08eb103166493b9227ae37ce52f86de4cb Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 26 Feb 2019 13:59:53 -0800 Subject: [PATCH 3/8] [Unicode] Add convenience APIs to Unicode encodings Add convenience APIs to the stdlib's Unicode encodings: * Unicode.UTF16 * isASCII * isSurrogate * Unicode.UTF8 * isASCII * width * Unicode.UTF32 * isASCII * Unicode.ASCII * isASCII Tests added --- stdlib/public/core/ASCII.swift | 4 + stdlib/public/core/LegacyABI.swift | 20 ++ stdlib/public/core/StringComparison.swift | 2 +- stdlib/public/core/StringNormalization.swift | 4 +- stdlib/public/core/StringUTF8Validation.swift | 14 +- stdlib/public/core/StringUTF8View.swift | 4 +- .../public/core/StringUnicodeScalarView.swift | 4 +- stdlib/public/core/UTF16.swift | 250 +++++++++++++++++- stdlib/public/core/UTF32.swift | 10 +- stdlib/public/core/UTF8.swift | 43 ++- stdlib/public/core/Unicode.swift | 236 ----------------- stdlib/public/core/UnicodeHelpers.swift | 131 +++------ test/stdlib/Unicode.swift | 53 ++++ 13 files changed, 423 insertions(+), 352 deletions(-) diff --git a/stdlib/public/core/ASCII.swift b/stdlib/public/core/ASCII.swift index 9affc2a7db735..27b9efde18d18 100644 --- a/stdlib/public/core/ASCII.swift +++ b/stdlib/public/core/ASCII.swift @@ -23,6 +23,10 @@ extension Unicode.ASCII : Unicode.Encoding { return EncodedScalar(0x1a) // U+001A SUBSTITUTE; best we can do for ASCII } + /// Returns whether the given code unit represents an ASCII scalar + @_alwaysEmitIntoClient + public static func isASCII(_ x: CodeUnit) -> Bool { return UTF8.isASCII(x) } + @inline(__always) @inlinable public static func _isScalar(_ x: CodeUnit) -> Bool { diff --git a/stdlib/public/core/LegacyABI.swift b/stdlib/public/core/LegacyABI.swift index 959e09ee2fc46..3ba450a3a93e2 100644 --- a/stdlib/public/core/LegacyABI.swift +++ b/stdlib/public/core/LegacyABI.swift @@ -13,3 +13,23 @@ // This file contains non-API (or underscored) declarations that are needed to // be kept around for ABI compatibility +extension Unicode.UTF16 { + @available(*, unavailable, renamed: "Unicode.UTF16.isASCII") + @inlinable + public static func _isASCII(_ x: CodeUnit) -> Bool { + return Unicode.UTF16.isASCII(x) + } +} + +@available(*, unavailable, renamed: "Unicode.UTF8.isASCII") +@inlinable +internal func _isASCII(_ x: UInt8) -> Bool { + return Unicode.UTF8.isASCII(x) +} + +@available(*, unavailable, renamed: "Unicode.UTF8.isContinuation") +@inlinable +internal func _isContinuation(_ x: UInt8) -> Bool { + return UTF8.isContinuation(x) +} + diff --git a/stdlib/public/core/StringComparison.swift b/stdlib/public/core/StringComparison.swift index 3f776bf54f347..3eceb4b01e2a0 100644 --- a/stdlib/public/core/StringComparison.swift +++ b/stdlib/public/core/StringComparison.swift @@ -239,7 +239,7 @@ private func _findBoundary( } // Back up to scalar boundary - while _isContinuation(utf8[_unchecked: idx]) { + while UTF8.isContinuation(utf8[_unchecked: idx]) { idx &-= 1 } diff --git a/stdlib/public/core/StringNormalization.swift b/stdlib/public/core/StringNormalization.swift index c4330b4c1fe3b..c50285baf1df8 100644 --- a/stdlib/public/core/StringNormalization.swift +++ b/stdlib/public/core/StringNormalization.swift @@ -108,7 +108,7 @@ extension UnsafeBufferPointer where Element == UInt8 { if index == 0 || index == count { return true } - assert(!_isContinuation(self[_unchecked: index])) + assert(!UTF8.isContinuation(self[_unchecked: index])) // Sub-300 latiny fast-path if self[_unchecked: index] < 0xCC { return true } @@ -165,7 +165,7 @@ extension UnsafeBufferPointer where Element == UInt8 { _internalInvariant(index == count) return true } - return !_isContinuation(self[index]) + return !UTF8.isContinuation(self[index]) } } diff --git a/stdlib/public/core/StringUTF8Validation.swift b/stdlib/public/core/StringUTF8Validation.swift index bfb4d145ed2d1..bd0ec9f72c504 100644 --- a/stdlib/public/core/StringUTF8Validation.swift +++ b/stdlib/public/core/StringUTF8Validation.swift @@ -7,7 +7,7 @@ private func _isNotOverlong_F0(_ x: UInt8) -> Bool { } private func _isNotOverlong_F4(_ x: UInt8) -> Bool { - return _isContinuation(x) && x <= 0x8F + return UTF8.isContinuation(x) && x <= 0x8F } private func _isNotOverlong_E0(_ x: UInt8) -> Bool { @@ -15,11 +15,7 @@ private func _isNotOverlong_E0(_ x: UInt8) -> Bool { } private func _isNotOverlong_ED(_ x: UInt8) -> Bool { - return _isContinuation(x) && x <= 0x9F -} - -private func _isASCII_cmp(_ x: UInt8) -> Bool { - return x <= 0x7F + return UTF8.isContinuation(x) && x <= 0x9F } internal struct UTF8ExtraInfo: Equatable { @@ -48,7 +44,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer) -> UTF8ValidationR guard f(cu) else { throw UTF8ValidationError() } } @inline(__always) func guaranteeContinuation() throws { - try guaranteeIn(_isContinuation) + try guaranteeIn(UTF8.isContinuation) } func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int { @@ -94,7 +90,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer) -> UTF8ValidationR var endIndex = buf.startIndex var iter = buf.makeIterator() _ = iter.next() - while let cu = iter.next(), !_isASCII(cu) && !_isUTF8MultiByteLeading(cu) { + while let cu = iter.next(), UTF8.isContinuation(cu) { endIndex += 1 } let illegalRange = Range(buf.startIndex...endIndex) @@ -107,7 +103,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer) -> UTF8ValidationR do { var isASCII = true while let cu = iter.next() { - if _isASCII(cu) { lastValidIndex &+= 1; continue } + if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue } isASCII = false if _slowPath(!_isUTF8MultiByteLeading(cu)) { throw UTF8ValidationError() diff --git a/stdlib/public/core/StringUTF8View.swift b/stdlib/public/core/StringUTF8View.swift index 6addee3aaa041..81b8bdb6d6795 100644 --- a/stdlib/public/core/StringUTF8View.swift +++ b/stdlib/public/core/StringUTF8View.swift @@ -415,7 +415,7 @@ extension String.UTF8View { let (scalar, scalarLen) = _guts.foreignErrorCorrectedScalar( startingAt: i.strippingTranscoding) - let utf8Len = _numUTF8CodeUnits(scalar) + let utf8Len = UTF8.width(scalar) if utf8Len == 1 { _internalInvariant(i.transcodedOffset == 0) @@ -442,7 +442,7 @@ extension String.UTF8View { let (scalar, scalarLen) = _guts.foreignErrorCorrectedScalar( endingAt: i) - let utf8Len = _numUTF8CodeUnits(scalar) + let utf8Len = UTF8.width(scalar) return i.encoded(offsetBy: -scalarLen).transcoded(withOffset: utf8Len &- 1) } diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index 48465eab9cfeb..d46792def1537 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -418,7 +418,7 @@ extension String.UnicodeScalarView { internal func _foreignIndex(after i: Index) -> Index { _internalInvariant(_guts.isForeign) let cu = _guts.foreignErrorCorrectedUTF16CodeUnit(at: i) - let len = _isLeadingSurrogate(cu) ? 2 : 1 + let len = UTF16.isLeadSurrogate(cu) ? 2 : 1 return i.encoded(offsetBy: len) } @@ -429,7 +429,7 @@ extension String.UnicodeScalarView { _internalInvariant(_guts.isForeign) let priorIdx = i.priorEncoded let cu = _guts.foreignErrorCorrectedUTF16CodeUnit(at: priorIdx) - let len = _isTrailingSurrogate(cu) ? 2 : 1 + let len = UTF16.isTrailSurrogate(cu) ? 2 : 1 return i.encoded(offsetBy: -len) } diff --git a/stdlib/public/core/UTF16.swift b/stdlib/public/core/UTF16.swift index 88bd4c1283629..c683284f2e6f7 100644 --- a/stdlib/public/core/UTF16.swift +++ b/stdlib/public/core/UTF16.swift @@ -16,6 +16,249 @@ extension Unicode { } } +extension Unicode.UTF16 { + /// Returns the number of code units required to encode the given Unicode + /// scalar. + /// + /// Because a Unicode scalar value can require up to 21 bits to store its + /// value, some Unicode scalars are represented in UTF-16 by a pair of + /// 16-bit code units. The first and second code units of the pair, + /// designated *leading* and *trailing* surrogates, make up a *surrogate + /// pair*. + /// + /// let anA: Unicode.Scalar = "A" + /// print(anA.value) + /// // Prints "65" + /// print(UTF16.width(anA)) + /// // Prints "1" + /// + /// let anApple: Unicode.Scalar = "๐ŸŽ" + /// print(anApple.value) + /// // Prints "127822" + /// print(UTF16.width(anApple)) + /// // Prints "2" + /// + /// - Parameter x: A Unicode scalar value. + /// - Returns: The width of `x` when encoded in UTF-16, either `1` or `2`. + @inlinable + public static func width(_ x: Unicode.Scalar) -> Int { + return x.value <= UInt16.max ? 1 : 2 + } + + /// Returns the high-surrogate code unit of the surrogate pair representing + /// the specified Unicode scalar. + /// + /// Because a Unicode scalar value can require up to 21 bits to store its + /// value, some Unicode scalars are represented in UTF-16 by a pair of + /// 16-bit code units. The first and second code units of the pair, + /// designated *leading* and *trailing* surrogates, make up a *surrogate + /// pair*. + /// + /// let apple: Unicode.Scalar = "๐ŸŽ" + /// print(UTF16.leadSurrogate(apple) + /// // Prints "55356" + /// + /// - Parameter x: A Unicode scalar value. `x` must be represented by a + /// surrogate pair when encoded in UTF-16. To check whether `x` is + /// represented by a surrogate pair, use `UTF16.width(x) == 2`. + /// - Returns: The leading surrogate code unit of `x` when encoded in UTF-16. + @inlinable + public static func leadSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit { + _precondition(width(x) == 2) + return 0xD800 + UTF16.CodeUnit(truncatingIfNeeded: + (x.value - 0x1_0000) &>> (10 as UInt32)) + } + + /// Returns the low-surrogate code unit of the surrogate pair representing + /// the specified Unicode scalar. + /// + /// Because a Unicode scalar value can require up to 21 bits to store its + /// value, some Unicode scalars are represented in UTF-16 by a pair of + /// 16-bit code units. The first and second code units of the pair, + /// designated *leading* and *trailing* surrogates, make up a *surrogate + /// pair*. + /// + /// let apple: Unicode.Scalar = "๐ŸŽ" + /// print(UTF16.trailSurrogate(apple) + /// // Prints "57166" + /// + /// - Parameter x: A Unicode scalar value. `x` must be represented by a + /// surrogate pair when encoded in UTF-16. To check whether `x` is + /// represented by a surrogate pair, use `UTF16.width(x) == 2`. + /// - Returns: The trailing surrogate code unit of `x` when encoded in UTF-16. + @inlinable + public static func trailSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit { + _precondition(width(x) == 2) + return 0xDC00 + UTF16.CodeUnit(truncatingIfNeeded: + (x.value - 0x1_0000) & (((1 as UInt32) &<< 10) - 1)) + } + + /// Returns a Boolean value indicating whether the specified code unit is a + /// high-surrogate code unit. + /// + /// Here's an example of checking whether each code unit in a string's + /// `utf16` view is a lead surrogate. The `apple` string contains a single + /// emoji character made up of a surrogate pair when encoded in UTF-16. + /// + /// let apple = "๐ŸŽ" + /// for unit in apple.utf16 { + /// print(UTF16.isLeadSurrogate(unit)) + /// } + /// // Prints "true" + /// // Prints "false" + /// + /// This method does not validate the encoding of a UTF-16 sequence beyond + /// the specified code unit. Specifically, it does not validate that a + /// low-surrogate code unit follows `x`. + /// + /// - Parameter x: A UTF-16 code unit. + /// - Returns: `true` if `x` is a high-surrogate code unit; otherwise, + /// `false`. + @inlinable + public static func isLeadSurrogate(_ x: CodeUnit) -> Bool { + return (x & 0xFC00) == 0xD800 + } + + /// Returns a Boolean value indicating whether the specified code unit is a + /// low-surrogate code unit. + /// + /// Here's an example of checking whether each code unit in a string's + /// `utf16` view is a trailing surrogate. The `apple` string contains a + /// single emoji character made up of a surrogate pair when encoded in + /// UTF-16. + /// + /// let apple = "๐ŸŽ" + /// for unit in apple.utf16 { + /// print(UTF16.isTrailSurrogate(unit)) + /// } + /// // Prints "false" + /// // Prints "true" + /// + /// This method does not validate the encoding of a UTF-16 sequence beyond + /// the specified code unit. Specifically, it does not validate that a + /// high-surrogate code unit precedes `x`. + /// + /// - Parameter x: A UTF-16 code unit. + /// - Returns: `true` if `x` is a low-surrogate code unit; otherwise, + /// `false`. + @inlinable + public static func isTrailSurrogate(_ x: CodeUnit) -> Bool { + return (x & 0xFC00) == 0xDC00 + } + + /// Returns a Boolean value indicating whether the specified code unit is a + /// high or low surrogate code unit. + @_alwaysEmitIntoClient + public static func isSurrogate(_ x: CodeUnit) -> Bool { + return isLeadSurrogate(x) || isTrailSurrogate(x) + } + + @inlinable + public // @testable + static func _copy( + source: UnsafeMutablePointer, + destination: UnsafeMutablePointer, + count: Int + ) { + if MemoryLayout.stride == MemoryLayout.stride { + _memcpy( + dest: UnsafeMutablePointer(destination), + src: UnsafeMutablePointer(source), + size: UInt(count) * UInt(MemoryLayout.stride)) + } + else { + for i in 0..( + of input: Input, + decodedAs sourceEncoding: Encoding.Type, + repairingIllFormedSequences: Bool + ) -> (count: Int, isASCII: Bool)? + where Encoding.CodeUnit == Input.Element { + + var utf16Count = 0 + var i = input + var d = Encoding.ForwardParser() + + // Fast path for ASCII in a UTF8 buffer + if sourceEncoding == Unicode.UTF8.self { + var peek: Encoding.CodeUnit = 0 + while let u = i.next() { + peek = u + guard _fastPath(peek < 0x80) else { break } + utf16Count = utf16Count + 1 + } + if _fastPath(peek < 0x80) { return (utf16Count, true) } + + var d1 = UTF8.ForwardParser() + d1._buffer.append(numericCast(peek)) + d = _identityCast(d1, to: Encoding.ForwardParser.self) + } + + var utf16BitUnion: CodeUnit = 0 + while true { + let s = d.parseScalar(from: &i) + if _fastPath(s._valid != nil), let scalarContent = s._valid { + let utf16 = transcode(scalarContent, from: sourceEncoding) + ._unsafelyUnwrappedUnchecked + utf16Count += utf16.count + for x in utf16 { utf16BitUnion |= x } + } + else if let _ = s._error { + guard _fastPath(repairingIllFormedSequences) else { return nil } + utf16Count += 1 + utf16BitUnion |= UTF16._replacementCodeUnit + } + else { + return (utf16Count, utf16BitUnion < 0x80) + } + } + } +} + extension Unicode.UTF16 : Unicode.Encoding { public typealias CodeUnit = UInt16 public typealias EncodedScalar = _UIntBuffer @@ -30,13 +273,14 @@ extension Unicode.UTF16 : Unicode.Encoding { return EncodedScalar(_storage: 0xFFFD, _bitCount: 16) } - @inlinable - public static func _isASCII(_ x: CodeUnit) -> Bool { + /// Returns whether the given code unit represents an ASCII scalar + @_alwaysEmitIntoClient + public static func isASCII(_ x: CodeUnit) -> Bool { return x <= 0x7f } @inlinable - public static func _isScalar(_ x: CodeUnit) -> Bool { + public static func _isScalar(_ x: CodeUnit) -> Bool { return x & 0xf800 != 0xd800 } diff --git a/stdlib/public/core/UTF32.swift b/stdlib/public/core/UTF32.swift index 6c60df8ff03db..2f2d870ad1e9a 100644 --- a/stdlib/public/core/UTF32.swift +++ b/stdlib/public/core/UTF32.swift @@ -24,7 +24,7 @@ extension Unicode.UTF32 : Unicode.Encoding { internal static var _replacementCodeUnit: CodeUnit { @inline(__always) get { return 0xFFFD } } - + @inlinable public static var encodedReplacementCharacter : EncodedScalar { return EncodedScalar(_replacementCodeUnit) @@ -32,10 +32,16 @@ extension Unicode.UTF32 : Unicode.Encoding { @inlinable @inline(__always) - public static func _isScalar(_ x: CodeUnit) -> Bool { + public static func _isScalar(_ x: CodeUnit) -> Bool { return true } + /// Returns whether the given code unit represents an ASCII scalar + @_alwaysEmitIntoClient + public static func isASCII(_ x: CodeUnit) -> Bool { + return x <= 0x7F + } + @inlinable @inline(__always) public static func decode(_ source: EncodedScalar) -> Unicode.Scalar { diff --git a/stdlib/public/core/UTF8.swift b/stdlib/public/core/UTF8.swift index 288172180dc2a..64f2d89bb84c6 100644 --- a/stdlib/public/core/UTF8.swift +++ b/stdlib/public/core/UTF8.swift @@ -16,6 +16,40 @@ extension Unicode { } } +extension Unicode.UTF8 { + /// Returns the number of code units required to encode the given Unicode + /// scalar. + /// + /// Because a Unicode scalar value can require up to 21 bits to store its + /// value, some Unicode scalars are represented in UTF-8 by a sequence of up + /// to 4 code units. The first code unit is designated a *lead* byte and the + /// rest are *continuation* bytes. + /// + /// let anA: Unicode.Scalar = "A" + /// print(anA.value) + /// // Prints "65" + /// print(UTF8.width(anA)) + /// // Prints "1" + /// + /// let anApple: Unicode.Scalar = "๐ŸŽ" + /// print(anApple.value) + /// // Prints "127822" + /// print(UTF8.width(anApple)) + /// // Prints "4" + /// + /// - Parameter x: A Unicode scalar value. + /// - Returns: The width of `x` when encoded in UTF-8, from `1` to `4`. + @_alwaysEmitIntoClient + public static func width(_ x: Unicode.Scalar) -> Int { + switch x.value { + case 0..<0x80: return 1 + case 0x80..<0x0800: return 2 + case 0x0800..<0x1_0000: return 3 + default: return 4 + } + } +} + extension Unicode.UTF8 : _UnicodeEncoding { public typealias CodeUnit = UInt8 public typealias EncodedScalar = _ValidUTF8Buffer @@ -28,7 +62,14 @@ extension Unicode.UTF8 : _UnicodeEncoding { @inline(__always) @inlinable public static func _isScalar(_ x: CodeUnit) -> Bool { - return x & 0x80 == 0 + return isASCII(x) + } + + /// Returns whether the given code unit represents an ASCII scalar + @_alwaysEmitIntoClient + @inline(__always) + public static func isASCII(_ x: CodeUnit) -> Bool { + return x & 0b1000_0000 == 0 } @inline(__always) diff --git a/stdlib/public/core/Unicode.swift b/stdlib/public/core/Unicode.swift index 3cfaf14d12b8d..fd2123797bc08 100644 --- a/stdlib/public/core/Unicode.swift +++ b/stdlib/public/core/Unicode.swift @@ -628,242 +628,6 @@ extension UTF8.CodeUnit : _StringElement { } } -extension UTF16 { - /// Returns the number of code units required to encode the given Unicode - /// scalar. - /// - /// Because a Unicode scalar value can require up to 21 bits to store its - /// value, some Unicode scalars are represented in UTF-16 by a pair of - /// 16-bit code units. The first and second code units of the pair, - /// designated *leading* and *trailing* surrogates, make up a *surrogate - /// pair*. - /// - /// let anA: Unicode.Scalar = "A" - /// print(anA.value) - /// // Prints "65" - /// print(UTF16.width(anA)) - /// // Prints "1" - /// - /// let anApple: Unicode.Scalar = "๐ŸŽ" - /// print(anApple.value) - /// // Prints "127822" - /// print(UTF16.width(anApple)) - /// // Prints "2" - /// - /// - Parameter x: A Unicode scalar value. - /// - Returns: The width of `x` when encoded in UTF-16, either `1` or `2`. - @inlinable - public static func width(_ x: Unicode.Scalar) -> Int { - return x.value <= 0xFFFF ? 1 : 2 - } - - /// Returns the high-surrogate code unit of the surrogate pair representing - /// the specified Unicode scalar. - /// - /// Because a Unicode scalar value can require up to 21 bits to store its - /// value, some Unicode scalars are represented in UTF-16 by a pair of - /// 16-bit code units. The first and second code units of the pair, - /// designated *leading* and *trailing* surrogates, make up a *surrogate - /// pair*. - /// - /// let apple: Unicode.Scalar = "๐ŸŽ" - /// print(UTF16.leadSurrogate(apple) - /// // Prints "55356" - /// - /// - Parameter x: A Unicode scalar value. `x` must be represented by a - /// surrogate pair when encoded in UTF-16. To check whether `x` is - /// represented by a surrogate pair, use `UTF16.width(x) == 2`. - /// - Returns: The leading surrogate code unit of `x` when encoded in UTF-16. - @inlinable - public static func leadSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit { - _precondition(width(x) == 2) - return 0xD800 + UTF16.CodeUnit(truncatingIfNeeded: - (x.value - 0x1_0000) &>> (10 as UInt32)) - } - - /// Returns the low-surrogate code unit of the surrogate pair representing - /// the specified Unicode scalar. - /// - /// Because a Unicode scalar value can require up to 21 bits to store its - /// value, some Unicode scalars are represented in UTF-16 by a pair of - /// 16-bit code units. The first and second code units of the pair, - /// designated *leading* and *trailing* surrogates, make up a *surrogate - /// pair*. - /// - /// let apple: Unicode.Scalar = "๐ŸŽ" - /// print(UTF16.trailSurrogate(apple) - /// // Prints "57166" - /// - /// - Parameter x: A Unicode scalar value. `x` must be represented by a - /// surrogate pair when encoded in UTF-16. To check whether `x` is - /// represented by a surrogate pair, use `UTF16.width(x) == 2`. - /// - Returns: The trailing surrogate code unit of `x` when encoded in UTF-16. - @inlinable - public static func trailSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit { - _precondition(width(x) == 2) - return 0xDC00 + UTF16.CodeUnit(truncatingIfNeeded: - (x.value - 0x1_0000) & (((1 as UInt32) &<< 10) - 1)) - } - - /// Returns a Boolean value indicating whether the specified code unit is a - /// high-surrogate code unit. - /// - /// Here's an example of checking whether each code unit in a string's - /// `utf16` view is a lead surrogate. The `apple` string contains a single - /// emoji character made up of a surrogate pair when encoded in UTF-16. - /// - /// let apple = "๐ŸŽ" - /// for unit in apple.utf16 { - /// print(UTF16.isLeadSurrogate(unit)) - /// } - /// // Prints "true" - /// // Prints "false" - /// - /// This method does not validate the encoding of a UTF-16 sequence beyond - /// the specified code unit. Specifically, it does not validate that a - /// low-surrogate code unit follows `x`. - /// - /// - Parameter x: A UTF-16 code unit. - /// - Returns: `true` if `x` is a high-surrogate code unit; otherwise, - /// `false`. - @inlinable - public static func isLeadSurrogate(_ x: CodeUnit) -> Bool { - return (x & 0xFC00) == 0xD800 - } - - /// Returns a Boolean value indicating whether the specified code unit is a - /// low-surrogate code unit. - /// - /// Here's an example of checking whether each code unit in a string's - /// `utf16` view is a trailing surrogate. The `apple` string contains a - /// single emoji character made up of a surrogate pair when encoded in - /// UTF-16. - /// - /// let apple = "๐ŸŽ" - /// for unit in apple.utf16 { - /// print(UTF16.isTrailSurrogate(unit)) - /// } - /// // Prints "false" - /// // Prints "true" - /// - /// This method does not validate the encoding of a UTF-16 sequence beyond - /// the specified code unit. Specifically, it does not validate that a - /// high-surrogate code unit precedes `x`. - /// - /// - Parameter x: A UTF-16 code unit. - /// - Returns: `true` if `x` is a low-surrogate code unit; otherwise, - /// `false`. - @inlinable - public static func isTrailSurrogate(_ x: CodeUnit) -> Bool { - return (x & 0xFC00) == 0xDC00 - } - - @inlinable - public // @testable - static func _copy( - source: UnsafeMutablePointer, - destination: UnsafeMutablePointer, - count: Int - ) { - if MemoryLayout.stride == MemoryLayout.stride { - _memcpy( - dest: UnsafeMutablePointer(destination), - src: UnsafeMutablePointer(source), - size: UInt(count) * UInt(MemoryLayout.stride)) - } - else { - for i in 0..( - of input: Input, - decodedAs sourceEncoding: Encoding.Type, - repairingIllFormedSequences: Bool - ) -> (count: Int, isASCII: Bool)? - where Encoding.CodeUnit == Input.Element { - - var utf16Count = 0 - var i = input - var d = Encoding.ForwardParser() - - // Fast path for ASCII in a UTF8 buffer - if sourceEncoding == Unicode.UTF8.self { - var peek: Encoding.CodeUnit = 0 - while let u = i.next() { - peek = u - guard _fastPath(peek < 0x80) else { break } - utf16Count = utf16Count + 1 - } - if _fastPath(peek < 0x80) { return (utf16Count, true) } - - var d1 = UTF8.ForwardParser() - d1._buffer.append(numericCast(peek)) - d = _identityCast(d1, to: Encoding.ForwardParser.self) - } - - var utf16BitUnion: CodeUnit = 0 - while true { - let s = d.parseScalar(from: &i) - if _fastPath(s._valid != nil), let scalarContent = s._valid { - let utf16 = transcode(scalarContent, from: sourceEncoding) - ._unsafelyUnwrappedUnchecked - utf16Count += utf16.count - for x in utf16 { utf16BitUnion |= x } - } - else if let _ = s._error { - guard _fastPath(repairingIllFormedSequences) else { return nil } - utf16Count += 1 - utf16BitUnion |= UTF16._replacementCodeUnit - } - else { - return (utf16Count, utf16BitUnion < 0x80) - } - } - } -} - // Unchecked init to avoid precondition branches in hot code paths where we // already know the value is a valid unicode scalar. extension Unicode.Scalar { diff --git a/stdlib/public/core/UnicodeHelpers.swift b/stdlib/public/core/UnicodeHelpers.swift index 668c1eea0234b..d0addb39ddb23 100644 --- a/stdlib/public/core/UnicodeHelpers.swift +++ b/stdlib/public/core/UnicodeHelpers.swift @@ -13,34 +13,10 @@ // // Low-level helper functions and utilities for interpreting Unicode // - -internal let _leadingSurrogateBias: UInt16 = 0xd800 -internal let _trailingSurrogateBias: UInt16 = 0xdc00 -internal let _surrogateMask: UInt16 = 0xfc00 - -@inline(__always) -internal func _isTrailingSurrogate(_ cu: UInt16) -> Bool { - return cu & _surrogateMask == _trailingSurrogateBias -} -@inline(__always) -internal func _isLeadingSurrogate(_ cu: UInt16) -> Bool { - return cu & _surrogateMask == _leadingSurrogateBias -} -@inline(__always) -internal func _isSurrogate(_ cu: UInt16) -> Bool { - // TODO(String micro-performance): check codegen - return _isLeadingSurrogate(cu) || _isTrailingSurrogate(cu) -} - -@inlinable @inline(__always) -internal func _isASCII(_ x: UInt8) -> Bool { - return x & 0b1000_0000 == 0 -} - @inlinable @inline(__always) internal func _decodeUTF8(_ x: UInt8) -> Unicode.Scalar { - _internalInvariant(_isASCII(x)) + _internalInvariant(UTF8.isASCII(x)) return Unicode.Scalar(_unchecked: UInt32(x)) } @@ -48,7 +24,7 @@ internal func _decodeUTF8(_ x: UInt8) -> Unicode.Scalar { @inline(__always) internal func _decodeUTF8(_ x: UInt8, _ y: UInt8) -> Unicode.Scalar { _internalInvariant(_utf8ScalarLength(x) == 2) - _internalInvariant(_isContinuation(y)) + _internalInvariant(UTF8.isContinuation(y)) let x = UInt32(x) let value = ((x & 0b0001_1111) &<< 6) | _continuationPayload(y) return Unicode.Scalar(_unchecked: value) @@ -60,7 +36,7 @@ internal func _decodeUTF8( _ x: UInt8, _ y: UInt8, _ z: UInt8 ) -> Unicode.Scalar { _internalInvariant(_utf8ScalarLength(x) == 3) - _internalInvariant(_isContinuation(y) && _isContinuation(z)) + _internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z)) let x = UInt32(x) let value = ((x & 0b0000_1111) &<< 12) | (_continuationPayload(y) &<< 6) @@ -75,7 +51,8 @@ internal func _decodeUTF8( ) -> Unicode.Scalar { _internalInvariant(_utf8ScalarLength(x) == 4) _internalInvariant( - _isContinuation(y) && _isContinuation(z) && _isContinuation(w)) + UTF8.isContinuation(y) && UTF8.isContinuation(z) + && UTF8.isContinuation(w)) let x = UInt32(x) let value = ((x & 0b0000_1111) &<< 18) | (_continuationPayload(y) &<< 12) @@ -89,20 +66,20 @@ internal func _decodeScalar( ) -> (Unicode.Scalar, scalarLength: Int) { let high = utf16[i] if i + 1 >= utf16.count { - _internalInvariant(!_isLeadingSurrogate(high)) - _internalInvariant(!_isTrailingSurrogate(high)) + _internalInvariant(!UTF16.isLeadSurrogate(high)) + _internalInvariant(!UTF16.isTrailSurrogate(high)) return (Unicode.Scalar(_unchecked: UInt32(high)), 1) - } - - if !_isLeadingSurrogate(high) { - _internalInvariant(!_isTrailingSurrogate(high)) + } + + if !UTF16.isLeadSurrogate(high) { + _internalInvariant(!UTF16.isTrailSurrogate(high)) return (Unicode.Scalar(_unchecked: UInt32(high)), 1) } - + let low = utf16[i+1] - _internalInvariant(_isLeadingSurrogate(high)) - _internalInvariant(_isTrailingSurrogate(low)) - return (Unicode.Scalar(_unchecked: _decodeSurrogatePair(leading: high, trailing: low)), 2) + _internalInvariant(UTF16.isLeadSurrogate(high)) + _internalInvariant(UTF16.isTrailSurrogate(low)) + return (UTF16._decodeSurrogates(high, low), 2) } @inlinable @@ -139,8 +116,8 @@ internal func _decodeScalar( @inlinable @inline(__always) internal func _utf8ScalarLength(_ x: UInt8) -> Int { - _internalInvariant(!_isContinuation(x)) - if _isASCII(x) { return 1 } + _internalInvariant(!UTF8.isContinuation(x)) + if UTF8.isASCII(x) { return 1 } // TODO(String micro-performance): check codegen return (~x).leadingZeroBitCount } @@ -150,57 +127,25 @@ internal func _utf8ScalarLength( _ utf8: UnsafeBufferPointer, endingAt i: Int ) -> Int { var len = 1 - while _isContinuation(utf8[_unchecked: i &- len]) { + while UTF8.isContinuation(utf8[_unchecked: i &- len]) { len &+= 1 } _internalInvariant(len == _utf8ScalarLength(utf8[i &- len])) return len } -@inlinable @inline(__always) -internal func _isContinuation(_ x: UInt8) -> Bool { - return x & 0b1100_0000 == 0b1000_0000 -} - @inlinable @inline(__always) internal func _continuationPayload(_ x: UInt8) -> UInt32 { return UInt32(x & 0x3F) } -@inline(__always) -internal func _decodeSurrogatePair( - leading high: UInt16, trailing low: UInt16 -) -> UInt32 { - _internalInvariant(_isLeadingSurrogate(high) && _isTrailingSurrogate(low)) - let hi10: UInt32 = UInt32(high) &- UInt32(_leadingSurrogateBias) - _internalInvariant(hi10 < 1<<10, "I said high 10. Not high, like, 20 or something") - let lo10: UInt32 = UInt32(low) &- UInt32(_trailingSurrogateBias) - _internalInvariant(lo10 < 1<<10, "I said low 10. Not low, like, 20 or something") - - return ((hi10 &<< 10) | lo10) &+ 0x1_00_00 -} - -@inline(__always) -internal func _numUTF8CodeUnits(_ scalar: Unicode.Scalar) -> Int { - switch scalar.value { - case 0..<0x80: return 1 - case 0x80..<0x0800: return 2 - case 0x0800..<0x1_0000: return 3 - default: return 4 - } -} -@inline(__always) -internal func _numUTF16CodeUnits(_ scalar: Unicode.Scalar) -> Int { - return scalar.value <= UInt16.max ? 1 : 2 -} - @inlinable @inline(__always) internal func _scalarAlign( _ utf8: UnsafeBufferPointer, _ idx: Int ) -> Int { var i = idx - while _slowPath(_isContinuation(utf8[_unchecked: i])) { + while _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) { i &-= 1 _internalInvariant(i >= 0, "Malformed contents: starts with continuation byte") @@ -250,9 +195,9 @@ extension _StringGuts { _internalInvariant(isFastUTF8) return self.withFastUTF8 { utf8 in - _internalInvariant(i == utf8.count || !_isContinuation(utf8[i])) + _internalInvariant(i == utf8.count || !UTF8.isContinuation(utf8[i])) var len = 1 - while _isContinuation(utf8[i &- len]) { + while UTF8.isContinuation(utf8[i &- len]) { _internalInvariant(i &- len > 0) len += 1 } @@ -277,7 +222,9 @@ extension _StringGuts { if i == self.startIndex || i == self.endIndex { return true } if _fastPath(isFastUTF8) { - return self.withFastUTF8 { return !_isContinuation($0[i._encodedOffset]) } + return self.withFastUTF8 { + return !UTF8.isContinuation($0[i._encodedOffset]) + } } return i == foreignScalarAlign(i) @@ -310,7 +257,7 @@ extension _StringGuts { let start = idx._encodedOffset let leading = _getForeignCodeUnit(at: start) - if _fastPath(!_isSurrogate(leading)) { + if _fastPath(!UTF16.isSurrogate(leading)) { return (Unicode.Scalar(_unchecked: UInt32(leading)), 1) } @@ -320,17 +267,15 @@ extension _StringGuts { // TODO(String performance): Consider having a valid performance flag // available to check, and assert it's not set in the condition here. let nextOffset = start &+ 1 - if _slowPath(_isTrailingSurrogate(leading) || nextOffset == self.count) { + if _slowPath(UTF16.isTrailSurrogate(leading) || nextOffset == self.count) { return (Unicode.Scalar._replacementCharacter, 1) } let trailing = _getForeignCodeUnit(at: nextOffset) - if _slowPath(!_isTrailingSurrogate(trailing)) { + if _slowPath(!UTF16.isTrailSurrogate(trailing)) { return (Unicode.Scalar._replacementCharacter, 1) } - return (Unicode.Scalar( - _unchecked: _decodeSurrogatePair(leading: leading, trailing: trailing)), - 2) + return (UTF16._decodeSurrogates(leading, trailing), 2) } @_effects(releasenone) @@ -343,7 +288,7 @@ extension _StringGuts { let end = idx._encodedOffset let trailing = _getForeignCodeUnit(at: end &- 1) - if _fastPath(!_isSurrogate(trailing)) { + if _fastPath(!UTF16.isSurrogate(trailing)) { return (Unicode.Scalar(_unchecked: UInt32(trailing)), 1) } @@ -353,17 +298,15 @@ extension _StringGuts { // TODO(String performance): Consider having a valid performance flag // available to check, and assert it's not set in the condition here. let priorOffset = end &- 2 - if _slowPath(_isLeadingSurrogate(trailing) || priorOffset < 0) { + if _slowPath(UTF16.isLeadSurrogate(trailing) || priorOffset < 0) { return (Unicode.Scalar._replacementCharacter, 1) } let leading = _getForeignCodeUnit(at: priorOffset) - if _slowPath(!_isLeadingSurrogate(leading)) { + if _slowPath(!UTF16.isLeadSurrogate(leading)) { return (Unicode.Scalar._replacementCharacter, 1) } - return (Unicode.Scalar( - _unchecked: _decodeSurrogatePair(leading: leading, trailing: trailing)), - 2) + return (UTF16._decodeSurrogates(leading, trailing), 2) } @_effects(releasenone) @@ -375,7 +318,7 @@ extension _StringGuts { let start = idx._encodedOffset let cu = _getForeignCodeUnit(at: start) - if _fastPath(!_isSurrogate(cu)) { + if _fastPath(!UTF16.isSurrogate(cu)) { return cu } @@ -384,15 +327,15 @@ extension _StringGuts { // // TODO(String performance): Consider having a valid performance flag // available to check, and assert it's not set in the condition here. - if _isLeadingSurrogate(cu) { + if UTF16.isLeadSurrogate(cu) { let nextOffset = start &+ 1 guard nextOffset < self.count, - _isTrailingSurrogate(_getForeignCodeUnit(at: nextOffset)) + UTF16.isTrailSurrogate(_getForeignCodeUnit(at: nextOffset)) else { return UTF16._replacementCodeUnit } } else { let priorOffset = start &- 1 guard priorOffset >= 0, - _isLeadingSurrogate(_getForeignCodeUnit(at: priorOffset)) + UTF16.isLeadSurrogate(_getForeignCodeUnit(at: priorOffset)) else { return UTF16._replacementCodeUnit } } @@ -405,7 +348,7 @@ extension _StringGuts { _internalInvariant(idx._encodedOffset < self.count) let ecCU = foreignErrorCorrectedUTF16CodeUnit(at: idx) - if _fastPath(!_isTrailingSurrogate(ecCU)) { + if _fastPath(!UTF16.isTrailSurrogate(ecCU)) { return idx } _internalInvariant(idx._encodedOffset > 0, diff --git a/test/stdlib/Unicode.swift b/test/stdlib/Unicode.swift index 007ff0280eaf9..57061384796dc 100644 --- a/test/stdlib/Unicode.swift +++ b/test/stdlib/Unicode.swift @@ -57,4 +57,57 @@ UnicodeAPIs.test("UnicodeDecodingResult/Equatable") { checkEquatable(instances, oracle: ==) } +typealias ASCII = Unicode.ASCII +typealias UTF8 = Unicode.UTF8 +typealias UTF16 = Unicode.UTF16 +typealias UTF32 = Unicode.UTF32 + +UnicodeAPIs.test("UTF-8 and UTF-16 queries") { + guard #available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) else { + return + } + let str = "abรฉร01๐Ÿ˜“๐ŸŽƒ๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆใ‚ขใ‚คใ‚ฆใ‚จใ‚ช" + let scalars = Array(str.unicodeScalars) + for scalar in scalars { + expectEqual(String(scalar).utf16.count, UTF16.width(scalar)) + expectEqual(String(scalar).utf8.count, UTF8.width(scalar)) + + expectEqual(UTF32.isASCII(scalar.value), UTF8.isASCII(scalar.utf8[0])) + expectEqual(UTF32.isASCII(scalar.value), ASCII.isASCII(scalar.utf8[0])) + expectEqual(UTF32.isASCII(scalar.value), UTF16.isASCII(scalar.utf16[0])) + + if scalar.utf16.count == 2 { + let lead = scalar.utf16[0] + let trail = scalar.utf16[1] + expectTrue(UTF16.isLeadSurrogate(lead)) + expectTrue(UTF16.isSurrogate(lead)) + expectFalse(UTF16.isASCII(lead)) + + expectTrue(UTF16.isTrailSurrogate(trail)) + expectTrue(UTF16.isSurrogate(trail)) + } else { + let codeUnit = scalar.utf16[0] + expectFalse(UTF16.isLeadSurrogate(codeUnit)) + expectFalse(UTF16.isTrailSurrogate(codeUnit)) + expectFalse(UTF16.isSurrogate(codeUnit)) + + expectEqual(codeUnit <= 0x7F, UTF16.isASCII(codeUnit)) + } + + expectFalse(UTF8.isContinuation(scalar.utf8[0])) + if scalar.utf8.count == 1 { + let ascii = scalar.utf8[0] + expectFalse(UTF8.isContinuation(ascii)) + expectTrue(UTF8.isASCII(ascii)) + } else { + expectFalse(UTF8.isASCII(scalar.utf8[0])) + expectFalse(UTF8.isContinuation(scalar.utf8[0])) + for i in 1.. Date: Tue, 26 Feb 2019 11:15:10 -0800 Subject: [PATCH 4/8] [String] Add generic String.Index and range inits within a String Adds a generic version of String.Index.init?(_:within:) and Range.init?(_:in:). Tests added --- stdlib/public/Darwin/Foundation/NSRange.swift | 24 +++++-- .../public/core/StringIndexConversions.swift | 64 ++++++++++++++++--- test/stdlib/StringIndex.swift | 38 +++++++++++ 3 files changed, 114 insertions(+), 12 deletions(-) diff --git a/stdlib/public/Darwin/Foundation/NSRange.swift b/stdlib/public/Darwin/Foundation/NSRange.swift index ce16c74cb36f6..39d3d26243c7e 100644 --- a/stdlib/public/Darwin/Foundation/NSRange.swift +++ b/stdlib/public/Darwin/Foundation/NSRange.swift @@ -175,17 +175,33 @@ extension Range where Bound == Int { } extension Range where Bound == String.Index { - public init?(_ range: NSRange, in string: __shared String) { + private init?( + _ range: NSRange, _genericIn string: __shared S + ) { + // Corresponding stdlib version + guard #available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) else { + fatalError() + } let u = string.utf16 guard range.location != NSNotFound, - let start = u.index(u.startIndex, offsetBy: range.location, limitedBy: u.endIndex), - let end = u.index(u.startIndex, offsetBy: range.location + range.length, limitedBy: u.endIndex), + let start = u.index( + u.startIndex, offsetBy: range.location, limitedBy: u.endIndex), + let end = u.index( + start, offsetBy: range.length, limitedBy: u.endIndex), let lowerBound = String.Index(start, within: string), let upperBound = String.Index(end, within: string) else { return nil } - + self = lowerBound..(_ range: NSRange, in string: __shared S) { + self.init(range, _genericIn: string) + } } extension NSRange : CustomReflectable { diff --git a/stdlib/public/core/StringIndexConversions.swift b/stdlib/public/core/StringIndexConversions.swift index b9a03fbb2e006..60de4afa2da4a 100644 --- a/stdlib/public/core/StringIndexConversions.swift +++ b/stdlib/public/core/StringIndexConversions.swift @@ -11,6 +11,15 @@ //===----------------------------------------------------------------------===// extension String.Index { + private init?( + _ sourcePosition: String.Index, _genericWithin target: S + ) { + guard target._wholeGuts.isOnGraphemeClusterBoundary(sourcePosition) else { + return nil + } + self = sourcePosition + } + /// Creates an index in the given string that corresponds exactly to the /// specified position. /// @@ -49,14 +58,53 @@ extension String.Index { /// `sourcePosition` must be a valid index of at least one of the views /// of `target`. /// - target: The string referenced by the resulting index. - public init?( - _ sourcePosition: String.Index, - within target: String + public init?(_ sourcePosition: String.Index, within target: String) { + self.init(sourcePosition, _genericWithin: target) + } + + /// Creates an index in the given string that corresponds exactly to the + /// specified position. + /// + /// If the index passed as `sourcePosition` represents the start of an + /// extended grapheme cluster---the element type of a string---then the + /// initializer succeeds. + /// + /// The following example converts the position of the Unicode scalar `"e"` + /// into its corresponding position in the string. The character at that + /// position is the composed `"รฉ"` character. + /// + /// let cafe = "Cafe\u{0301}" + /// print(cafe) + /// // Prints "Cafรฉ" + /// + /// let scalarsIndex = cafe.unicodeScalars.firstIndex(of: "e")! + /// let stringIndex = String.Index(scalarsIndex, within: cafe)! + /// + /// print(cafe[...stringIndex]) + /// // Prints "Cafรฉ" + /// + /// If the index passed as `sourcePosition` doesn't have an exact + /// corresponding position in `target`, the result of the initializer is + /// `nil`. For example, an attempt to convert the position of the combining + /// acute accent (`"\u{0301}"`) fails. Combining Unicode scalars do not have + /// their own position in a string. + /// + /// let nextScalarsIndex = cafe.unicodeScalars.index(after: scalarsIndex) + /// let nextStringIndex = String.Index(nextScalarsIndex, within: cafe) + /// + /// print(nextStringIndex) + /// // Prints "nil" + /// + /// - Parameters: + /// - sourcePosition: A position in a view of the `target` parameter. + /// `sourcePosition` must be a valid index of at least one of the views + /// of `target`. + /// - target: The string referenced by the resulting index. + @available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) + public init?( + _ sourcePosition: String.Index, within target: S ) { - guard target._guts.isOnGraphemeClusterBoundary(sourcePosition) else { - return nil - } - self = sourcePosition + self.init(sourcePosition, _genericWithin: target) } /// Returns the position in the given UTF-8 view that corresponds exactly to @@ -81,7 +129,7 @@ extension String.Index { /// position of a UTF-16 trailing surrogate returns `nil`. public func samePosition( in utf8: String.UTF8View - ) -> String.UTF8View.Index? { + ) -> String.UTF8View.Index? { return String.UTF8View.Index(self, within: utf8) } diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index 53329686ed29e..45d8bb310443b 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -202,6 +202,44 @@ StringIndexTests.test("Scalar Align UTF-8 indices") { expectEqual(roundedIdx, roundedIdx3) } +import Foundation +StringIndexTests.test("String.Index(_:within) / Range(_:in:)") { + guard #available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) else { + return + } + let str = simpleStrings.joined() + let substr = str[...] + for idx in str.utf8.indices { + expectEqual( + String.Index(idx, within: str), String.Index(idx, within: substr)) + } + + let utf16Count = str.utf16.count + let utf16Indices = Array(str.utf16.indices) + [str.utf16.endIndex] + for location in 0..(nsRange, in: str) + let substrRange = Range(nsRange, in: substr) + + expectEqual(strRange, substrRange) + guard strLB != nil && strUB != nil else { + expectNil(strRange) + continue + } + expectEqual(strRange, Range(uncheckedBounds: (strLB!, strUB!))) + } + } + } +} runAllTests() \ No newline at end of file From aa519362f36081d1cafe4e94219f8e487c010b94 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 26 Feb 2019 11:15:48 -0800 Subject: [PATCH 5/8] [String] Add Character.UTF16View and Character.UTF8View Adds these collections, which are just String's views. Tests added. --- stdlib/public/core/Character.swift | 27 ++++++++++++++------------- test/stdlib/Character.swift | 13 +++++++++++++ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/stdlib/public/core/Character.swift b/stdlib/public/core/Character.swift index 1264ad32ba0ae..c5fb278f88f10 100644 --- a/stdlib/public/core/Character.swift +++ b/stdlib/public/core/Character.swift @@ -88,25 +88,26 @@ extension Character { } extension Character { - @usableFromInline - typealias UTF8View = String.UTF8View + /// A view of a character's contents as a collection of UTF-8 code units. See + /// String.UTF8View for more information + public typealias UTF8View = String.UTF8View + /// A UTF-8 encoding of `self`. @inlinable - internal var utf8: UTF8View { - return _str.utf8 - } - @usableFromInline - typealias UTF16View = String.UTF16View + public var utf8: UTF8View { return _str.utf8 } + + /// A view of a character's contents as a collection of UTF-16 code units. See + /// String.UTF16View for more information + public typealias UTF16View = String.UTF16View + /// A UTF-16 encoding of `self`. @inlinable - internal var utf16: UTF16View { - return _str.utf16 - } + public var utf16: UTF16View { return _str.utf16 } + public typealias UnicodeScalarView = String.UnicodeScalarView + @inlinable - public var unicodeScalars: UnicodeScalarView { - return _str.unicodeScalars - } + public var unicodeScalars: UnicodeScalarView { return _str.unicodeScalars } } extension Character : diff --git a/test/stdlib/Character.swift b/test/stdlib/Character.swift index da31d0a5d68a0..9a06a8ade8cfc 100644 --- a/test/stdlib/Character.swift +++ b/test/stdlib/Character.swift @@ -347,6 +347,19 @@ CharacterTests.test("String.append(_: Character)") { } } +CharacterTests.test("utf6/16/unicodescalar views") { + for c in testCharacters { + expectEqualSequence(String(c).unicodeScalars, c.unicodeScalars) + expectEqualSequence(String(c).utf8, c.utf8) + expectEqualSequence(String(c).utf16, c.utf16) + + expectEqualSequence( + String(c).unicodeScalars.reversed(), c.unicodeScalars.reversed()) + expectEqualSequence(String(c).utf8.reversed(), c.utf8.reversed()) + expectEqualSequence(String(c).utf16.reversed(), c.utf16.reversed()) + } +} + var UnicodeScalarTests = TestSuite("UnicodeScalar") UnicodeScalarTests.test("UInt8(ascii: UnicodeScalar)") { From 0ece62d9113a3f6ce77a8f6a713249508d63b9b4 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 26 Feb 2019 11:16:10 -0800 Subject: [PATCH 6/8] [String] Add Substring.base Adds Substring.base, analogous to Slice.base, to access the entire String. Tests added. --- stdlib/public/core/LegacyABI.swift | 6 ++++++ stdlib/public/core/StringCreate.swift | 4 ++-- stdlib/public/core/Substring.swift | 22 ++++++++++------------ test/stdlib/subString.swift | 9 +++++++++ validation-test/stdlib/String.swift | 2 +- 5 files changed, 28 insertions(+), 15 deletions(-) diff --git a/stdlib/public/core/LegacyABI.swift b/stdlib/public/core/LegacyABI.swift index 3ba450a3a93e2..6f64dfd270db8 100644 --- a/stdlib/public/core/LegacyABI.swift +++ b/stdlib/public/core/LegacyABI.swift @@ -33,3 +33,9 @@ internal func _isContinuation(_ x: UInt8) -> Bool { return UTF8.isContinuation(x) } +extension Substring { +@available(*, unavailable, renamed: "Substring.base") + @inlinable + internal var _wholeString: String { return base } +} + diff --git a/stdlib/public/core/StringCreate.swift b/stdlib/public/core/StringCreate.swift index fb7e7f9799cd4..648420e30ab7d 100644 --- a/stdlib/public/core/StringCreate.swift +++ b/stdlib/public/core/StringCreate.swift @@ -193,8 +193,8 @@ extension String { internal static func _fromSubstring( _ substring: __shared Substring ) -> String { - if substring._offsetRange == substring._wholeString._offsetRange { - return substring._wholeString + if substring._offsetRange == substring.base._offsetRange { + return substring.base } return String._copying(substring) diff --git a/stdlib/public/core/Substring.swift b/stdlib/public/core/Substring.swift index 91b124dcc921c..1d89c3b245a64 100644 --- a/stdlib/public/core/Substring.swift +++ b/stdlib/public/core/Substring.swift @@ -116,14 +116,12 @@ public struct Substring { } extension Substring { - @inlinable - internal var _wholeGuts: _StringGuts { - @inline(__always) get { return _slice.base._guts } - } - @inlinable - internal var _wholeString: String { - @inline(__always) get { return String(self._wholeGuts) } - } + /// Returns the underlying string from which this Substring was derived. + @_alwaysEmitIntoClient + public var base: String { return _slice.base } + + @inlinable @inline(__always) + internal var _wholeGuts: _StringGuts { return base._guts } @inlinable internal var _offsetRange: Range { @@ -141,7 +139,7 @@ extension Substring { #else @usableFromInline @inline(never) @_effects(releasenone) internal func _invariantCheck() { - self._wholeString._invariantCheck() + self.base._invariantCheck() } #endif // INTERNAL_CHECKS_ENABLED } @@ -420,7 +418,7 @@ extension Substring { @inlinable public var utf8: UTF8View { get { - return _wholeString.utf8[startIndex.. Date: Wed, 27 Feb 2019 14:53:56 -0800 Subject: [PATCH 7/8] [String] String.Index.init(_:within:) bounds checks Bounds check the given index for String.Index's generic initializer that makes sure a passed index is a valid one for the given StringProtocol. --- stdlib/public/core/StringIndexConversions.swift | 9 ++++++--- test/stdlib/StringIndex.swift | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/stdlib/public/core/StringIndexConversions.swift b/stdlib/public/core/StringIndexConversions.swift index 60de4afa2da4a..dfb5c351d8534 100644 --- a/stdlib/public/core/StringIndexConversions.swift +++ b/stdlib/public/core/StringIndexConversions.swift @@ -12,12 +12,15 @@ extension String.Index { private init?( - _ sourcePosition: String.Index, _genericWithin target: S + _ idx: String.Index, _genericWithin target: S ) { - guard target._wholeGuts.isOnGraphemeClusterBoundary(sourcePosition) else { + guard target._wholeGuts.isOnGraphemeClusterBoundary(idx), + idx >= target.startIndex && idx <= target.endIndex + else { return nil } - self = sourcePosition + + self = idx } /// Creates an index in the given string that corresponds exactly to the diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index 45d8bb310443b..c2fb07d841c1b 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -215,6 +215,11 @@ StringIndexTests.test("String.Index(_:within) / Range(_:in:)") { String.Index(idx, within: str), String.Index(idx, within: substr)) } + expectNil(String.Index(str.startIndex, within: str.dropFirst())) + expectNil(String.Index(str.endIndex, within: str.dropLast())) + expectNotNil(String.Index(str.startIndex, within: str)) + expectNotNil(String.Index(str.endIndex, within: str)) + let utf16Count = str.utf16.count let utf16Indices = Array(str.utf16.indices) + [str.utf16.endIndex] for location in 0.. Date: Mon, 1 Apr 2019 09:29:20 -0700 Subject: [PATCH 8/8] [tests] Adjust tests for Linux --- test/stdlib/StringIndex.swift | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index c2fb07d841c1b..eabe0f50ece59 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -202,6 +202,7 @@ StringIndexTests.test("Scalar Align UTF-8 indices") { expectEqual(roundedIdx, roundedIdx3) } +#if _runtime(_ObjC) import Foundation StringIndexTests.test("String.Index(_:within) / Range(_:in:)") { guard #available(macOS 9999, iOS 9999, tvOS 9999, watchOS 9999, *) else { @@ -246,5 +247,6 @@ StringIndexTests.test("String.Index(_:within) / Range(_:in:)") { } } } +#endif // _runtime(_ObjC) runAllTests() \ No newline at end of file