diff --git a/stdlib/public/core/String.swift b/stdlib/public/core/String.swift index 1384626a33a7f..37f71928d3e94 100644 --- a/stdlib/public/core/String.swift +++ b/stdlib/public/core/String.swift @@ -493,6 +493,116 @@ extension String { self = String._fromNonContiguousUnsafeBitcastUTF8Repairing(codeUnits).0 } + /// Creates a new string by copying and validating the sequence of + /// code units passed in, according to the specified encoding. + /// + /// This initializer does not try to repair ill-formed code unit sequences. + /// If any are found, the result of the initializer is `nil`. + /// + /// The following example calls this initializer with the contents of two + /// different arrays---first with a well-formed UTF-8 code unit sequence and + /// then with an ill-formed UTF-16 code unit sequence. + /// + /// let validUTF8: [UInt8] = [67, 97, 0, 102, 195, 169] + /// let valid = String(validating: validUTF8, as: UTF8.self) + /// print(valid ?? "nil") + /// // Prints "Café" + /// + /// let invalidUTF16: [UInt16] = [0x41, 0x42, 0xd801] + /// let invalid = String(validating: invalidUTF16, as: UTF16.self) + /// print(invalid ?? "nil") + /// // Prints "nil" + /// + /// - Parameters: + /// - codeUnits: A sequence of code units that encode a `String` + /// - encoding: A conformer to `Unicode.Encoding` to be used + /// to decode `codeUnits`. + @inlinable + @available(SwiftStdlib 5.11, *) + public init?( + validating codeUnits: some Sequence, + as encoding: Encoding.Type + ) { + let contiguousResult = codeUnits.withContiguousStorageIfAvailable { + String._validate($0, as: Encoding.self) + } + if let validationResult = contiguousResult { + guard let validatedString = validationResult else { + return nil + } + self = validatedString + return + } + + // slow-path + var transcoded: [UTF8.CodeUnit] = [] + transcoded.reserveCapacity(codeUnits.underestimatedCount) + var isASCII = true + let error = transcode( + codeUnits.makeIterator(), + from: Encoding.self, + to: UTF8.self, + stoppingOnError: true, + into: { + uint8 in + transcoded.append(uint8) + if isASCII && (uint8 & 0x80) == 0x80 { isASCII = false } + } + ) + if error { return nil } + self = transcoded.withUnsafeBufferPointer{ + String._uncheckedFromUTF8($0, asciiPreScanResult: isASCII) + } + } + + /// Creates a new string by copying and validating the sequence of + /// code units passed in, according to the specified encoding. + /// + /// This initializer does not try to repair ill-formed code unit sequences. + /// If any are found, the result of the initializer is `nil`. + /// + /// The following example calls this initializer with the contents of two + /// different arrays---first with a well-formed UTF-8 code unit sequence and + /// then with an ill-formed ASCII code unit sequence. + /// + /// let validUTF8: [Int8] = [67, 97, 0, 102, -61, -87] + /// let valid = String(validating: validUTF8, as: UTF8.self) + /// print(valid ?? "nil") + /// // Prints "Café" + /// + /// let invalidASCII: [Int8] = [67, 97, -5] + /// let invalid = String(validating: invalidASCII, as: Unicode.ASCII.self) + /// print(invalid ?? "nil") + /// // Prints "nil" + /// + /// - Parameters: + /// - codeUnits: A sequence of code units that encode a `String` + /// - encoding: A conformer to `Unicode.Encoding` that can decode + /// `codeUnits` as `UInt8` + @inlinable + @available(SwiftStdlib 5.11, *) + public init?( + validating codeUnits: some Sequence, + as encoding: Encoding.Type + ) where Encoding: Unicode.Encoding, Encoding.CodeUnit == UInt8 { + let contiguousResult = codeUnits.withContiguousStorageIfAvailable { + $0.withMemoryRebound(to: UInt8.self) { + String._validate($0, as: Encoding.self) + } + } + if let validationResult = contiguousResult { + guard let validatedString = validationResult else { + return nil + } + self = validatedString + return + } + + // slow-path + let uint8s = codeUnits.lazy.map(UInt8.init(bitPattern:)) + self.init(validating: uint8s, as: Encoding.self) + } + /// Creates a new string with the specified capacity in UTF-8 code units, and /// then calls the given closure with a buffer covering the string's /// uninitialized memory. diff --git a/stdlib/public/core/StringCreate.swift b/stdlib/public/core/StringCreate.swift index 4ea7b11eb0573..7a21c51f2b81b 100644 --- a/stdlib/public/core/StringCreate.swift +++ b/stdlib/public/core/StringCreate.swift @@ -2,7 +2,7 @@ // // This source file is part of the Swift.org open source project // -// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors +// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information @@ -298,4 +298,80 @@ extension String { String._uncheckedFromUTF8($0) } } + + @usableFromInline + @available(SwiftStdlib 5.11, *) + internal static func _validate( + _ input: UnsafeBufferPointer, + as encoding: Encoding.Type + ) -> String? { + if encoding.CodeUnit.self == UInt8.self { + let bytes = _identityCast(input, to: UnsafeBufferPointer.self) + if encoding.self == UTF8.self { + guard case .success(let info) = validateUTF8(bytes) else { return nil } + return String._uncheckedFromUTF8(bytes, asciiPreScanResult: info.isASCII) + } else if encoding.self == Unicode.ASCII.self { + guard _allASCII(bytes) else { return nil } + return String._uncheckedFromASCII(bytes) + } + } + + // slow-path + var isASCII = true + var buffer: UnsafeMutableBufferPointer + buffer = UnsafeMutableBufferPointer.allocate(capacity: input.count*3) + var written = buffer.startIndex + + var parser = Encoding.ForwardParser() + var input = input.makeIterator() + + transcodingLoop: + while true { + switch parser.parseScalar(from: &input) { + case .valid(let s): + let scalar = Encoding.decode(s) + guard let utf8 = Unicode.UTF8.encode(scalar) else { + // transcoding error: clean up and return nil + fallthrough + } + if buffer.count < written + utf8.count { + let newCapacity = buffer.count + (buffer.count >> 1) + let copy: UnsafeMutableBufferPointer + copy = UnsafeMutableBufferPointer.allocate(capacity: newCapacity) + let copied = copy.moveInitialize( + fromContentsOf: buffer.prefix(upTo: written) + ) + buffer.deallocate() + buffer = copy + written = copied + } + if isASCII && utf8.count > 1 { + isASCII = false + } + written = buffer.suffix(from: written).initialize(fromContentsOf: utf8) + break + case .error: + // validation error: clean up and return nil + buffer.prefix(upTo: written).deinitialize() + buffer.deallocate() + return nil + case .emptyInput: + break transcodingLoop + } + } + + let storage = buffer.baseAddress.map { + __SharedStringStorage( + _mortal: $0, + countAndFlags: _StringObject.CountAndFlags( + count: buffer.startIndex.distance(to: written), + isASCII: isASCII, + isNFC: isASCII, + isNativelyStored: false, + isTailAllocated: false + ) + ) + } + return storage?.asString + } } diff --git a/stdlib/public/core/StringStorage.swift b/stdlib/public/core/StringStorage.swift index 961bf033bf288..c6ef8c99f8039 100644 --- a/stdlib/public/core/StringStorage.swift +++ b/stdlib/public/core/StringStorage.swift @@ -2,7 +2,7 @@ // // This source file is part of the Swift.org open source project // -// Copyright (c) 2014 - 2020 Apple Inc. and the Swift project authors +// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information @@ -681,6 +681,8 @@ final internal class __SharedStringStorage internal var _breadcrumbs: _StringBreadcrumbs? = nil + internal var immortal = false + internal var count: Int { _countAndFlags.count } internal init( @@ -689,6 +691,7 @@ final internal class __SharedStringStorage ) { self._owner = nil self.start = ptr + self.immortal = true #if _pointerBitWidth(_64) self._countAndFlags = countAndFlags #elseif _pointerBitWidth(_32) @@ -709,6 +712,32 @@ final internal class __SharedStringStorage return String(_StringGuts(self)) } } + + internal init( + _mortal ptr: UnsafePointer, + countAndFlags: _StringObject.CountAndFlags + ) { + // ptr *must* be the start of an allocation + self._owner = nil + self.start = ptr + self.immortal = false +#if _pointerBitWidth(_64) + self._countAndFlags = countAndFlags +#elseif _pointerBitWidth(_32) + self._count = countAndFlags.count + self._countFlags = countAndFlags.flags +#else +#error("Unknown platform") +#endif + super.init() + self._invariantCheck() + } + + deinit { + if (_owner == nil) && !immortal { + start.deallocate() + } + } } extension __SharedStringStorage { diff --git a/test/abi/macOS/arm64/stdlib.swift b/test/abi/macOS/arm64/stdlib.swift index 4d56470f092f8..d667196e36486 100644 --- a/test/abi/macOS/arm64/stdlib.swift +++ b/test/abi/macOS/arm64/stdlib.swift @@ -45,6 +45,15 @@ Added: _$ss19_getWeakRetainCountySuyXlF // Swift._getUnownedRetainCount(Swift.AnyObject) -> Swift.UInt Added: _$ss22_getUnownedRetainCountySuyXlF +// Swift.String.init(validating: B, as: A.Type) -> Swift.String? +Added: _$sSS10validating2asSSSgq__xmtcs16_UnicodeEncodingRzSTR_7ElementQy_8CodeUnitRtzr0_lufC + +// Swift.String.init(validating: B, as: A.Type) -> Swift.String? +Added: _$sSS10validating2asSSSgq__xmtcs16_UnicodeEncodingRzSTR_s5UInt8V8CodeUnitRtzs4Int8V7ElementRt_r0_lufC + +// static Swift.String._validate(_: Swift.UnsafeBufferPointer, as: A.Type) -> Swift.String? +Added: _$sSS9_validate_2asSSSgSRy8CodeUnitQzG_xmts16_UnicodeEncodingRzlFZ + // class __StaticArrayStorage Added: _$ss20__StaticArrayStorageC12_doNotCallMeAByt_tcfC Added: _$ss20__StaticArrayStorageC12_doNotCallMeAByt_tcfCTj diff --git a/test/abi/macOS/x86_64/stdlib.swift b/test/abi/macOS/x86_64/stdlib.swift index 71127971381dc..1b28d30dcbabb 100644 --- a/test/abi/macOS/x86_64/stdlib.swift +++ b/test/abi/macOS/x86_64/stdlib.swift @@ -45,6 +45,15 @@ Added: _$ss19_getWeakRetainCountySuyXlF // Swift._getUnownedRetainCount(Swift.AnyObject) -> Swift.UInt Added: _$ss22_getUnownedRetainCountySuyXlF +// Swift.String.init(validating: B, as: A.Type) -> Swift.String? +Added: _$sSS10validating2asSSSgq__xmtcs16_UnicodeEncodingRzSTR_7ElementQy_8CodeUnitRtzr0_lufC + +// Swift.String.init(validating: B, as: A.Type) -> Swift.String? +Added: _$sSS10validating2asSSSgq__xmtcs16_UnicodeEncodingRzSTR_s5UInt8V8CodeUnitRtzs4Int8V7ElementRt_r0_lufC + +// static Swift.String._validate(_: Swift.UnsafeBufferPointer, as: A.Type) -> Swift.String? +Added: _$sSS9_validate_2asSSSgSRy8CodeUnitQzG_xmts16_UnicodeEncodingRzlFZ + // class __StaticArrayStorage Added: _$ss20__StaticArrayStorageC12_doNotCallMeAByt_tcfC Added: _$ss20__StaticArrayStorageC12_doNotCallMeAByt_tcfCTj diff --git a/test/stdlib/StringCreate.swift b/test/stdlib/StringCreate.swift index 10793058cf9b3..e53bd8416df31 100644 --- a/test/stdlib/StringCreate.swift +++ b/test/stdlib/StringCreate.swift @@ -143,3 +143,152 @@ if #available(SwiftStdlib 5.3, *) { expectEqual(str1, str6) } } + +let s1 = "Long string containing the characters é, ß, 🦆, and 👨‍👧‍👦." +let s2 = "Long ascii string with no accented characters (obviously)." + +StringCreateTests.test("Validating.utf8") +.skip(.custom( + { if #available(SwiftStdlib 5.11, *) { false } else { true } }, + reason: "Requires Swift 5.11's standard library" +)) +.code { + guard #available(SwiftStdlib 5.11, *) else { return } + + let i1 = Array(s1.utf8) + let i2 = Array(s2.utf8) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: 240) + expectNotNil(index) + index.map { modified[$0] = 0 } + return modified + }() + + var actual: String? + for simpleString in SimpleString.allCases { + let expected = simpleString.rawValue + actual = String(validating: expected.utf8, as: Unicode.UTF8.self) + expectEqual(actual, expected) + } + + expectEqual(String(validating: i1, as: UTF8.self), s1) + expectEqual(String(validating: i2, as: UTF8.self), s2) + expectNil(String(validating: i3, as: UTF8.self)) + + expectEqual(String(validating: AnyCollection(i1), as: UTF8.self), s1) + expectEqual(String(validating: AnyCollection(i2), as: UTF8.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF8.self)) +} + +StringCreateTests.test("Validating.utf8.from.int8") +.skip(.custom( + { if #available(SwiftStdlib 5.11, *) { false } else { true } }, + reason: "Requires Swift 5.11's standard library" +)) +.code { + guard #available(SwiftStdlib 5.11, *) else { return } + + let i1 = s1.utf8.map(Int8.init(bitPattern:)) + let i2 = s2.utf8.map(Int8.init(bitPattern:)) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: Int8(bitPattern: 240)) + expectNotNil(index) + index.map { modified[$0] = 0 } + return modified + }() + + expectEqual(String(validating: i1, as: UTF8.self), s1) + expectEqual(String(validating: i2, as: UTF8.self), s2) + expectNil(String(validating: i3, as: UTF8.self)) + + expectEqual(String(validating: AnyCollection(i1), as: UTF8.self), s1) + expectEqual(String(validating: AnyCollection(i2), as: UTF8.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF8.self)) +} + +StringCreateTests.test("Validating.ascii") +.skip(.custom( + { if #available(SwiftStdlib 5.11, *) { false } else { true } }, + reason: "Requires Swift 5.11's standard library" +)) +.code { + guard #available(SwiftStdlib 5.11, *) else { return } + + let i1 = Array(s1.utf8) + let i2 = Array(s2.utf8) + + expectNil(String(validating: i1, as: Unicode.ASCII.self)) + expectEqual(String(validating: i2, as: Unicode.ASCII.self), s2) + + expectNil(String(validating: AnyCollection(i1), as: Unicode.ASCII.self)) + expectEqual(String(validating: AnySequence(i2), as: Unicode.ASCII.self), s2) + + let i3 = i1.map(Int8.init(bitPattern:)) + let i4 = i2.map(Int8.init(bitPattern:)) + + expectNil(String(validating: i3, as: Unicode.ASCII.self)) + expectEqual(String(validating: i4, as: Unicode.ASCII.self), s2) + + expectNil(String(validating: AnyCollection(i3), as: Unicode.ASCII.self)) + expectEqual(String(validating: AnySequence(i4), as: Unicode.ASCII.self), s2) +} + +StringCreateTests.test("Validating.utf16") +.skip(.custom( + { if #available(SwiftStdlib 5.11, *) { false } else { true } }, + reason: "Requires Swift 5.11's standard library" +)) +.code { + guard #available(SwiftStdlib 5.11, *) else { return } + + let i1 = Array(s1.utf16) + let i2 = Array(s2.utf16) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: 32) + expectNotNil(index) + index.map { modified[$0] = 0xd801 } + return modified + }() + + expectEqual(String(validating: i1, as: UTF16.self), s1) + expectEqual(String(validating: i2, as: UTF16.self), s2) + expectNil(String(validating: i3, as: UTF16.self)) + + expectEqual(String(validating: AnySequence(i1), as: UTF16.self), s1) + expectEqual(String(validating: AnySequence(i2), as: UTF16.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF16.self)) +} + +StringCreateTests.test("Validating.utf32") +.skip(.custom( + { if #available(SwiftStdlib 5.11, *) { false } else { true } }, + reason: "Requires Swift 5.11's standard library" +)) +.code { + guard #available(SwiftStdlib 5.11, *) else { return } + + let i1 = s1.unicodeScalars.map(\.value) + let i2 = s2.unicodeScalars.map(\.value) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: .init(bitPattern: 32)) + expectNotNil(index) + index.map { modified[$0] = .max } + return modified + }() + let s4 = SimpleString.emoji.rawValue + let i4 = s4.unicodeScalars.map(\.value) + + expectEqual(String(validating: i1, as: UTF32.self), s1) + expectEqual(String(validating: i2, as: UTF32.self), s2) + expectNil(String(validating: i3, as: UTF32.self)) + expectEqual(String(validating: i4, as: UTF32.self), s4) + + expectEqual(String(validating: AnySequence(i1), as: UTF32.self), s1) + expectEqual(String(validating: AnySequence(i2), as: UTF32.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF32.self)) + expectEqual(String(validating: AnySequence(i4), as: UTF32.self), s4) +}