From 6d77eb5b14bed5747711043d4a18318dd69a39bf Mon Sep 17 00:00:00 2001 From: Guillaume Lessard Date: Fri, 8 Sep 2023 15:09:09 -0700 Subject: [PATCH 1/2] [se-0405] adapt implementation from staging package --- stdlib/public/core/String.swift | 106 ++++++++++++++++++++++++++ stdlib/public/core/StringCreate.swift | 48 ++++++++++++ 2 files changed, 154 insertions(+) diff --git a/stdlib/public/core/String.swift b/stdlib/public/core/String.swift index 1384626a33a7f..5a550b26ca148 100644 --- a/stdlib/public/core/String.swift +++ b/stdlib/public/core/String.swift @@ -493,6 +493,112 @@ extension String { self = String._fromNonContiguousUnsafeBitcastUTF8Repairing(codeUnits).0 } + /// Creates a new `String` by copying and validating the sequence of + /// code units passed in, according to the specified encoding. + /// + /// This initializer does not try to repair ill-formed code unit sequences. + /// If any are found, the result of the initializer is `nil`. + /// + /// The following example calls this initializer with the contents of two + /// different arrays---first with a well-formed UTF-8 code unit sequence and + /// then with an ill-formed UTF-16 code unit sequence. + /// + /// let validUTF8: [UInt8] = [67, 97, 0, 102, 195, 169] + /// let valid = String(validating: validUTF8, as: UTF8.self) + /// print(valid) + /// // Prints "Optional("Café")" + /// + /// let invalidUTF16: [UInt16] = [0x41, 0x42, 0xd801] + /// let invalid = String(validating: invalidUTF16, as: UTF16.self) + /// print(invalid) + /// // Prints "nil" + /// + /// - Parameters: + /// - codeUnits: A sequence of code units that encode a `String` + /// - encoding: A conformer to `Unicode.Encoding` to be used + /// to decode `codeUnits`. + @inlinable + public init?( + validating codeUnits: some Sequence, + as encoding: Encoding.Type + ) { + let newString: String?? = codeUnits.withContiguousStorageIfAvailable { + String._validate($0, as: Encoding.self) + } + if let newString { + guard let newString else { return nil } + self = newString + return + } + + // slow-path + var transcoded: [UTF8.CodeUnit] = [] + transcoded.reserveCapacity(codeUnits.underestimatedCount) + var isASCII = true + let error = transcode( + codeUnits.makeIterator(), + from: Encoding.self, + to: UTF8.self, + stoppingOnError: true, + into: { + uint8 in + transcoded.append(uint8) + if isASCII && (uint8 & 0x80) == 0x80 { isASCII = false } + } + ) + if error { return nil } + self = transcoded.withUnsafeBufferPointer{ + String._uncheckedFromUTF8($0, asciiPreScanResult: isASCII) + } + } + + /// Creates a new `String` by copying and validating the sequence of + /// `Int8` passed in, according to the specified encoding. + /// + /// This initializer does not try to repair ill-formed code unit sequences. + /// If any are found, the result of the initializer is `nil`. + /// + /// The following example calls this initializer with the contents of two + /// different arrays---first with a well-formed UTF-8 code unit sequence and + /// then with an ill-formed ASCII code unit sequence. + /// + /// let validUTF8: [Int8] = [67, 97, 0, 102, -61, -87] + /// let valid = String(validating: validUTF8, as: UTF8.self) + /// print(valid) + /// // Prints "Optional("Café")" + /// + /// let invalidASCII: [Int8] = [67, 97, -5] + /// let invalid = String(validating: invalidASCII, as: Unicode.ASCII.self) + /// print(invalid) + /// // Prints "nil" + /// + /// - Parameters: + /// - codeUnits: A sequence of code units that encode a `String` + /// - encoding: A conformer to `Unicode.Encoding` that can decode + /// `codeUnits` as `UInt8` + @inlinable + public init?( + validating codeUnits: some Sequence, + as encoding: Encoding.Type + ) where Encoding: Unicode.Encoding, Encoding.CodeUnit == UInt8 { + let newString: String?? = codeUnits.withContiguousStorageIfAvailable { + $0.withMemoryRebound(to: UInt8.self) { + String._validate($0, as: Encoding.self) + } + } + if let newString { + guard let newString else { return nil } + self = newString + return + } + + // slow-path + let uint8s = codeUnits.lazy.map(UInt8.init(bitPattern:)) + let string = String(validating: uint8s, as: Encoding.self) + guard let string else { return nil } + self = string + } + /// Creates a new string with the specified capacity in UTF-8 code units, and /// then calls the given closure with a buffer covering the string's /// uninitialized memory. diff --git a/stdlib/public/core/StringCreate.swift b/stdlib/public/core/StringCreate.swift index 4ea7b11eb0573..26f9d335a26bc 100644 --- a/stdlib/public/core/StringCreate.swift +++ b/stdlib/public/core/StringCreate.swift @@ -298,4 +298,52 @@ extension String { String._uncheckedFromUTF8($0) } } + + @usableFromInline + internal static func _validate( + _ input: UnsafeBufferPointer, + as encoding: Encoding.Type + ) -> String? { + fast: // fast-path + if encoding.CodeUnit.self == UInt8.self { + let bytes = _identityCast(input, to: UnsafeBufferPointer.self) + let isASCII: Bool + if encoding.self == UTF8.self { + guard case .success(let info) = validateUTF8(bytes) else { return nil } + isASCII = info.isASCII + } else if encoding.self == Unicode.ASCII.self { + guard _allASCII(bytes) else { return nil } + isASCII = true + } else { + break fast + } + return String._uncheckedFromUTF8(bytes, asciiPreScanResult: isASCII) + } + + // slow-path + // this multiplier is a worst-case estimate + let multiplier = if encoding.self == UTF16.self { 3 } else { 4 } + return withUnsafeTemporaryAllocation( + of: UInt8.self, capacity: input.count * multiplier + ) { + output -> String? in + var isASCII = true + var index = output.startIndex + let error = transcode( + input.makeIterator(), + from: encoding.self, + to: UTF8.self, + stoppingOnError: true, + into: { + uint8 in + output[index] = uint8 + output.formIndex(after: &index) + if isASCII && (uint8 & 0x80) == 0x80 { isASCII = false } + } + ) + if error { return nil } + let bytes = UnsafeBufferPointer(start: output.baseAddress, count: index) + return String._uncheckedFromUTF8(bytes, asciiPreScanResult: isASCII) + } + } } From 25fe233645fa966e7e869b2a90de7c0f67fc6b65 Mon Sep 17 00:00:00 2001 From: Guillaume Lessard Date: Sat, 9 Sep 2023 14:13:55 -0700 Subject: [PATCH 2/2] [test] se-0405 input-validating String initializers --- test/stdlib/StringCreate.swift | 145 +++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/test/stdlib/StringCreate.swift b/test/stdlib/StringCreate.swift index 10793058cf9b3..7bca2931c64bf 100644 --- a/test/stdlib/StringCreate.swift +++ b/test/stdlib/StringCreate.swift @@ -143,3 +143,148 @@ if #available(SwiftStdlib 5.3, *) { expectEqual(str1, str6) } } + +let s1 = "Long string containing the characters é, ß, 🦆, and 👨‍👧‍👦." +let s2 = "Long ascii string with no accented characters (obviously)." + +StringCreateTests.test("Validating.utf8") +.skip(.custom( + { if #available(SwiftStdlib 5.10, *) { false } else { true } }, + reason: "Requires Swift 5.10's standard library" +)) +.code { + guard #available(SwiftStdlib 5.10, *) else { return } + + let i1 = Array(s1.utf8) + let i2 = Array(s2.utf8) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: 240) + expectNotNil(index) + index.map { modified[$0] = 0 } + return modified + }() + + var actual: String? + for simpleString in SimpleString.allCases { + let expected = simpleString.rawValue + actual = String(validating: expected.utf8, as: Unicode.UTF8.self) + expectEqual(actual, expected) + } + + expectEqual(String(validating: i1, as: UTF8.self), s1) + expectEqual(String(validating: i2, as: UTF8.self), s2) + expectNil(String(validating: i3, as: UTF8.self)) + + expectEqual(String(validating: AnyCollection(i1), as: UTF8.self), s1) + expectEqual(String(validating: AnyCollection(i2), as: UTF8.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF8.self)) +} + +StringCreateTests.test("Validating.utf8.from.int8") +.skip(.custom( + { if #available(SwiftStdlib 5.10, *) { false } else { true } }, + reason: "Requires Swift 5.10's standard library" +)) +.code { + guard #available(SwiftStdlib 5.10, *) else { return } + + let i1 = s1.utf8.map(Int8.init(bitPattern:)) + let i2 = s2.utf8.map(Int8.init(bitPattern:)) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: Int8(bitPattern: 240)) + expectNotNil(index) + index.map { modified[$0] = 0 } + return modified + }() + + expectEqual(String(validating: i1, as: UTF8.self), s1) + expectEqual(String(validating: i2, as: UTF8.self), s2) + expectNil(String(validating: i3, as: UTF8.self)) + + expectEqual(String(validating: AnyCollection(i1), as: UTF8.self), s1) + expectEqual(String(validating: AnyCollection(i2), as: UTF8.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF8.self)) +} + +StringCreateTests.test("Validating.ascii") +.skip(.custom( + { if #available(SwiftStdlib 5.10, *) { false } else { true } }, + reason: "Requires Swift 5.10's standard library" +)) +.code { + guard #available(SwiftStdlib 5.10, *) else { return } + + let i1 = Array(s1.utf8) + let i2 = Array(s2.utf8) + + expectNil(String(validating: i1, as: Unicode.ASCII.self)) + expectEqual(String(validating: i2, as: Unicode.ASCII.self), s2) + + expectNil(String(validating: AnyCollection(i1), as: Unicode.ASCII.self)) + expectEqual(String(validating: AnySequence(i2), as: Unicode.ASCII.self), s2) + + let i3 = i1.map(Int8.init(bitPattern:)) + let i4 = i2.map(Int8.init(bitPattern:)) + + expectNil(String(validating: i3, as: Unicode.ASCII.self)) + expectEqual(String(validating: i4, as: Unicode.ASCII.self), s2) + + expectNil(String(validating: AnyCollection(i3), as: Unicode.ASCII.self)) + expectEqual(String(validating: AnySequence(i4), as: Unicode.ASCII.self), s2) +} + +StringCreateTests.test("Validating.utf16") +.skip(.custom( + { if #available(SwiftStdlib 5.10, *) { false } else { true } }, + reason: "Requires Swift 5.10's standard library" +)) +.code { + guard #available(SwiftStdlib 5.10, *) else { return } + + let i1 = Array(s1.utf16) + let i2 = Array(s2.utf16) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: 32) + expectNotNil(index) + index.map { modified[$0] = 0xd801 } + return modified + }() + + expectEqual(String(validating: i1, as: UTF16.self), s1) + expectEqual(String(validating: i2, as: UTF16.self), s2) + expectNil(String(validating: i3, as: UTF16.self)) + + expectEqual(String(validating: AnySequence(i1), as: UTF16.self), s1) + expectEqual(String(validating: AnySequence(i2), as: UTF16.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF16.self)) +} + +StringCreateTests.test("Validating.utf32") +.skip(.custom( + { if #available(SwiftStdlib 5.10, *) { false } else { true } }, + reason: "Requires Swift 5.10's standard library" +)) +.code { + guard #available(SwiftStdlib 5.10, *) else { return } + + let i1 = s1.unicodeScalars.map(\.value) + let i2 = s2.unicodeScalars.map(\.value) + let i3 = { + var modified = i1 + let index = modified.lastIndex(of: .init(bitPattern: 32)) + expectNotNil(index) + index.map { modified[$0] = .max } + return modified + }() + + expectEqual(String(validating: i1, as: UTF32.self), s1) + expectEqual(String(validating: i2, as: UTF32.self), s2) + expectNil(String(validating: i3, as: UTF32.self)) + + expectEqual(String(validating: AnySequence(i1), as: UTF32.self), s1) + expectEqual(String(validating: AnySequence(i2), as: UTF32.self), s2) + expectNil(String(validating: AnyCollection(i3), as: UTF32.self)) +}