Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions stdlib/public/core/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,112 @@ extension String {
self = String._fromNonContiguousUnsafeBitcastUTF8Repairing(codeUnits).0
}

/// Creates a new `String` by copying and validating the sequence of
/// code units passed in, according to the specified encoding.
///
/// This initializer does not try to repair ill-formed code unit sequences.
/// If any are found, the result of the initializer is `nil`.
///
/// The following example calls this initializer with the contents of two
/// different arrays---first with a well-formed UTF-8 code unit sequence and
/// then with an ill-formed UTF-16 code unit sequence.
///
/// let validUTF8: [UInt8] = [67, 97, 0, 102, 195, 169]
/// let valid = String(validating: validUTF8, as: UTF8.self)
/// print(valid)
/// // Prints "Optional("Café")"
///
/// let invalidUTF16: [UInt16] = [0x41, 0x42, 0xd801]
/// let invalid = String(validating: invalidUTF16, as: UTF16.self)
/// print(invalid)
/// // Prints "nil"
///
/// - Parameters:
/// - codeUnits: A sequence of code units that encode a `String`
/// - encoding: A conformer to `Unicode.Encoding` to be used
/// to decode `codeUnits`.
@inlinable
public init?<Encoding: Unicode.Encoding>(
validating codeUnits: some Sequence<Encoding.CodeUnit>,
as encoding: Encoding.Type
) {
let newString: String?? = codeUnits.withContiguousStorageIfAvailable {
String._validate($0, as: Encoding.self)
}
if let newString {
guard let newString else { return nil }
self = newString
return
}

// slow-path
var transcoded: [UTF8.CodeUnit] = []
transcoded.reserveCapacity(codeUnits.underestimatedCount)
var isASCII = true
let error = transcode(
codeUnits.makeIterator(),
from: Encoding.self,
to: UTF8.self,
stoppingOnError: true,
into: {
uint8 in
transcoded.append(uint8)
if isASCII && (uint8 & 0x80) == 0x80 { isASCII = false }
}
)
if error { return nil }
self = transcoded.withUnsafeBufferPointer{
String._uncheckedFromUTF8($0, asciiPreScanResult: isASCII)
}
}

/// Creates a new `String` by copying and validating the sequence of
/// `Int8` passed in, according to the specified encoding.
///
/// This initializer does not try to repair ill-formed code unit sequences.
/// If any are found, the result of the initializer is `nil`.
///
/// The following example calls this initializer with the contents of two
/// different arrays---first with a well-formed UTF-8 code unit sequence and
/// then with an ill-formed ASCII code unit sequence.
///
/// let validUTF8: [Int8] = [67, 97, 0, 102, -61, -87]
/// let valid = String(validating: validUTF8, as: UTF8.self)
/// print(valid)
/// // Prints "Optional("Café")"
///
/// let invalidASCII: [Int8] = [67, 97, -5]
/// let invalid = String(validating: invalidASCII, as: Unicode.ASCII.self)
/// print(invalid)
/// // Prints "nil"
///
/// - Parameters:
/// - codeUnits: A sequence of code units that encode a `String`
/// - encoding: A conformer to `Unicode.Encoding` that can decode
/// `codeUnits` as `UInt8`
@inlinable
public init?<Encoding>(
validating codeUnits: some Sequence<Int8>,
as encoding: Encoding.Type
) where Encoding: Unicode.Encoding, Encoding.CodeUnit == UInt8 {
let newString: String?? = codeUnits.withContiguousStorageIfAvailable {
$0.withMemoryRebound(to: UInt8.self) {
String._validate($0, as: Encoding.self)
}
}
if let newString {
guard let newString else { return nil }
self = newString
return
}

// slow-path
let uint8s = codeUnits.lazy.map(UInt8.init(bitPattern:))
let string = String(validating: uint8s, as: Encoding.self)
guard let string else { return nil }
self = string
}

/// Creates a new string with the specified capacity in UTF-8 code units, and
/// then calls the given closure with a buffer covering the string's
/// uninitialized memory.
Expand Down
48 changes: 48 additions & 0 deletions stdlib/public/core/StringCreate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -298,4 +298,52 @@ extension String {
String._uncheckedFromUTF8($0)
}
}

@usableFromInline
internal static func _validate<Encoding: Unicode.Encoding>(
_ input: UnsafeBufferPointer<Encoding.CodeUnit>,
as encoding: Encoding.Type
) -> String? {
fast: // fast-path
if encoding.CodeUnit.self == UInt8.self {
let bytes = _identityCast(input, to: UnsafeBufferPointer<UInt8>.self)
let isASCII: Bool
if encoding.self == UTF8.self {
guard case .success(let info) = validateUTF8(bytes) else { return nil }
isASCII = info.isASCII
} else if encoding.self == Unicode.ASCII.self {
guard _allASCII(bytes) else { return nil }
isASCII = true
} else {
break fast
}
return String._uncheckedFromUTF8(bytes, asciiPreScanResult: isASCII)
}

// slow-path
// this multiplier is a worst-case estimate
let multiplier = if encoding.self == UTF16.self { 3 } else { 4 }
return withUnsafeTemporaryAllocation(
of: UInt8.self, capacity: input.count * multiplier
) {
output -> String? in
var isASCII = true
var index = output.startIndex
let error = transcode(
input.makeIterator(),
from: encoding.self,
to: UTF8.self,
stoppingOnError: true,
into: {
uint8 in
output[index] = uint8
output.formIndex(after: &index)
if isASCII && (uint8 & 0x80) == 0x80 { isASCII = false }
}
)
if error { return nil }
let bytes = UnsafeBufferPointer(start: output.baseAddress, count: index)
return String._uncheckedFromUTF8(bytes, asciiPreScanResult: isASCII)
}
}
}
145 changes: 145 additions & 0 deletions test/stdlib/StringCreate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,148 @@ if #available(SwiftStdlib 5.3, *) {
expectEqual(str1, str6)
}
}

let s1 = "Long string containing the characters é, ß, 🦆, and 👨‍👧‍👦."
let s2 = "Long ascii string with no accented characters (obviously)."

StringCreateTests.test("Validating.utf8")
.skip(.custom(
{ if #available(SwiftStdlib 5.10, *) { false } else { true } },
reason: "Requires Swift 5.10's standard library"
))
.code {
guard #available(SwiftStdlib 5.10, *) else { return }

let i1 = Array(s1.utf8)
let i2 = Array(s2.utf8)
let i3 = {
var modified = i1
let index = modified.lastIndex(of: 240)
expectNotNil(index)
index.map { modified[$0] = 0 }
return modified
}()

var actual: String?
for simpleString in SimpleString.allCases {
let expected = simpleString.rawValue
actual = String(validating: expected.utf8, as: Unicode.UTF8.self)
expectEqual(actual, expected)
}

expectEqual(String(validating: i1, as: UTF8.self), s1)
expectEqual(String(validating: i2, as: UTF8.self), s2)
expectNil(String(validating: i3, as: UTF8.self))

expectEqual(String(validating: AnyCollection(i1), as: UTF8.self), s1)
expectEqual(String(validating: AnyCollection(i2), as: UTF8.self), s2)
expectNil(String(validating: AnyCollection(i3), as: UTF8.self))
}

StringCreateTests.test("Validating.utf8.from.int8")
.skip(.custom(
{ if #available(SwiftStdlib 5.10, *) { false } else { true } },
reason: "Requires Swift 5.10's standard library"
))
.code {
guard #available(SwiftStdlib 5.10, *) else { return }

let i1 = s1.utf8.map(Int8.init(bitPattern:))
let i2 = s2.utf8.map(Int8.init(bitPattern:))
let i3 = {
var modified = i1
let index = modified.lastIndex(of: Int8(bitPattern: 240))
expectNotNil(index)
index.map { modified[$0] = 0 }
return modified
}()

expectEqual(String(validating: i1, as: UTF8.self), s1)
expectEqual(String(validating: i2, as: UTF8.self), s2)
expectNil(String(validating: i3, as: UTF8.self))

expectEqual(String(validating: AnyCollection(i1), as: UTF8.self), s1)
expectEqual(String(validating: AnyCollection(i2), as: UTF8.self), s2)
expectNil(String(validating: AnyCollection(i3), as: UTF8.self))
}

StringCreateTests.test("Validating.ascii")
.skip(.custom(
{ if #available(SwiftStdlib 5.10, *) { false } else { true } },
reason: "Requires Swift 5.10's standard library"
))
.code {
guard #available(SwiftStdlib 5.10, *) else { return }

let i1 = Array(s1.utf8)
let i2 = Array(s2.utf8)

expectNil(String(validating: i1, as: Unicode.ASCII.self))
expectEqual(String(validating: i2, as: Unicode.ASCII.self), s2)

expectNil(String(validating: AnyCollection(i1), as: Unicode.ASCII.self))
expectEqual(String(validating: AnySequence(i2), as: Unicode.ASCII.self), s2)

let i3 = i1.map(Int8.init(bitPattern:))
let i4 = i2.map(Int8.init(bitPattern:))

expectNil(String(validating: i3, as: Unicode.ASCII.self))
expectEqual(String(validating: i4, as: Unicode.ASCII.self), s2)

expectNil(String(validating: AnyCollection(i3), as: Unicode.ASCII.self))
expectEqual(String(validating: AnySequence(i4), as: Unicode.ASCII.self), s2)
}

StringCreateTests.test("Validating.utf16")
.skip(.custom(
{ if #available(SwiftStdlib 5.10, *) { false } else { true } },
reason: "Requires Swift 5.10's standard library"
))
.code {
guard #available(SwiftStdlib 5.10, *) else { return }

let i1 = Array(s1.utf16)
let i2 = Array(s2.utf16)
let i3 = {
var modified = i1
let index = modified.lastIndex(of: 32)
expectNotNil(index)
index.map { modified[$0] = 0xd801 }
return modified
}()

expectEqual(String(validating: i1, as: UTF16.self), s1)
expectEqual(String(validating: i2, as: UTF16.self), s2)
expectNil(String(validating: i3, as: UTF16.self))

expectEqual(String(validating: AnySequence(i1), as: UTF16.self), s1)
expectEqual(String(validating: AnySequence(i2), as: UTF16.self), s2)
expectNil(String(validating: AnyCollection(i3), as: UTF16.self))
}

StringCreateTests.test("Validating.utf32")
.skip(.custom(
{ if #available(SwiftStdlib 5.10, *) { false } else { true } },
reason: "Requires Swift 5.10's standard library"
))
.code {
guard #available(SwiftStdlib 5.10, *) else { return }

let i1 = s1.unicodeScalars.map(\.value)
let i2 = s2.unicodeScalars.map(\.value)
let i3 = {
var modified = i1
let index = modified.lastIndex(of: .init(bitPattern: 32))
expectNotNil(index)
index.map { modified[$0] = .max }
return modified
}()

expectEqual(String(validating: i1, as: UTF32.self), s1)
expectEqual(String(validating: i2, as: UTF32.self), s2)
expectNil(String(validating: i3, as: UTF32.self))

expectEqual(String(validating: AnySequence(i1), as: UTF32.self), s1)
expectEqual(String(validating: AnySequence(i2), as: UTF32.self), s2)
expectNil(String(validating: AnyCollection(i3), as: UTF32.self))
}