Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
6ae47a3
WIP vectorization for UTF16->UTF8
Catfish-Man Jul 15, 2025
246939c
Lots of fixes
Catfish-Man Jul 15, 2025
f85efe5
Fun fact: UInt16 is not the same size as UInt8
Catfish-Man Jul 16, 2025
4b84ced
See if the scalar version autovectorizes on arm64 too
Catfish-Man Jul 19, 2025
25ac970
Build fix for experiment
Catfish-Man Jul 19, 2025
931ae62
Remove arm64-specific code
Catfish-Man Jul 19, 2025
8e9f5e0
Adjust for 32 bit
Catfish-Man Jul 19, 2025
f0cee25
Stop doing size math, stop duplicating work in some cases, and delete…
Catfish-Man Jul 20, 2025
4b9be8f
Adopt the new implementation in another place, add unsafe annotations…
Catfish-Man Jul 20, 2025
f326f61
Actually detect non-ascii in the fallback path
Catfish-Man Jul 21, 2025
9263ce6
Remove pointless failed attempt at being clever
Catfish-Man Jul 21, 2025
3177957
Do it all by hand, since empirically it's a lot faster for runs of no…
Catfish-Man Jul 22, 2025
9d6d225
Merge branch 'main' into asciivec
Catfish-Man Jul 28, 2025
9dc0c96
Add a (slow) scalar fallback path, and add more unsafe annotations
Catfish-Man Jul 28, 2025
bb2437d
Fix precondition
Catfish-Man Jul 29, 2025
3105736
Optimize utf8Length to handle values < 0x1000 with the vector path
Catfish-Man Jul 29, 2025
af65484
Expand the table to 128 entries and use an InlineArray instead of a t…
Catfish-Man Jul 29, 2025
1c80d60
Fix 32 bit
Catfish-Man Jul 30, 2025
72738d2
Fix comment
Catfish-Man Jul 30, 2025
3358192
Sigh, don't get to use InlineArray after all
Catfish-Man Jul 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 69 additions & 15 deletions stdlib/public/core/StringCreate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -247,26 +247,57 @@ extension String {
initializingFrom: input, isASCII: isASCII)
return storage.asString
}

internal static func _fromUTF16(
_ input: UnsafeBufferPointer<UInt16>,
repairing: Bool = true
) -> (String, repairsMade: Bool)? {
if input.isEmpty { return ("", repairsMade: false) }
guard let (utf8Len, isASCII) = unsafe utf8Length(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once we're ready to push forward on this again, a test I just ran locally suggests that adding a callback to resize the allocation if needed (guess all-ascii, then resize by 2x, then by 1.5x) is actually quite a bit faster than sizing up front.

We'll see if I can make that work for String, will require a bit of shenanigans probably.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fun option: port String growth-on-append to realloc then use that

of: input,
repairing: repairing
) else {
return nil
}
var repairsMade = false
if utf8Len <= _SmallString.capacity {
let smol = unsafe _SmallString(initializingUTF8With: {
let (count, tmpRepairsMade) = unsafe transcodeUTF16ToUTF8(
UTF16CodeUnits: input,
into: $0,
repairing: repairing
)
repairsMade = tmpRepairsMade
return count
})
return (String(_StringGuts(smol)), repairsMade: repairsMade)
}
let result = unsafe __StringStorage.create(
uninitializedCodeUnitCapacity: utf8Len,
initializingUncheckedUTF8With: { buffer -> Int in
let (count, tmpRepairsMade) = unsafe transcodeUTF16ToUTF8(
UTF16CodeUnits: input,
into: buffer,
repairing: repairing
)
repairsMade = tmpRepairsMade
return count
}
)
result._updateCountAndFlags(
newCount: result.count,
newIsASCII: isASCII
)
return (result.asString, repairsMade: repairsMade)
}

@usableFromInline
internal static func _uncheckedFromUTF16(
_ input: UnsafeBufferPointer<UInt16>
) -> String {
// TODO(String Performance): Attempt to form smol strings

// TODO(String performance): Skip intermediary array, transcode directly
// into a StringStorage space.
var contents: [UInt8] = []
contents.reserveCapacity(input.count)
let repaired = unsafe transcode(
input.makeIterator(),
from: UTF16.self,
to: UTF8.self,
stoppingOnError: false,
into: { contents.append($0) })
let (result, repaired) = unsafe _fromUTF16(input, repairing: true)!
_internalInvariant(!repaired, "Error present")

return unsafe contents.withUnsafeBufferPointer { unsafe String._uncheckedFromUTF8($0) }
return result
}

@inline(never) // slow path
Expand Down Expand Up @@ -311,7 +342,30 @@ extension String {
repair: Bool
) -> (String, repairsMade: Bool)?
where Input.Element == Encoding.CodeUnit {
guard _fastPath(encoding == Unicode.ASCII.self) else {
if encoding != Unicode.ASCII.self {
if encoding == Unicode.UTF16.self {
if let str = input.withContiguousStorageIfAvailable({ buffer in
unsafe _fromUTF16(
UnsafeRawBufferPointer(buffer).assumingMemoryBound(to: UInt16.self),
repairing: repair
)
}) {
return str
}
#if !$Embedded
if let contigBytes = input as? _HasContiguousBytes,
contigBytes._providesContiguousBytesNoCopy {
if let str = contigBytes.withUnsafeBytes({ buffer in
unsafe _fromUTF16(
buffer.assumingMemoryBound(to: UInt16.self),
repairing: repair
)
}) {
return str
}
}
#endif
}
return _slowFromCodeUnits(input, encoding: encoding, repair: repair)
}

Expand Down
Loading