From 2f1ed631e2fbe325a72d40fa60e6280ee1d173e4 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Fri, 6 Jan 2023 13:52:27 -0800 Subject: [PATCH] [stdlib] _CharacterRecognizer: Add Sendable, Equatable, CustomStringConvertible conformances Equatability allows faster implementations for updating cached grapheme boundary state after a text mutation, because it enables quick detection of before/after state equality, without having to feed the recognizers until they produce a synchronized grapheme break. The CustomStringConvertible conformance makes it orders of magnitude more pleasant to debug code that uses this. Sendable is a baseline requirement for value types these days. --- .../public/core/StringGraphemeBreaking.swift | 31 +++++++++++++++-- test/stdlib/CharacterRecognizer.swift | 33 +++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index b602485b72da4..ff842039d7b47 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -407,7 +407,7 @@ extension Unicode.Scalar { } } -internal struct _GraphemeBreakingState { +internal struct _GraphemeBreakingState: Sendable, Equatable { // When we're looking through an indic sequence, one of the requirements is // that there is at LEAST 1 Virama present between two linking consonants. // This value helps ensure that when we ultimately need to decide whether or @@ -436,6 +436,18 @@ internal struct _GraphemeBreakingState { var shouldBreakRI = false } +extension _GraphemeBreakingState: CustomStringConvertible { + var description: String { + var r = "[" + if hasSeenVirama { r += "V" } + if isInEmojiSequence { r += "E" } + if isInIndicSequence { r += "I" } + if shouldBreakRI { r += "R" } + r += "]" + return r + } +} + extension Unicode { /// A state machine for recognizing character (i.e., extended grapheme /// cluster) boundaries in an arbitrary series of Unicode scalars. @@ -448,7 +460,7 @@ extension Unicode { /// `String` splits its contents into `Character` values. @available(SwiftStdlib 5.8, *) public // SPI(Foundation) FIXME: We need API for this - struct _CharacterRecognizer { + struct _CharacterRecognizer: Sendable { internal var _previous: Unicode.Scalar internal var _state: _GraphemeBreakingState @@ -547,6 +559,21 @@ extension Unicode { } } +@available(SwiftStdlib 5.8, *) +extension Unicode._CharacterRecognizer: Equatable { + public static func ==(left: Self, right: Self) -> Bool { + left._previous == right._previous && left._state == right._state + } +} + +@available(SwiftStdlib 5.8, *) +extension Unicode._CharacterRecognizer: CustomStringConvertible { + public var description: String { + return "\(_state)U+\(String(_previous.value, radix: 16, uppercase: true))" + } +} + + extension _StringGuts { // Returns the stride of the grapheme cluster starting at offset `index`, // assuming it is on a grapheme cluster boundary. diff --git a/test/stdlib/CharacterRecognizer.swift b/test/stdlib/CharacterRecognizer.swift index 6bacbaaacac54..12816c6a164f7 100644 --- a/test/stdlib/CharacterRecognizer.swift +++ b/test/stdlib/CharacterRecognizer.swift @@ -114,3 +114,36 @@ if #available(SwiftStdlib 5.8, *) { """) } } + +if #available(SwiftStdlib 5.8, *) { + suite.test("Equatable") { + var r1 = Unicode._CharacterRecognizer() + var r2 = Unicode._CharacterRecognizer() + expectEqual(r1, r2) + expectTrue(r1.hasBreak(before: "a")) + expectNotEqual(r1, r2) + expectTrue(r2.hasBreak(before: "a")) + expectEqual(r1, r2) + expectTrue(r2.hasBreak(before: "\u{1f44f}")) // CLAPPING HANDS SIGN + expectNotEqual(r1, r2) + expectTrue(r1.hasBreak(before: "b")) + expectNotEqual(r1, r2) + expectFalse(r2.hasBreak(before: "\u{1f3fc}")) // EMOJI MODIFIER FITZPATRICK TYPE-3 + expectNotEqual(r1, r2) + expectTrue(r2.hasBreak(before: "b")) + expectEqual(r1, r2) // breaks should reset state + } +} + +if #available(SwiftStdlib 5.8, *) { + suite.test("CustomStringConvertible") { + var r = Unicode._CharacterRecognizer() + expectEqual("\(r)", "[]U+0") + expectTrue(r.hasBreak(before: "\u{1F1FA}")) // REGIONAL INDICATOR SYMBOL LETTER U + expectEqual("\(r)", "[]U+1F1FA") + expectFalse(r.hasBreak(before: "\u{1F1F8}")) // REGIONAL INDICATOR SYMBOL LETTER S + expectEqual("\(r)", "[R]U+1F1F8") + expectTrue(r.hasBreak(before: "$")) + expectEqual("\(r)", "[]U+24") + } +}