Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix fmt UTF-8 character as fill #18533

Merged
merged 18 commits into from
Jan 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 61 additions & 34 deletions lib/std/fmt.zig
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub const FormatOptions = struct {
precision: ?usize = null,
width: ?usize = null,
alignment: Alignment = .right,
fill: u8 = ' ',
fill: u21 = ' ',
};

/// Renders fmt string with args, calling `writer` with slices of bytes.
Expand Down Expand Up @@ -211,14 +211,18 @@ fn cacheString(str: anytype) []const u8 {

pub const Placeholder = struct {
specifier_arg: []const u8,
fill: u8,
fill: u21,
alignment: Alignment,
arg: Specifier,
width: Specifier,
precision: Specifier,

pub fn parse(comptime str: anytype) Placeholder {
comptime var parser = Parser{ .buf = &str };
const view = std.unicode.Utf8View.initComptime(&str);
comptime var parser = Parser{
.buf = &str,
.iter = view.iterator(),
};

// Parse the positional argument number
const arg = comptime parser.specifier() catch |err|
Expand All @@ -230,7 +234,7 @@ pub const Placeholder = struct {
// Skip the colon, if present
if (comptime parser.char()) |ch| {
if (ch != ':') {
@compileError("expected : or }, found '" ++ [1]u8{ch} ++ "'");
@compileError("expected : or }, found '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
}
}

Expand Down Expand Up @@ -265,7 +269,7 @@ pub const Placeholder = struct {
// Skip the dot, if present
if (comptime parser.char()) |ch| {
if (ch != '.') {
@compileError("expected . or }, found '" ++ [1]u8{ch} ++ "'");
@compileError("expected . or }, found '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
}
}

Expand All @@ -274,7 +278,7 @@ pub const Placeholder = struct {
@compileError(@errorName(err));

if (comptime parser.char()) |ch| {
@compileError("extraneous trailing character '" ++ [1]u8{ch} ++ "'");
@compileError("extraneous trailing character '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
}

return Placeholder{
Expand All @@ -297,53 +301,51 @@ pub const Specifier = union(enum) {
pub const Parser = struct {
buf: []const u8,
pos: usize = 0,
iter: std.unicode.Utf8Iterator = undefined,

// Returns a decimal number or null if the current character is not a
// digit
pub fn number(self: *@This()) ?usize {
var r: ?usize = null;

while (self.pos < self.buf.len) : (self.pos += 1) {
switch (self.buf[self.pos]) {
while (self.peek(0)) |code_point| {
switch (code_point) {
'0'...'9' => {
if (r == null) r = 0;
r.? *= 10;
r.? += self.buf[self.pos] - '0';
r.? += code_point - '0';
},
else => break,
}
_ = self.iter.nextCodepoint();
}

return r;
}

// Returns a substring of the input starting from the current position
// and ending where `ch` is found or until the end if not found
pub fn until(self: *@This(), ch: u8) []const u8 {
const start = self.pos;

if (start >= self.buf.len)
return &[_]u8{};

while (self.pos < self.buf.len) : (self.pos += 1) {
if (self.buf[self.pos] == ch) break;
pub fn until(self: *@This(), ch: u21) []const u8 {
var result: []const u8 = &[_]u8{};
while (self.peek(0)) |code_point| {
if (code_point == ch)
break;
result = result ++ (self.iter.nextCodepointSlice() orelse &[_]u8{});
}
return self.buf[start..self.pos];
return result;
}

// Returns one character, if available
pub fn char(self: *@This()) ?u8 {
if (self.pos < self.buf.len) {
const ch = self.buf[self.pos];
self.pos += 1;
return ch;
pub fn char(self: *@This()) ?u21 {
if (self.iter.nextCodepoint()) |code_point| {
return code_point;
}
return null;
}

pub fn maybe(self: *@This(), val: u8) bool {
if (self.pos < self.buf.len and self.buf[self.pos] == val) {
self.pos += 1;
pub fn maybe(self: *@This(), val: u21) bool {
if (self.peek(0) == val) {
_ = self.iter.nextCodepoint();
return true;
}
return false;
Expand All @@ -367,8 +369,17 @@ pub const Parser = struct {
}

// Returns the n-th next character or null if that's past the end
pub fn peek(self: *@This(), n: usize) ?u8 {
return if (self.pos + n < self.buf.len) self.buf[self.pos + n] else null;
pub fn peek(self: *@This(), n: usize) ?u21 {
const original_i = self.iter.i;
defer self.iter.i = original_i;

var i = 0;
var code_point: ?u21 = null;
while (i <= n) : (i += 1) {
code_point = self.iter.nextCodepoint();
if (code_point == null) return null;
}
return code_point;
}
};

Expand Down Expand Up @@ -965,8 +976,7 @@ pub fn formatUnicodeCodepoint(
var buf: [4]u8 = undefined;
const len = unicode.utf8Encode(c, &buf) catch |err| switch (err) {
error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => {
const len = unicode.utf8Encode(unicode.replacement_character, &buf) catch unreachable;
return formatBuf(buf[0..len], options, writer);
return formatBuf(&unicode.utf8EncodeComptime(unicode.replacement_character), options, writer);
},
};
return formatBuf(buf[0..len], options, writer);
Expand All @@ -985,20 +995,28 @@ pub fn formatBuf(
if (padding == 0)
return writer.writeAll(buf);

var fill_buffer: [4]u8 = undefined;
const fill_utf8 = if (unicode.utf8Encode(options.fill, &fill_buffer)) |len|
fill_buffer[0..len]
else |err| switch (err) {
error.Utf8CannotEncodeSurrogateHalf,
error.CodepointTooLarge,
=> &unicode.utf8EncodeComptime(unicode.replacement_character),
};
switch (options.alignment) {
.left => {
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, padding);
try writer.writeBytesNTimes(fill_utf8, padding);
},
.center => {
const left_padding = padding / 2;
const right_padding = (padding + 1) / 2;
try writer.writeByteNTimes(options.fill, left_padding);
try writer.writeBytesNTimes(fill_utf8, left_padding);
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, right_padding);
try writer.writeBytesNTimes(fill_utf8, right_padding);
},
.right => {
try writer.writeByteNTimes(options.fill, padding);
try writer.writeBytesNTimes(fill_utf8, padding);
try writer.writeAll(buf);
},
}
Expand Down Expand Up @@ -2793,6 +2811,15 @@ test "padding" {
try expectFmt("a====", "{c:=<5}", .{'a'});
}

test "padding fill char utf" {
try expectFmt("──crêpe───", "{s:─^10}", .{"crêpe"});
try expectFmt("─────crêpe", "{s:─>10}", .{"crêpe"});
try expectFmt("crêpe─────", "{s:─<10}", .{"crêpe"});
try expectFmt("────a", "{c:─>5}", .{'a'});
try expectFmt("──a──", "{c:─^5}", .{'a'});
try expectFmt("a────", "{c:─<5}", .{'a'});
}

test "decimal float padding" {
const number: f32 = 3.1415;
try expectFmt("left-pad: **3.141\n", "left-pad: {d:*>7.3}\n", .{number});
Expand Down
7 changes: 7 additions & 0 deletions lib/std/io/writer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ pub fn Writer(
}
}

pub fn writeBytesNTimes(self: Self, bytes: []const u8, n: usize) Error!void {
var i: usize = 0;
while (i < n) : (i += 1) {
try self.writeAll(bytes);
}
}

pub inline fn writeInt(self: Self, comptime T: type, value: T, endian: std.builtin.Endian) Error!void {
var bytes: [@divExact(@typeInfo(T).Int.bits, 8)]u8 = undefined;
mem.writeInt(std.math.ByteAlignedInt(@TypeOf(value)), &bytes, value, endian);
Expand Down
20 changes: 20 additions & 0 deletions lib/std/unicode.zig
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,19 @@ pub fn utf8Encode(c: u21, out: []u8) !u3 {
return length;
}

pub inline fn utf8EncodeComptime(comptime c: u21) [
utf8CodepointSequenceLength(c) catch |err|
@compileError(@errorName(err))
]u8 {
comptime var result: [
utf8CodepointSequenceLength(c) catch
unreachable
]u8 = undefined;
comptime assert((utf8Encode(c, &result) catch |err|
@compileError(@errorName(err))) == result.len);
return result;
}

const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;

/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
Expand Down Expand Up @@ -525,6 +538,13 @@ fn testUtf8Encode() !void {
try testing.expect(array[3] == 0b10001000);
}

test "utf8 encode comptime" {
try testing.expectEqualSlices(u8, "€", &utf8EncodeComptime('€'));
try testing.expectEqualSlices(u8, "$", &utf8EncodeComptime('$'));
try testing.expectEqualSlices(u8, "¢", &utf8EncodeComptime('¢'));
try testing.expectEqualSlices(u8, "𐍈", &utf8EncodeComptime('𐍈'));
}

test "utf8 encode error" {
try comptime testUtf8EncodeError();
try testUtf8EncodeError();
Expand Down