-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[libc] mbtowc implementation #145405
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] mbtowc implementation #145405
Conversation
Implemented mbtowcs and tests for the function.
@llvm/pr-subscribers-libc Author: None (sribee8) ChangesImplemented mbtowcs and tests for the function. Full diff: https://github.com/llvm/llvm-project/pull/145405.diff 7 Files Affected:
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index e1a2a26479de9..f0e17d6a2544f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1248,6 +1248,7 @@ if(LLVM_LIBC_FULL_BUILD)
# wchar.h entrypoints
libc.src.wchar.mbrtowc
+ libc.src.wchar.mbtowc
libc.src.wchar.wcrtomb
)
endif()
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 0a6a75ebbbf91..d5044e1728f80 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -38,6 +38,14 @@ functions:
- type: const char *__restrict
- type: size_t
- type: mbstate_t *__restrict
+ - name: mbtowc
+ standards:
+ - stdc
+ return_type: int
+ arguments:
+ - type: wchar_t *__restrict
+ - type: const char *__restrict
+ - type: size_t
- name: wmemset
standards:
- stdc
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 476cf38f4662e..fe1a9e7cc6e17 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -65,6 +65,21 @@ add_entrypoint_object(
libc.src.__support.wchar.mbstate
)
+add_entrypoint_object(
+ mbtowc
+ SRCS
+ mbtowc.cpp
+ HDRS
+ mbtowc.h
+ DEPENDS
+ libc.hdr.types.size_t
+ libc.hdr.types.wchar_t
+ libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.wchar.mbrtowc
+ libc.src.__support.wchar.mbstate
+)
+
add_entrypoint_object(
wmemset
SRCS
diff --git a/libc/src/wchar/mbtowc.cpp b/libc/src/wchar/mbtowc.cpp
new file mode 100644
index 0000000000000..128fe9e84a201
--- /dev/null
+++ b/libc/src/wchar/mbtowc.cpp
@@ -0,0 +1,34 @@
+//===-- Implementation of mbtowc -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbtowc.h"
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbrtowc.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, mbtowc,
+ (wchar_t *__restrict pwc, const char *__restrict s,
+ size_t n)) {
+ if (s == nullptr)
+ return 0;
+ internal::mbstate internal_mbstate;
+ auto ret = internal::mbrtowc(pwc, s, n, &internal_mbstate);
+ if (!ret.has_value() || static_cast<int>(ret.value()) == -2) {
+ // Encoding failure
+ return -1;
+ }
+ return static_cast<int>(ret.value());
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbtowc.h b/libc/src/wchar/mbtowc.h
new file mode 100644
index 0000000000000..f974197f81b58
--- /dev/null
+++ b/libc/src/wchar/mbtowc.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for mbtowc ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBTOWC_H
+#define LLVM_LIBC_SRC_WCHAR_MBTOWC_H
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int mbtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBTOWC_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index c932f3632c7ab..510f01d33fbeb 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -37,6 +37,18 @@ add_libc_test(
libc.hdr.types.wchar_t
)
+add_libc_test(
+ mbtowc_test
+ SUITE
+ libc_wchar_unittests
+ SRCS
+ mbtowc_test.cpp
+ DEPENDS
+ libc.src.__support.libc_errno
+ libc.src.wchar.mbtowc
+ libc.hdr.types.wchar_t
+)
+
add_libc_test(
wctob_test
SUITE
diff --git a/libc/test/src/wchar/mbtowc_test.cpp b/libc/test/src/wchar/mbtowc_test.cpp
new file mode 100644
index 0000000000000..a3bd654d3ef04
--- /dev/null
+++ b/libc/test/src/wchar/mbtowc_test.cpp
@@ -0,0 +1,132 @@
+//===-- Unittests for mbtowc ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/wchar_t.h"
+#include "src/wchar/mbtowc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcMBToWC, OneByte) {
+ const char *ch = "A";
+ wchar_t dest[2];
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+ ASSERT_EQ(static_cast<char>(*dest), 'A');
+ ASSERT_EQ(n, 1);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbtowc(dest, ch, 0);
+ ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, TwoByte) {
+ const char ch[2] = {static_cast<char>(0xC2),
+ static_cast<char>(0x8E)}; // � car symbol
+ wchar_t dest[2];
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+ ASSERT_EQ(static_cast<int>(*dest), 142);
+ ASSERT_EQ(n, 2);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+ ASSERT_EQ(n, -1);
+ // Should fail after trying to read next byte too
+ n = LIBC_NAMESPACE::mbtowc(dest, ch + 1, 1);
+ ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, ThreeByte) {
+ const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+ static_cast<char>(0x91)}; // ∑ sigma symbol
+ wchar_t dest[2];
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 3);
+ ASSERT_EQ(static_cast<int>(*dest), 8721);
+ ASSERT_EQ(n, 3);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+ ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, FourByte) {
+ const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+ static_cast<char>(0xA4),
+ static_cast<char>(0xA1)}; // 🤡 clown emoji
+ wchar_t dest[2];
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4);
+ ASSERT_EQ(static_cast<int>(*dest), 129313);
+ ASSERT_EQ(n, 4);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+ ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, InvalidByte) {
+ const char ch[1] = {static_cast<char>(0x80)};
+ wchar_t dest[2];
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+ ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, InvalidMultiByte) {
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x80),
+ static_cast<char>(0x00)}; // invalid sequence of bytes
+ wchar_t dest[2];
+ // Trying to push all 4 should error
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4);
+ ASSERT_EQ(n, -1);
+ // Trying to push the second and third should correspond to null wc
+ n = LIBC_NAMESPACE::mbtowc(dest, ch + 1, 2);
+ ASSERT_EQ(n, 0);
+ ASSERT_TRUE(*dest == L'\0');
+}
+
+TEST(LlvmLibcMBToWC, InvalidLastByte) {
+ // Last byte is invalid since it does not have correct starting sequence.
+ // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+ static_cast<char>(0x80), static_cast<char>(0xC0)};
+ wchar_t dest[2];
+ // Trying to push all 4 should error
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4);
+ ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, ValidTwoByteWithExtraRead) {
+ const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0x80)};
+ wchar_t dest[2];
+ // Trying to push all 3 should return valid 2 byte
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 3);
+ ASSERT_EQ(n, 2);
+ ASSERT_EQ(static_cast<int>(*dest), 142);
+}
+
+TEST(LlvmLibcMBToWC, TwoValidTwoBytes) {
+ const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0xC7), static_cast<char>(0x8C)};
+ wchar_t dest[2];
+ int n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+ ASSERT_EQ(n, 2);
+ ASSERT_EQ(static_cast<int>(*dest), 142);
+ n = LIBC_NAMESPACE::mbtowc(dest + 1, ch + 2, 2);
+ ASSERT_EQ(n, 2);
+ ASSERT_EQ(static_cast<int>(*(dest + 1)), 460);
+}
+
+TEST(LlvmLibcMBToWC, NullString) {
+ wchar_t dest[2] = {L'O', L'K'};
+ // reading on nullptr should return 0
+ int n = LIBC_NAMESPACE::mbtowc(dest, nullptr, 2);
+ ASSERT_EQ(n, 0);
+ ASSERT_TRUE(dest[0] == L'O');
+ // reading a null terminator should return 0
+ const char *ch = "\0";
+ n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+ ASSERT_EQ(n, 0);
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbtowc.html
Posix clarifies some of the behavior since the C standard is very vague.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, tests pass on my machine
Implemented mbtowcs and tests for the function. --------- Co-authored-by: Sriya Pratipati <sriyap@google.com>
Implemented mbtowcs and tests for the function. --------- Co-authored-by: Sriya Pratipati <sriyap@google.com>
Implemented mbtowcs and tests for the function.