Skip to content

[libc] mbtowc implementation #145405

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 23, 2025
Merged

[libc] mbtowc implementation #145405

merged 4 commits into from
Jun 23, 2025

Conversation

sribee8
Copy link
Contributor

@sribee8 sribee8 commented Jun 23, 2025

Implemented mbtowcs and tests for the function.

Implemented mbtowcs and tests for the function.
@llvmbot llvmbot added the libc label Jun 23, 2025
@llvmbot
Copy link
Member

llvmbot commented Jun 23, 2025

@llvm/pr-subscribers-libc

Author: None (sribee8)

Changes

Implemented mbtowcs and tests for the function.


Full diff: https://github.com/llvm/llvm-project/pull/145405.diff

7 Files Affected:

  • (modified) libc/config/linux/x86_64/entrypoints.txt (+1)
  • (modified) libc/include/wchar.yaml (+8)
  • (modified) libc/src/wchar/CMakeLists.txt (+15)
  • (added) libc/src/wchar/mbtowc.cpp (+34)
  • (added) libc/src/wchar/mbtowc.h (+22)
  • (modified) libc/test/src/wchar/CMakeLists.txt (+12)
  • (added) libc/test/src/wchar/mbtowc_test.cpp (+132)
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index e1a2a26479de9..f0e17d6a2544f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1248,6 +1248,7 @@ if(LLVM_LIBC_FULL_BUILD)
 
     # wchar.h entrypoints
     libc.src.wchar.mbrtowc
+    libc.src.wchar.mbtowc
     libc.src.wchar.wcrtomb
   )
 endif()
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 0a6a75ebbbf91..d5044e1728f80 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -38,6 +38,14 @@ functions:
       - type: const char *__restrict
       - type: size_t
       - type: mbstate_t *__restrict
+  - name: mbtowc
+    standards:
+      - stdc
+    return_type: int
+    arguments:
+      - type: wchar_t *__restrict
+      - type: const char *__restrict
+      - type: size_t
   - name: wmemset
     standards:
       - stdc
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 476cf38f4662e..fe1a9e7cc6e17 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -65,6 +65,21 @@ add_entrypoint_object(
     libc.src.__support.wchar.mbstate
 )
 
+add_entrypoint_object(
+  mbtowc
+  SRCS
+    mbtowc.cpp
+  HDRS
+    mbtowc.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.wchar.mbrtowc
+    libc.src.__support.wchar.mbstate
+)
+
 add_entrypoint_object(
   wmemset
   SRCS
diff --git a/libc/src/wchar/mbtowc.cpp b/libc/src/wchar/mbtowc.cpp
new file mode 100644
index 0000000000000..128fe9e84a201
--- /dev/null
+++ b/libc/src/wchar/mbtowc.cpp
@@ -0,0 +1,34 @@
+//===-- Implementation of mbtowc -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbtowc.h"
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbrtowc.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, mbtowc,
+                   (wchar_t *__restrict pwc, const char *__restrict s,
+                    size_t n)) {
+  if (s == nullptr)
+    return 0;
+  internal::mbstate internal_mbstate;
+  auto ret = internal::mbrtowc(pwc, s, n, &internal_mbstate);
+  if (!ret.has_value() || static_cast<int>(ret.value()) == -2) {
+    // Encoding failure
+    return -1;
+  }
+  return static_cast<int>(ret.value());
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbtowc.h b/libc/src/wchar/mbtowc.h
new file mode 100644
index 0000000000000..f974197f81b58
--- /dev/null
+++ b/libc/src/wchar/mbtowc.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for mbtowc ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBTOWC_H
+#define LLVM_LIBC_SRC_WCHAR_MBTOWC_H
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int mbtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBTOWC_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index c932f3632c7ab..510f01d33fbeb 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -37,6 +37,18 @@ add_libc_test(
     libc.hdr.types.wchar_t
 )
 
+add_libc_test(
+  mbtowc_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    mbtowc_test.cpp
+  DEPENDS
+    libc.src.__support.libc_errno
+    libc.src.wchar.mbtowc
+    libc.hdr.types.wchar_t
+)
+
 add_libc_test(
   wctob_test
   SUITE
diff --git a/libc/test/src/wchar/mbtowc_test.cpp b/libc/test/src/wchar/mbtowc_test.cpp
new file mode 100644
index 0000000000000..a3bd654d3ef04
--- /dev/null
+++ b/libc/test/src/wchar/mbtowc_test.cpp
@@ -0,0 +1,132 @@
+//===-- Unittests for mbtowc ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/wchar_t.h"
+#include "src/wchar/mbtowc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcMBToWC, OneByte) {
+  const char *ch = "A";
+  wchar_t dest[2];
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+  ASSERT_EQ(static_cast<char>(*dest), 'A');
+  ASSERT_EQ(n, 1);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbtowc(dest, ch, 0);
+  ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, TwoByte) {
+  const char ch[2] = {static_cast<char>(0xC2),
+                      static_cast<char>(0x8E)}; // � car symbol
+  wchar_t dest[2];
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+  ASSERT_EQ(static_cast<int>(*dest), 142);
+  ASSERT_EQ(n, 2);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+  ASSERT_EQ(n, -1);
+  // Should fail after trying to read next byte too
+  n = LIBC_NAMESPACE::mbtowc(dest, ch + 1, 1);
+  ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, ThreeByte) {
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑ sigma symbol
+  wchar_t dest[2];
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 3);
+  ASSERT_EQ(static_cast<int>(*dest), 8721);
+  ASSERT_EQ(n, 3);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+  ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, FourByte) {
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+  wchar_t dest[2];
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4);
+  ASSERT_EQ(static_cast<int>(*dest), 129313);
+  ASSERT_EQ(n, 4);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+  ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, InvalidByte) {
+  const char ch[1] = {static_cast<char>(0x80)};
+  wchar_t dest[2];
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+  ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, InvalidMultiByte) {
+  const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+                      static_cast<char>(0x80),
+                      static_cast<char>(0x00)}; // invalid sequence of bytes
+  wchar_t dest[2];
+  // Trying to push all 4 should error
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4);
+  ASSERT_EQ(n, -1);
+  // Trying to push the second and third should correspond to null wc
+  n = LIBC_NAMESPACE::mbtowc(dest, ch + 1, 2);
+  ASSERT_EQ(n, 0);
+  ASSERT_TRUE(*dest == L'\0');
+}
+
+TEST(LlvmLibcMBToWC, InvalidLastByte) {
+  // Last byte is invalid since it does not have correct starting sequence.
+  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+                      static_cast<char>(0x80), static_cast<char>(0xC0)};
+  wchar_t dest[2];
+  // Trying to push all 4 should error
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4);
+  ASSERT_EQ(n, -1);
+}
+
+TEST(LlvmLibcMBToWC, ValidTwoByteWithExtraRead) {
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+  wchar_t dest[2];
+  // Trying to push all 3 should return valid 2 byte
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 3);
+  ASSERT_EQ(n, 2);
+  ASSERT_EQ(static_cast<int>(*dest), 142);
+}
+
+TEST(LlvmLibcMBToWC, TwoValidTwoBytes) {
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+  wchar_t dest[2];
+  int n = LIBC_NAMESPACE::mbtowc(dest, ch, 2);
+  ASSERT_EQ(n, 2);
+  ASSERT_EQ(static_cast<int>(*dest), 142);
+  n = LIBC_NAMESPACE::mbtowc(dest + 1, ch + 2, 2);
+  ASSERT_EQ(n, 2);
+  ASSERT_EQ(static_cast<int>(*(dest + 1)), 460);
+}
+
+TEST(LlvmLibcMBToWC, NullString) {
+  wchar_t dest[2] = {L'O', L'K'};
+  // reading on nullptr should return 0
+  int n = LIBC_NAMESPACE::mbtowc(dest, nullptr, 2);
+  ASSERT_EQ(n, 0);
+  ASSERT_TRUE(dest[0] == L'O');
+  // reading a null terminator should return 0
+  const char *ch = "\0";
+  n = LIBC_NAMESPACE::mbtowc(dest, ch, 1);
+  ASSERT_EQ(n, 0);
+}

@michaelrj-google michaelrj-google self-requested a review June 23, 2025 20:53
Copy link
Contributor

@michaelrj-google michaelrj-google left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbtowc.html

Posix clarifies some of the behavior since the C standard is very vague.

Copy link
Contributor

@michaelrj-google michaelrj-google left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, tests pass on my machine

@sribee8 sribee8 merged commit 10d46cf into llvm:main Jun 23, 2025
13 checks passed
DrSergei pushed a commit to DrSergei/llvm-project that referenced this pull request Jun 24, 2025
Implemented mbtowcs and tests for the function.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
anthonyhatran pushed a commit to anthonyhatran/llvm-project that referenced this pull request Jun 26, 2025
Implemented mbtowcs and tests for the function.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants