From 8ef434d7519e6d21ff0e9fb7d3b2ca851f9c2b99 Mon Sep 17 00:00:00 2001 From: mattsu Date: Thu, 9 Apr 2026 19:06:29 +0900 Subject: [PATCH 1/2] fix(ls): use GetACP to detect UTF-8 encoding on Windows On Windows, locale environment variables (LC_ALL, LC_COLLATE, LANG) are typically unset, causing get_locale_from_env() to default to UEncoding::Ascii. This makes non-ASCII filenames display as octal escape sequences or `?` characters in ls output. Fix by querying the system ANSI code page via GetACP() when no locale variables are set. If the active code page is 65001 (UTF-8), use UEncoding::Utf8. This aligns with GNU coreutils' gnulib approach which calls locale_charset() -> GetACP() on Windows. Fixes: uutils#11103 --- src/uucore/src/lib/features/i18n/mod.rs | 32 +++++++++- tests/by-util/test_ls.rs | 78 +++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index 282baf2e768..9989aed5569 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -28,6 +28,24 @@ pub enum UEncoding { // This ensures real locales like "en-US" won't match const DEFAULT_LOCALE: Locale = locale!("und"); +/// On Windows, detect the encoding from the system ANSI code page. +/// Returns `UEncoding::Utf8` if the active code page is 65001 (UTF-8), +/// otherwise `UEncoding::Ascii`. +/// +/// This mirrors the GNU lib approach where `locale_charset()` calls `GetACP()` on Windows. +#[cfg(target_os = "windows")] +fn get_windows_encoding() -> UEncoding { + unsafe extern "system" { + fn GetACP() -> u32; + } + let acp = unsafe { GetACP() }; + if acp == 65001 { + UEncoding::Utf8 + } else { + UEncoding::Ascii + } +} + /// Look at 3 environment variables in the following order /// /// 1. LC_ALL @@ -70,8 +88,18 @@ pub fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) { return (locale, encoding); } } - // Default POSIX locale representing LC_ALL=C - (DEFAULT_LOCALE, UEncoding::Ascii) + // No locale environment variables set. + // On Windows, check the system ANSI code page to determine encoding, + // matching GNU coreutils' approach (locale_charset -> GetACP). + #[cfg(target_os = "windows")] + { + (DEFAULT_LOCALE, get_windows_encoding()) + } + #[cfg(not(target_os = "windows"))] + { + // Default POSIX locale representing LC_ALL=C + (DEFAULT_LOCALE, UEncoding::Ascii) + } } /// Get the collating locale from the environment diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs index fb9a00fa2df..67b17a96f2d 100644 --- a/tests/by-util/test_ls.rs +++ b/tests/by-util/test_ls.rs @@ -7195,3 +7195,81 @@ fn test_ls_a_dotdot_no_error_on_wasi() { .stdout_contains("..") .no_stderr(); } + +/// Verify that ls correctly detects encoding from locale environment variables. +/// Non-ASCII filenames should be escaped in C/POSIX/non-UTF-8 locales +/// and displayed as-is in UTF-8 locales. +#[cfg(not(any(target_vendor = "apple", target_os = "windows", target_os = "openbsd")))] +mod locale_encoding { + use uutests::util::TestScenario; + use uutests::util_name; + + /// Create a file with a non-ASCII name and check ls output with the given locale. + /// If `expect_utf8` is true, assert the filename is shown as-is (UTF-8 locale). + /// Otherwise, assert the non-ASCII character is escaped (ASCII locale). + fn check_locale(locale: &str, expect_utf8: bool) { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + let filename = uucore::os_str_from_bytes("é".as_bytes()) + .expect("should be valid Unicode"); + at.touch(filename); + + let result = scene + .ucmd() + .env("LC_ALL", locale) + .arg("--quoting-style=shell-escape") + .succeeds(); + + if expect_utf8 { + result.stdout_contains("é"); + } else { + result.stdout_does_not_contain("é"); + } + } + + #[test] + fn test_ls_locale_c_escapes_non_ascii() { + check_locale("C", false); + } + + #[test] + fn test_ls_locale_posix_escapes_non_ascii() { + check_locale("POSIX", false); + } + + #[test] + fn test_ls_locale_utf8_suffix_shows_non_ascii() { + check_locale("en_US.UTF-8", true); + } + + #[test] + fn test_ls_locale_utf8_lowercase_shows_non_ascii() { + check_locale("en_US.utf8", true); + } + + #[test] + fn test_ls_locale_iso8859_escapes_non_ascii() { + check_locale("en_US.ISO-8859-1", false); + } + + #[test] + fn test_ls_locale_no_encoding_suffix_escapes_non_ascii() { + check_locale("en_US", false); + } +} + +/// On Windows, verify that ls can display non-ASCII filenames correctly +/// when the system ANSI code page is set to UTF-8 (ACP 65001). +#[cfg(target_os = "windows")] +#[test] +fn test_ls_windows_non_ascii_filename() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("文件1"); + + scene + .ucmd() + .succeeds() + .stdout_contains("文件1") + .no_stderr(); +} From b423916134f5941c47e8d1a929257d00dd7c4456 Mon Sep 17 00:00:00 2001 From: mattsu Date: Thu, 9 Apr 2026 19:10:02 +0900 Subject: [PATCH 2/2] refactor: simplify string handling and improve code formatting in ls tests This commit simplifies string handling by removing unnecessary `expect()` calls and improves code formatting in the ls test module. The changes include: - Removing redundant `expect()` calls for valid Unicode strings - Consolidating multi-line method chaining into single lines for better readability - These are purely cosmetic improvements that maintain the same functionality while making the code cleaner --- tests/by-util/test_ls.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs index 67b17a96f2d..e205aa77ee6 100644 --- a/tests/by-util/test_ls.rs +++ b/tests/by-util/test_ls.rs @@ -7210,8 +7210,7 @@ mod locale_encoding { fn check_locale(locale: &str, expect_utf8: bool) { let scene = TestScenario::new(util_name!()); let at = &scene.fixtures; - let filename = uucore::os_str_from_bytes("é".as_bytes()) - .expect("should be valid Unicode"); + let filename = uucore::os_str_from_bytes("é".as_bytes()).expect("should be valid Unicode"); at.touch(filename); let result = scene @@ -7267,9 +7266,5 @@ fn test_ls_windows_non_ascii_filename() { let at = &scene.fixtures; at.touch("文件1"); - scene - .ucmd() - .succeeds() - .stdout_contains("文件1") - .no_stderr(); + scene.ucmd().succeeds().stdout_contains("文件1").no_stderr(); }