Skip to content

Commit

Permalink
Fix recognizing locales using UTF-8 charset
Browse files Browse the repository at this point in the history
Do not assume that C locale uses UTF-8, as this is simply not true and
none of the CRT functions handle UTF-8 correctly with this locale.

Do recognize locales explicitly using UTF-8 charset as being in UTF-8.
On most Unix systems (including Linux), it didn't really matter that we
didn't do it, because we used nl_langinfo() there, but it does matter a
lot with MSVC under MSW whose CRT supports UTF-8 now, but UTF-8
functions were not used there -- do use them now.
  • Loading branch information
vadz committed Mar 28, 2023
1 parent e4e3e7e commit e48c101
Showing 1 changed file with 24 additions and 21 deletions.
45 changes: 24 additions & 21 deletions src/common/wxcrt.cpp
Expand Up @@ -1041,6 +1041,19 @@ char *strdup(const char *s)
bool wxLocaleIsUtf8 = false; // the safer setting if not known
#endif

static bool wxIsCharsetUtf8(const char* charset)
{
if ( strcmp(charset, "UTF-8") == 0 ||
strcmp(charset, "utf-8") == 0 ||
strcmp(charset, "UTF8") == 0 ||
strcmp(charset, "utf8") == 0 )
{
return true;
}

return false;
}

static bool wxIsLocaleUtf8()
{
// NB: we intentionally don't use wxLocale::GetSystemEncodingName(),
Expand All @@ -1051,31 +1064,21 @@ static bool wxIsLocaleUtf8()
// GNU libc provides current character set this way (this conforms to
// Unix98)
const char *charset = nl_langinfo(CODESET);
if ( charset )
{
// "UTF-8" is used by modern glibc versions, but test other variants
// as well, just in case:
if ( strcmp(charset, "UTF-8") == 0 ||
strcmp(charset, "utf-8") == 0 ||
strcmp(charset, "UTF8") == 0 ||
strcmp(charset, "utf8") == 0 )
{
return true;
}
}
#endif // HAVE_LANGINFO_H

// check if we're running under the "C" locale: it is 7bit subset
// of UTF-8, so it can be safely used with the UTF-8 build:
if ( charset && wxIsCharsetUtf8(charset) )
return true;
#else // !HAVE_LANGINFO_H
// check charset of the LC_CTYPE string: this also works with (sufficiently
// recent) MSVC and on any other system without nl_langinfo()
const char *lc_ctype = setlocale(LC_CTYPE, nullptr);
if ( lc_ctype &&
(strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
if ( lc_ctype )
{
return true;
const char* charset = strrchr(lc_ctype, '.');
if ( charset && wxIsCharsetUtf8(charset + 1) )
return true;
}
#endif // HAVE_LANGINFO_H/!HAVE_LANGINFO_H

// we don't know what charset libc is using, so assume the worst
// to be safe:
// by default assume that we don't use UTF-8
return false;
}

Expand Down

0 comments on commit e48c101

Please sign in to comment.