Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Charsetconverter UTF-32 fix #3351

Closed
wants to merge 3 commits into from

1 participant

@Karlson2k
Collaborator
  • Most common implementation of iconv (GNU libiconv) use big endian and BOM when output is UTF-32. Most other (but not all) iconv implementations usually works similar. To ensure that conversion will be correct, we need to explicitly specify output endianness.

  • On some platforms (FreeBSD) wchar_t isn't in Unicode, so code checks __STDC_ISO_10646__ macro before using wchar_t as UTF.

  • If wchar_t is 4 bytes wide and in Unicode, do proper convert from wchar_t to UTF-32 (instead of copy), as wchar_t in UCS-4 (superset of UTF-32). UTF-32 can be simply copied to wchar_t

This is alternative and extended version of PR #3342

@Karlson2k
Collaborator

jenkins build this please

@Karlson2k
Collaborator

Better solution: #3353

@Karlson2k Karlson2k closed this
@Karlson2k Karlson2k deleted the Karlson2k:charsetconverter_fix_02_alt branch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Sep 29, 2013
  1. @Karlson2k

    CharsetConverter: add extra care on using wchar_t as UTF-32

    Karlson2k authored
    * check for __STDC_ISO_10646__
    * copy-convert only for UTF-32 -> wchar_t, but use converter for wchar_t -> UTF-32 (UTF-32 has more restrictions)
Commits on Sep 30, 2013
  1. @Karlson2k
  2. @Karlson2k
This page is out of date. Refresh to see the latest.
View
3  configure.in
@@ -1939,7 +1939,8 @@ AC_HEADER_SYS_WAIT
AC_CHECK_HEADERS([arpa/inet.h fcntl.h float.h inttypes.h limits.h locale.h \
malloc.h memory.h netdb.h netinet/in.h stddef.h stdint.h stdlib.h string.h \
strings.h sys/file.h sys/ioctl.h sys/mount.h sys/param.h sys/socket.h \
- sys/time.h sys/timeb.h sys/vfs.h termios.h unistd.h utime.h wchar.h wctype.h])
+ sys/time.h sys/timeb.h sys/vfs.h termios.h unistd.h utime.h wchar.h \
+ wctype.h endian.h])
AC_CHECK_HEADERS([cdio/iso9660.h],,AC_MSG_ERROR([$missing_headers]))
# Checks for typedefs, structures, and compiler characteristics.
View
1  project/VS2010Express/XBMC.vcxproj
@@ -1208,6 +1208,7 @@
<ClInclude Include="..\..\xbmc\utils\IRssObserver.h" />
<ClInclude Include="..\..\xbmc\utils\IXmlDeserializable.h" />
<ClInclude Include="..\..\xbmc\utils\LegacyPathTranslation.h" />
+ <ClInclude Include="..\..\xbmc\utils\PlatformEndian.h" />
<ClInclude Include="..\..\xbmc\utils\RssManager.h" />
<ClInclude Include="..\..\xbmc\utils\StringValidation.h" />
<ClInclude Include="..\..\xbmc\utils\uXstrings.h" />
View
3  project/VS2010Express/XBMC.vcxproj.filters
@@ -6037,6 +6037,9 @@
<ClInclude Include="..\..\xbmc\utils\uXstrings.h">
<Filter>utils</Filter>
</ClInclude>
+ <ClInclude Include="..\..\xbmc\utils\PlatformEndian.h">
+ <Filter>utils</Filter>
+ </ClInclude>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="..\..\xbmc\win32\XBMC_PC.rc">
View
60 xbmc/utils/CharsetConverter.cpp
@@ -27,28 +27,31 @@
#include "settings/Setting.h"
#include "threads/SingleLock.h"
#include "log.h"
+#include "utils/PlatformEndian.h"
#include <errno.h>
#include <iconv.h>
#if defined(TARGET_DARWIN)
- #define WCHAR_IS_UTF32 1
+ #define WCHAR_IS_ALMOST_UTF32 1
#undef WCHAR_IS_UTF16
#ifdef __POWERPC__
#define WCHAR_CHARSET "UTF-32BE"
#else
#define WCHAR_CHARSET "UTF-32LE"
#endif
+ #define UTF32_CHARSET WCHAR_CHARSET
#define UTF8_SOURCE "UTF-8-MAC"
#elif defined(TARGET_WINDOWS)
- #undef WCHAR_IS_UTF32
+ #undef WCHAR_IS_ALMOST_UTF32
#define WCHAR_IS_UTF16 1
#define WCHAR_CHARSET "UTF-16LE"
+ #define UTF32_CHARSET "UTF-32LE"
#define UTF8_SOURCE "UTF-8"
#pragma comment(lib, "libfribidi.lib")
#pragma comment(lib, "libiconv.lib")
#elif defined(TARGET_ANDROID)
- #define WCHAR_IS_UTF32 1
+ #define WCHAR_IS_ALMOST_UTF32 1
#undef WCHAR_IS_UTF16
#define UTF8_SOURCE "UTF-8"
#ifdef __BIG_ENDIAN__
@@ -56,21 +59,29 @@
#else
#define WCHAR_CHARSET "UTF-32LE"
#endif
+ #define UTF32_CHARSET WCHAR_CHARSET
#else
#define WCHAR_CHARSET "WCHAR_T"
#define UTF8_SOURCE "UTF-8"
- #ifdef HAVE_CONFIG_H
- #include "config.h"
- #endif // HAVE_CONFIG_H
- #undef WCHAR_IS_UTF32
+ #undef WCHAR_IS_ALMOST_UTF32
#undef WCHAR_IS_UTF16
- #ifdef SIZEOF_WCHAR_T
- #if SIZEOF_WCHAR_T == 4
- #define WCHAR_IS_UTF32 1
- #elif SIZEOF_WCHAR_T == 2
- #define WCHAR_IS_UTF16 1
+ #if __STDC_ISO_10646__
+ #ifdef HAVE_CONFIG_H
+ #include "config.h"
+ #endif // HAVE_CONFIG_H
+ #ifdef SIZEOF_WCHAR_T
+ #if SIZEOF_WCHAR_T == 4
+ #define WCHAR_IS_ALMOST_UTF32 1
+ #elif SIZEOF_WCHAR_T == 2
+ #define WCHAR_IS_UTF16 1
+ #endif
#endif
#endif
+ #ifdef TARGET_BIGENDIAN
+ #define UTF32_CHARSET "UTF-32BE"
+ #else
+ #define UTF32_CHARSET "UTF-32LE"
+ #endif
#endif
@@ -479,7 +490,7 @@ void CCharsetConverter::reset(void)
bool CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
{
CSingleLock lock(m_critSection);
- return convert(m_iconvUtf8ToUtf32, 1, UTF8_SOURCE, "UTF-32", utf8StringSrc, utf32StringDst, failOnBadChar);
+ return convert(m_iconvUtf8ToUtf32, 1, UTF8_SOURCE, UTF32_CHARSET, utf8StringSrc, utf32StringDst, failOnBadChar);
}
std::u32string CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, bool failOnBadChar /*= true*/)
@@ -497,16 +508,16 @@ bool CCharsetConverter::utf8ToUtf32Visual(const std::string& utf8StringSrc, std:
if (!logicalToVisualBiDi(utf8StringSrc, strFlipped, FRIBIDI_UTF8, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF))
return false;
CSingleLock lock(m_critSection);
- return convert(m_iconvUtf8ToUtf32, 1, UTF8_SOURCE, "UTF-32", strFlipped, utf32StringDst, failOnBadChar);
+ return convert(m_iconvUtf8ToUtf32, 1, UTF8_SOURCE, UTF32_CHARSET, strFlipped, utf32StringDst, failOnBadChar);
}
CSingleLock lock(m_critSection);
- return convert(m_iconvUtf8ToUtf32, 1, UTF8_SOURCE, "UTF-32", utf8StringSrc, utf32StringDst, failOnBadChar);
+ return convert(m_iconvUtf8ToUtf32, 1, UTF8_SOURCE, UTF32_CHARSET, utf8StringSrc, utf32StringDst, failOnBadChar);
}
bool CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, std::string& utf8StringDst, bool failOnBadChar /*= true*/)
{
CSingleLock lock(m_critSection);
- return convert(m_iconvUtf32ToUtf8, m_Utf8CharMaxSize, "UTF-32", "UTF-8", utf32StringSrc, utf8StringDst, failOnBadChar);
+ return convert(m_iconvUtf32ToUtf8, m_Utf8CharMaxSize, UTF32_CHARSET, "UTF-8", utf32StringSrc, utf8StringDst, failOnBadChar);
}
std::string CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, bool failOnBadChar /*= false*/)
@@ -518,13 +529,13 @@ std::string CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc,
bool CCharsetConverter::utf32ToW(const std::u32string& utf32StringSrc, std::wstring& wStringDst, bool failOnBadChar /*= true*/)
{
-#ifdef WCHAR_IS_UTF32
+#ifdef WCHAR_IS_ALMOST_UTF32
wStringDst.assign((const wchar_t*)utf32StringSrc.c_str(), utf32StringSrc.length());
return true;
-#else // !WCHAR_IS_UTF32
+#else // !WCHAR_IS_ALMOST_UTF32
CSingleLock lock(m_critSection);
- return convert(m_iconvUtf32ToW, 1, "UTF-32", WCHAR_CHARSET, utf32StringSrc, wStringDst, failOnBadChar);
-#endif // !WCHAR_IS_UTF32
+ return convert(m_iconvUtf32ToW, 1, UTF32_CHARSET, WCHAR_CHARSET, utf32StringSrc, wStringDst, failOnBadChar);
+#endif // !WCHAR_IS_ALMOST_UTF32
}
bool CCharsetConverter::utf32logicalToVisualBiDi(const std::u32string& logicalStringSrc, std::u32string& visualStringDst, bool forceLTRReadingOrder /*= false*/)
@@ -539,13 +550,8 @@ bool CCharsetConverter::utf32logicalToVisualBiDi(const std::u32string& logicalSt
bool CCharsetConverter::wToUtf32(const std::wstring& wStringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
{
-#ifdef WCHAR_IS_UTF32
- utf32StringDst.assign((const char32_t*)wStringSrc.c_str(), wStringSrc.length());
- return true;
-#else // !WCHAR_IS_UTF32
CSingleLock lock(m_critSection);
- return convert(m_iconvWToUtf32, 1, WCHAR_CHARSET, "UTF-32", wStringSrc, utf32StringDst, failOnBadChar);
-#endif // !WCHAR_IS_UTF32
+ return convert(m_iconvWToUtf32, 1, WCHAR_CHARSET, UTF32_CHARSET, wStringSrc, utf32StringDst, failOnBadChar);
}
// The bVisualBiDiFlip forces a flip of characters for hebrew/arabic languages, only set to false if the flipping
@@ -706,7 +712,7 @@ bool CCharsetConverter::utf16LEtoW(const std::u16string& utf16String, std::wstri
bool CCharsetConverter::utf32ToStringCharset(const std::u32string& utf32StringSrc, std::string& stringDst)
{
CSingleLock lock(m_critSection);
- return convert(m_iconvUtf32ToStringCharset, 1, g_langInfo.GetGuiCharSet().c_str(), "UTF-32", utf32StringSrc, stringDst);
+ return convert(m_iconvUtf32ToStringCharset, 1, g_langInfo.GetGuiCharSet().c_str(), UTF32_CHARSET, utf32StringSrc, stringDst);
}
bool CCharsetConverter::utf8ToSystem(std::string& stringSrcDst, bool failOnBadChar /*= false*/)
View
79 xbmc/utils/PlatformEndian.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2013 Team XBMC
+ * http://xbmc.org
+ *
+ * This Program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This Program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with XBMC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ */
+#pragma once
+
+#ifdef TARGET_WINDOWS
+ /* All versions of Windows are little-endian */
+ #define TARGET_LITTLEENDIAN 1
+#else // ! TARGET_WINDOWS
+
+#ifdef HAVE_CONFIG_H
+ #include "config.h"
+ #if defined(HAVE_ENDIAN_H)
+ #include <endian.h>
+ #elif defined (HAVE_SYS_PARAM_H)
+ #include <sys/param.h>
+ #endif
+#endif
+
+// check system headers macros
+#if (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \
+ (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN)
+#define TARGET_BIGENDIAN 1
+
+#elif (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \
+ (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN)
+#define TARGET_LITTLEENDIAN 1
+
+// check predefined macros
+#elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) || \
+ defined(__ARMEB__) || defined (__THUMBEB__) || defined (__AARCH64EB__) || defined(_MIPSEB) || defined(__MIPSEB) || defined(__MIPSEB__)
+#define TARGET_BIGENDIAN 1
+
+#elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || \
+ defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) || \
+ defined(i386) || defined(__i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) || \
+ defined(_X86_) || defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(_MIPSEL) || defined(__MIPSEL) || \
+ defined(__MIPSEL__)
+#define TARGET_LITTLEENDIAN 1
+
+// detection by "configure" can be inaccurate, use it only if other methods failed
+#elif defined(WORDS_BIGENDIAN)
+#define TARGET_BIGENDIAN 1
+#else
+#define TARGET_LITTLEENDIAN 1
+
+#endif
+#endif // ! TARGET_WINDOWS
+
+#ifndef BIG_ENDIAN
+ #define BIG_ENDIAN 4321
+#endif
+
+#ifndef LITTLE_ENDIAN
+ #define LITTLE_ENDIAN 1234
+#endif
+
+#ifdef TARGET_BIGENDIAN
+ #define PLATFROM_ENDIANNESS BIG_ENDIAN
+#else
+ #define PLATFROM_ENDIANNESS LITTLE_ENDIAN
+#endif
+
Something went wrong with that request. Please try again.