Skip to content

Commit

Permalink
CharsetDetection: add "ConvertPlainTextToUtf8()"
Browse files Browse the repository at this point in the history
  • Loading branch information
Karlson2k committed Jan 5, 2014
1 parent 72731b1 commit 6277558
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 0 deletions.
72 changes: 72 additions & 0 deletions xbmc/utils/CharsetDetection.cpp
Expand Up @@ -355,6 +355,78 @@ bool CCharsetDetection::ConvertHtmlToUtf8(const std::string& htmlContent, std::s
return false;
}

bool CCharsetDetection::ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset)
{
converted.clear();
usedCharset.clear();
if (textContent.empty())
{
usedCharset = "UTF-8"; // any charset can be used for empty content, use UTF-8 as default
return true;
}

// try to get charset from Byte Order Mark
std::string bomCharset(GetBomEncoding(textContent));
if (checkConversion(bomCharset, textContent, converted))
{
usedCharset = bomCharset;
return true;
}

// try charset from HTTP header (or from other out-of-band source)
if (checkConversion(serverReportedCharset, textContent, converted))
{
usedCharset = serverReportedCharset;
return true;
}

// try UTF-8 if not tried before
if (bomCharset != "UTF-8" && serverReportedCharset != "UTF-8" && checkConversion("UTF-8", textContent, converted))
{
usedCharset = "UTF-8";
return true;
}

// try user charset
std::string userCharset(g_langInfo.GetGuiCharSet());
if (checkConversion(userCharset, textContent, converted))
{
usedCharset = userCharset;
return true;
}

// try system default charset
if (g_charsetConverter.systemToUtf8(textContent, converted, true))
{
usedCharset = "char"; // synonym to system charset
return true;
}

// try WINDOWS-1252
if (checkConversion("WINDOWS-1252", textContent, converted))
{
usedCharset = "WINDOWS-1252";
return true;
}

// can't find correct charset
// use one of detected as fallback
if (!serverReportedCharset.empty())
usedCharset = serverReportedCharset;
else if (!bomCharset.empty())
usedCharset = bomCharset;
else if (!userCharset.empty())
usedCharset = userCharset;
else
usedCharset = "WINDOWS-1252";

CLog::Log(LOGWARNING, "%s: Can't correctly convert to UTF-8 charset, converting as \"%s\"", __FUNCTION__, usedCharset.c_str());
g_charsetConverter.ToUtf8(usedCharset, textContent, converted, false);

return false;
}


bool CCharsetDetection::checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst)
{
if (srcCharset.empty())
Expand Down
10 changes: 10 additions & 0 deletions xbmc/utils/CharsetDetection.h
Expand Up @@ -70,6 +70,16 @@ class CCharsetDetection
*/
static bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset);

/**
* Try to convert plain text to UTF-8 using best suitable charset
* @param textContent text to convert
* @param converted receive result of conversion
* @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
* @param usedCharset receive charset used for conversion
* @return true if converted without errors, false otherwise
*/
static bool ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset);

private:
static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding);
/**
Expand Down

0 comments on commit 6277558

Please sign in to comment.