Permalink
Browse files

Merge pull request #3650 from Karlson2k/fix_pcre_utf8

Fix PCRE UTF-8 error (mostly for scrapers)
  • Loading branch information...
2 parents 79aee04 + 7ad9c35 commit e20eee92df9fe2c18c2bf70ba96dba0134d2f58d @jmarshallnz jmarshallnz committed Dec 10, 2013
View
@@ -2324,7 +2324,7 @@ void CFileItemList::StackFolders()
{
// Precompile our REs
VECCREGEXP folderRegExps;
- CRegExp folderRegExp(true, true);
+ CRegExp folderRegExp(true, CRegExp::autoUtf8);
const CStdStringArray& strFolderRegExps = g_advancedSettings.m_folderStackRegExps;
CStdStringArray::const_iterator strExpression = strFolderRegExps.begin();
@@ -2416,7 +2416,7 @@ void CFileItemList::StackFiles()
{
// Precompile our REs
VECCREGEXP stackRegExps;
- CRegExp tmpRegExp(true, true);
+ CRegExp tmpRegExp(true, CRegExp::autoUtf8);
const CStdStringArray& strStackRegExps = g_advancedSettings.m_videoStackRegExps;
CStdStringArray::const_iterator strRegExp = strStackRegExps.begin();
while (strRegExp != strStackRegExps.end())
@@ -3242,7 +3242,7 @@ CStdString CFileItem::FindTrailer() const
// Precompile our REs
VECCREGEXP matchRegExps;
- CRegExp tmpRegExp(true, true);
+ CRegExp tmpRegExp(true, CRegExp::autoUtf8);
const CStdStringArray& strMatchRegExps = g_advancedSettings.m_trailerMatchRegExps;
CStdStringArray::const_iterator strRegExp = strMatchRegExps.begin();
View
@@ -243,8 +243,8 @@ void CUtil::CleanString(const CStdString& strFileName, CStdString& strTitle, CSt
const CStdStringArray &regexps = g_advancedSettings.m_videoCleanStringRegExps;
- CRegExp reTags(true, true);
- CRegExp reYear(false, true);
+ CRegExp reTags(true, CRegExp::autoUtf8);
+ CRegExp reYear(false, CRegExp::autoUtf8);
if (!reYear.RegComp(g_advancedSettings.m_videoCleanDateTimeRegExp))
{
@@ -519,7 +519,7 @@ bool CUtil::ExcludeFileOrFolder(const CStdString& strFileOrFolder, const CStdStr
if (strFileOrFolder.empty())
return false;
- CRegExp regExExcludes(true, true); // case insensitive regex
+ CRegExp regExExcludes(true, CRegExp::autoUtf8); // case insensitive regex
for (unsigned int i = 0; i < regexps.size(); i++)
{
@@ -185,7 +185,7 @@ void CExternalPlayer::Process()
CStdString strMatch = vecSplit[0];
StringUtils::Replace(strMatch, ",,",",");
bool bCaseless = vecSplit[3].find('i') != std::string::npos;
- CRegExp regExp(bCaseless, true);
+ CRegExp regExp(bCaseless, CRegExp::autoUtf8);
if (!regExp.RegComp(strMatch.c_str()))
{ // invalid regexp - complain in logs
@@ -118,7 +118,7 @@ void CPlayerSelectionRule::GetPlayers(const CFileItem& item, VECPLAYERCORES &vec
if (m_tDVDFile >= 0 && (m_tDVDFile > 0) != item.IsDVDFile()) return;
if (m_tDVDImage >= 0 && (m_tDVDImage > 0) != item.IsDVDImage()) return;
- CRegExp regExp(false, true);
+ CRegExp regExp(false, CRegExp::autoUtf8);
if (m_bStreamDetails)
{
@@ -59,7 +59,7 @@ namespace XFILE
{
// Load up our REs
VECCREGEXP RegExps;
- CRegExp tempRE(true, true);
+ CRegExp tempRE(true, CRegExp::autoUtf8);
const CStdStringArray& strRegExps = g_advancedSettings.m_videoStackRegExps;
CStdStringArray::const_iterator itRegExp = strRegExps.begin();
vector<pair<int, CStdString> > badStacks;
View
@@ -53,19 +53,20 @@ int CRegExp::m_UcpSupported = -1;
int CRegExp::m_JitSupported = -1;
-CRegExp::CRegExp(bool caseless /*= false*/, bool utf8 /*= false*/)
+CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
{
InitValues(caseless, utf8);
}
-void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
+void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
{
+ m_utf8Mode = utf8;
m_re = NULL;
m_sd = NULL;
m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY;
if(caseless)
m_iOptions |= PCRE_CASELESS;
- if (utf8)
+ if (m_utf8Mode == forceUtf8)
{
if (IsUtf8Supported())
m_iOptions |= PCRE_UTF8;
@@ -82,17 +83,162 @@ void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
memset(m_iOvector, 0, sizeof(m_iOvector));
}
-CRegExp::CRegExp(bool caseless, bool utf8, const char *re, studyMode study /*= NoStudy*/)
+CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/)
{
+ if (utf8 == autoUtf8)
+ utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
+
InitValues(caseless, utf8);
RegComp(re, study);
}
+bool CRegExp::requireUtf8(const std::string& regexp)
+{
+ // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
+ if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
+ return true;
+
+ // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
+ // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
+ // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
+ const char* const regexpC = regexp.c_str();
+ const size_t len = regexp.length();
+ size_t pos = 0;
+
+ while (pos < len)
+ {
+ const char chr = regexpC[pos];
+ if (chr == '\\')
+ {
+ const char nextChr = regexpC[pos + 1];
+
+ if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X')
+ return true; // found Unicode Properties
+ else if (nextChr == 'Q')
+ pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
+ else if (nextChr == 'x' && regexpC[pos + 2] == '{')
+ { // Unicode character with hex code
+ if (readCharXCode(regexp, pos) >= 0x100)
+ return true; // found Unicode character code
+ }
+ else if (nextChr == '\\' || nextChr == '(' || nextChr == ')'
+ || nextChr == '[' || nextChr == ']')
+ pos++; // exclude next character from analyze
+
+ } // chr != '\\'
+ else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
+ pos = regexp.find(')', pos); // skip comment
+ else if (chr == '[')
+ {
+ if (isCharClassWithUnicode(regexp, pos))
+ return true;
+ }
+
+ if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
+ return false;
+
+ pos++;
+ }
+
+ // no Unicode Properties was found
+ return false;
+}
+
+inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
+{
+ // read hex character code in form "\x{hh..}"
+ // 'pos' must point to '\'
+ if (pos >= regexp.length())
+ return -1;
+ const char* const regexpC = regexp.c_str();
+ if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{')
+ return -1;
+
+ pos++;
+ const size_t startPos = pos; // 'startPos' points to 'x'
+ const size_t closingBracketPos = regexp.find('}', startPos + 2);
+ if (closingBracketPos == std::string::npos)
+ return 0; // return character zero code, leave 'pos' at 'x'
+
+ pos++; // 'pos' points to '{'
+ int chCode = 0;
+ while (++pos < closingBracketPos)
+ {
+ const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
+ if (xdigitVal >= 0)
+ chCode = chCode * 16 + xdigitVal;
+ else
+ { // found non-hexdigit
+ pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
+ return 0; // return character zero code
+ }
+ }
+
+ return chCode;
+}
+
+bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
+{
+ const char* const regexpC = regexp.c_str();
+ const size_t len = regexp.length();
+ if (pos > len || regexpC[pos] != '[')
+ return false;
+
+ // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
+ // find end (terminating ']') of character class (like "[a-h45]")
+ // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
+ bool needUnicode = false;
+ while (++pos < len)
+ {
+ if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
+ { // possible POSIX character class, like "[:alpha:]"
+ const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
+
+ if (nextClosingBracketPos == std::string::npos)
+ { // error in regexp: no closing ']' for character class
+ pos = std::string::npos;
+ return needUnicode;
+ }
+ else if (regexpC[nextClosingBracketPos - 1] == ':')
+ pos = nextClosingBracketPos; // skip POSIX character class
+ // if ":]" is not found, process "[:..." as part of normal character class
+ }
+ else if (regexpC[pos] == ']')
+ return needUnicode; // end of character class
+ else if (regexpC[pos] == '\\')
+ {
+ const char nextChar = regexpC[pos + 1];
+ if (nextChar == ']' || nextChar == '[')
+ pos++; // skip next character
+ else if (nextChar == 'Q')
+ {
+ pos = regexp.find("\\E", pos + 2);
+ if (pos == std::string::npos)
+ return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
+ else
+ pos++; // skip "\E"
+ }
+ else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X')
+ needUnicode = true; // don't care about property name as it can contain only ASCII chars
+ else if (nextChar == 'x')
+ {
+ if (readCharXCode(regexp, pos) >= 0x100)
+ needUnicode = true;
+ }
+ }
+ }
+ pos = std::string::npos; // closing square bracket was not found
+
+ return needUnicode;
+}
+
+
CRegExp::CRegExp(const CRegExp& re)
{
m_re = NULL;
m_sd = NULL;
m_jitStack = NULL;
+ m_utf8Mode = re.m_utf8Mode;
m_iOptions = re.m_iOptions;
*this = re;
}
@@ -140,10 +286,13 @@ bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
m_iMatchCount = 0;
const char *errMsg = NULL;
int errOffset = 0;
+ int options = m_iOptions;
+ if (m_utf8Mode == autoUtf8 && requireUtf8(re))
+ options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
Cleanup();
- m_re = pcre_compile(re, m_iOptions, &errMsg, &errOffset, NULL);
+ m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
if (!m_re)
{
m_pattern.clear();
View
@@ -48,25 +48,32 @@ class CRegExp
StudyRegExp = 1, // study expression (slower compilation, faster find)
StudyWithJitComp // study expression and JIT-compile it, if possible (heavyweight optimization)
};
+ enum utf8Mode
+ {
+ autoUtf8 = -1, // analyze regexp for UTF-8 multi-byte chars, for Unicode codes > 0xFF
+ // or explicit Unicode properties (\p, \P and \X), enable UTF-8 mode if any of them are found
+ asciiOnly = 0, // process regexp and strings as single-byte encoded strings
+ forceUtf8 = 1 // enable UTF-8 mode (with Unicode properties)
+ };
static const int m_MaxNumOfBackrefrences = 20;
/**
* @param caseless (optional) Matching will be case insensitive if set to true
* or case sensitive if set to false
- * @param utf8 (optional) If set to true all string will be processed as UTF-8 strings
+ * @param utf8 (optional) Control UTF-8 processing
*/
- CRegExp(bool caseless = false, bool utf8 = false);
+ CRegExp(bool caseless = false, utf8Mode utf8 = asciiOnly);
/**
* Create new CRegExp object and compile regexp expression in one step
* @warning Use only with hardcoded regexp when you're sure that regexp is compiled without errors
* @param caseless Matching will be case insensitive if set to true
* or case sensitive if set to false
- * @param utf8 If set to true all string will be processed as UTF-8 strings
+ * @param utf8 Control UTF-8 processing
* @param re The regular expression
* @param study (optional) Controls study of expression, useful if expression will be used
* several times
*/
- CRegExp(bool caseless, bool utf8, const char *re, studyMode study = NoStudy);
+ CRegExp(bool caseless, utf8Mode utf8, const char *re, studyMode study = NoStudy);
CRegExp(const CRegExp& re);
~CRegExp();
@@ -143,7 +150,10 @@ class CRegExp
private:
int PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1);
- void InitValues(bool caseless = false, bool utf8 = false);
+ void InitValues(bool caseless = false, CRegExp::utf8Mode utf8 = asciiOnly);
+ static bool requireUtf8(const std::string& regexp);
+ static int readCharXCode(const std::string& regexp, size_t& pos);
+ static bool isCharClassWithUnicode(const std::string& regexp, size_t& pos);
void Cleanup();
inline bool IsValidSubNumber(int iSub) const;
@@ -153,6 +163,7 @@ class CRegExp
static const int OVECCOUNT=(m_MaxNumOfBackrefrences + 1) * 3;
unsigned int m_offset;
int m_iOvector[OVECCOUNT];
+ utf8Mode m_utf8Mode;
int m_iMatchCount;
int m_iOptions;
bool m_jitCompiled;
@@ -204,7 +204,19 @@ void CScraperParser::ParseExpression(const CStdString& input, CStdString& dest,
if (stricmp(sensitive,"yes") == 0)
bInsensitive=false; // match case sensitive
- CRegExp reg(bInsensitive, true);
+ CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8;
+ const char* const strUtf8 = pExpression->Attribute("utf8");
+ if (strUtf8)
+ {
+ if (stricmp(strUtf8, "yes") == 0)
+ eUtf8 = CRegExp::forceUtf8;
+ else if (stricmp(strUtf8, "no") == 0)
+ eUtf8 = CRegExp::asciiOnly;
+ else if (stricmp(strUtf8, "auto") == 0)
+ eUtf8 = CRegExp::autoUtf8;
+ }
+
+ CRegExp reg(bInsensitive, eUtf8);
CStdString strExpression;
if (pExpression->FirstChild())
strExpression = pExpression->FirstChild()->Value();
View
@@ -733,6 +733,28 @@ bool StringUtils::IsInteger(const CStdString& str)
return i == str.size() && n > 0;
}
+int StringUtils::asciidigitvalue(char chr)
+{
+ if (!isasciidigit(chr))
+ return -1;
+
+ return chr - '0';
+}
+
+int StringUtils::asciixdigitvalue(char chr)
+{
+ int v = asciidigitvalue(chr);
+ if (v >= 0)
+ return v;
+ if (chr >= 'a' && chr <= 'f')
+ return chr - 'a' + 10;
+ if (chr >= 'A' && chr <= 'F')
+ return chr - 'A' + 10;
+
+ return -1;
+}
+
+
void StringUtils::RemoveCRLF(CStdString& strLine)
{
StringUtils::TrimRight(strLine, "\n\r");
Oops, something went wrong.

0 comments on commit e20eee9

Please sign in to comment.