Merge pull request #3986 from Karlson2k/fix_mime_01

Fix archive processing by ScraperUrl after 872de5f
xbmc · Jan 8, 2014 · df7398c · df7398c
2 parents ec1cf22 + 8e1c9f7
commit df7398c
Show file tree

Hide file tree

Showing 5 changed files with 150 additions and 135 deletions.
diff --git a/xbmc/utils/FileUtils.cpp b/xbmc/utils/FileUtils.cpp
@@ -160,114 +160,3 @@ unsigned int CFileUtils::LoadFile(const std::string &filename, void* &outputBuff
 
   return total_read;
 }
-
-CFileUtils::EFileType CFileUtils::GetFileTypeFromMime(const std::string& mimeType)
-{
-  // based on http://mimesniff.spec.whatwg.org/
-
-  std::string type, subtype;
-  if (!parseMimeType(mimeType, type, subtype))
-    return FileTypeUnknown;
-
-  if (type == "application")
-  {
-    if (subtype == "zip")
-      return FileTypeZip;
-    if (subtype == "x-gzip")
-      return FileTypeGZip;
-    if (subtype == "x-rar-compressed")
-      return FileTypeRar;
-
-    if (subtype == "xml")
-      return FileTypeXml;
-  }
-  else if (type == "text")
-  {
-    if (subtype == "xml")
-      return FileTypeXml;
-    if (subtype == "html")
-      return FileTypeHtml;
-    if (subtype == "plain")
-      return FileTypePlainText;
-  }
-  else if (type == "image")
-  {
-    if (subtype == "bmp")
-      return FileTypeBmp;
-    if (subtype == "gif")
-      return FileTypeGif;
-    if (subtype == "png")
-      return FileTypePng;
-    if (subtype == "jpeg" || subtype == "pjpeg")
-      return FileTypeJpeg;
-  }
-
-  if (StringUtils::EndsWith(subtype, "+zip"))
-    return FileTypeZip;
-  if (StringUtils::EndsWith(subtype, "+xml"))
-    return FileTypeXml;
-
-  return FileTypeUnknown;
-}
-
-bool CFileUtils::parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype)
-{
-  // this is an modified implementation of http://mimesniff.spec.whatwg.org/#parsing-a-mime-type with additional checks for non-empty type and subtype
-  // note: only type and subtype are parsed, parameters are ignored
-
-  static const char* const whitespaceChars = "\x09\x0A\x0C\x0D\x20"; // tab, LF, FF, CR and space
-  static const std::string whitespaceSmclnChars("\x09\x0A\x0C\x0D\x20\x3B"); // tab, LF, FF, CR, space and semicolon
-
-  type.clear();
-  subtype.clear();
-
-  const size_t len = mimeType.length();
-  if (len < 1)
-    return false;
-
-  const char* const mimeTypeC = mimeType.c_str();
-  size_t pos = mimeType.find_first_not_of(whitespaceChars);
-  if (pos == std::string::npos)
-    return false;
-
-  // find "type"
-  size_t t = 0;
-  do
-  {
-    const char chr = mimeTypeC[pos];
-    if (t > 127 || !chr)
-    {
-      type.clear();
-      return false;
-    }
-
-    if (chr >= 'A' && chr <= 'Z')
-      type.push_back(chr + ('a' - 'A')); // convert to lowercase
-    else
-      type.push_back(chr);
-    t++;
-    pos++;
-  } while (mimeTypeC[pos] != '/');
-
-  pos++; // skip '/'
-  t = 0;
-
-  while (mimeTypeC[pos] && whitespaceSmclnChars.find(mimeTypeC[pos]) == std::string::npos && t++ <= 127)
-  {
-    const char chr = mimeTypeC[pos];
-    if (chr >= 'A' && chr <= 'Z')
-      subtype.push_back(chr + ('a' - 'A')); // convert to lowercase
-    else
-      subtype.push_back(chr);
-    pos++;
-  }
-
-  if (subtype.empty() || t > 127)
-  {
-    type.clear();
-    subtype.clear();
-    return false;
-  }
-
-  return true;
-}
diff --git a/xbmc/utils/FileUtils.h b/xbmc/utils/FileUtils.h
@@ -29,21 +29,4 @@ class CFileUtils
   static bool RenameFile(const CStdString &strFile);
   static bool RemoteAccessAllowed(const CStdString &strPath);
   static unsigned int LoadFile(const std::string &filename, void* &outputBuffer);
-
-  enum EFileType
-  {
-    FileTypeUnknown = 0,
-    FileTypeHtml,
-    FileTypeXml,
-    FileTypePlainText,
-    FileTypeZip,
-    FileTypeGZip,
-    FileTypeRar,
-    FileTypeBmp,
-    FileTypeGif,
-    FileTypePng,
-    FileTypeJpeg,
-  };
-  static EFileType GetFileTypeFromMime(const std::string& mimeType);
-  static bool parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype);
 };
diff --git a/xbmc/utils/Mime.cpp b/xbmc/utils/Mime.cpp
@@ -576,3 +576,121 @@ string CMime::GetMimeType(const CURL &url, bool lookup)
 
   return strMimeType;
 }
+
+CMime::EFileType CMime::GetFileTypeFromMime(const std::string& mimeType)
+{
+  // based on http://mimesniff.spec.whatwg.org/
+
+  std::string type, subtype;
+  if (!parseMimeType(mimeType, type, subtype))
+    return FileTypeUnknown;
+
+  if (type == "application")
+  {
+    if (subtype == "zip")
+      return FileTypeZip;
+    if (subtype == "x-gzip")
+      return FileTypeGZip;
+    if (subtype == "x-rar-compressed")
+      return FileTypeRar;
+
+    if (subtype == "xml")
+      return FileTypeXml;
+  }
+  else if (type == "text")
+  {
+    if (subtype == "xml")
+      return FileTypeXml;
+    if (subtype == "html")
+      return FileTypeHtml;
+    if (subtype == "plain")
+      return FileTypePlainText;
+  }
+  else if (type == "image")
+  {
+    if (subtype == "bmp")
+      return FileTypeBmp;
+    if (subtype == "gif")
+      return FileTypeGif;
+    if (subtype == "png")
+      return FileTypePng;
+    if (subtype == "jpeg" || subtype == "pjpeg")
+      return FileTypeJpeg;
+  }
+
+  if (StringUtils::EndsWith(subtype, "+zip"))
+    return FileTypeZip;
+  if (StringUtils::EndsWith(subtype, "+xml"))
+    return FileTypeXml;
+
+  return FileTypeUnknown;
+}
+
+CMime::EFileType CMime::GetFileTypeFromContent(const std::string& fileContent)
+{
+  // based on http://mimesniff.spec.whatwg.org/#matching-a-mime-type-pattern
+
+  const size_t len = fileContent.length();
+  if (len < 2)
+    return FileTypeUnknown;
+
+  const unsigned char* const b = (const unsigned char*)fileContent.c_str();
+
+  // TODO: add detection for text types
+
+  // check image types
+  if (b[0] == 'B' && b[1] == 'M')
+    return FileTypeBmp;
+  if (len >= 6 && b[0] == 'G' && b[1] == 'I' && b[2] == 'F' && b[3] == '8' && (b[4] == '7' || b[4] == '9') && b[5] == 'a')
+    return FileTypeGif;
+  if (len >= 8 && b[0] == 0x89 && b[1] == 'P' && b[2] == 'N' && b[3] == 'G' && b[4] == 0x0D && b[5] == 0x0A && b[6] == 0x1A && b[7] == 0x0A)
+    return FileTypePng;
+  if (len >= 3 && b[0] == 0xFF && b[1] == 0xD8 && b[2] == 0xFF)
+    return FileTypeJpeg;
+
+  // check archive types
+  if (len >= 3 && b[0] == 0x1F && b[1] == 0x8B && b[2] == 0x08)
+    return FileTypeGZip;
+  if (len >= 4 && b[0] == 'P' && b[1] == 'K' && b[2] == 0x03 && b[3] == 0x04)
+    return FileTypeZip;
+  if (len >= 7 && b[0] == 'R' && b[1] == 'a' && b[2] == 'r' && b[3] == ' ' && b[4] == 0x1A && b[5] == 0x07 && b[6] == 0x00)
+    return FileTypeRar;
+
+  // TODO: add detection for other types if required
+
+  return FileTypeUnknown;
+}
+
+bool CMime::parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype)
+{
+  static const char* const whitespaceChars = "\x09\x0A\x0C\x0D\x20"; // tab, LF, FF, CR and space
+
+  type.clear();
+  subtype.clear();
+
+  const size_t slashPos = mimeType.find('/');
+  if (slashPos == std::string::npos)
+    return false;
+
+  type.assign(mimeType, 0, slashPos);
+  subtype.assign(mimeType, slashPos + 1, std::string::npos);
+
+  const size_t semicolonPos = subtype.find(';');
+  if (semicolonPos != std::string::npos)
+    subtype.erase(semicolonPos);
+
+  StringUtils::Trim(type, whitespaceChars);
+  StringUtils::Trim(subtype, whitespaceChars);
+
+  if (type.empty() || subtype.empty())
+  {
+    type.clear();
+    subtype.clear();
+    return false;
+  }
+
+  StringUtils::ToLower(type);
+  StringUtils::ToLower(subtype);
+
+  return true;
+}
diff --git a/xbmc/utils/Mime.h b/xbmc/utils/Mime.h
@@ -33,6 +33,24 @@ class CMime
   static std::string GetMimeType(const CFileItem &item);
   static std::string GetMimeType(const CURL &url, bool lookup = true);
 
+  enum EFileType
+  {
+    FileTypeUnknown = 0,
+    FileTypeHtml,
+    FileTypeXml,
+    FileTypePlainText,
+    FileTypeZip,
+    FileTypeGZip,
+    FileTypeRar,
+    FileTypeBmp,
+    FileTypeGif,
+    FileTypePng,
+    FileTypeJpeg,
+  };
+  static EFileType GetFileTypeFromMime(const std::string& mimeType);
+  static EFileType GetFileTypeFromContent(const std::string& fileContent);
+  static bool parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype);
+
 private:
   static std::map<std::string, std::string> m_mimetypes;
 };
diff --git a/xbmc/utils/ScraperUrl.cpp b/xbmc/utils/ScraperUrl.cpp
@@ -30,7 +30,7 @@
 #include "filesystem/ZipFile.h"
 #include "URIUtils.h"
 #include "utils/XBMCTinyXML.h"
-#include "utils/FileUtils.h"
+#include "utils/Mime.h"
 
 #include <cstring>
 #include <sstream>
@@ -239,19 +239,26 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
   strHTML = strHTML1;
 
   std::string mimeType(http.GetMimeType());
-  CFileUtils::EFileType ftype = CFileUtils::GetFileTypeFromMime(mimeType);
+  CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
+  if (ftype == CMime::FileTypeUnknown)
+    ftype = CMime::GetFileTypeFromContent(strHTML);
 
-  if (ftype == CFileUtils::FileTypeZip || ftype == CFileUtils::FileTypeGZip)
+  if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
   {
     XFILE::CZipFile file;
     std::string strBuffer;
     int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
     if (iSize > 0)
+    {
       strHTML = strBuffer;
+      CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str());
+    }
+    else
+      CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str());
   }
 
   std::string reportedCharset(http.GetServerReportedCharset());
-  if (ftype == CFileUtils::FileTypeHtml)
+  if (ftype == CMime::FileTypeHtml)
   {
     std::string realHtmlCharset, converted;
     if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
@@ -261,7 +268,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
 
     strHTML = converted;
   }
-  else if (ftype == CFileUtils::FileTypeXml)
+  else if (ftype == CMime::FileTypeXml)
   {
     CXBMCTinyXML xmlDoc;
     xmlDoc.Parse(strHTML, reportedCharset);
@@ -275,7 +282,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
       strHTML = converted;
     }
   }
-  else if (ftype == CFileUtils::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
+  else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
   {
     std::string realTextCharset, converted;
     CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
@@ -293,7 +300,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
     strHTML = converted;
   }
   else
-    CLog::Log(LOGDEBUG, "%s: Assuming \"UTF-8\" charset for content of \"%s\"", __FUNCTION__, scrURL.m_url.c_str());
+    CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str());
 
   if (!scrURL.m_cache.empty())
   {