Permalink
Browse files

Merge pull request #3986 from Karlson2k/fix_mime_01

Fix archive processing by ScraperUrl after 872de5f
  • Loading branch information...
2 parents ec1cf22 + 8e1c9f7 commit df7398c4cbf940d78f6e334e2ddf68ede36f9314 @jmarshallnz jmarshallnz committed Jan 8, 2014
Showing with 150 additions and 135 deletions.
  1. +0 −111 xbmc/utils/FileUtils.cpp
  2. +0 −17 xbmc/utils/FileUtils.h
  3. +118 −0 xbmc/utils/Mime.cpp
  4. +18 −0 xbmc/utils/Mime.h
  5. +14 −7 xbmc/utils/ScraperUrl.cpp
View
@@ -160,114 +160,3 @@ unsigned int CFileUtils::LoadFile(const std::string &filename, void* &outputBuff
return total_read;
}
-
-CFileUtils::EFileType CFileUtils::GetFileTypeFromMime(const std::string& mimeType)
-{
- // based on http://mimesniff.spec.whatwg.org/
-
- std::string type, subtype;
- if (!parseMimeType(mimeType, type, subtype))
- return FileTypeUnknown;
-
- if (type == "application")
- {
- if (subtype == "zip")
- return FileTypeZip;
- if (subtype == "x-gzip")
- return FileTypeGZip;
- if (subtype == "x-rar-compressed")
- return FileTypeRar;
-
- if (subtype == "xml")
- return FileTypeXml;
- }
- else if (type == "text")
- {
- if (subtype == "xml")
- return FileTypeXml;
- if (subtype == "html")
- return FileTypeHtml;
- if (subtype == "plain")
- return FileTypePlainText;
- }
- else if (type == "image")
- {
- if (subtype == "bmp")
- return FileTypeBmp;
- if (subtype == "gif")
- return FileTypeGif;
- if (subtype == "png")
- return FileTypePng;
- if (subtype == "jpeg" || subtype == "pjpeg")
- return FileTypeJpeg;
- }
-
- if (StringUtils::EndsWith(subtype, "+zip"))
- return FileTypeZip;
- if (StringUtils::EndsWith(subtype, "+xml"))
- return FileTypeXml;
-
- return FileTypeUnknown;
-}
-
-bool CFileUtils::parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype)
-{
- // this is an modified implementation of http://mimesniff.spec.whatwg.org/#parsing-a-mime-type with additional checks for non-empty type and subtype
- // note: only type and subtype are parsed, parameters are ignored
-
- static const char* const whitespaceChars = "\x09\x0A\x0C\x0D\x20"; // tab, LF, FF, CR and space
- static const std::string whitespaceSmclnChars("\x09\x0A\x0C\x0D\x20\x3B"); // tab, LF, FF, CR, space and semicolon
-
- type.clear();
- subtype.clear();
-
- const size_t len = mimeType.length();
- if (len < 1)
- return false;
-
- const char* const mimeTypeC = mimeType.c_str();
- size_t pos = mimeType.find_first_not_of(whitespaceChars);
- if (pos == std::string::npos)
- return false;
-
- // find "type"
- size_t t = 0;
- do
- {
- const char chr = mimeTypeC[pos];
- if (t > 127 || !chr)
- {
- type.clear();
- return false;
- }
-
- if (chr >= 'A' && chr <= 'Z')
- type.push_back(chr + ('a' - 'A')); // convert to lowercase
- else
- type.push_back(chr);
- t++;
- pos++;
- } while (mimeTypeC[pos] != '/');
-
- pos++; // skip '/'
- t = 0;
-
- while (mimeTypeC[pos] && whitespaceSmclnChars.find(mimeTypeC[pos]) == std::string::npos && t++ <= 127)
- {
- const char chr = mimeTypeC[pos];
- if (chr >= 'A' && chr <= 'Z')
- subtype.push_back(chr + ('a' - 'A')); // convert to lowercase
- else
- subtype.push_back(chr);
- pos++;
- }
-
- if (subtype.empty() || t > 127)
- {
- type.clear();
- subtype.clear();
- return false;
- }
-
- return true;
-}
View
@@ -29,21 +29,4 @@ class CFileUtils
static bool RenameFile(const CStdString &strFile);
static bool RemoteAccessAllowed(const CStdString &strPath);
static unsigned int LoadFile(const std::string &filename, void* &outputBuffer);
-
- enum EFileType
- {
- FileTypeUnknown = 0,
- FileTypeHtml,
- FileTypeXml,
- FileTypePlainText,
- FileTypeZip,
- FileTypeGZip,
- FileTypeRar,
- FileTypeBmp,
- FileTypeGif,
- FileTypePng,
- FileTypeJpeg,
- };
- static EFileType GetFileTypeFromMime(const std::string& mimeType);
- static bool parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype);
};
View
@@ -576,3 +576,121 @@ string CMime::GetMimeType(const CURL &url, bool lookup)
return strMimeType;
}
+
+CMime::EFileType CMime::GetFileTypeFromMime(const std::string& mimeType)
+{
+ // based on http://mimesniff.spec.whatwg.org/
+
+ std::string type, subtype;
+ if (!parseMimeType(mimeType, type, subtype))
+ return FileTypeUnknown;
+
+ if (type == "application")
+ {
+ if (subtype == "zip")
+ return FileTypeZip;
+ if (subtype == "x-gzip")
+ return FileTypeGZip;
+ if (subtype == "x-rar-compressed")
+ return FileTypeRar;
+
+ if (subtype == "xml")
+ return FileTypeXml;
+ }
+ else if (type == "text")
+ {
+ if (subtype == "xml")
+ return FileTypeXml;
+ if (subtype == "html")
+ return FileTypeHtml;
+ if (subtype == "plain")
+ return FileTypePlainText;
+ }
+ else if (type == "image")
+ {
+ if (subtype == "bmp")
+ return FileTypeBmp;
+ if (subtype == "gif")
+ return FileTypeGif;
+ if (subtype == "png")
+ return FileTypePng;
+ if (subtype == "jpeg" || subtype == "pjpeg")
+ return FileTypeJpeg;
+ }
+
+ if (StringUtils::EndsWith(subtype, "+zip"))
+ return FileTypeZip;
+ if (StringUtils::EndsWith(subtype, "+xml"))
+ return FileTypeXml;
+
+ return FileTypeUnknown;
+}
+
+CMime::EFileType CMime::GetFileTypeFromContent(const std::string& fileContent)
+{
+ // based on http://mimesniff.spec.whatwg.org/#matching-a-mime-type-pattern
+
+ const size_t len = fileContent.length();
+ if (len < 2)
+ return FileTypeUnknown;
+
+ const unsigned char* const b = (const unsigned char*)fileContent.c_str();
+
+ // TODO: add detection for text types
+
+ // check image types
+ if (b[0] == 'B' && b[1] == 'M')
+ return FileTypeBmp;
+ if (len >= 6 && b[0] == 'G' && b[1] == 'I' && b[2] == 'F' && b[3] == '8' && (b[4] == '7' || b[4] == '9') && b[5] == 'a')
+ return FileTypeGif;
+ if (len >= 8 && b[0] == 0x89 && b[1] == 'P' && b[2] == 'N' && b[3] == 'G' && b[4] == 0x0D && b[5] == 0x0A && b[6] == 0x1A && b[7] == 0x0A)
+ return FileTypePng;
+ if (len >= 3 && b[0] == 0xFF && b[1] == 0xD8 && b[2] == 0xFF)
+ return FileTypeJpeg;
+
+ // check archive types
+ if (len >= 3 && b[0] == 0x1F && b[1] == 0x8B && b[2] == 0x08)
+ return FileTypeGZip;
+ if (len >= 4 && b[0] == 'P' && b[1] == 'K' && b[2] == 0x03 && b[3] == 0x04)
+ return FileTypeZip;
+ if (len >= 7 && b[0] == 'R' && b[1] == 'a' && b[2] == 'r' && b[3] == ' ' && b[4] == 0x1A && b[5] == 0x07 && b[6] == 0x00)
+ return FileTypeRar;
+
+ // TODO: add detection for other types if required
+
+ return FileTypeUnknown;
+}
+
+bool CMime::parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype)
+{
+ static const char* const whitespaceChars = "\x09\x0A\x0C\x0D\x20"; // tab, LF, FF, CR and space
+
+ type.clear();
+ subtype.clear();
+
+ const size_t slashPos = mimeType.find('/');
+ if (slashPos == std::string::npos)
+ return false;
+
+ type.assign(mimeType, 0, slashPos);
+ subtype.assign(mimeType, slashPos + 1, std::string::npos);
+
+ const size_t semicolonPos = subtype.find(';');
+ if (semicolonPos != std::string::npos)
+ subtype.erase(semicolonPos);
+
+ StringUtils::Trim(type, whitespaceChars);
+ StringUtils::Trim(subtype, whitespaceChars);
+
+ if (type.empty() || subtype.empty())
+ {
+ type.clear();
+ subtype.clear();
+ return false;
+ }
+
+ StringUtils::ToLower(type);
+ StringUtils::ToLower(subtype);
+
+ return true;
+}
View
@@ -33,6 +33,24 @@ class CMime
static std::string GetMimeType(const CFileItem &item);
static std::string GetMimeType(const CURL &url, bool lookup = true);
+ enum EFileType
+ {
+ FileTypeUnknown = 0,
+ FileTypeHtml,
+ FileTypeXml,
+ FileTypePlainText,
+ FileTypeZip,
+ FileTypeGZip,
+ FileTypeRar,
+ FileTypeBmp,
+ FileTypeGif,
+ FileTypePng,
+ FileTypeJpeg,
+ };
+ static EFileType GetFileTypeFromMime(const std::string& mimeType);
+ static EFileType GetFileTypeFromContent(const std::string& fileContent);
+ static bool parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype);
+
private:
static std::map<std::string, std::string> m_mimetypes;
};
View
@@ -30,7 +30,7 @@
#include "filesystem/ZipFile.h"
#include "URIUtils.h"
#include "utils/XBMCTinyXML.h"
-#include "utils/FileUtils.h"
+#include "utils/Mime.h"
#include <cstring>
#include <sstream>
@@ -239,19 +239,26 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
strHTML = strHTML1;
std::string mimeType(http.GetMimeType());
- CFileUtils::EFileType ftype = CFileUtils::GetFileTypeFromMime(mimeType);
+ CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
+ if (ftype == CMime::FileTypeUnknown)
+ ftype = CMime::GetFileTypeFromContent(strHTML);
- if (ftype == CFileUtils::FileTypeZip || ftype == CFileUtils::FileTypeGZip)
+ if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
{
XFILE::CZipFile file;
std::string strBuffer;
int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
if (iSize > 0)
+ {
strHTML = strBuffer;
+ CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str());
+ }
+ else
+ CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str());
}
std::string reportedCharset(http.GetServerReportedCharset());
- if (ftype == CFileUtils::FileTypeHtml)
+ if (ftype == CMime::FileTypeHtml)
{
std::string realHtmlCharset, converted;
if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
@@ -261,7 +268,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
strHTML = converted;
}
- else if (ftype == CFileUtils::FileTypeXml)
+ else if (ftype == CMime::FileTypeXml)
{
CXBMCTinyXML xmlDoc;
xmlDoc.Parse(strHTML, reportedCharset);
@@ -275,7 +282,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
strHTML = converted;
}
}
- else if (ftype == CFileUtils::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
+ else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
{
std::string realTextCharset, converted;
CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
@@ -293,7 +300,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
strHTML = converted;
}
else
- CLog::Log(LOGDEBUG, "%s: Assuming \"UTF-8\" charset for content of \"%s\"", __FUNCTION__, scrURL.m_url.c_str());
+ CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str());
if (!scrURL.m_cache.empty())
{

0 comments on commit df7398c

Please sign in to comment.