Skip to content

Commit

Permalink
Merge pull request #3986 from Karlson2k/fix_mime_01
Browse files Browse the repository at this point in the history
Fix archive processing by ScraperUrl after 872de5f
  • Loading branch information
jmarshallnz committed Jan 8, 2014
2 parents ec1cf22 + 8e1c9f7 commit df7398c
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 135 deletions.
111 changes: 0 additions & 111 deletions xbmc/utils/FileUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,114 +160,3 @@ unsigned int CFileUtils::LoadFile(const std::string &filename, void* &outputBuff

return total_read;
}

CFileUtils::EFileType CFileUtils::GetFileTypeFromMime(const std::string& mimeType)
{
// based on http://mimesniff.spec.whatwg.org/

std::string type, subtype;
if (!parseMimeType(mimeType, type, subtype))
return FileTypeUnknown;

if (type == "application")
{
if (subtype == "zip")
return FileTypeZip;
if (subtype == "x-gzip")
return FileTypeGZip;
if (subtype == "x-rar-compressed")
return FileTypeRar;

if (subtype == "xml")
return FileTypeXml;
}
else if (type == "text")
{
if (subtype == "xml")
return FileTypeXml;
if (subtype == "html")
return FileTypeHtml;
if (subtype == "plain")
return FileTypePlainText;
}
else if (type == "image")
{
if (subtype == "bmp")
return FileTypeBmp;
if (subtype == "gif")
return FileTypeGif;
if (subtype == "png")
return FileTypePng;
if (subtype == "jpeg" || subtype == "pjpeg")
return FileTypeJpeg;
}

if (StringUtils::EndsWith(subtype, "+zip"))
return FileTypeZip;
if (StringUtils::EndsWith(subtype, "+xml"))
return FileTypeXml;

return FileTypeUnknown;
}

bool CFileUtils::parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype)
{
// this is an modified implementation of http://mimesniff.spec.whatwg.org/#parsing-a-mime-type with additional checks for non-empty type and subtype
// note: only type and subtype are parsed, parameters are ignored

static const char* const whitespaceChars = "\x09\x0A\x0C\x0D\x20"; // tab, LF, FF, CR and space
static const std::string whitespaceSmclnChars("\x09\x0A\x0C\x0D\x20\x3B"); // tab, LF, FF, CR, space and semicolon

type.clear();
subtype.clear();

const size_t len = mimeType.length();
if (len < 1)
return false;

const char* const mimeTypeC = mimeType.c_str();
size_t pos = mimeType.find_first_not_of(whitespaceChars);
if (pos == std::string::npos)
return false;

// find "type"
size_t t = 0;
do
{
const char chr = mimeTypeC[pos];
if (t > 127 || !chr)
{
type.clear();
return false;
}

if (chr >= 'A' && chr <= 'Z')
type.push_back(chr + ('a' - 'A')); // convert to lowercase
else
type.push_back(chr);
t++;
pos++;
} while (mimeTypeC[pos] != '/');

pos++; // skip '/'
t = 0;

while (mimeTypeC[pos] && whitespaceSmclnChars.find(mimeTypeC[pos]) == std::string::npos && t++ <= 127)
{
const char chr = mimeTypeC[pos];
if (chr >= 'A' && chr <= 'Z')
subtype.push_back(chr + ('a' - 'A')); // convert to lowercase
else
subtype.push_back(chr);
pos++;
}

if (subtype.empty() || t > 127)
{
type.clear();
subtype.clear();
return false;
}

return true;
}
17 changes: 0 additions & 17 deletions xbmc/utils/FileUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,4 @@ class CFileUtils
static bool RenameFile(const CStdString &strFile);
static bool RemoteAccessAllowed(const CStdString &strPath);
static unsigned int LoadFile(const std::string &filename, void* &outputBuffer);

enum EFileType
{
FileTypeUnknown = 0,
FileTypeHtml,
FileTypeXml,
FileTypePlainText,
FileTypeZip,
FileTypeGZip,
FileTypeRar,
FileTypeBmp,
FileTypeGif,
FileTypePng,
FileTypeJpeg,
};
static EFileType GetFileTypeFromMime(const std::string& mimeType);
static bool parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype);
};
118 changes: 118 additions & 0 deletions xbmc/utils/Mime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,3 +576,121 @@ string CMime::GetMimeType(const CURL &url, bool lookup)

return strMimeType;
}

CMime::EFileType CMime::GetFileTypeFromMime(const std::string& mimeType)
{
// based on http://mimesniff.spec.whatwg.org/

std::string type, subtype;
if (!parseMimeType(mimeType, type, subtype))
return FileTypeUnknown;

if (type == "application")
{
if (subtype == "zip")
return FileTypeZip;
if (subtype == "x-gzip")
return FileTypeGZip;
if (subtype == "x-rar-compressed")
return FileTypeRar;

if (subtype == "xml")
return FileTypeXml;
}
else if (type == "text")
{
if (subtype == "xml")
return FileTypeXml;
if (subtype == "html")
return FileTypeHtml;
if (subtype == "plain")
return FileTypePlainText;
}
else if (type == "image")
{
if (subtype == "bmp")
return FileTypeBmp;
if (subtype == "gif")
return FileTypeGif;
if (subtype == "png")
return FileTypePng;
if (subtype == "jpeg" || subtype == "pjpeg")
return FileTypeJpeg;
}

if (StringUtils::EndsWith(subtype, "+zip"))
return FileTypeZip;
if (StringUtils::EndsWith(subtype, "+xml"))
return FileTypeXml;

return FileTypeUnknown;
}

CMime::EFileType CMime::GetFileTypeFromContent(const std::string& fileContent)
{
// based on http://mimesniff.spec.whatwg.org/#matching-a-mime-type-pattern

const size_t len = fileContent.length();
if (len < 2)
return FileTypeUnknown;

const unsigned char* const b = (const unsigned char*)fileContent.c_str();

// TODO: add detection for text types

// check image types
if (b[0] == 'B' && b[1] == 'M')
return FileTypeBmp;
if (len >= 6 && b[0] == 'G' && b[1] == 'I' && b[2] == 'F' && b[3] == '8' && (b[4] == '7' || b[4] == '9') && b[5] == 'a')
return FileTypeGif;
if (len >= 8 && b[0] == 0x89 && b[1] == 'P' && b[2] == 'N' && b[3] == 'G' && b[4] == 0x0D && b[5] == 0x0A && b[6] == 0x1A && b[7] == 0x0A)
return FileTypePng;
if (len >= 3 && b[0] == 0xFF && b[1] == 0xD8 && b[2] == 0xFF)
return FileTypeJpeg;

// check archive types
if (len >= 3 && b[0] == 0x1F && b[1] == 0x8B && b[2] == 0x08)
return FileTypeGZip;
if (len >= 4 && b[0] == 'P' && b[1] == 'K' && b[2] == 0x03 && b[3] == 0x04)
return FileTypeZip;
if (len >= 7 && b[0] == 'R' && b[1] == 'a' && b[2] == 'r' && b[3] == ' ' && b[4] == 0x1A && b[5] == 0x07 && b[6] == 0x00)
return FileTypeRar;

// TODO: add detection for other types if required

return FileTypeUnknown;
}

bool CMime::parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype)
{
static const char* const whitespaceChars = "\x09\x0A\x0C\x0D\x20"; // tab, LF, FF, CR and space

type.clear();
subtype.clear();

const size_t slashPos = mimeType.find('/');
if (slashPos == std::string::npos)
return false;

type.assign(mimeType, 0, slashPos);
subtype.assign(mimeType, slashPos + 1, std::string::npos);

const size_t semicolonPos = subtype.find(';');
if (semicolonPos != std::string::npos)
subtype.erase(semicolonPos);

StringUtils::Trim(type, whitespaceChars);
StringUtils::Trim(subtype, whitespaceChars);

if (type.empty() || subtype.empty())
{
type.clear();
subtype.clear();
return false;
}

StringUtils::ToLower(type);
StringUtils::ToLower(subtype);

return true;
}
18 changes: 18 additions & 0 deletions xbmc/utils/Mime.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,24 @@ class CMime
static std::string GetMimeType(const CFileItem &item);
static std::string GetMimeType(const CURL &url, bool lookup = true);

enum EFileType
{
FileTypeUnknown = 0,
FileTypeHtml,
FileTypeXml,
FileTypePlainText,
FileTypeZip,
FileTypeGZip,
FileTypeRar,
FileTypeBmp,
FileTypeGif,
FileTypePng,
FileTypeJpeg,
};
static EFileType GetFileTypeFromMime(const std::string& mimeType);
static EFileType GetFileTypeFromContent(const std::string& fileContent);
static bool parseMimeType(const std::string& mimeType, std::string& type, std::string& subtype);

private:
static std::map<std::string, std::string> m_mimetypes;
};
21 changes: 14 additions & 7 deletions xbmc/utils/ScraperUrl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#include "filesystem/ZipFile.h"
#include "URIUtils.h"
#include "utils/XBMCTinyXML.h"
#include "utils/FileUtils.h"
#include "utils/Mime.h"

#include <cstring>
#include <sstream>
Expand Down Expand Up @@ -239,19 +239,26 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
strHTML = strHTML1;

std::string mimeType(http.GetMimeType());
CFileUtils::EFileType ftype = CFileUtils::GetFileTypeFromMime(mimeType);
CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
if (ftype == CMime::FileTypeUnknown)
ftype = CMime::GetFileTypeFromContent(strHTML);

if (ftype == CFileUtils::FileTypeZip || ftype == CFileUtils::FileTypeGZip)
if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
{
XFILE::CZipFile file;
std::string strBuffer;
int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
if (iSize > 0)
{
strHTML = strBuffer;
CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str());
}
else
CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str());
}

std::string reportedCharset(http.GetServerReportedCharset());
if (ftype == CFileUtils::FileTypeHtml)
if (ftype == CMime::FileTypeHtml)
{
std::string realHtmlCharset, converted;
if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
Expand All @@ -261,7 +268,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur

strHTML = converted;
}
else if (ftype == CFileUtils::FileTypeXml)
else if (ftype == CMime::FileTypeXml)
{
CXBMCTinyXML xmlDoc;
xmlDoc.Parse(strHTML, reportedCharset);
Expand All @@ -275,7 +282,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
strHTML = converted;
}
}
else if (ftype == CFileUtils::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
{
std::string realTextCharset, converted;
CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
Expand All @@ -293,7 +300,7 @@ bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCur
strHTML = converted;
}
else
CLog::Log(LOGDEBUG, "%s: Assuming \"UTF-8\" charset for content of \"%s\"", __FUNCTION__, scrURL.m_url.c_str());
CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str());

if (!scrURL.m_cache.empty())
{
Expand Down

0 comments on commit df7398c

Please sign in to comment.