Permalink
Browse files

Improved character encoding detection from Content-Type header

Also updated some related JavaDocs
  • Loading branch information...
luccioman committed Jun 22, 2017
1 parent 1acb700 commit 64cec2790d3b0816ad551c4c090f40a6e34c5926
@@ -377,18 +377,20 @@ public String toString() {
*/
/**
* Get mime type from header field content-type
* stripps any parameter (denoted by ';' see RFC 2616)
* Get mime type from header field Content-Type.
* Strips any parameter denoted by ';'.
* References : RFC 7231 on HTTP/1.1 and RFC 2045 on Multipurpose Internet Mail Extensions (MIME)
* @see <a href="https://tools.ietf.org/html/rfc7231#section-3.1.1.1">RFC 7231 (HTTP/1.1) - "Media Type" section</a>
* @see <a href="https://tools.ietf.org/html/rfc2045#section-5">RFC 2045 (MIME) - "Content-Type Header Field" section</a>
* @return mime or on missing header field "application/octet-stream"
*/
public String mime() {
final String tmpstr = this.get(CONTENT_TYPE, "application/octet-stream");
final int pos = tmpstr.indexOf(';');
if (pos > 0) {
return tmpstr.substring(0, pos).trim();
} else {
return tmpstr;
}
return tmpstr;
}
/*
@@ -398,15 +400,25 @@ public String mime() {
* org.apache.commons.fileupload.RequestContext#getCharacterEncoding()
*/
public String getCharacterEncoding() {
final String mimeType = getContentType();
if (mimeType == null) return null;
return getCharacterEncoding(getContentType());
}
/**
* References : RFC 7231 on HTTP/1.1 and RFC 2045 on Multipurpose Internet Mail Extensions (MIME)
* @param contentType a Content-Type header value
* @return the characters set name extracted from the header, or null when not in the header
* @see <a href="https://tools.ietf.org/html/rfc7231#section-3.1.1.1">RFC 7231 (HTTP/1.1) - "Media Type" section</a>
* @see <a href="https://tools.ietf.org/html/rfc2045#section-5">RFC 2045 (MIME) - "Content-Type Header Field" section</a>
*/
public static final String getCharacterEncoding(final String contentType) {
if (contentType == null) return null;
final String[] parts = CommonPattern.SEMICOLON.split(mimeType);
final String[] parts = CommonPattern.SEMICOLON.split(contentType);
if (parts == null || parts.length <= 1) return null;
for (int i=1; i < parts.length; i++) {
final String param = parts[i].trim();
if (param.startsWith("charset=")) {
if (param.toLowerCase(Locale.ROOT).startsWith("charset=")) {
String charset = param.substring("charset=".length()).trim();
if (charset.length() > 0 && (charset.charAt(0) == '\"' || charset.charAt(0) == '\'')) charset = charset.substring(1);
if (charset.endsWith("\"") || charset.endsWith("'")) charset = charset.substring(0,charset.length()-1);
@@ -453,7 +465,7 @@ public long getContentLengthLong() {
/**
* Get header field content-type (unmodified)
* which may include additional parameter (RFC 2616)
* which may include additional parameter (RFC 2616, obsoleted by RFC 7231)
* see also mime()
* @see org.apache.commons.fileupload.RequestContext#getContentType()
*/
@@ -9,7 +9,7 @@
/**
* Test of httpHeader date parsing routine
*/
@Test
@Test
public void testParseHTTPDate() {
Date parsedDate = HeaderFramework.parseHTTPDate("Tue, 08 Jul 2003 21:22:46 GMT");
@@ -19,4 +19,16 @@ public void testParseHTTPDate() {
// Print Result
System.out.println("testParseHTTPDate: " + parsedDate.toString());
}
/**
* Unit test for character encoding retrieval
*/
@Test
public void testGetCharacterEncoding() {
/* Examples from RFC 7231 - HTTP/1.1, section "Media Type" (https://tools.ietf.org/html/rfc7231#section-3.1.1.1)*/
assertEquals("utf-8", HeaderFramework.getCharacterEncoding("text/html;charset=utf-8"));
assertEquals("UTF-8", HeaderFramework.getCharacterEncoding("text/html;charset=UTF-8"));
assertEquals("utf-8", HeaderFramework.getCharacterEncoding("Text/HTML;Charset=\"utf-8\""));
assertEquals("utf-8", HeaderFramework.getCharacterEncoding("text/html; charset=\"utf-8\""));
}
}

0 comments on commit 64cec27

Please sign in to comment.