Skip to content

Commit 2d80bf9

Browse files
committed
Check whether data with a BOM actually decoded instead of blindly continuing.
Noticed this while studying the code around issue #86. Hard to say whether this is the observed issue, as we can't reliably reproduce that crash.
1 parent 326b770 commit 2d80bf9

File tree

3 files changed

+35
-15
lines changed

3 files changed

+35
-15
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## [Unreleased]
44

5+
* Fix crash when given data that starts with a UTF-8 BOM but is not valid UTF-8.
6+
57
## [2.1.2][]
68

79
* Avoid symbol collision for `parseNumber` with Tapjoy SDK by marking it and other helper functions in `HTMLSelector.m` as `static`. (Fixes #85.)

HTMLReaderTests/HTMLDocumentTests.m

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,13 @@ - (void)testParsedStringEncodingContentTypeUTF8
3737
XCTAssertEqual(document.parsedStringEncoding, (NSStringEncoding)NSUTF8StringEncoding);
3838
}
3939

40+
- (void)testInvalidUTF8WithBOM
41+
{
42+
char buffer[] = "\xEF\xBB\xBF" // UTF-8 BOM
43+
"\xF5"; // invalid byte in UTF-8
44+
NSData *data = [NSData dataWithBytesNoCopy:buffer length:(sizeof(buffer) - 1) freeWhenDone:NO];
45+
HTMLDocument *document = [HTMLDocument documentWithData:data contentTypeHeader:nil];
46+
XCTAssertEqual(document.parsedStringEncoding, (NSStringEncoding)NSWindowsCP1252StringEncoding);
47+
}
48+
4049
@end

Sources/HTMLEncoding.m

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,32 @@ HTMLStringEncoding DeterminedStringEncodingForData(NSData *data, NSString *conte
2121
unsigned char buffer[3] = {0};
2222
[data getBytes:buffer length:MIN(data.length, 3U)];
2323
if (buffer[0] == 0xFE && buffer[1] == 0xFF) {
24-
*outDecodedString = [[NSString alloc] initWithData:data encoding:NSUTF16BigEndianStringEncoding];
25-
return (HTMLStringEncoding){
26-
.encoding = NSUTF16BigEndianStringEncoding,
27-
.confidence = Certain
28-
};
24+
NSString *decodedString = [[NSString alloc] initWithData:data encoding:NSUTF16BigEndianStringEncoding];
25+
if (decodedString) {
26+
*outDecodedString = decodedString;
27+
return (HTMLStringEncoding){
28+
.encoding = NSUTF16BigEndianStringEncoding,
29+
.confidence = Certain
30+
};
31+
}
2932
} else if (buffer[0] == 0xFF && buffer[1] == 0xFE) {
30-
*outDecodedString = [[NSString alloc] initWithData:data encoding:NSUTF16LittleEndianStringEncoding];
31-
return (HTMLStringEncoding){
32-
.encoding = NSUTF16LittleEndianStringEncoding,
33-
.confidence = Certain
34-
};
33+
NSString *decodedString = [[NSString alloc] initWithData:data encoding:NSUTF16LittleEndianStringEncoding];
34+
if (decodedString) {
35+
*outDecodedString = decodedString;
36+
return (HTMLStringEncoding){
37+
.encoding = NSUTF16LittleEndianStringEncoding,
38+
.confidence = Certain
39+
};
40+
}
3541
} else if (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) {
36-
*outDecodedString = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding];
37-
return (HTMLStringEncoding){
38-
.encoding = NSUTF8StringEncoding,
39-
.confidence = Certain
40-
};
42+
NSString *decodedString = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding];
43+
if (decodedString) {
44+
*outDecodedString = decodedString;
45+
return (HTMLStringEncoding){
46+
.encoding = NSUTF8StringEncoding,
47+
.confidence = Certain
48+
};
49+
}
4150
}
4251

4352
if (contentType) {

0 commit comments

Comments
 (0)