/
TrivialHtmlParser.h
101 lines (78 loc) · 3.07 KB
/
TrivialHtmlParser.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* Copyright 2022 the SumatraPDF project authors (see AUTHORS file).
License: Simplified BSD (see COPYING.BSD) */
enum HtmlParseError {
ErrParsingNoError,
ErrParsingElement, // syntax error parsing element
ErrParsingExclOrPI,
ErrParsingClosingElement, // syntax error in closing element
ErrParsingElementName, // syntax error after element name
ErrParsingAttributes, // syntax error in attributes
ErrParsingAttributeName, // syntax error after attribute name
ErrParsingAttributeValue,
};
struct HtmlToken;
struct HtmlAttr {
char* name;
char* val;
HtmlAttr* next;
};
struct HtmlElement {
HtmlTag tag;
char* name; // name is nullptr whenever tag != Tag_NotFound
HtmlAttr* firstAttr;
HtmlElement *up, *down, *next;
uint codepage;
bool NameIs(const char* name) const;
bool NameIsNS(const char* name, const char* ns) const;
WCHAR* GetAttribute(const char* name) const;
char* GetAttributeTemp(const char* name) const;
HtmlElement* GetChildByTag(HtmlTag tag, int idx = 0) const;
};
class HtmlParser {
PoolAllocator allocator;
// text to parse. It can be changed.
char* html = nullptr;
// true if s was allocated by ourselves, false if managed
// by the caller
bool freeHtml = false;
// the codepage used for converting text to Unicode
uint codepage{CP_ACP};
size_t elementsCount = 0;
size_t attributesCount = 0;
HtmlElement* rootElement = nullptr;
HtmlElement* currElement = nullptr;
HtmlElement* AllocElement(HtmlTag tag, char* name, HtmlElement* parent);
HtmlAttr* AllocAttr(char* name, HtmlAttr* next);
void CloseTag(HtmlToken* tok);
void StartTag(HtmlToken* tok);
void AppendAttr(char* name, char* value);
HtmlElement* FindParent(HtmlToken* tok);
HtmlElement* ParseError(HtmlParseError err) {
error = err;
return nullptr;
}
void Reset();
public:
HtmlParseError error{ErrParsingNoError}; // parsing error, a static string
const char* errorContext = nullptr; // pointer within html showing which part we failed to parse
HtmlParser();
~HtmlParser();
HtmlElement* Parse(const ByteSlice& d, uint codepage = CP_ACP);
HtmlElement* ParseInPlace(const ByteSlice& d, uint codepage = CP_ACP);
size_t ElementsCount() const;
size_t TotalAttrCount() const;
HtmlElement* FindElementByName(const char* name, HtmlElement* from = nullptr);
HtmlElement* FindElementByNameNS(const char* name, const char* ns, HtmlElement* from = nullptr);
};
WCHAR* DecodeHtmlEntitites(const char* string, uint codepage);
char* DecodeHtmlEntititesTemp(const char* string, uint codepage);
namespace strconv {
inline WCHAR* FromHtmlUtf8(const char* s, size_t len) {
char* tmp = str::DupTemp(s, len);
return DecodeHtmlEntitites(tmp, CP_UTF8);
}
inline char* FromHtmlUtf8Temp(const char* s, size_t len) {
char* tmp = str::DupTemp(s, len);
return DecodeHtmlEntititesTemp(tmp, CP_UTF8);
}
} // namespace strconv