/
whitelist.py
155 lines (129 loc) · 5.53 KB
/
whitelist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
A generic HTML whitelisting engine, designed to accommodate subclassing to override
specific rules.
"""
import re
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
from django.utils.html import escape
ALLOWED_URL_SCHEMES = ["http", "https", "ftp", "mailto", "tel"]
PROTOCOL_RE = re.compile("^[a-z0-9][-+.a-z0-9]*:")
def check_url(url_string):
# Remove control characters and other disallowed characters
# Browsers sometimes ignore these, so that 'jav\tascript:alert("XSS")'
# is treated as a valid javascript: link
unescaped = url_string.lower()
unescaped = unescaped.replace("<", "<")
unescaped = unescaped.replace(">", ">")
unescaped = unescaped.replace("&", "&")
unescaped = re.sub(r"[`\000-\040\177-\240\s]+", "", unescaped)
unescaped = unescaped.replace("\ufffd", "")
if PROTOCOL_RE.match(unescaped):
protocol = unescaped.split(":", 1)[0]
if protocol not in ALLOWED_URL_SCHEMES:
return None
return url_string
def attribute_rule(allowed_attrs):
"""
Generator for functions that can be used as entries in Whitelister.element_rules.
These functions accept a tag, and modify its attributes by looking each attribute
up in the 'allowed_attrs' dict defined here:
* if the lookup fails, drop the attribute
* if the lookup returns a callable, replace the attribute with the result of calling
it - for example `{'title': uppercase}` will replace 'title' with the result of
uppercasing the title. If the callable returns None, the attribute is dropped.
* if the lookup returns a truthy value, keep the attribute; if falsy, drop it
"""
def fn(tag):
for attr, val in list(tag.attrs.items()):
rule = allowed_attrs.get(attr)
if rule:
if callable(rule):
new_val = rule(val)
if new_val is None:
del tag[attr]
else:
tag[attr] = new_val
else:
# rule is not callable, just truthy - keep the attribute
pass
else:
# rule is falsy or absent - remove the attribute
del tag[attr]
return fn
allow_without_attributes = attribute_rule({})
DEFAULT_ELEMENT_RULES = {
"[document]": allow_without_attributes,
"a": attribute_rule({"href": check_url}),
"b": allow_without_attributes,
"br": allow_without_attributes,
"div": allow_without_attributes,
"em": allow_without_attributes,
"h1": allow_without_attributes,
"h2": allow_without_attributes,
"h3": allow_without_attributes,
"h4": allow_without_attributes,
"h5": allow_without_attributes,
"h6": allow_without_attributes,
"hr": allow_without_attributes,
"i": allow_without_attributes,
"img": attribute_rule(
{"src": check_url, "width": True, "height": True, "alt": True}
),
"li": allow_without_attributes,
"ol": allow_without_attributes,
"p": allow_without_attributes,
"strong": allow_without_attributes,
"sub": allow_without_attributes,
"sup": allow_without_attributes,
"ul": allow_without_attributes,
}
class Whitelister:
element_rules = DEFAULT_ELEMENT_RULES
def clean(self, html):
"""Clean up an HTML string to contain just the allowed elements /
attributes"""
doc = BeautifulSoup(html, "html.parser")
self.clean_node(doc, doc)
# Pass strings through django.utils.html.escape when generating the final HTML.
# This differs from BeautifulSoup's default EntitySubstitution.substitute_html formatter
# in that it escapes " to " as well as escaping < > & - if we don't do this, then
# BeautifulSoup will try to be clever and use single-quotes to wrap attribute values,
# which confuses our regexp-based db-HTML-to-real-HTML conversion.
return doc.decode(formatter=escape)
def clean_node(self, doc, node):
"""Clean a BeautifulSoup document in-place"""
if isinstance(node, NavigableString):
self.clean_string_node(doc, node)
elif isinstance(node, Tag):
self.clean_tag_node(doc, node)
# This branch is here in case node is a BeautifulSoup object that does
# not inherit from NavigableString or Tag. I can't find any examples
# of such a thing at the moment, so this branch is untested.
else: # pragma: no cover
self.clean_unknown_node(doc, node)
def clean_string_node(self, doc, node):
# Remove comments
if isinstance(node, Comment):
node.extract()
return
# by default, nothing needs to be done to whitelist string nodes
pass
def clean_tag_node(self, doc, tag):
# first, whitelist the contents of this tag
# NB tag.contents will change while this iteration is running, so we need
# to capture the initial state into a static list() and iterate over that
# to avoid losing our place in the sequence.
for child in list(tag.contents):
self.clean_node(doc, child)
# see if there is a rule in element_rules for this tag type
try:
rule = self.element_rules[tag.name]
except KeyError:
# don't recognise this tag name, so KILL IT WITH FIRE
tag.unwrap()
return
# apply the rule
rule(tag)
def clean_unknown_node(self, doc, node):
# don't know what type of object this is, so KILL IT WITH FIRE
node.decompose()