Skip to content

Commit

Permalink
Merge pull request #1459 from tracim/fix/1154__check_empty_htlml_tags…
Browse files Browse the repository at this point in the history
…_in_comments

Fix/1154  check empty htlml tags in comments
  • Loading branch information
buxx committed Feb 28, 2019
2 parents 413085f + 2755083 commit 35883ff
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 34 deletions.
1 change: 1 addition & 0 deletions backend/tracim_backend/error.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
# Validation Error
GENERIC_SCHEMA_VALIDATION_ERROR = 2001
INTERNAL_TRACIM_VALIDATION_ERROR = 2002
EMPTY_COMMENT_NOT_ALLOWED = 2003
# Not in Tracim Request #
USER_NOT_IN_TRACIM_REQUEST = 2011
WORKSPACE_NOT_IN_TRACIM_REQUEST = 2012
Expand Down
2 changes: 1 addition & 1 deletion backend/tracim_backend/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ class EmptyLabelNotAllowed(EmptyValueNotAllowed):


class EmptyCommentContentNotAllowed(EmptyValueNotAllowed):
pass
error_code = error.EMPTY_COMMENT_NOT_ALLOWED


class EmptyEmailBody(EmptyValueNotAllowed):
Expand Down
2 changes: 1 addition & 1 deletion backend/tracim_backend/fixtures/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def insert(self):

content_api.create_comment(
parent=best_cake_thread,
content='<p>What is for you the best cake ever? </br> I personnally vote for Chocolate cupcake!</p>', # nopep8
content='<p>What is for you the best cake ever? <br/> I personnally vote for Chocolate cupcake!</p>', # nopep8
do_save=True,
)
bob_content_api.create_comment(
Expand Down
9 changes: 9 additions & 0 deletions backend/tracim_backend/lib/core/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
from tracim_backend.lib.utils.utils import cmp_to_key
from tracim_backend.lib.utils.utils import current_date_for_filename
from tracim_backend.lib.utils.utils import preview_manager_page_format
from tracim_backend.lib.mail_fetcher.email_processing.sanitizer import HtmlSanitizer # nopep8
from tracim_backend.lib.mail_fetcher.email_processing.sanitizer import HtmlSanitizerConfig # nopep8
from tracim_backend.models.auth import User
from tracim_backend.models.context_models import ContentInContext
from tracim_backend.models.context_models import PreviewAllowedDim
Expand Down Expand Up @@ -617,6 +619,13 @@ def create_comment(self, workspace: Workspace=None, parent: Content=None, conten
if not content:
raise EmptyCommentContentNotAllowed()

config = HtmlSanitizerConfig(tag_blacklist=['script'])
satinizer = HtmlSanitizer(html_body=content, config=config)
if satinizer.is_html():
content = satinizer.sanitize()
if satinizer.html_is_empty():
raise EmptyCommentContentNotAllowed()

item = self.create(
content_type_slug=content_type_list.Comment.slug,
workspace=workspace,
Expand Down
4 changes: 2 additions & 2 deletions backend/tracim_backend/lib/mail_fetcher/email_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,14 @@ def get_body(
if use_txt_parsing:
txt_body = EmailReplyParser.parse_reply(txt_body)
html_body = markdown.markdown(txt_body)
body = HtmlSanitizer.sanitize(html_body)
body = HtmlSanitizer(html_body).sanitize()

elif content_type == CONTENT_TYPE_TEXT_HTML:
html_body = body_part.get_payload(decode=True).decode(
charset)
if use_html_parsing:
html_body = str(ParsedHTMLMail(html_body))
body = HtmlSanitizer.sanitize(html_body)
body = HtmlSanitizer(html_body).sanitize()
if not body:
raise EmptyEmailBody()
return body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,22 @@


class HtmlSanitizerConfig(object):
# whitelist : keep tag and content
Tag_whitelist = TAG_WHITELIST
Attrs_whitelist = ATTRS_WHITELIST
# blacklist : remove content
Tag_blacklist = TAG_BLACKLIST
Class_blacklist = CLASS_BLACKLIST
Id_blacklist = ID_BLACKLIST

def __init__(
self,
tag_whitelist: list = TAG_WHITELIST,
attrs_whitelist: list = ATTRS_WHITELIST,
tag_blacklist: list = TAG_BLACKLIST,
class_blacklist: list = CLASS_BLACKLIST,
id_blacklist: list = ID_BLACKLIST,
parser: str = 'html.parser'
):
self.tag_whitelist = tag_whitelist
self.attrs_whitelist = attrs_whitelist
self.tag_blacklist = tag_blacklist
self.class_blacklist = class_blacklist
self.id_blacklist = id_blacklist
self.parser = parser


class HtmlSanitizer(object):
Expand All @@ -28,41 +37,78 @@ class HtmlSanitizer(object):
- Remove non-whitelisted attributes
"""

@classmethod
def sanitize(cls, html_body: str) -> typing.Optional[str]:
soup = BeautifulSoup(html_body, 'html.parser')
for tag in soup.findAll():
if cls._tag_to_extract(tag):
def __init__(self, html_body: str, config: HtmlSanitizerConfig = None):
if config is None:
config = HtmlSanitizerConfig()
self.config = config
self.soup = BeautifulSoup(html_body, config.parser)

def sanitize(self) -> typing.Optional[str]:
for tag in self.soup.findAll():
if self._tag_to_extract(tag):
tag.extract()
elif tag.name.lower() in HtmlSanitizerConfig.Tag_whitelist:
elif tag.name.lower() in self.config.tag_whitelist:
attrs = dict(tag.attrs)
for attr in attrs:
if attr not in HtmlSanitizerConfig.Attrs_whitelist:
if attr not in self.config.attrs_whitelist:
del tag.attrs[attr]
else:
tag.unwrap()

if cls._is_content_empty(soup):
if self._is_content_empty(self.soup):
return None
else:
return str(soup)
return str(self.soup)

@classmethod
def _is_content_empty(cls, soup):
def _is_content_empty(self, soup: Tag) -> bool:
img = soup.find('img')
txt = soup.get_text().replace('\n', '').strip()
return (not img and not txt)

@classmethod
def _tag_to_extract(cls, tag: Tag) -> bool:
if tag.name.lower() in HtmlSanitizerConfig.Tag_blacklist:
def _tag_to_extract(self, tag: Tag) -> bool:
if tag.name.lower() in self.config.tag_blacklist:
return True
if 'class' in tag.attrs:
for elem in HtmlSanitizerConfig.Class_blacklist:
for elem in self.config.class_blacklist:
if elem in tag.attrs['class']:
return True
if 'id' in tag.attrs:
for elem in HtmlSanitizerConfig.Id_blacklist:
for elem in self.config.id_blacklist:
if elem in tag.attrs['id']:
return True
return False

def html_is_empty(self, node_to_inspect: Tag = None) -> bool:
"""
Inspects all nodes of an html tag and returns True if all of them are
empty.
:param node_to_inspect: Node to inspect, if None the root node is used
:type node_to_inspect: bs4.Tag
:returns: boolean
:Example:
<p></p> return True
<p><p></p></p> returns True
<p><p>some text</p></p> return False
"""
if not node_to_inspect:
node_to_inspect = self.soup
if not self._is_content_empty(node_to_inspect):
return False
return all(
(self.html_is_empty(child) for child in node_to_inspect.children)
)

def is_html(self) -> bool:
"""
Checks if the html_body given to the constructor contains html
:returns: boolean
:Example:
<p>some text</p> return True
'some text' return False
"""
return self.soup.find() is not None
4 changes: 3 additions & 1 deletion backend/tracim_backend/lib/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import datetime
import random
import string
from lxml import html
from lxml.etree import ParseError
from os.path import normpath as base_normpath
from urllib.parse import urljoin
from urllib.parse import urlencode
Expand Down Expand Up @@ -308,4 +310,4 @@ def normpath(path):
path = b'/'
elif path == '':
path = '/'
return base_normpath(path)
return base_normpath(path)
100 changes: 94 additions & 6 deletions backend/tracim_backend/tests/functional/test_comments.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_api__get_contents_comments__ok_200__nominal_case(self) -> None:
comment = res.json_body[0]
assert comment['content_id'] == 18
assert comment['parent_id'] == 7
assert comment['raw_content'] == '<p>What is for you the best cake ever? </br> I personnally vote for Chocolate cupcake!</p>' # nopep8
assert comment['raw_content'] == '<p>What is for you the best cake ever? <br/> I personnally vote for Chocolate cupcake!</p>' # nopep8
assert comment['author']
assert comment['author']['user_id'] == 1
# TODO - G.M - 2018-06-172 - [avatar] setup avatar url
Expand Down Expand Up @@ -204,9 +204,6 @@ def test_api__post_content_comment__err_400__content_not_editable(self) -> None:
assert res.json_body['code'] == error.CONTENT_IN_NOT_EDITABLE_STATE

def test_api__post_content_comment__err_400__empty_raw_content(self) -> None:
"""
Get alls comments of a content
"""
self.testapp.authorization = (
'Basic',
(
Expand All @@ -225,7 +222,98 @@ def test_api__post_content_comment__err_400__empty_raw_content(self) -> None:
# INFO - G.M - 2018-09-10 - error handle by marshmallow validator.
assert res.json_body
assert 'code' in res.json_body
assert res.json_body['code'] == error.GENERIC_SCHEMA_VALIDATION_ERROR # nopep8
assert res.json_body['code'] == error.GENERIC_SCHEMA_VALIDATION_ERROR

def test_api__post_content_comment__err_400__empty_simple_html(self) -> None:
self.testapp.authorization = (
'Basic',
('admin@admin.admin', 'admin@admin.admin')
)
params = {'raw_content': '<p></p>'}
res = self.testapp.post_json(
'/api/v2/workspaces/2/contents/7/comments',
params=params,
status=400
)
assert res.json_body
assert 'code' in res.json_body
assert res.json_body['code'] == error.EMPTY_COMMENT_NOT_ALLOWED

def test_api__post_content_comment__err_400__empty_nested_html(self) -> None:
self.testapp.authorization = (
'Basic',
('admin@admin.admin', 'admin@admin.admin')
)
params = {'raw_content': '<p><p></p><p><p></p></p></p>'}
res = self.testapp.post_json(
'/api/v2/workspaces/2/contents/7/comments',
params=params,
status=400
)
assert res.json_body
assert 'code' in res.json_body
assert res.json_body['code'] == error.EMPTY_COMMENT_NOT_ALLOWED

def test_api__post_content_comment__err_400__only__tags_nested_html(self) -> None:
self.testapp.authorization = (
'Basic',
('admin@admin.admin', 'admin@admin.admin')
)
params = {'raw_content': '<p><p></p><p><p><br/><br/></p><br/></p></p>'}
res = self.testapp.post_json(
'/api/v2/workspaces/2/contents/7/comments',
params=params,
status=400
)
assert res.json_body
assert 'code' in res.json_body
assert res.json_body['code'] == error.EMPTY_COMMENT_NOT_ALLOWED

def test_api__post_content_comment__err_400__unclosed_empty_tag(self) -> None:
self.testapp.authorization = (
'Basic',
('admin@admin.admin', 'admin@admin.admin')
)
params = {'raw_content': '<p></i>'}
res = self.testapp.post_json(
'/api/v2/workspaces/2/contents/7/comments',
params=params,
status=400
)
assert res.json_body
assert 'code' in res.json_body
assert res.json_body['code'] == error.EMPTY_COMMENT_NOT_ALLOWED

def test_api__post_content_comment__err_400__unclosed_tag_not_empty(self) -> None:
"""
This test should raise an error if we validate the html
The browser will close the p tag and removes the i tag so the html is valid
"""
self.testapp.authorization = (
'Basic',
('admin@admin.admin', 'admin@admin.admin')
)
params = {'raw_content': '<p>Hello</i>'}
self.testapp.post_json(
'/api/v2/workspaces/2/contents/7/comments',
params=params,
status=200
)

def test_api__post_content_comment__err_400__invalid_html(self) -> None:
"""
This test should raise an error as the html isn't valid
"""
self.testapp.authorization = (
'Basic',
('admin@admin.admin', 'admin@admin.admin')
)
params = {'raw_content': '<p></p>Hello'}
self.testapp.post_json(
'/api/v2/workspaces/2/contents/7/comments',
params=params,
status=200
)

def test_api__delete_content_comment__ok_200__user_is_owner_and_workspace_manager(self) -> None: # nopep8
"""
Expand All @@ -243,7 +331,7 @@ def test_api__delete_content_comment__ok_200__user_is_owner_and_workspace_manage
comment = res.json_body[0]
assert comment['content_id'] == 18
assert comment['parent_id'] == 7
assert comment['raw_content'] == '<p>What is for you the best cake ever? </br> I personnally vote for Chocolate cupcake!</p>' # nopep8
assert comment['raw_content'] == '<p>What is for you the best cake ever? <br/> I personnally vote for Chocolate cupcake!</p>' # nopep8
assert comment['author']
assert comment['author']['user_id'] == 1
# TODO - G.M - 2018-06-172 - [avatar] setup avatar url
Expand Down

0 comments on commit 35883ff

Please sign in to comment.