Skip to content

Commit

Permalink
Fixed #2416 - HTML sanitizer blocks email processing because of an en…
Browse files Browse the repository at this point in the history
…dless loop.
  • Loading branch information
Billy Zhou authored and thorsteneckel committed Jan 18, 2019
1 parent cfc1bdb commit 1c55fc0
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 160 deletions.
330 changes: 170 additions & 160 deletions lib/html_sanitizer.rb
@@ -1,5 +1,7 @@
class HtmlSanitizer
LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
PROCESSING_TIMEOUT = 10
UNPROCESSABLE_HTML_MSG = 'This message cannot be displayed due to HTML processing issues. Download the raw message below and open it via an Email client if you still wish to view it.'.freeze

=begin
Expand All @@ -9,198 +11,202 @@ class HtmlSanitizer
=end

def self.strict(string, external = false)
@fqdn = Setting.get('fqdn')
def self.strict(string, external = false, timeout: true)
Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
@fqdn = Setting.get('fqdn')

# config
tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
tags_whitelist = Rails.configuration.html_sanitizer_tags_whitelist
attributes_whitelist = Rails.configuration.html_sanitizer_attributes_whitelist
css_properties_whitelist = Rails.configuration.html_sanitizer_css_properties_whitelist
css_values_blacklist = Rails.application.config.html_sanitizer_css_values_backlist
# config
tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
tags_whitelist = Rails.configuration.html_sanitizer_tags_whitelist
attributes_whitelist = Rails.configuration.html_sanitizer_attributes_whitelist
css_properties_whitelist = Rails.configuration.html_sanitizer_css_properties_whitelist
css_values_blacklist = Rails.application.config.html_sanitizer_css_values_backlist

# We whitelist yahoo_quoted because Yahoo Mail marks quoted email content using
# <div class='yahoo_quoted'> and we rely on this class to identify quoted messages
classes_whitelist = ['js-signatureMarker', 'yahoo_quoted']
attributes_2_css = %w[width height]
# We whitelist yahoo_quoted because Yahoo Mail marks quoted email content using
# <div class='yahoo_quoted'> and we rely on this class to identify quoted messages
classes_whitelist = ['js-signatureMarker', 'yahoo_quoted']
attributes_2_css = %w[width height]

# remove html comments
string.gsub!(/<!--.+?-->/m, '')
# remove html comments
string.gsub!(/<!--.+?-->/m, '')

scrubber_link = Loofah::Scrubber.new do |node|
scrubber_link = Loofah::Scrubber.new do |node|

# wrap plain-text URLs in <a> tags
if node.is_a?(Nokogiri::XML::Text) && node.content.present? && node.content.include?(':') && node.ancestors.map(&:name).exclude?('a')
urls = URI.extract(node.content, LINKABLE_URL_SCHEMES)
.map { |u| u.sub(/[,.]$/, '') } # URI::extract captures trailing dots/commas
.reject { |u| u.match?(/^[^:]+:$/) } # URI::extract will match, e.g., 'tel:'
# wrap plain-text URLs in <a> tags
if node.is_a?(Nokogiri::XML::Text) && node.content.present? && node.content.include?(':') && node.ancestors.map(&:name).exclude?('a')
urls = URI.extract(node.content, LINKABLE_URL_SCHEMES)
.map { |u| u.sub(/[,.]$/, '') } # URI::extract captures trailing dots/commas
.reject { |u| u.match?(/^[^:]+:$/) } # URI::extract will match, e.g., 'tel:'

next if urls.blank?
next if urls.blank?

add_link(node.content, urls, node)
end
add_link(node.content, urls, node)
end

# prepare links
if node['href']
href = cleanup_target(node['href'], keep_spaces: true)
href_without_spaces = href.gsub(/[[:space:]]/, '')
if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
node['href'] = "http://#{node['href']}"
href = node['href']
# prepare links
if node['href']
href = cleanup_target(node['href'], keep_spaces: true)
href_without_spaces = href.gsub(/[[:space:]]/, '')
end
if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
node['href'] = "http://#{node['href']}"
href = node['href']
href_without_spaces = href.gsub(/[[:space:]]/, '')
end

next if !href_without_spaces.downcase.start_with?('http', 'ftp', '//')
next if !href_without_spaces.downcase.start_with?('http', 'ftp', '//')

node.set_attribute('href', href)
node.set_attribute('rel', 'nofollow noreferrer noopener')
node.set_attribute('target', '_blank')
end
node.set_attribute('href', href)
node.set_attribute('rel', 'nofollow noreferrer noopener')
node.set_attribute('target', '_blank')
end

if node.name == 'a' && node['href'].blank?
node.replace node.children.to_s
Loofah::Scrubber::STOP
end
if node.name == 'a' && node['href'].blank?
node.replace node.children.to_s
Loofah::Scrubber::STOP
end

# check if href is different to text
if node.name == 'a' && !url_same?(node['href'], node.text)
if node['title'].blank?
node['title'] = node['href']
# check if href is different to text
if node.name == 'a' && !url_same?(node['href'], node.text)
if node['title'].blank?
node['title'] = node['href']
end
end
end
end

scrubber_wipe = Loofah::Scrubber.new do |node|
scrubber_wipe = Loofah::Scrubber.new do |node|

# remove tags with subtree
if tags_remove_content.include?(node.name)
node.remove
Loofah::Scrubber::STOP
end

# remove tag, insert quoted content
if tags_quote_content.include?(node.name)
string = html_decode(node.content)
text = Nokogiri::XML::Text.new(string, node.document)
node.add_next_sibling(text)
node.remove
Loofah::Scrubber::STOP
end

# replace tags, keep subtree
if !tags_whitelist.include?(node.name)
node.replace node.children.to_s
Loofah::Scrubber::STOP
end
# remove tags with subtree
if tags_remove_content.include?(node.name)
node.remove
Loofah::Scrubber::STOP
end

# prepare src attribute
if node['src']
src = cleanup_target(node['src'])
if src =~ /(javascript|livescript|vbscript):/i || src.downcase.start_with?('http', 'ftp', '//')
# remove tag, insert quoted content
if tags_quote_content.include?(node.name)
string = html_decode(node.content)
text = Nokogiri::XML::Text.new(string, node.document)
node.add_next_sibling(text)
node.remove
Loofah::Scrubber::STOP
end
end

# clean class / only use allowed classes
if node['class']
classes = node['class'].gsub(/\t|\n|\r/, '').split(' ')
class_new = ''
classes.each do |local_class|
next if !classes_whitelist.include?(local_class.to_s.strip)
# replace tags, keep subtree
if !tags_whitelist.include?(node.name)
node.replace node.children.to_s
Loofah::Scrubber::STOP
end

if class_new != ''
class_new += ' '
# prepare src attribute
if node['src']
src = cleanup_target(node['src'])
if src =~ /(javascript|livescript|vbscript):/i || src.downcase.start_with?('http', 'ftp', '//')
node.remove
Loofah::Scrubber::STOP
end
class_new += local_class
end
if class_new != ''
node['class'] = class_new
else
node.delete('class')
end
end

# move style attributes to css attributes
attributes_2_css.each do |key|
next if !node[key]
# clean class / only use allowed classes
if node['class']
classes = node['class'].gsub(/\t|\n|\r/, '').split(' ')
class_new = ''
classes.each do |local_class|
next if !classes_whitelist.include?(local_class.to_s.strip)

if node['style'].blank?
node['style'] = ''
else
node['style'] += ';'
if class_new != ''
class_new += ' '
end
class_new += local_class
end
if class_new != ''
node['class'] = class_new
else
node.delete('class')
end
end
value = node[key]
node.delete(key)
next if value.blank?

value += 'px' if !value.match?(/%|px|em/i)
node['style'] += "#{key}:#{value}"
end

# clean style / only use allowed style properties
if node['style']
pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
style = ''
pears.each do |local_pear|
prop = local_pear.split(':')
next if !prop[0]
# move style attributes to css attributes
attributes_2_css.each do |key|
next if !node[key]

key = prop[0].strip
next if !css_properties_whitelist.include?(node.name)
next if !css_properties_whitelist[node.name].include?(key)
next if css_values_blacklist[node.name]&.include?(local_pear.gsub(/[[:space:]]/, '').strip)
if node['style'].blank?
node['style'] = ''
else
node['style'] += ';'
end
value = node[key]
node.delete(key)
next if value.blank?

style += "#{local_pear};"
value += 'px' if !value.match?(/%|px|em/i)
node['style'] += "#{key}:#{value}"
end
node['style'] = style
if style == ''
node.delete('style')

# clean style / only use allowed style properties
if node['style']
pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
style = ''
pears.each do |local_pear|
prop = local_pear.split(':')
next if !prop[0]

key = prop[0].strip
next if !css_properties_whitelist.include?(node.name)
next if !css_properties_whitelist[node.name].include?(key)
next if css_values_blacklist[node.name]&.include?(local_pear.gsub(/[[:space:]]/, '').strip)

style += "#{local_pear};"
end
node['style'] = style
if style == ''
node.delete('style')
end
end
end

# scan for invalid link content
%w[href style].each do |attribute_name|
next if !node[attribute_name]
# scan for invalid link content
%w[href style].each do |attribute_name|
next if !node[attribute_name]

href = cleanup_target(node[attribute_name])
next if href !~ /(javascript|livescript|vbscript):/i
href = cleanup_target(node[attribute_name])
next if href !~ /(javascript|livescript|vbscript):/i

node.delete(attribute_name)
end
node.delete(attribute_name)
end

# remove attributes if not whitelisted
node.each do |attribute, _value|
attribute_name = attribute.downcase
next if attributes_whitelist[:all].include?(attribute_name) || (attributes_whitelist[node.name]&.include?(attribute_name))
# remove attributes if not whitelisted
node.each do |attribute, _value|
attribute_name = attribute.downcase
next if attributes_whitelist[:all].include?(attribute_name) || (attributes_whitelist[node.name]&.include?(attribute_name))

node.delete(attribute)
end
node.delete(attribute)
end

# remove mailto links
if node['href']
href = cleanup_target(node['href'])
if href =~ /mailto:(.*)$/i
text = Nokogiri::XML::Text.new($1, node.document)
node.add_next_sibling(text)
node.remove
Loofah::Scrubber::STOP
# remove mailto links
if node['href']
href = cleanup_target(node['href'])
if href =~ /mailto:(.*)$/i
text = Nokogiri::XML::Text.new($1, node.document)
node.add_next_sibling(text)
node.remove
Loofah::Scrubber::STOP
end
end
end
end

new_string = ''
done = true
while done
new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
if string == new_string
done = false
new_string = ''
done = true
while done
new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
if string == new_string
done = false
end
string = new_string
end
string = new_string
end

Loofah.fragment(string).scrub!(scrubber_link).to_s
Loofah.fragment(string).scrub!(scrubber_link).to_s
end
rescue Timeout::Error => e
UNPROCESSABLE_HTML_MSG
end

=begin
Expand All @@ -214,21 +220,25 @@ def self.strict(string, external = false)
=end

def self.cleanup(string)
string.gsub!(/<[A-z]:[A-z]>/, '')
string.gsub!(%r{</[A-z]:[A-z]>}, '')
string.delete!("\t")
def self.cleanup(string, timeout: true)
Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
string.gsub!(/<[A-z]:[A-z]>/, '')
string.gsub!(%r{</[A-z]:[A-z]>}, '')
string.delete!("\t")

# remove all new lines
string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, "\n")
# remove all new lines
string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, "\n")

# remove double multiple empty lines
string.gsub!(/\n\n\n+/, "\n\n")
# remove double multiple empty lines
string.gsub!(/\n\n\n+/, "\n\n")

string = cleanup_structure(string, 'pre')
string = cleanup_replace_tags(string)
string = cleanup_structure(string)
string
string = cleanup_structure(string, 'pre')
string = cleanup_replace_tags(string)
string = cleanup_structure(string)
string
end
rescue Timeout::Error => e
UNPROCESSABLE_HTML_MSG
end

def self.cleanup_replace_tags(string)
Expand Down

0 comments on commit 1c55fc0

Please sign in to comment.