Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

When sanitizing elements add a space when replacing block element suc…

…h as br with its text.
  • Loading branch information...
commit 99ecb337b5056eff8b87d289876734b0d0a28f10 1 parent 3c8ab69
@libc libc authored iterationlabs committed
Showing with 31 additions and 3 deletions.
  1. +12 −2 lib/readability.rb
  2. +19 −1 spec/readability_spec.rb
View
14 lib/readability.rb
@@ -248,10 +248,15 @@ def sanitize(node, candidates, options = {})
# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
+ # We'll add whitespace instead of block elements,
+ # so a<br>b will have a nice space between them
+ base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
# Use a hash for speed (don't want to make a million calls to include?)
whitelist = Hash.new
base_whitelist.each {|tag| whitelist[tag] = true }
+ replace_with_whitespace = Hash[base_replace_with_whitespace.map { |tag| [tag, true] }]
+
([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
@@ -260,13 +265,18 @@ def sanitize(node, candidates, options = {})
# Otherwise, replace the element with its contents
else
- el.swap(el.text)
+ if replace_with_whitespace[el.node_name]
+ # Adding &nbsp; here, because swap removes regular spaaces
+ el.swap('&nbsp;' << el.text << '&nbsp;')
+ else
+ el.swap(el.text)
+ end
end
end
# Get rid of duplicate whitespace
- node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t  ]+/, " ")
end
def clean_conditionally(node, candidates, selector)
View
20 spec/readability_spec.rb
@@ -178,7 +178,25 @@
@doc.content.should_not match("sidebar")
end
end
-
+
+ describe "inserting space for block elements" do
+ before do
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
+ <html><head><title>title!</title></head>
+ <body>
+ <div>
+ <p>a<br>b<hr>c<address>d</address>f/p>
+ </div>
+ </body>
+ </html>
+ HTML
+ end
+
+ it "should not return the sidebar" do
+ @doc.content.should_not match("a b c d f")
+ end
+ end
+
describe "outputs good stuff for known documents" do
before do
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
Please sign in to comment.
Something went wrong with that request. Please try again.