Permalink
Browse files

Allow remove_empty_nodes to be passed in as an option; add demo code …

…in bin/readability to show how to keep images and links
  • Loading branch information...
1 parent 365f177 commit 330e2bd8a73ffcb2dab0e6d6175dafce71e308a6 @cantino cantino committed Jun 10, 2011
Showing with 44 additions and 8 deletions.
  1. +7 −0 README
  2. +29 −2 bin/readability
  3. +8 −6 lib/readability.rb
View
@@ -15,6 +15,13 @@ Example:
source = open('http://lab.arc90.com/experiments/readability/').read
puts Readability::Document.new(source).content
+There is also a command-line tool for testing readability in bin/readability.
+
+ Usage: readability [options] URL
+ -d, --debug Show debug output
+ -i, --images Keep images and links
+ -h, --help Show this message
+
===
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
View
@@ -2,12 +2,39 @@
$KCODE='u'
require 'rubygems'
require 'open-uri'
+require 'optparse'
require File.dirname(__FILE__) + '/../lib/readability'
+options = { :debug => false, :images => false }
+options_parser = OptionParser.new do |opts|
+ opts.banner = "Usage: #{File.basename($0)} [options] URL"
+
+ opts.on("-d", "--debug", "Show debug output") do |v|
+ options[:debug] = v
+ end
+
+ opts.on("-i", "--images", "Keep images and links") do |i|
+ options[:images] = i
+ end
+
+ opts.on_tail("-h", "--help", "Show this message") do
+ puts opts
+ exit
+ end
+end
+options_parser.parse!
+
if ARGV.length != 1
- STDERR.puts "Usage: #{File.basename($0)} URL"
+ STDERR.puts options_parser
exit 1
end
text = open(ARGV.first).read
-puts Readability::Document.new(text).content
+if options[:images]
+ puts Readability::Document.new(text, :tags => %w[div p img a],
+ :attributes => %w[src href],
+ :remove_empty_nodes => false,
+ :debug => options[:debug]).content
+else
+ puts Readability::Document.new(text, :debug => options[:debug]).content
+end
View
@@ -8,7 +8,8 @@ class Document
:min_text_length => 25,
:remove_unlikely_candidates => true,
:weight_classes => true,
- :clean_conditionally => true
+ :clean_conditionally => true,
+ :remove_empty_nodes => true
}.freeze
attr_accessor :options, :html
@@ -221,7 +222,7 @@ def transform_misused_divs_into_paragraphs!
# wrap text nodes in p tags
# elem.children.each do |child|
# if child.text?
-## debug("wrapping text node with a p")
+# debug("wrapping text node with a p")
# child.swap("<p>#{child.text}</p>")
# end
# end
@@ -238,9 +239,11 @@ def sanitize(node, candidates, options = {})
elem.remove
end
- # remove empty <p> tags
- node.css("p").each do |elem|
- elem.remove if elem.content.strip.empty?
+ if @options[:remove_empty_nodes]
+ # remove <p> tags that have no text content - this will also remove p tags that contain only images.
+ node.css("p").each do |elem|
+ elem.remove if elem.content.strip.empty?
+ end
end
# Conditionally clean <table>s, <ul>s, and <div>s
@@ -259,7 +262,6 @@ def sanitize(node, candidates, options = {})
base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
([node] + node.css("*")).each do |el|
-
# If element is in whitelist, delete all its attributes
if whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }

0 comments on commit 330e2bd

Please sign in to comment.