Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Allow remove_empty_nodes to be passed in as an option; add demo code …

…in bin/readability to show how to keep images and links
  • Loading branch information...
commit 330e2bd8a73ffcb2dab0e6d6175dafce71e308a6 1 parent 365f177
Andrew Cantino authored June 09, 2011
7  README
@@ -15,6 +15,13 @@ Example:
15 15
   source = open('http://lab.arc90.com/experiments/readability/').read
16 16
   puts Readability::Document.new(source).content
17 17
 
  18
+There is also a command-line tool for testing readability in bin/readability.
  19
+
  20
+  Usage: readability [options] URL
  21
+      -d, --debug                      Show debug output
  22
+      -i, --images                     Keep images and links
  23
+      -h, --help                       Show this message
  24
+
18 25
 ===
19 26
 
20 27
 This code is under the Apache License 2.0.  http://www.apache.org/licenses/LICENSE-2.0
31  bin/readability
@@ -2,12 +2,39 @@
2 2
 $KCODE='u'
3 3
 require 'rubygems'
4 4
 require 'open-uri'
  5
+require 'optparse'
5 6
 require File.dirname(__FILE__) + '/../lib/readability'
6 7
 
  8
+options = { :debug => false, :images => false }
  9
+options_parser = OptionParser.new do |opts|
  10
+  opts.banner = "Usage: #{File.basename($0)} [options] URL"
  11
+
  12
+  opts.on("-d", "--debug", "Show debug output") do |v|
  13
+    options[:debug] = v
  14
+  end
  15
+
  16
+  opts.on("-i", "--images", "Keep images and links") do |i|
  17
+    options[:images] = i
  18
+  end
  19
+
  20
+  opts.on_tail("-h", "--help", "Show this message") do
  21
+    puts opts
  22
+    exit
  23
+  end
  24
+end
  25
+options_parser.parse!
  26
+
7 27
 if ARGV.length != 1
8  
-  STDERR.puts "Usage: #{File.basename($0)} URL"
  28
+  STDERR.puts options_parser
9 29
   exit 1
10 30
 end
11 31
 
12 32
 text = open(ARGV.first).read
13  
-puts Readability::Document.new(text).content
  33
+if options[:images]
  34
+  puts Readability::Document.new(text, :tags => %w[div p img a], 
  35
+                                       :attributes => %w[src href], 
  36
+                                       :remove_empty_nodes => false, 
  37
+                                       :debug => options[:debug]).content
  38
+else
  39
+  puts Readability::Document.new(text, :debug => options[:debug]).content
  40
+end
14  lib/readability.rb
@@ -8,7 +8,8 @@ class Document
8 8
       :min_text_length => 25,
9 9
       :remove_unlikely_candidates => true,
10 10
       :weight_classes => true,
11  
-      :clean_conditionally => true
  11
+      :clean_conditionally => true,
  12
+      :remove_empty_nodes => true
12 13
     }.freeze
13 14
 
14 15
     attr_accessor :options, :html
@@ -221,7 +222,7 @@ def transform_misused_divs_into_paragraphs!
221 222
           # wrap text nodes in p tags
222 223
 #          elem.children.each do |child|
223 224
 #            if child.text?
224  
-##              debug("wrapping text node with a p")
  225
+#              debug("wrapping text node with a p")
225 226
 #              child.swap("<p>#{child.text}</p>")
226 227
 #            end
227 228
 #          end
@@ -238,9 +239,11 @@ def sanitize(node, candidates, options = {})
238 239
         elem.remove
239 240
       end
240 241
 
241  
-      # remove empty <p> tags
242  
-      node.css("p").each do |elem|
243  
-        elem.remove if elem.content.strip.empty?
  242
+      if @options[:remove_empty_nodes]
  243
+        # remove <p> tags that have no text content - this will also remove p tags that contain only images.
  244
+        node.css("p").each do |elem|
  245
+          elem.remove if elem.content.strip.empty?
  246
+        end
244 247
       end
245 248
 
246 249
       # Conditionally clean <table>s, <ul>s, and <div>s
@@ -259,7 +262,6 @@ def sanitize(node, candidates, options = {})
259 262
       base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
260 263
 
261 264
       ([node] + node.css("*")).each do |el|
262  
-
263 265
         # If element is in whitelist, delete all its attributes
264 266
         if whitelist[el.node_name]
265 267
           el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }

0 notes on commit 330e2bd

Please sign in to comment.
Something went wrong with that request. Please try again.