forked from cantino/ruby-readability
/
readability.rb
77 lines (52 loc) · 2.19 KB
/
readability.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
require 'rubygems'
require 'nokogiri'
module Readability
class Document
def initialize(input)
@html = Nokogiri::HTML(input)
end
def content
# Get all parent elements containing a <p> tag
@parents = @html.css("p").map { |p| p.parent }.compact.uniq
sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0])
end
def score(parent)
s = 0
# Adjust score based on parent's "class" attribute
s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i
s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i
# Adjust score based on parent id
s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i
s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i
# Adjust score based on # of <p> elements inside parent
s += parent.css("p").size
# Adjust score based on # of commas inside parent
s += parent.text.count ","
s
end
def sanitize(node)
# Get rid of divs full of non-text items
node.css("div").each do |el|
counts = Hash[ %w[p img li a embed].each { |kind| [kind, el.css(kind)] } ]
el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"])
end
# We'll sanitize all elements using a whitelist
whitelist = %w[div p]
# Use a hash for speed (don't want to make a million calls to include?)
whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
if whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) }
# Otherwise, replace the element with its contents
else
el.swap(el.text)
end
end
# Get rid of duplicate whitespace
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
end
end
end
d = Readability::Document.new(File.open("sample.html"))
p d.content