This repository has been archived by the owner on Jun 8, 2019. It is now read-only.
forked from cantino/ruby-readability
-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
readability_spec.rb
237 lines (206 loc) · 7.45 KB
/
readability_spec.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
describe Readability do
before do
@simple_html_fixture = <<-HTML
<html>
<head>
<title>title!</title>
</head>
<body class='comment'>
<div>
<p class='comment'>a comment</p>
<div class='comment' id='body'>real content</div>
<div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
</div>
</body>
</html>
HTML
end
describe "transformMisusedDivsIntoParagraphs" do
before do
@doc = Readability::Document.new(@simple_html_fixture)
@doc.transform_misused_divs_into_paragraphs!
end
it "should transform divs containing no block elements into <p>s" do
@doc.html.css("#body").first.name.should == "p"
end
it "should not transform divs that contain block elements" do
@doc.html.css("#contains_blockquote").first.name.should == "div"
end
end
describe "score_node" do
before do
@doc = Readability::Document.new(<<-HTML)
<html>
<body>
<div id='elem1'>
<p>some content</p>
</div>
<th id='elem2'>
<p>some other content</p>
</th>
</body>
</html>
HTML
@elem1 = @doc.html.css("#elem1").first
@elem2 = @doc.html.css("#elem2").first
end
it "should like <div>s more than <th>s" do
@doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
end
it "should like classes like text more than classes like comment" do
@elem2.name = "div"
@doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
@elem1['class'] = "text"
@elem2['class'] = "comment"
@doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
end
end
describe "remove_unlikely_candidates!" do
before do
@doc = Readability::Document.new(@simple_html_fixture)
@doc.remove_unlikely_candidates!
end
it "should remove things that have class comment" do
@doc.html.inner_html.should_not =~ /a comment/
end
it "should not remove body tags" do
@doc.html.inner_html.should =~ /<\/body>/
end
it "should not remove things with class comment and id body" do
@doc.html.inner_html.should =~ /real content/
end
end
describe "score_paragraphs" do
before(:each) do
@doc = Readability::Document.new(<<-HTML)
<html>
<head>
<title>title!</title>
</head>
<body id="body">
<div id="div1">
<div id="div2>
<p id="some_comment">a comment</p>
</div>
<p id="some_text">some text</p>
</div>
<div id="div3">
<p id="some_text2">some more text</p>
</div>
</body>
</html><!-- " -->
HTML
@candidates = @doc.score_paragraphs(0)
end
it "should score elements in the document" do
@candidates.values.length.should == 3
end
it "should prefer the body in this particular example" do
@candidates.values.sort { |a, b|
b[:content_score] <=> a[:content_score]
}.first[:elem][:id].should == "body"
end
end
describe "score_paragraphs" do
context "when two consequent br tags are used instead of p" do
before :each do
@doc = Readability::Document.new(<<-HTML)
<html>
<head>
<title>title!</title>
</head>
<body id="body">
<div id="post1">
This is the main content!<br/><br/>
Zebra found killed butcher with the chainsaw.<br/><br/>
If only I could think of an example, oh, wait.
</div>
<div id="post2">
This is not the content and although it's longer if you meaure it in characters,
it's supposed to have lower score than the previous paragraph. And it's only because
of the previous paragraph is not one paragraph, it's three subparagraphs
</div>
</body>
</html>
HTML
@candidates = @doc.score_paragraphs(0)
end
it "should assign the higher score to the first paragraph in this particular example" do
@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
end
end
end
describe "the cant_read.html fixture" do
it "should work on the cant_read.html fixture with some allowed tags" do
allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
allowed_attributes = %w[href]
html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
end
end
describe "general functionality" do
before do
@doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
:min_text_length => 0, :retry_length => 1)
end
it "should return the main page content" do
@doc.content.should match("Some content")
end
it "should return the page title if present" do
@doc.title.should match("title!")
doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
:min_text_length => 0, :retry_length => 1)
doc.title.should be_nil
end
end
describe "ignoring sidebars" do
before do
@doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
:min_text_length => 0, :retry_length => 1)
end
it "should not return the sidebar" do
@doc.content.should_not match("sidebar")
end
end
describe "inserting space for block elements" do
before do
@doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
<html><head><title>title!</title></head>
<body>
<div>
<p>a<br>b<hr>c<address>d</address>f/p>
</div>
</body>
</html>
HTML
end
it "should not return the sidebar" do
@doc.content.should_not match("a b c d f")
end
end
describe "outputs good stuff for known documents" do
before do
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
@samples = @html_files.map {|filename| File.basename(filename, '.html') }
end
it "should output expected fragments of text" do
checks = 0
@samples.each do |sample|
html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
doc = Readability::Document.new(html).content
load "fixtures/samples/#{sample}-fragments.rb"
puts "testing #{sample}..."
$required_fragments.each do |required_text|
doc.should include(required_text)
checks += 1
end
$excluded_fragments.each do |text_to_avoid|
doc.should_not include(text_to_avoid)
checks += 1
end
end
puts "Performed #{checks} checks."
end
end
end