Permalink
Browse files

Remove Hpricot for Nokogiri

Seems to be way faster
  • Loading branch information...
1 parent e7ebb06 commit a8dca0176df480a51d835a0a45b22b8e9281050e @zetaben committed Nov 29, 2011
View
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
s.require_paths = ["lib"]
s.default_executable = 'html2fb.rb'
- s.add_dependency('hpricot', '= 0.8.1')
+ s.add_dependency('nokogiri','>=1.4.1')
s.add_dependency('htmlentities', '>= 4.2.1')
s.add_dependency('launchy', '>= 2.0.0')
s.add_dependency('progressbar', '>= 0.0.3')
View
@@ -34,8 +34,8 @@ def down_url(entry_url)
req = Net::HTTP::Get.new(url.path)
req.basic_auth user,pass unless user.nil?
response = http.request(req)
- doc=Hpricot(response.body)
- e=doc.at('//entry').at('link[@rel="down"]')
+ doc=Nokogiri::XML(response.body).remove_namespaces!
+ e=doc.at('//entry/link[@rel="down"]')
return URI.parse(e[:href]).path unless e.nil?
}
end
@@ -47,7 +47,6 @@ def send
#STDERR.puts "sending to #{url}"
req = Net::HTTP::Post.new(url.path)
req.basic_auth user,pass unless user.nil?
-
req.body = '<?xml version="1.0"?>'+"\n"
req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
req.body +='<title>'+decode_text(title)+'</title>'+"\n"
@@ -77,9 +76,9 @@ def send
def recode_text(txt)
return txt if txt.blank?
- m=Hpricot(txt)
- m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
- m.to_html
+ m=Nokogiri::XML("<text>#{txt}</text>")
+ m.traverse{|t| next unless t.text?;t.text=force_decimal_entities(t.text) if t.text.match(/&[a-z][a-z0-9]+;/i)}
+ m.root.inner_html
end
HTMLENCODER=HTMLEntities.new
def force_decimal_entities(txt)
@@ -88,9 +87,9 @@ def force_decimal_entities(txt)
def decode_text(txt)
return txt if txt.blank?
- m=Hpricot(txt)
- m.traverse_text{|t| HTMLENCODER.decode(t.content)}
- m.to_html
+ m=Nokogiri::XML("<text>#{txt}</text>")
+ m.traverse{|t| next unless t.text?; HTMLENCODER.decode(t.text)}
+ m.root.inner_html
end
end
View
@@ -1,5 +1,5 @@
require 'html2fb/app.rb'
-require 'hpricot'
+require 'nokogiri'
require 'digest/md5'
module HTML2FB
@@ -105,11 +105,10 @@ class Text
def to_feedbooks(conf,path=nil)
stxt=to_html
return unless stxt.strip.size > 0
- doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
- doc.traverse_all_element do |e|
- unless e.is_a?Hpricot::Text
+ doc=Nokogiri::XML('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
+ doc.traverse do |e|
+ if e.element?
e.name='xhtml:'+e.name
- e.etag='xhtml:'+e.etag unless (!e.respond_to?:etag) || e.etag.nil?
end
end
FBPost.push(conf,'',doc.to_html,"Text",path)
View
@@ -1,4 +1,4 @@
-require 'hpricot'
+require 'nokogiri'
require 'html2fb/document.rb'
require 'progressbar'
#require 'ruby-prof'
@@ -14,11 +14,11 @@ def initialize(conf)
def parse(txt)
puts "Parsing HTML"
- pdoc=Hpricot(txt)
+ pdoc=Nokogiri::HTML(txt)
if @conf['conv']
mc=pdoc/'meta[@http-equiv="Content-Type"]'
if mc.size>0
- charset=mc.first.attributes['content'].split(';').find do |s|
+ charset=mc.first.attributes['content'].to_s.split(';').find do |s|
s.strip[0,7]=='charset'
end
unless charset.nil?
@@ -28,7 +28,7 @@ def parse(txt)
unless tc.nil?
puts "Trying to convert source encoding from #{tc} to utf-8"
require 'iconv'
- pdoc=Hpricot(Iconv.conv('utf-8',tc.downcase,txt))
+ pdoc=Nokogiri::HTML(Iconv.conv('utf-8',tc.downcase,txt))
end
@@ -38,7 +38,7 @@ def parse(txt)
puts "Removing garbage elements"
remove_objs(pdoc)
ti=pdoc.at('title')
- doc.title= ti.extract_text.strip unless ti.nil?
+ doc.title= ti.text.strip unless ti.nil?
# pdoc.search('//h3').each do |e|
# doc.content.push(e.inner_text)
# end
@@ -58,10 +58,10 @@ def remove_objs(doc)
doc.search('.'+cl).remove
end unless @conf['remove']['class'].nil?
@conf['remove']['expr'].each do |cl|
- doc.search(cl).remove
+ doc.search(cl).remove rescue doc.xpath(cl).remove
end unless @conf['remove']['expr'].nil?
@conf['remove']['before'].each do |cl|
- x=doc.at(cl)
+ x=doc.at(cl) rescue doc.at_xpath(cl)
if x
x.preceding.remove
x.parent.children.delete(x)
@@ -73,7 +73,7 @@ def remove_objs(doc)
t.remove unless t.nil?
end unless @conf['remove']['between'].nil?
@conf['remove']['after'].each do |cl|
- x=doc.at(cl)
+ x=doc.at(cl) rescue doc.at_xpath(cl)
if x
x.following.remove
x.parent.children.delete(x)
@@ -89,13 +89,13 @@ def parse_text(doc,ret)
aut=build_autom(@conf['select'],ret)
- pbar = ProgressBar.new("Parsing", doc.search('//').size)
- doc.traverse_all_element do |el|
+ pbar = ProgressBar.new("Parsing", doc.search('//*').size)
+ doc.traverse do |el|
aut.feed(el)
pbar.inc
end
- pbar.finish
aut.finish(doc)
+ pbar.finish
=begin
result = RubyProf.stop
printer = RubyProf::FlatPrinter.new(result)
@@ -180,10 +180,10 @@ def open_section(obj,lvl,el)
if @content=='body'
tmp=el.preceding[0..-1]
else
- tmp=el.root.search(@content...(el.xpath))[1..-1]
+ tmp=el.root.between(@content,(el.path),true)[1..-1]
end
if tmp.blank? #search can'find between siblins
- tmp=el.root.deep_between(@content,(el.xpath))
+ tmp=el.root.deep_between(@content,(el.path))
end
unless tmp.blank?
tmph=tmp.to_html
@@ -195,7 +195,7 @@ def open_section(obj,lvl,el)
(lvl..@max_level).to_a.reverse.each do |l|
close_section(l)
end
- @starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
+ @starts[lvl]=create_fbsection(el.root.at_xpath(obj[:xpath]).text,obj[:fblevel])
@content=obj[:xpath]
@current_level=lvl
end
@@ -209,7 +209,7 @@ def close_section(lvl)
end
def feed(el)
- return if el.is_a?Hpricot::Text
+ return if el.text?
@done=[[]*@levels.size]
@levels.each_with_index do |lvl,i|
@@ -218,7 +218,7 @@ def feed(el)
if el.in_search?(expr['expr'])
- open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
+ open_section({:xpath => el.path, :fblevel => expr['fblevel']},i+1,el)
break
end
end
@@ -228,6 +228,9 @@ def feed(el)
end
end
+class Nokogiri::XML::NodeSet
+ alias :blank? :empty?
+end
class String
def blank?
@@ -241,52 +244,83 @@ def blank?
end
end
-module Hpricot::Traverse
+
+
+class Nokogiri::XML::Node
+
def in_search?(expr)
if expr !~ /[^a-z0-9]/
return self.name.downcase()==expr.downcase()
end
- se_in=self.parent
+ se_in=self.root
+ se_in=self.parent if self.respond_to?(:parent)
if expr[0..1]=='/'
se_in=self.root
end
- se_in.search(expr).each do |el|
+ set=se_in.search(expr) rescue se_in.xpath(expr)
+ set.each do |el|
return true if el==self
end
# puts self.name+" "+expr
return false
end
def root
- return @root unless @root.nil?
- se_in=self
- se_in=se_in.parent until se_in.parent.nil?
- @root=se_in
- se_in
+ self.document.root
end
- def between(a,b)
- root.search(a..b)
+ def node_position
+ return @node_position if @node_position
+ @node_position=parent.children.index(self)
end
- def extract_text
- t=''
- self.traverse_all_element do |e|
- t+=e.content.to_s if e.is_a?(Hpricot::Text)
+ def between(a,b,excl=false)
+
+ #from nokogiri
+ offset=(excl ? -1 : 0)
+ ary = []
+ ele1=at(a) rescue at_xpath(a)
+ ele2=at(b) rescue at_xpath(b)
+
+ if ele1 and ele2
+ # let's quickly take care of siblings
+ if ele1.parent == ele2.parent
+
+ ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
+ else
+ # find common parent
+ ele1_p=ele1.ancestors
+ ele2_p=ele2.ancestors
+ common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.first
+
+ child = nil
+ if ele1 == common_parent
+ child = ele2
+ elsif ele2 == common_parent
+ child = ele1
+ end
+
+ if child
+ ary = common_parent.children[0..(child.node_position+offset)]
+ end
+ end
end
- t
+
+ return Nokogiri::XML::NodeSet.new(ele1.document,ary)
end
- def deep_between(i,j)
- unless j.nil? || self.at(j).nil?
- tm=self.at(i)
+
+
+ def deep_between(i,j)
+ unless j.nil? || self.at_xpath(j).nil?
+ tm=self.at_xpath(i)
prec=tm.deep_preceding
- r=Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}]
+ r=Nokogiri::XML::NodeSet.new(tm.document,[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}])
else
r=self.at(i).deep_following unless self.at(i).nil?
end
- Hpricot::Elements[*select_end(r,i)]
+ Nokogiri::XML::NodeSet.new(self.document,[*select_end(r,i)])
end
def select_end(tab,expr)
@@ -296,13 +330,15 @@ def select_end(tab,expr)
idx=-1
i=0
tab.each do |e|
- if e.search(expr.gsub(e.xpath,'.')).size > 0
+ nxp=expr.gsub(e.path,'.')
+ set=e.search(nxp) rescue e.xpath(nxp)
+ if set.size > 0
idx=i
#if e.search(i).size > 0
- if e.children.find{|ee| ee.xpath==expr }
+ if e.children.find{|ee| ee.path==expr }
e.children.each do |ee|
s << ee if f
- f=true if ee.xpath==expr
+ f=true if ee.path==expr
end
else
s=select_end(e.children,expr)
@@ -316,20 +352,24 @@ def select_end(tab,expr)
return s+tab[(idx+1)..-1]
end
+ def preceding
+ self.parent.children[0...node_position]
+ end
+
+ def following
+ self.parent.children[node_position+1..-1]
+ end
+
def deep_preceding()
- ret=Hpricot::Elements[]
- ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
+ ret=Nokogiri::XML::NodeSet.new(self.document,[])
+ ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
ret+=preceding
- Hpricot::Elements[*ret]
+ ret
end
def deep_following()
ret=following
- ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
- Hpricot::Elements[*ret]
+ ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
+ ret
end
end
-
-class Hpricot::Elements
- alias_method :blank?, :empty?
-end
View
@@ -1,3 +1,3 @@
module Html2fb
- VERSION = "1.2.0"
+ VERSION = "1.3.0"
end
View
@@ -10,11 +10,11 @@ remove:
before:
- "//br[8]"
after:
- - '//hr:last'
+ - '(//hr)[last()]'
select:
- - expr: "center:first"
+ - expr: "center[0]"
fblevel: section
- - expr: "center:last"
+ - expr: "center[last()]"
fblevel: section
- expr: '//center[h3]'
fblevel: chapter
View
@@ -9,9 +9,9 @@ remove:
- //pre
- hr
before:
- - "//p[11]"
+ - "(//p)[11]"
select:
- - expr: "//h2:first"
+ - expr: "(//h2)[0]"
fblevel: section
- expr: h2
fblevel: part
Oops, something went wrong.

0 comments on commit a8dca01

Please sign in to comment.