Support query css selectors and xpath (#11)

sunshineplan · Nov 20, 2023 · 4ae58b8 · 4ae58b8
1 parent ca84917
commit 4ae58b8
Show file tree

Hide file tree

Showing 11 changed files with 382 additions and 117 deletions.
diff --git a/README.md b/README.md
@@ -155,7 +155,7 @@ type Attributes interface {
 
 // Finder represents a set of methods for finding nodes.
 type Finder interface {
-	// Find searches for a single node in the parse tree based on the specified find method and filters.
+	// Find searches for the first matched node in the parse tree based on the specified find method and filters.
 	Find(FindMethod, TagFilter, ...Filter) Node
 
 	// FindN searches for up to n nodes in the parse tree based on the specified find method and filters.
@@ -164,14 +164,33 @@ type Finder interface {
 	// FindAll searches for all nodes in the parse tree based on the specified find method and filters.
 	FindAll(FindMethod, TagFilter, ...Filter) []Node
 
-	// FindString searches for a single text node in the parse tree based on the specified find method and filters.
+	// FindString searches for the first matched text node in the parse tree based on the specified find method and filters.
 	FindString(FindMethod, StringFilter) TextNode
 
 	// FindStringN searches for up to n text nodes in the parse tree based on the specified find method and filters.
 	FindStringN(FindMethod, int, StringFilter) []TextNode
 
 	// FindAllString searches for all text nodes in the parse tree based on the specified find method and filters.
 	FindAllString(FindMethod, StringFilter) []TextNode
+
+	// CSS selectors support
+
+	// Select searches for the first matched node in the parse tree based on the css selector.
+	// Will panics if the selector cannot be parsed.
+	Select(string) Node
+
+	// SelectAll searches for all nodes in the parse tree based on the css selector.
+	// Will panics if the selector cannot be parsed.
+	SelectAll(string) []Node
+
+	// xpath support
+
+	// XPath searches for all node that matches by the specified XPath expr. Will panics if the expression cannot be parsed.
+	XPath(string) []Node
+
+	// Evaluate returns the result of the xpath expression.
+	// The result type of the expression is one of the follow: bool, float64, string, *xpath.NodeIterator.
+	Evaluate(string) (any, error)
 }
 
 // FindMethod represents the method used to search for nodes in the parse tree.
@@ -223,6 +242,13 @@ type StringFilter interface {
 }
 ```
 
+## Credits
+
+This repo relies on the following third-party projects:
+
+  * [ericchiang/css](https://github.com/ericchiang/css)
+  * [antchfx/xpath](https://github.com/antchfx/xpath)
+
 ## License
 
 [The MIT License (MIT)](https://raw.githubusercontent.com/sunshineplan/node/main/LICENSE)
diff --git a/class_test.go b/class_test.go
@@ -9,11 +9,7 @@ func TestClass(t *testing.T) {
 	if nodes := soup.FindAll(0, A, Class("sister")); len(nodes) != 3 {
 		t.Errorf("expected nodes %d; got %d", 3, len(nodes))
 	} else {
-		expected := []string{
-			`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
-			`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
-			`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
-		}
+		expected := []string{elsie, lacie, tillie}
 		for i, node := range nodes {
 			if html := node.Readable(); html != expected[i] {
 				t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
@@ -22,21 +18,15 @@ func TestClass(t *testing.T) {
 	}
 	if nodes := soup.FindAll(0, nil, Class(regexp.MustCompile("itl"))); len(nodes) != 1 {
 		t.Errorf("expected nodes %d; got %d", 1, len(nodes))
-	} else {
-		if html := nodes[0].Readable(); html != `<p class="title"><b>The Dormouse's story</b></p>` {
-			t.Errorf("expected html %q; got %q", `<p class="title"><b>The Dormouse's story</b></p>`, html)
-		}
+	} else if html := nodes[0].Readable(); html != `<p class="title"><b>The Dormouse's story</b></p>` {
+		t.Errorf("expected html %q; got %q", `<p class="title"><b>The Dormouse's story</b></p>`, html)
 	}
 	if nodes := soup.FindAll(0, A, Class(func(class string, node Node) bool {
 		return node.HasAttr("class") && len(class) == 6
 	})); len(nodes) != 3 {
 		t.Errorf("expected nodes %d; got %d", 3, len(nodes))
 	} else {
-		expected := []string{
-			`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
-			`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
-			`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
-		}
+		expected := []string{elsie, lacie, tillie}
 		for i, node := range nodes {
 			if html := node.Readable(); html != expected[i] {
 				t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
@@ -46,11 +36,7 @@ func TestClass(t *testing.T) {
 	if nodes := soup.FindAll(0, A, Attr("class", "sister")); len(nodes) != 3 {
 		t.Errorf("expected nodes %d; got %d", 3, len(nodes))
 	} else {
-		expected := []string{
-			`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
-			`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
-			`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
-		}
+		expected := []string{elsie, lacie, tillie}
 		for i, node := range nodes {
 			if html := node.Readable(); html != expected[i] {
 				t.Errorf("expected html #%d %q; got %q", i, expected[i], html)

diff --git a/example_test.go b/example_test.go
@@ -19,14 +19,20 @@ func ExampleAttr() {
 	if err != nil {
 		log.Fatal(err)
 	}
-	if nodes := node.FindAll(0, nil, Attr("name", "email")); len(nodes) != 1 {
+	if nodes := node.SelectAll(`[name="email"]`); len(nodes) != 1 {
+		log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
+	} else {
+		fmt.Println(nodes[0].Readable())
+	}
+	if nodes := node.XPath(`//*[@name="email"]`); len(nodes) != 1 {
 		log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
 	} else {
 		fmt.Println(nodes[0].Readable())
 	}
 	// Output:
 	// <div data-foo="value">foo!</div>
 	// <input name="email"/>
+	// <input name="email"/>
 }
 
 func ExampleClass() {

diff --git a/filter_test.go b/filter_test.go
@@ -11,10 +11,8 @@ import (
 func TestFilter(t *testing.T) {
 	if nodes := soup.FindAll(0, B); len(nodes) != 1 {
 		t.Errorf("expected b %d; got %d", 1, len(nodes))
-	} else {
-		if html := nodes[0].Readable(); html != "<b>The Dormouse's story</b>" {
-			t.Errorf("expected html %q; got %q", "<b>The Dormouse's story</b>", html)
-		}
+	} else if html := nodes[0].Readable(); html != "<b>The Dormouse's story</b>" {
+		t.Errorf("expected html %q; got %q", "<b>The Dormouse's story</b>", html)
 	}
 	if nodes := soup.FindAll(0, Tag(regexp.MustCompile("^b"))); len(nodes) != 2 {
 		t.Errorf("expected ^b %d; got %d", 2, len(nodes))
@@ -39,12 +37,7 @@ func TestFilter(t *testing.T) {
 	if nodes := soup.FindAll(0, Tags("a", "b")); len(nodes) != 4 {
 		t.Errorf("expected nodes %d; got %d", 4, len(nodes))
 	} else {
-		expected := []string{
-			"<b>The Dormouse's story</b>",
-			`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
-			`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
-			`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
-		}
+		expected := []string{"<b>The Dormouse's story</b>", elsie, lacie, tillie}
 		for i, node := range nodes {
 			if html := node.Readable(); html != expected[i] {
 				t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
@@ -82,10 +75,7 @@ func TestFilter(t *testing.T) {
 	})); len(nodes) != 2 {
 		t.Errorf("expected nodes %d; got %d", 2, len(nodes))
 	} else {
-		expected := []string{
-			`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
-			`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
-		}
+		expected := []string{elsie, tillie}
 		for i, node := range nodes {
 			if html := node.Readable(); !strings.HasPrefix(html, expected[i]) {
 				t.Errorf("expected html #%d %q; got %q", i, expected[i], html)

diff --git a/finder.go b/finder.go
@@ -3,12 +3,15 @@ package node
 import (
 	"context"
 
+	"github.com/antchfx/htmlquery"
+	"github.com/antchfx/xpath"
+	"github.com/ericchiang/css"
 	"golang.org/x/net/html"
 )
 
 // Finder represents a set of methods for finding nodes.
 type Finder interface {
-	// Find searches for a single node in the parse tree based on the specified find method and filters.
+	// Find searches for the first matched node in the parse tree based on the specified find method and filters.
 	Find(FindMethod, TagFilter, ...Filter) Node
 
 	// FindN searches for up to n nodes in the parse tree based on the specified find method and filters.
@@ -17,14 +20,33 @@ type Finder interface {
 	// FindAll searches for all nodes in the parse tree based on the specified find method and filters.
 	FindAll(FindMethod, TagFilter, ...Filter) []Node
 
-	// FindString searches for a single text node in the parse tree based on the specified find method and filters.
+	// FindString searches for the first matched text node in the parse tree based on the specified find method and filters.
 	FindString(FindMethod, StringFilter) TextNode
 
 	// FindStringN searches for up to n text nodes in the parse tree based on the specified find method and filters.
 	FindStringN(FindMethod, int, StringFilter) []TextNode
 
 	// FindAllString searches for all text nodes in the parse tree based on the specified find method and filters.
 	FindAllString(FindMethod, StringFilter) []TextNode
+
+	// CSS selectors support
+
+	// Select searches for the first matched node in the parse tree based on the css selector.
+	// Will panics if the selector cannot be parsed.
+	Select(string) Node
+
+	// SelectAll searches for all nodes in the parse tree based on the css selector.
+	// Will panics if the selector cannot be parsed.
+	SelectAll(string) []Node
+
+	// xpath support
+
+	// XPath searches for all node that matches by the specified XPath expr. Will panics if the expression cannot be parsed.
+	XPath(string) []Node
+
+	// Evaluate returns the result of the xpath expression.
+	// The result type of the expression is one of the follow: bool, float64, string, *xpath.NodeIterator.
+	Evaluate(string) (any, error)
 }
 
 // FindMethod represents the method used to search for nodes in the parse tree.
@@ -177,3 +199,33 @@ func (n *htmlNode) FindAllString(method FindMethod, filter StringFilter) (res []
 	}
 	return
 }
+
+func (n *htmlNode) Select(sel string) Node {
+	nodes := n.SelectAll(sel)
+	if len(nodes) == 0 {
+		return nil
+	}
+	return nodes[0]
+}
+
+func (n *htmlNode) SelectAll(sel string) (res []Node) {
+	for _, i := range css.MustParse(sel).Select(n.Raw()) {
+		res = append(res, NewNode(i))
+	}
+	return
+}
+
+func (n *htmlNode) XPath(expr string) (res []Node) {
+	for _, i := range htmlquery.Find(n.Raw(), expr) {
+		res = append(res, NewNode(i))
+	}
+	return
+}
+
+func (n *htmlNode) Evaluate(expr string) (any, error) {
+	exp, err := xpath.Compile(expr)
+	if err != nil {
+		return nil, err
+	}
+	return exp.Evaluate(htmlquery.CreateXPathNavigator(n.Raw())), nil
+}