Skip to content

Commit

Permalink
Support query css selectors and xpath (#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
sunshineplan committed Nov 20, 2023
1 parent ca84917 commit 4ae58b8
Show file tree
Hide file tree
Showing 11 changed files with 382 additions and 117 deletions.
30 changes: 28 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ type Attributes interface {

// Finder represents a set of methods for finding nodes.
type Finder interface {
// Find searches for a single node in the parse tree based on the specified find method and filters.
// Find searches for the first matched node in the parse tree based on the specified find method and filters.
Find(FindMethod, TagFilter, ...Filter) Node

// FindN searches for up to n nodes in the parse tree based on the specified find method and filters.
Expand All @@ -164,14 +164,33 @@ type Finder interface {
// FindAll searches for all nodes in the parse tree based on the specified find method and filters.
FindAll(FindMethod, TagFilter, ...Filter) []Node

// FindString searches for a single text node in the parse tree based on the specified find method and filters.
// FindString searches for the first matched text node in the parse tree based on the specified find method and filters.
FindString(FindMethod, StringFilter) TextNode

// FindStringN searches for up to n text nodes in the parse tree based on the specified find method and filters.
FindStringN(FindMethod, int, StringFilter) []TextNode

// FindAllString searches for all text nodes in the parse tree based on the specified find method and filters.
FindAllString(FindMethod, StringFilter) []TextNode

// CSS selectors support

// Select searches for the first matched node in the parse tree based on the css selector.
// Will panics if the selector cannot be parsed.
Select(string) Node

// SelectAll searches for all nodes in the parse tree based on the css selector.
// Will panics if the selector cannot be parsed.
SelectAll(string) []Node

// xpath support

// XPath searches for all node that matches by the specified XPath expr. Will panics if the expression cannot be parsed.
XPath(string) []Node

// Evaluate returns the result of the xpath expression.
// The result type of the expression is one of the follow: bool, float64, string, *xpath.NodeIterator.
Evaluate(string) (any, error)
}

// FindMethod represents the method used to search for nodes in the parse tree.
Expand Down Expand Up @@ -223,6 +242,13 @@ type StringFilter interface {
}
```

## Credits

This repo relies on the following third-party projects:

* [ericchiang/css](https://github.com/ericchiang/css)
* [antchfx/xpath](https://github.com/antchfx/xpath)

## License

[The MIT License (MIT)](https://raw.githubusercontent.com/sunshineplan/node/main/LICENSE)
24 changes: 5 additions & 19 deletions class_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ func TestClass(t *testing.T) {
if nodes := soup.FindAll(0, A, Class("sister")); len(nodes) != 3 {
t.Errorf("expected nodes %d; got %d", 3, len(nodes))
} else {
expected := []string{
`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
}
expected := []string{elsie, lacie, tillie}
for i, node := range nodes {
if html := node.Readable(); html != expected[i] {
t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
Expand All @@ -22,21 +18,15 @@ func TestClass(t *testing.T) {
}
if nodes := soup.FindAll(0, nil, Class(regexp.MustCompile("itl"))); len(nodes) != 1 {
t.Errorf("expected nodes %d; got %d", 1, len(nodes))
} else {
if html := nodes[0].Readable(); html != `<p class="title"><b>The Dormouse's story</b></p>` {
t.Errorf("expected html %q; got %q", `<p class="title"><b>The Dormouse's story</b></p>`, html)
}
} else if html := nodes[0].Readable(); html != `<p class="title"><b>The Dormouse's story</b></p>` {
t.Errorf("expected html %q; got %q", `<p class="title"><b>The Dormouse's story</b></p>`, html)
}
if nodes := soup.FindAll(0, A, Class(func(class string, node Node) bool {
return node.HasAttr("class") && len(class) == 6
})); len(nodes) != 3 {
t.Errorf("expected nodes %d; got %d", 3, len(nodes))
} else {
expected := []string{
`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
}
expected := []string{elsie, lacie, tillie}
for i, node := range nodes {
if html := node.Readable(); html != expected[i] {
t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
Expand All @@ -46,11 +36,7 @@ func TestClass(t *testing.T) {
if nodes := soup.FindAll(0, A, Attr("class", "sister")); len(nodes) != 3 {
t.Errorf("expected nodes %d; got %d", 3, len(nodes))
} else {
expected := []string{
`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
}
expected := []string{elsie, lacie, tillie}
for i, node := range nodes {
if html := node.Readable(); html != expected[i] {
t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
Expand Down
8 changes: 7 additions & 1 deletion example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,20 @@ func ExampleAttr() {
if err != nil {
log.Fatal(err)
}
if nodes := node.FindAll(0, nil, Attr("name", "email")); len(nodes) != 1 {
if nodes := node.SelectAll(`[name="email"]`); len(nodes) != 1 {
log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
fmt.Println(nodes[0].Readable())
}
if nodes := node.XPath(`//*[@name="email"]`); len(nodes) != 1 {
log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
fmt.Println(nodes[0].Readable())
}
// Output:
// <div data-foo="value">foo!</div>
// <input name="email"/>
// <input name="email"/>
}

func ExampleClass() {
Expand Down
18 changes: 4 additions & 14 deletions filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@ import (
func TestFilter(t *testing.T) {
if nodes := soup.FindAll(0, B); len(nodes) != 1 {
t.Errorf("expected b %d; got %d", 1, len(nodes))
} else {
if html := nodes[0].Readable(); html != "<b>The Dormouse's story</b>" {
t.Errorf("expected html %q; got %q", "<b>The Dormouse's story</b>", html)
}
} else if html := nodes[0].Readable(); html != "<b>The Dormouse's story</b>" {
t.Errorf("expected html %q; got %q", "<b>The Dormouse's story</b>", html)
}
if nodes := soup.FindAll(0, Tag(regexp.MustCompile("^b"))); len(nodes) != 2 {
t.Errorf("expected ^b %d; got %d", 2, len(nodes))
Expand All @@ -39,12 +37,7 @@ func TestFilter(t *testing.T) {
if nodes := soup.FindAll(0, Tags("a", "b")); len(nodes) != 4 {
t.Errorf("expected nodes %d; got %d", 4, len(nodes))
} else {
expected := []string{
"<b>The Dormouse's story</b>",
`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
`<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>`,
`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
}
expected := []string{"<b>The Dormouse's story</b>", elsie, lacie, tillie}
for i, node := range nodes {
if html := node.Readable(); html != expected[i] {
t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
Expand Down Expand Up @@ -82,10 +75,7 @@ func TestFilter(t *testing.T) {
})); len(nodes) != 2 {
t.Errorf("expected nodes %d; got %d", 2, len(nodes))
} else {
expected := []string{
`<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>`,
`<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>`,
}
expected := []string{elsie, tillie}
for i, node := range nodes {
if html := node.Readable(); !strings.HasPrefix(html, expected[i]) {
t.Errorf("expected html #%d %q; got %q", i, expected[i], html)
Expand Down
56 changes: 54 additions & 2 deletions finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ package node
import (
"context"

"github.com/antchfx/htmlquery"
"github.com/antchfx/xpath"
"github.com/ericchiang/css"
"golang.org/x/net/html"
)

// Finder represents a set of methods for finding nodes.
type Finder interface {
// Find searches for a single node in the parse tree based on the specified find method and filters.
// Find searches for the first matched node in the parse tree based on the specified find method and filters.
Find(FindMethod, TagFilter, ...Filter) Node

// FindN searches for up to n nodes in the parse tree based on the specified find method and filters.
Expand All @@ -17,14 +20,33 @@ type Finder interface {
// FindAll searches for all nodes in the parse tree based on the specified find method and filters.
FindAll(FindMethod, TagFilter, ...Filter) []Node

// FindString searches for a single text node in the parse tree based on the specified find method and filters.
// FindString searches for the first matched text node in the parse tree based on the specified find method and filters.
FindString(FindMethod, StringFilter) TextNode

// FindStringN searches for up to n text nodes in the parse tree based on the specified find method and filters.
FindStringN(FindMethod, int, StringFilter) []TextNode

// FindAllString searches for all text nodes in the parse tree based on the specified find method and filters.
FindAllString(FindMethod, StringFilter) []TextNode

// CSS selectors support

// Select searches for the first matched node in the parse tree based on the css selector.
// Will panics if the selector cannot be parsed.
Select(string) Node

// SelectAll searches for all nodes in the parse tree based on the css selector.
// Will panics if the selector cannot be parsed.
SelectAll(string) []Node

// xpath support

// XPath searches for all node that matches by the specified XPath expr. Will panics if the expression cannot be parsed.
XPath(string) []Node

// Evaluate returns the result of the xpath expression.
// The result type of the expression is one of the follow: bool, float64, string, *xpath.NodeIterator.
Evaluate(string) (any, error)
}

// FindMethod represents the method used to search for nodes in the parse tree.
Expand Down Expand Up @@ -177,3 +199,33 @@ func (n *htmlNode) FindAllString(method FindMethod, filter StringFilter) (res []
}
return
}

func (n *htmlNode) Select(sel string) Node {
nodes := n.SelectAll(sel)
if len(nodes) == 0 {
return nil
}
return nodes[0]
}

func (n *htmlNode) SelectAll(sel string) (res []Node) {
for _, i := range css.MustParse(sel).Select(n.Raw()) {
res = append(res, NewNode(i))
}
return
}

func (n *htmlNode) XPath(expr string) (res []Node) {
for _, i := range htmlquery.Find(n.Raw(), expr) {
res = append(res, NewNode(i))
}
return
}

func (n *htmlNode) Evaluate(expr string) (any, error) {
exp, err := xpath.Compile(expr)
if err != nil {
return nil, err
}
return exp.Evaluate(htmlquery.CreateXPathNavigator(n.Raw())), nil
}
Loading

0 comments on commit 4ae58b8

Please sign in to comment.