From ef59a259386a99949002fcc5e4720085f8ddb6ad Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 14:32:41 +0530 Subject: [PATCH 01/25] Add closest --- node-tree.go | 22 +++++++++++++++++++++- node-tree_test.go | 20 ++++++++++++++++++++ querying.go | 1 + test-files/4.html | 4 ++-- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/node-tree.go b/node-tree.go index 759a0a7..858e7dc 100644 --- a/node-tree.go +++ b/node-tree.go @@ -2,6 +2,7 @@ package GoHtml import ( "strings" + "golang.org/x/net/html" ) @@ -75,7 +76,7 @@ func (node *Node) GetAttribute(attributeName string) (string, bool) { // RemoveAttribute remove or delete the specified attribute. func (node *Node) RemoveAttribute(attributeName string) { delete(node.attributes, strings.TrimSpace(strings.ToLower(attributeName))) - + } // IterateAttributes calls callback at every attribute in the node by passing attribute and value of the node. @@ -203,3 +204,22 @@ func (node *Node) RemoveNode() { func (node *Node) IsTextNode() bool { return node.GetTagName() == "" } + +// Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the specified query. +// Adapted from [https://developer.mozilla.org/en-US/docs/Web/API/Element/closest](MDN Element: closest() method) +func (node *Node) Closest(query string) *Node { + queryTokens := TokenizeQuery(query) + traverser := NewTraverser(node) + for traverser.GetCurrentNode() != nil { + if matchQueryTokens(traverser.GetCurrentNode(), queryTokens) { + break + } + + if traverser.GetCurrentNode().GetPreviousNode() == nil { + traverser.SetCurrentNodeTo(traverser.GetCurrentNode().GetParent()) + }else{ + traverser.Previous() + } + } + return traverser.GetCurrentNode() +} diff --git a/node-tree_test.go b/node-tree_test.go index 3ab4a01..500b33a 100644 --- a/node-tree_test.go +++ b/node-tree_test.go @@ -112,4 +112,24 @@ func TestRemoveNode(t *testing.T){ //p.RemoveNode() //t.Log(GoHtml.NodeTreeToHTML(article)) +} + +func TestClosest(t *testing.T){ + node, err := testFile4NodeTree() + if err != nil{ + t.Fatal(err) + } + node = node.GetElementByClassName("ordered-item") + if node == nil { + t.Fatal("Node is nil.") + } + + node = node.Closest("ol .ordered-list") + if node == nil { + t.Fatal("Node is nil") + }else if node.GetTagName() != "ol"{ + t.Fatal("Unexpected element.") + } + + } \ No newline at end of file diff --git a/querying.go b/querying.go index 8f00960..79092af 100644 --- a/querying.go +++ b/querying.go @@ -146,6 +146,7 @@ func TokenizeQuery(query string) []QueryToken { return slice } +// matchQueryTokens returns wether the queryTokens match given the node. func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { if len(queryTokens) == 0 { return false diff --git a/test-files/4.html b/test-files/4.html index 966886b..18a385f 100644 --- a/test-files/4.html +++ b/test-files/4.html @@ -10,7 +10,7 @@

Document

List 1

-
    +
    1. Apple
    2. Orange
    3. Mango
    4. @@ -18,7 +18,7 @@

      List 1

      List 2

      Lorem

      -
        +
        • Cake
        • Pizza
        • Kottue
        • From 8c107e4593c6ece7b2da443f3efe7ff6dc174a72 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 19:04:30 +0530 Subject: [PATCH 02/25] Add QuerySelector and it need to be tested furthermore. --- querying.go | 172 ++++++++++++++++++++++++++--------------------- querying_test.go | 17 ++++- tokenizer.go | 74 ++++++++++++++++++++ 3 files changed, 184 insertions(+), 79 deletions(-) diff --git a/querying.go b/querying.go index 79092af..d1550ad 100644 --- a/querying.go +++ b/querying.go @@ -2,6 +2,8 @@ package GoHtml import ( "strings" + + "github.com/emirpasic/gods/stacks/linkedliststack" ) // GetElementByTagName returns the first node that match with the given tagName by advancing from the node. @@ -99,80 +101,6 @@ func (node *Node) GetElementsById(idName string) NodeList { return nodeList } -// Selector types -const ( - Id int = iota - Tag - Class -) - -// QueryToken store data about basic css selectors(ids, classes, tags). -type QueryToken struct { - Type int - SelectorName string - Selector string -} - -// TokenizeQuery tokenizes the query and returns a list of QueryToken. -func TokenizeQuery(query string) []QueryToken { - slice := make([]QueryToken, 0, 1) - if strings.TrimSpace(query) == "" { - return slice - } - - iter := strings.SplitSeq(query, " ") - for sec := range iter { - token := QueryToken{} - switch sec { - case "", " ", ".", "#": - continue - } - - switch string(sec[0]) { - case ".": - token.Type = Class - token.SelectorName = sec[1:] - case "#": - token.Type = Id - token.SelectorName = sec[1:] - default: - token.Type = Tag - token.SelectorName = sec - } - token.Selector = sec - slice = append(slice, token) - } - - return slice -} - -// matchQueryTokens returns wether the queryTokens match given the node. -func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { - if len(queryTokens) == 0 { - return false - } - classList := NewClassList() - classList.DecodeFrom(node) - for _, token := range queryTokens { - switch token.Type { - case Id: - idName, _ := node.GetAttribute("id") - if token.SelectorName != idName { - return false - } - case Tag: - if node.GetTagName() != token.SelectorName { - return false - } - case Class: - if !classList.Contains(token.SelectorName) { - return false - } - } - } - return true -} - // Query returns the first node that matches with the give query. func (node *Node) Query(query string) *Node { queryTokens := TokenizeQuery(query) @@ -190,15 +118,105 @@ func (node *Node) Query(query string) *Node { } // QueryAll returns a NodeList containing nodes that matched with the given query. -func (node *Node) QueryAll(query string) NodeList{ +func (node *Node) QueryAll(query string) NodeList { nodeList := NewNodeList() queryTokens := TokenizeQuery(query) traverser := NewTraverser(node) - for node := range traverser.Walkthrough{ + for node := range traverser.Walkthrough { if matchQueryTokens(node, queryTokens) { nodeList.Append(node) } } return nodeList -} \ No newline at end of file +} + +func (node *Node) QuerySelector(query string) *Node { + queryTokens := TokenizeQuery(query) + + parentNodeStack := make([]*Node, 0, 2) + stack := linkedliststack.New() + type stackFrame struct { + //len should contain the length of the parentNodeStack at the stack push time. + len int + node *Node + } + stack.Push(stackFrame{ + len: len(parentNodeStack), + node: node, + }) + + for stack.Size() > 0 { + val, _ := stack.Pop() + sf := val.(stackFrame) + + for diff := len(parentNodeStack) - sf.len; len(parentNodeStack) > 0 && diff > 0; diff-- { + pop(parentNodeStack) + } + + classList := NewClassList() + classList.DecodeFrom(sf.node) + i := matchFromRightMostQueryToken(sf.node, classList, queryTokens, len(queryTokens)-1) + if i < len(queryTokens)-1 { + for j := len(parentNodeStack) - 1; j >= 0; j-- { + node := parentNodeStack[j] + classList := NewClassList() + classList.DecodeFrom(node) + i = matchFromRightMostQueryToken(node, classList, queryTokens, i-1) + } + if i <= 0{ + return sf.node + } + } + + if sf.node.GetNextNode() != nil { + stack.Push(stackFrame{ + len: len(parentNodeStack), + node: sf.node.GetNextNode(), + }) + } + + if sf.node.GetChildNode() != nil { + childNode := sf.node.GetChildNode() + parentNodeStack = append(parentNodeStack, childNode) + stack.Push(stackFrame{ + len: len(parentNodeStack), + node: childNode, + }) + } + } + return nil +} + +func pop(slice []*Node) *Node { + if len(slice) > 0 { + res := slice[len(slice)-1] + slice = slice[:len(slice)-1] + return res + } + return nil +} + +// matchFromRightMostQueryToken tries to match query tokens from right to left and return the index at which point query token last matched. +func matchFromRightMostQueryToken(node *Node, classList ClassList, queryTokens []QueryToken, i int) int { + outer : for i >= 0 { + token := queryTokens[i] + switch token.Type { + case Id: + idName, _ := node.GetAttribute("id") + if token.SelectorName != idName { + break outer + } + case Class: + if !classList.Contains(token.SelectorName) { + break outer + } + case Tag: + if node.GetTagName() != token.SelectorName { + break outer + } + } + i-- + } + return i +} diff --git a/querying_test.go b/querying_test.go index 2c22331..ee7afcb 100644 --- a/querying_test.go +++ b/querying_test.go @@ -166,7 +166,7 @@ func TestSelectorTokenizer(t *testing.T) { } } -func TestQuerySelector(t *testing.T) { +func TestQuery(t *testing.T) { node, err := testFile4NodeTree() if err != nil { t.Fatal(err) @@ -180,7 +180,7 @@ func TestQuerySelector(t *testing.T) { } } -func TestQuerySelectorAll(t *testing.T) { +func TestQueryAll(t *testing.T) { node, err := testFile4NodeTree() if err != nil { t.Fatal(err) @@ -192,3 +192,16 @@ func TestQuerySelectorAll(t *testing.T) { t.Fatal("") } } + +func TestQuerySelector(t *testing.T){ + node, err := testFile4NodeTree() + if err != nil { + t.Fatal(err) + return + } + node = node.QuerySelector("html .ordered-list ol li .ordered-item") + if node == nil { + t.Fatal("Node is nill after QuerySelector") + } + t.Log(node.GetInnerText()) +} diff --git a/tokenizer.go b/tokenizer.go index fde080b..a6fee42 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -121,3 +121,77 @@ func isTopNode(node *Node, stack *linkedliststack.Stack) bool { topNode := val.(*Node) return topNode == node } + +// QueryToken types +const ( + Id int = iota + Tag + Class +) + +// QueryToken store data about basic css selectors(ids, classes, tags). +type QueryToken struct { + Type int + SelectorName string + Selector string +} + +// TokenizeQuery tokenizes the query and returns a list of QueryToken. +func TokenizeQuery(query string) []QueryToken { + slice := make([]QueryToken, 0, 1) + if strings.TrimSpace(query) == "" { + return slice + } + + iter := strings.SplitSeq(query, " ") + for sec := range iter { + token := QueryToken{} + switch sec { + case "", " ", ".", "#": + continue + } + + switch string(sec[0]) { + case ".": + token.Type = Class + token.SelectorName = sec[1:] + case "#": + token.Type = Id + token.SelectorName = sec[1:] + default: + token.Type = Tag + token.SelectorName = sec + } + token.Selector = sec + slice = append(slice, token) + } + + return slice +} + +// matchQueryTokens returns wether the queryTokens match given the node. +func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { + if len(queryTokens) == 0 { + return false + } + classList := NewClassList() + classList.DecodeFrom(node) + for _, token := range queryTokens { + switch token.Type { + case Id: + idName, _ := node.GetAttribute("id") + if token.SelectorName != idName { + return false + } + case Tag: + if node.GetTagName() != token.SelectorName { + return false + } + case Class: + if !classList.Contains(token.SelectorName) { + return false + } + } + } + return true +} \ No newline at end of file From dfd0e60e8d6b2e31662807bb3be18d5f2344fcbe Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 19:28:57 +0530 Subject: [PATCH 03/25] pop function bug fixed --- querying.go | 13 ++++++++----- querying_test.go | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/querying.go b/querying.go index d1550ad..0f16ec2 100644 --- a/querying.go +++ b/querying.go @@ -1,6 +1,7 @@ package GoHtml import ( + "fmt" "strings" "github.com/emirpasic/gods/stacks/linkedliststack" @@ -151,18 +152,20 @@ func (node *Node) QuerySelector(query string) *Node { sf := val.(stackFrame) for diff := len(parentNodeStack) - sf.len; len(parentNodeStack) > 0 && diff > 0; diff-- { - pop(parentNodeStack) + _, parentNodeStack = pop(parentNodeStack) } classList := NewClassList() classList.DecodeFrom(sf.node) i := matchFromRightMostQueryToken(sf.node, classList, queryTokens, len(queryTokens)-1) if i < len(queryTokens)-1 { - for j := len(parentNodeStack) - 1; j >= 0; j-- { + fmt.Println(sf.node) + for j := len(parentNodeStack) - 1; j >= 0 && i >= 0; j-- { node := parentNodeStack[j] classList := NewClassList() classList.DecodeFrom(node) i = matchFromRightMostQueryToken(node, classList, queryTokens, i-1) + fmt.Println(node, i) } if i <= 0{ return sf.node @@ -188,13 +191,13 @@ func (node *Node) QuerySelector(query string) *Node { return nil } -func pop(slice []*Node) *Node { +func pop(slice []*Node) (*Node, []*Node) { if len(slice) > 0 { res := slice[len(slice)-1] slice = slice[:len(slice)-1] - return res + return res, slice } - return nil + return nil, slice } // matchFromRightMostQueryToken tries to match query tokens from right to left and return the index at which point query token last matched. diff --git a/querying_test.go b/querying_test.go index ee7afcb..6ee91c9 100644 --- a/querying_test.go +++ b/querying_test.go @@ -199,7 +199,7 @@ func TestQuerySelector(t *testing.T){ t.Fatal(err) return } - node = node.QuerySelector("html .ordered-list ol li .ordered-item") + node = node.QuerySelector(".ordered-list li .ordered-item") if node == nil { t.Fatal("Node is nill after QuerySelector") } From a3d00d42c0f3ef71cab022b4fea23bf0111d4300 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 21:52:37 +0530 Subject: [PATCH 04/25] Bug fixed and add QuerySearch. QuerySearch still have to be tested btw --- querying.go | 116 +++++++++++++++++++++++++++-------------------- querying_test.go | 2 +- 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/querying.go b/querying.go index 0f16ec2..d3f0802 100644 --- a/querying.go +++ b/querying.go @@ -1,7 +1,7 @@ package GoHtml import ( - "fmt" + "iter" "strings" "github.com/emirpasic/gods/stacks/linkedliststack" @@ -132,63 +132,62 @@ func (node *Node) QueryAll(query string) NodeList { return nodeList } -func (node *Node) QuerySelector(query string) *Node { - queryTokens := TokenizeQuery(query) - - parentNodeStack := make([]*Node, 0, 2) - stack := linkedliststack.New() - type stackFrame struct { - //len should contain the length of the parentNodeStack at the stack push time. - len int - node *Node - } - stack.Push(stackFrame{ - len: len(parentNodeStack), - node: node, - }) - - for stack.Size() > 0 { - val, _ := stack.Pop() - sf := val.(stackFrame) +func QuerySearch(node *Node, query string) iter.Seq[*Node] { + return func(yield func(node *Node) bool) { + queryTokens := TokenizeQuery(query) - for diff := len(parentNodeStack) - sf.len; len(parentNodeStack) > 0 && diff > 0; diff-- { - _, parentNodeStack = pop(parentNodeStack) + parentNodeStack := make([]*Node, 0, 2) + stack := linkedliststack.New() + type stackFrame struct { + //len should contain the length of the parentNodeStack at the stack push time. + len int + node *Node } + stack.Push(stackFrame{ + len: len(parentNodeStack), + node: node, + }) - classList := NewClassList() - classList.DecodeFrom(sf.node) - i := matchFromRightMostQueryToken(sf.node, classList, queryTokens, len(queryTokens)-1) - if i < len(queryTokens)-1 { - fmt.Println(sf.node) - for j := len(parentNodeStack) - 1; j >= 0 && i >= 0; j-- { - node := parentNodeStack[j] - classList := NewClassList() - classList.DecodeFrom(node) - i = matchFromRightMostQueryToken(node, classList, queryTokens, i-1) - fmt.Println(node, i) + for stack.Size() > 0 { + val, _ := stack.Pop() + sf := val.(stackFrame) + + for diff := len(parentNodeStack) - sf.len; len(parentNodeStack) > 0 && diff > 0; diff-- { + _, parentNodeStack = pop(parentNodeStack) } - if i <= 0{ - return sf.node + + classList := NewClassList() + classList.DecodeFrom(sf.node) + i := matchFromRightMostQueryToken(sf.node, classList, queryTokens, len(queryTokens)-1) + if i < len(queryTokens)-1 { + for j := len(parentNodeStack) - 1; j >= 0 && i >= 0; j-- { + node := parentNodeStack[j] + classList := NewClassList() + classList.DecodeFrom(node) + i = matchFromRightMostQueryToken(node, classList, queryTokens, i-1) + } + if i < 0 && !yield(sf.node){ + return + } } - } - if sf.node.GetNextNode() != nil { - stack.Push(stackFrame{ - len: len(parentNodeStack), - node: sf.node.GetNextNode(), - }) - } + if sf.node.GetNextNode() != nil { + stack.Push(stackFrame{ + len: len(parentNodeStack), + node: sf.node.GetNextNode(), + }) + } - if sf.node.GetChildNode() != nil { - childNode := sf.node.GetChildNode() - parentNodeStack = append(parentNodeStack, childNode) - stack.Push(stackFrame{ - len: len(parentNodeStack), - node: childNode, - }) + if sf.node.GetChildNode() != nil { + childNode := sf.node.GetChildNode() + parentNodeStack = append(parentNodeStack, childNode) + stack.Push(stackFrame{ + len: len(parentNodeStack), + node: childNode, + }) + } } } - return nil } func pop(slice []*Node) (*Node, []*Node) { @@ -202,8 +201,17 @@ func pop(slice []*Node) (*Node, []*Node) { // matchFromRightMostQueryToken tries to match query tokens from right to left and return the index at which point query token last matched. func matchFromRightMostQueryToken(node *Node, classList ClassList, queryTokens []QueryToken, i int) int { - outer : for i >= 0 { + checked := make(map[string]struct{}) +outer: + for i >= 0 { token := queryTokens[i] + _, ok := checked[token.Selector] + if ok{ + break + }else{ + checked[token.Selector] = struct{}{} + } + switch token.Type { case Id: idName, _ := node.GetAttribute("id") @@ -223,3 +231,11 @@ func matchFromRightMostQueryToken(node *Node, classList ClassList, queryTokens [ } return i } + +func (node *Node) QuerySelector(query string) *Node { + iter := QuerySearch(node, query) + for node := range iter{ + return node + } + return nil +} diff --git a/querying_test.go b/querying_test.go index 6ee91c9..ee7afcb 100644 --- a/querying_test.go +++ b/querying_test.go @@ -199,7 +199,7 @@ func TestQuerySelector(t *testing.T){ t.Fatal(err) return } - node = node.QuerySelector(".ordered-list li .ordered-item") + node = node.QuerySelector("html .ordered-list ol li .ordered-item") if node == nil { t.Fatal("Node is nill after QuerySelector") } From 3a4713a694442e829639364931926175ee776720 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 21:55:40 +0530 Subject: [PATCH 05/25] tokanizer CurrentNode to GetCurrentNode to match with the library schema --- tokenizer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizer.go b/tokenizer.go index a6fee42..e6b200d 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -28,7 +28,7 @@ func (t *Tokenizer) Advanced() html.TokenType { // CurrentNode returns the current node. // Returned value can be nil regardless of tt. -func (t *Tokenizer) CurrentNode() *Node { +func (t *Tokenizer) GetCurrentNode() *Node { currentToken := t.z.Token() if strings.TrimSpace(currentToken.Data) == "" { return nil From 771f70c8a9cd6e85979e9ae9e677961cced9be59 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 21:56:58 +0530 Subject: [PATCH 06/25] Fixed deprecated function use error in the parser.go --- parser.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser.go b/parser.go index 251a7d2..93edf8f 100644 --- a/parser.go +++ b/parser.go @@ -17,7 +17,7 @@ func Decode(r io.Reader) (*Node, error) { break } - nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt) + nodeTreeBuilder.WriteNodeTree(t.GetCurrentNode(), tt) } return nodeTreeBuilder.GetRootNode(), nil From ee66653729ca5adfc88de8bcb56b503fe5c87dc3 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 22:30:46 +0530 Subject: [PATCH 07/25] Add QuerySelectorAll and redocumented the library. QuerySearch, QuerySelector and QuerySelectorAll still not tested and have tests to write. --- .vscode/settings.json | 1 + node-tree.go | 11 ++++++++++- querying.go | 15 +++++++++++++++ tokenizer.go | 7 ++++++- 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 5e89b44..92d3d14 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,7 @@ { "cSpell.words": [ "autoplay", + "Combinators", "DOCTYPEDTD", "emirpasic", "gohtml", diff --git a/node-tree.go b/node-tree.go index 858e7dc..aaa129f 100644 --- a/node-tree.go +++ b/node-tree.go @@ -205,7 +205,16 @@ func (node *Node) IsTextNode() bool { return node.GetTagName() == "" } -// Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the specified query. +// Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the full query. +/* +Ex: +query = "div .video-container #div" + +
          + +As shown elements with every described specifiers will only match. +But this is not the case for QuerySearch, QuerySelector and QuerySelectorAll. +*/ // Adapted from [https://developer.mozilla.org/en-US/docs/Web/API/Element/closest](MDN Element: closest() method) func (node *Node) Closest(query string) *Node { queryTokens := TokenizeQuery(query) diff --git a/querying.go b/querying.go index d3f0802..1072652 100644 --- a/querying.go +++ b/querying.go @@ -132,6 +132,9 @@ func (node *Node) QueryAll(query string) NodeList { return nodeList } +/* +QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. +*/ func QuerySearch(node *Node, query string) iter.Seq[*Node] { return func(yield func(node *Node) bool) { queryTokens := TokenizeQuery(query) @@ -232,6 +235,7 @@ outer: return i } +//QuerySelector only returns the first node that matches with the QuerySearch. func (node *Node) QuerySelector(query string) *Node { iter := QuerySearch(node, query) for node := range iter{ @@ -239,3 +243,14 @@ func (node *Node) QuerySelector(query string) *Node { } return nil } + +//QuerySelectorAll stores nodes passed down by QuerySearch in a nodeList and returns the nodeList. +func (node *Node) QuerySelectorAll(query string) NodeList{ + iter := QuerySearch(node, query) + nodeList := NewNodeList() + + for node := range iter{ + nodeList.Append(node) + } + return nodeList +} diff --git a/tokenizer.go b/tokenizer.go index e6b200d..d0905cc 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -136,7 +136,12 @@ type QueryToken struct { Selector string } -// TokenizeQuery tokenizes the query and returns a list of QueryToken. + /* +TokenizeQuery tokenizes the query and returns a list of QueryToken. + +query should be of only consists of class, tag and/or id. This applies to every function that accepts a parameter name query. +query should not consists of css selectors, Combinators and separators. +*/ func TokenizeQuery(query string) []QueryToken { slice := make([]QueryToken, 0, 1) if strings.TrimSpace(query) == "" { From 39f9bbc29c7400dac96a4bd940f1c87b49ec5669 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 22:33:19 +0530 Subject: [PATCH 08/25] Updated the readme.md --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index dafb5af..d491a5c 100644 --- a/README.md +++ b/README.md @@ -50,10 +50,11 @@ Heres an example of fetching a website and parsing and then using querying metho ## Changelog Changes, bug fixes and new features in this version. -- add: Tokenizer -- add: NodeTreeBuilder -- renamed: QuerySelector to Query -- renamed: QuerySelectorAll to QueryAll +- add: Closest +- add: QuerySearch +- add: QuerySelector +- add: QuerySelectorAll +- renamed: Tokenizer CurrentNode method to GetCurrentNode. ## Documentation From fa601d319ff350e7e6332628feace8bea323bdcf Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 16 Aug 2025 22:36:25 +0530 Subject: [PATCH 09/25] Fixed extra tabs in readme.md example go code section. --- README.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index d491a5c..7ab641e 100644 --- a/README.md +++ b/README.md @@ -28,23 +28,23 @@ import ( ## Example Heres an example of fetching a website and parsing and then using querying methods. ```go - res, err := http.Get("https://www.metalsucks.net/") - if err != nil { - t.Fatal(err) - } - defer res.Body.Close() - - //Parses the given html reader and then returns the root node and an error. - node, err := GoHtml.Decode(res.Body) - if err != nil { - t.Fatal(err) - } - - nodeList := node.GetElementsByClassName("post-title") - iter := nodeList.IterNodeList() - for node := range iter{ - print(node.GetInnerText()) - } +res, err := http.Get("https://www.metalsucks.net/") +if err != nil { + t.Fatal(err) +} +defer res.Body.Close() + +//Parses the given html reader and then returns the root node and an error. +node, err := GoHtml.Decode(res.Body) +if err != nil { + t.Fatal(err) +} + +nodeList := node.GetElementsByClassName("post-title") +iter := nodeList.IterNodeList() +for node := range iter{ + print(node.GetInnerText()) +} ``` ## Changelog From 447b63750251ad6037c02153ba5a238d127d2021 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sun, 17 Aug 2025 11:08:50 +0530 Subject: [PATCH 10/25] Updated benchmark_test.go --- benchmarks/benchmark_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_test.go b/benchmarks/benchmark_test.go index 77af6ab..d8886f2 100644 --- a/benchmarks/benchmark_test.go +++ b/benchmarks/benchmark_test.go @@ -20,7 +20,7 @@ func TestFetchPostCovers(t *testing.T){ t.Fatal(err) } - nodeList := node.QueryAll(".sm-feat .clearfix article") + nodeList := node.QuerySelectorAll(".left-content article .post-title") t.Log("Got ", nodeList.Len(), " post titles.") iter := nodeList.IterNodeList() for node := range iter{ From bdef2bbfba8c7d569e5349b9a47997c011105471 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sun, 17 Aug 2025 13:34:14 +0530 Subject: [PATCH 11/25] Made GetParent fast. GetParent need be tested manually. GetParent and Closest test function passed. --- node-tree.go | 4 +++- querying.go | 27 ++++++++++++++------------- querying_test.go | 37 ++++++++++++++++++++++++++++++++++--- test-files/5.html | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 test-files/5.html diff --git a/node-tree.go b/node-tree.go index aaa129f..fc12549 100644 --- a/node-tree.go +++ b/node-tree.go @@ -115,6 +115,7 @@ func (node *Node) AppendChild(childNode *Node) { lastNode := node.GetChildNode().GetLastNode() childNode.SetPreviousNode(lastNode) + childNode.setParentNode(lastNode.GetParent()) lastNode.SetNextNode(childNode) } @@ -122,12 +123,13 @@ func (node *Node) AppendChild(childNode *Node) { func (node *Node) Append(newNode *Node) { lastNode := node.GetLastNode() newNode.SetPreviousNode(lastNode) + newNode.setParentNode(lastNode.GetParent()) lastNode.SetNextNode(newNode) } // GetParent returns a pointer to the parent node. func (node *Node) GetParent() *Node { - return node.GetFirstNode().getParentNode() + return node.parentNode } // GetLastNode returns the last node in the node branch. diff --git a/querying.go b/querying.go index 1072652..bc67bc0 100644 --- a/querying.go +++ b/querying.go @@ -133,7 +133,7 @@ func (node *Node) QueryAll(query string) NodeList { } /* -QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. +QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. */ func QuerySearch(node *Node, query string) iter.Seq[*Node] { return func(yield func(node *Node) bool) { @@ -155,8 +155,8 @@ func QuerySearch(node *Node, query string) iter.Seq[*Node] { val, _ := stack.Pop() sf := val.(stackFrame) - for diff := len(parentNodeStack) - sf.len; len(parentNodeStack) > 0 && diff > 0; diff-- { - _, parentNodeStack = pop(parentNodeStack) + if sf.len <= len(parentNodeStack) { + parentNodeStack = parentNodeStack[:sf.len] } classList := NewClassList() @@ -169,7 +169,8 @@ func QuerySearch(node *Node, query string) iter.Seq[*Node] { classList.DecodeFrom(node) i = matchFromRightMostQueryToken(node, classList, queryTokens, i-1) } - if i < 0 && !yield(sf.node){ + + if i < 0 && !yield(sf.node) { return } } @@ -183,7 +184,7 @@ func QuerySearch(node *Node, query string) iter.Seq[*Node] { if sf.node.GetChildNode() != nil { childNode := sf.node.GetChildNode() - parentNodeStack = append(parentNodeStack, childNode) + parentNodeStack = append(parentNodeStack, sf.node) stack.Push(stackFrame{ len: len(parentNodeStack), node: childNode, @@ -209,9 +210,9 @@ outer: for i >= 0 { token := queryTokens[i] _, ok := checked[token.Selector] - if ok{ + if ok { break - }else{ + } else { checked[token.Selector] = struct{}{} } @@ -235,21 +236,21 @@ outer: return i } -//QuerySelector only returns the first node that matches with the QuerySearch. +// QuerySelector only returns the first node that matches with the QuerySearch. func (node *Node) QuerySelector(query string) *Node { iter := QuerySearch(node, query) - for node := range iter{ + for node := range iter { return node } return nil } -//QuerySelectorAll stores nodes passed down by QuerySearch in a nodeList and returns the nodeList. -func (node *Node) QuerySelectorAll(query string) NodeList{ +// QuerySelectorAll stores nodes passed down by QuerySearch in a nodeList and returns the nodeList. +func (node *Node) QuerySelectorAll(query string) NodeList { iter := QuerySearch(node, query) nodeList := NewNodeList() - - for node := range iter{ + + for node := range iter { nodeList.Append(node) } return nodeList diff --git a/querying_test.go b/querying_test.go index ee7afcb..34b0d30 100644 --- a/querying_test.go +++ b/querying_test.go @@ -188,12 +188,12 @@ func TestQueryAll(t *testing.T) { } nodeList := node.QueryAll("h2") - if nodeList.Len() != 2{ + if nodeList.Len() != 2 { t.Fatal("") } } -func TestQuerySelector(t *testing.T){ +func TestQuerySelector(t *testing.T) { node, err := testFile4NodeTree() if err != nil { t.Fatal(err) @@ -202,6 +202,37 @@ func TestQuerySelector(t *testing.T){ node = node.QuerySelector("html .ordered-list ol li .ordered-item") if node == nil { t.Fatal("Node is nill after QuerySelector") + } else if node.GetInnerText() != "Apple" { + t.Fatal("Unexpected text") + } +} + +func TestQuerySelectorAll(t *testing.T) { + node, err := testFile4NodeTree() + if err != nil { + t.Fatal(err) + return + } + + nodeList := node.QuerySelectorAll(".unordered-list li") + if nodeList.Len() == 0 { + t.Fatal("Node list is empty") + }else if nodeList.Len() != 3{ + t.Fatal("Extra node in the node list.", nodeList.Len()) + } + stack := linkedliststack.New() + stack.Push("Kottue") + stack.Push("Pizza") + stack.Push("Cake") + + iter := nodeList.IterNodeList() + for node := range iter{ + val, _ := stack.Pop() + str := val.(string) + if node.GetInnerText() != str{ + t.Fatal("Got unexpected text.", "Expected", str, "But got", node.GetInnerText()) + }else{ + t.Log(node.GetInnerText()) + } } - t.Log(node.GetInnerText()) } diff --git a/test-files/5.html b/test-files/5.html new file mode 100644 index 0000000..68dca45 --- /dev/null +++ b/test-files/5.html @@ -0,0 +1,39 @@ + + + + Tricky HTML + + +
          +
          +
          +

          First Post

          +

          This is some tricky HTML code.

          + Read More +
          + + +
          + +
            +
          • One
          • +
          • Two extra
          • +
          • Three
          • +
          + +
          + + + +
          +
          + +
          +

          © 2025 Example

          +
          + + From 423ceead6c9e71ffb1090ae8487127cd6f2e2038 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sun, 17 Aug 2025 14:55:59 +0530 Subject: [PATCH 12/25] Bug fixed --- querying.go | 75 +++++++++++------------------------------------------ 1 file changed, 15 insertions(+), 60 deletions(-) diff --git a/querying.go b/querying.go index bc67bc0..7992a8d 100644 --- a/querying.go +++ b/querying.go @@ -3,8 +3,6 @@ package GoHtml import ( "iter" "strings" - - "github.com/emirpasic/gods/stacks/linkedliststack" ) // GetElementByTagName returns the first node that match with the given tagName by advancing from the node. @@ -136,75 +134,32 @@ func (node *Node) QueryAll(query string) NodeList { QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. */ func QuerySearch(node *Node, query string) iter.Seq[*Node] { + traverser := NewTraverser(node) return func(yield func(node *Node) bool) { queryTokens := TokenizeQuery(query) - - parentNodeStack := make([]*Node, 0, 2) - stack := linkedliststack.New() - type stackFrame struct { - //len should contain the length of the parentNodeStack at the stack push time. - len int - node *Node - } - stack.Push(stackFrame{ - len: len(parentNodeStack), - node: node, - }) - - for stack.Size() > 0 { - val, _ := stack.Pop() - sf := val.(stackFrame) - - if sf.len <= len(parentNodeStack) { - parentNodeStack = parentNodeStack[:sf.len] + iter := traverser.Walkthrough + for node := range iter { + i := matchFromRightMostQueryToken(node, queryTokens, len(queryTokens)-1) + if i == len(queryTokens)-1{ + continue } - - classList := NewClassList() - classList.DecodeFrom(sf.node) - i := matchFromRightMostQueryToken(sf.node, classList, queryTokens, len(queryTokens)-1) - if i < len(queryTokens)-1 { - for j := len(parentNodeStack) - 1; j >= 0 && i >= 0; j-- { - node := parentNodeStack[j] - classList := NewClassList() - classList.DecodeFrom(node) - i = matchFromRightMostQueryToken(node, classList, queryTokens, i-1) - } - - if i < 0 && !yield(sf.node) { - return - } - } - - if sf.node.GetNextNode() != nil { - stack.Push(stackFrame{ - len: len(parentNodeStack), - node: sf.node.GetNextNode(), - }) + parentNode := node.GetParent() + for parentNode != nil && i>=0 { + i = matchFromRightMostQueryToken(parentNode, queryTokens, i) + parentNode = parentNode.GetParent() } - - if sf.node.GetChildNode() != nil { - childNode := sf.node.GetChildNode() - parentNodeStack = append(parentNodeStack, sf.node) - stack.Push(stackFrame{ - len: len(parentNodeStack), - node: childNode, - }) + if i < 0 && !yield(node){ + return } } - } -} -func pop(slice []*Node) (*Node, []*Node) { - if len(slice) > 0 { - res := slice[len(slice)-1] - slice = slice[:len(slice)-1] - return res, slice } - return nil, slice } // matchFromRightMostQueryToken tries to match query tokens from right to left and return the index at which point query token last matched. -func matchFromRightMostQueryToken(node *Node, classList ClassList, queryTokens []QueryToken, i int) int { +func matchFromRightMostQueryToken(node *Node, queryTokens []QueryToken, i int) int { + classList := NewClassList() + classList.DecodeFrom(node) checked := make(map[string]struct{}) outer: for i >= 0 { From caaab5fcf4d02f6f5d8cae74dd802e860d3e2ce0 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Date: Mon, 18 Aug 2025 18:02:03 +0530 Subject: [PATCH 13/25] Removed QueryAll and Query methods --- querying.go | 30 ------------------------------ querying_test.go | 27 --------------------------- 2 files changed, 57 deletions(-) diff --git a/querying.go b/querying.go index 7992a8d..642173e 100644 --- a/querying.go +++ b/querying.go @@ -100,36 +100,6 @@ func (node *Node) GetElementsById(idName string) NodeList { return nodeList } -// Query returns the first node that matches with the give query. -func (node *Node) Query(query string) *Node { - queryTokens := TokenizeQuery(query) - - traverser := NewTraverser(node) - var res *Node - traverser.Walkthrough(func(node *Node) TraverseCondition { - if matchQueryTokens(node, queryTokens) { - res = node - return StopWalkthrough - } - return ContinueWalkthrough - }) - return res -} - -// QueryAll returns a NodeList containing nodes that matched with the given query. -func (node *Node) QueryAll(query string) NodeList { - nodeList := NewNodeList() - queryTokens := TokenizeQuery(query) - traverser := NewTraverser(node) - - for node := range traverser.Walkthrough { - if matchQueryTokens(node, queryTokens) { - nodeList.Append(node) - } - } - return nodeList -} - /* QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. */ diff --git a/querying_test.go b/querying_test.go index 34b0d30..fcba1ca 100644 --- a/querying_test.go +++ b/querying_test.go @@ -166,33 +166,6 @@ func TestSelectorTokenizer(t *testing.T) { } } -func TestQuery(t *testing.T) { - node, err := testFile4NodeTree() - if err != nil { - t.Fatal(err) - return - } - imgEl := node.Query("img #idElement") - imgSrc, _ := imgEl.GetAttribute("src") - imgAlt, _ := imgEl.GetAttribute("alt") - if imgSrc != "" || imgAlt != "" { - t.Fatal("") - } -} - -func TestQueryAll(t *testing.T) { - node, err := testFile4NodeTree() - if err != nil { - t.Fatal(err) - return - } - - nodeList := node.QueryAll("h2") - if nodeList.Len() != 2 { - t.Fatal("") - } -} - func TestQuerySelector(t *testing.T) { node, err := testFile4NodeTree() if err != nil { From 1a79a084c923c7f9a4a1d220d9200329aa60b416 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Date: Fri, 22 Aug 2025 16:57:54 +0530 Subject: [PATCH 14/25] Half finished selectors --- benchmarks/benchmark_test.go | 4 +- node-tree.go | 13 +-- node-tree_test.go | 4 +- querying.go | 5 +- querying_test.go | 3 + selectors.go | 175 +++++++++++++++++++++++++++++++++++ selectors_test.go | 13 +++ tokenizer.go | 84 +---------------- 8 files changed, 204 insertions(+), 97 deletions(-) create mode 100644 selectors.go create mode 100644 selectors_test.go diff --git a/benchmarks/benchmark_test.go b/benchmarks/benchmark_test.go index d8886f2..394c5d0 100644 --- a/benchmarks/benchmark_test.go +++ b/benchmarks/benchmark_test.go @@ -6,7 +6,9 @@ import( "net/http" "time" ) - +/* +Adapted from [GoQuery example](https://github.com/PuerkitoBio/goquery?tab=readme-ov-file#examples) +*/ func TestFetchPostCovers(t *testing.T){ res, err := http.Get("https://www.metalsucks.net/") if err != nil { diff --git a/node-tree.go b/node-tree.go index fc12549..65ec888 100644 --- a/node-tree.go +++ b/node-tree.go @@ -219,18 +219,7 @@ But this is not the case for QuerySearch, QuerySelector and QuerySelectorAll. */ // Adapted from [https://developer.mozilla.org/en-US/docs/Web/API/Element/closest](MDN Element: closest() method) func (node *Node) Closest(query string) *Node { - queryTokens := TokenizeQuery(query) traverser := NewTraverser(node) - for traverser.GetCurrentNode() != nil { - if matchQueryTokens(traverser.GetCurrentNode(), queryTokens) { - break - } - - if traverser.GetCurrentNode().GetPreviousNode() == nil { - traverser.SetCurrentNodeTo(traverser.GetCurrentNode().GetParent()) - }else{ - traverser.Previous() - } - } + return traverser.GetCurrentNode() } diff --git a/node-tree_test.go b/node-tree_test.go index 500b33a..69aaffb 100644 --- a/node-tree_test.go +++ b/node-tree_test.go @@ -114,6 +114,7 @@ func TestRemoveNode(t *testing.T){ //t.Log(GoHtml.NodeTreeToHTML(article)) } +/* func TestClosest(t *testing.T){ node, err := testFile4NodeTree() if err != nil{ @@ -132,4 +133,5 @@ func TestClosest(t *testing.T){ } -} \ No newline at end of file +} +*/ \ No newline at end of file diff --git a/querying.go b/querying.go index 642173e..c823995 100644 --- a/querying.go +++ b/querying.go @@ -1,7 +1,7 @@ package GoHtml import ( - "iter" + //"iter" "strings" ) @@ -103,6 +103,7 @@ func (node *Node) GetElementsById(idName string) NodeList { /* QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. */ +/* func QuerySearch(node *Node, query string) iter.Seq[*Node] { traverser := NewTraverser(node) return func(yield func(node *Node) bool) { @@ -180,3 +181,5 @@ func (node *Node) QuerySelectorAll(query string) NodeList { } return nodeList } + +*/ \ No newline at end of file diff --git a/querying_test.go b/querying_test.go index fcba1ca..feb84ea 100644 --- a/querying_test.go +++ b/querying_test.go @@ -138,6 +138,7 @@ func TestGetElementsById(t *testing.T) { } } +/* func TestSelectorTokenizer(t *testing.T) { stack := linkedliststack.New() stack.Push("article .content") @@ -209,3 +210,5 @@ func TestQuerySelectorAll(t *testing.T) { } } } + +*/ \ No newline at end of file diff --git a/selectors.go b/selectors.go new file mode 100644 index 0000000..32aa7ea --- /dev/null +++ b/selectors.go @@ -0,0 +1,175 @@ +package GoHtml + +import ( + "strings" +) + +type BasicSelector int + +const ( + Id BasicSelector = iota + Class + Tag +) + +type Selector struct { + selector string + selectorName string + selectorType BasicSelector +} + +func matchNode(node *Node, basicSelectorName string, basicSelectorType BasicSelector) bool { + if basicSelectorName == ""{ + return true + }else if node == nil { + return false + } + + switch basicSelectorType { + case Id: + idName, _ := node.GetAttribute("id") + return idName == basicSelectorName + case Class: + classList := NewClassList() + classList.DecodeFrom(node) + return classList.Contains(basicSelectorName) + case Tag: + return node.GetTagName() == basicSelectorName + } + return false +} + +func NewSelector(selector string) Selector { + selector = strings.TrimSpace(selector) + selectorStruct := Selector{} + if len(selector) == 0 || (selector[0] == '.' || selector[0] == '#') && len(selector) <= 1 { + return selectorStruct + } + + switch selector[0] { + case '.': + selectorStruct.selectorType = Class + case '#': + selectorStruct.selectorType = Id + default: + selectorStruct.selectorType = Tag + } + + selectorStruct.selector = selector + if selectorStruct.selectorType != Tag { + selectorStruct.selectorName = selector[1:] + } else { + selectorStruct.selectorName = selector + } + return selectorStruct +} + +type Combinator int + +const ( + Descendant Combinator = iota + Child + NextSibling + SubsequentSibling + //if no combinator + NoneCombinator +) + +type CombinatorEl struct { + Type Combinator + Selector1 Selector + Selector2 Selector +} + +func TokenizeSelectorsAndCombinators(selector string) []CombinatorEl { + list := make([]CombinatorEl, 0, 1) + slice := strings.SplitSeq(selector, " ") + currentCombinator := *new(CombinatorEl) + currentCombinator.Selector1 = NewSelector("") + for str := range slice { + if strings.TrimSpace(str) == "" { + continue + } + + switch str { + case "+": + currentCombinator.Type = NextSibling + case ">": + currentCombinator.Type = Child + case "~": + currentCombinator.Type = SubsequentSibling + default: + newSelector := NewSelector(str) + currentCombinator.Selector2 = newSelector + list = append(list, currentCombinator) + currentCombinator = *new(CombinatorEl) + currentCombinator.Selector1 = newSelector + } + + } + + if len(list) == 1 { + list[0].Type = NoneCombinator + } + + return list +} + +func (ce *CombinatorEl) IsMatchingNode(node *Node) bool { + switch ce.Type { + case Descendant: + return ce.isDescended(node) + case Child: + return ce.isDirectChild(node) + case NextSibling: + return ce.isNextSibling(node) + case SubsequentSibling: + return ce.isSubsequentSibling(node) + case NoneCombinator: + return matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) + } + return false +} + +// isDescended returns wether the given node is a ce.Selector2 and descended of ce.Selector1. +func (ce *CombinatorEl) isDescended(node *Node) bool { + if !matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return false + } + + parentNode := node.GetParent() + for parentNode != nil && !matchNode(parentNode, ce.Selector1.selectorName, ce.Selector1.selectorType) { + parentNode = parentNode.GetParent() + } + return parentNode != nil +} + +// isDirectChild returns whether the given node is a direct child of ce.Selector1 and node is of ce.Selector2 +func (ce *CombinatorEl) isDirectChild(node *Node) bool { + if node == nil { + return false + } + + return matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && matchNode(node.GetParent(), ce.Selector1.selectorName, ce.Selector1.selectorType) +} + +// isNextSibling return whether the given node is of ce.Selector2 and next sibling of ce.Selector1 +func (ce *CombinatorEl) isNextSibling(node *Node) bool { + if node == nil { + return false + } + + return matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && matchNode(node.GetPreviousNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) +} + +func (ce *CombinatorEl) isSubsequentSibling(node *Node) bool { + if !matchNode(node, ce.Selector2.selector, ce.Selector2.selectorType) { + return false + } + + traverser := NewTraverser(node) + for traverser.GetCurrentNode() != nil && !matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType) { + traverser.Previous() + } + return matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType) +} diff --git a/selectors_test.go b/selectors_test.go new file mode 100644 index 0000000..ec573f8 --- /dev/null +++ b/selectors_test.go @@ -0,0 +1,13 @@ +package GoHtml_test + +import( + "testing" + "github.com/udan-jayanith/GoHTML" +) + +func TestTokenizeSelector(t *testing.T){ + slice := GoHtml.TokenizeSelectorsAndCombinators(".class-1 > .class-2 + .class-3 a") + for _, el := range slice{ + t.Log(el) + } +} \ No newline at end of file diff --git a/tokenizer.go b/tokenizer.go index d0905cc..050ef47 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -8,7 +8,6 @@ import ( "golang.org/x/net/html" ) - // Tokenizer contains a *html.Tokenizer. type Tokenizer struct { z *html.Tokenizer @@ -26,7 +25,7 @@ func (t *Tokenizer) Advanced() html.TokenType { return t.z.Next() } -// CurrentNode returns the current node. +// CurrentNode returns the current node. // Returned value can be nil regardless of tt. func (t *Tokenizer) GetCurrentNode() *Node { currentToken := t.z.Token() @@ -85,7 +84,7 @@ func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType) { if node == nil { return } - + if isTopNode(ntb.currentNode, ntb.stack) { ntb.currentNode.AppendChild(node) } else { @@ -120,83 +119,4 @@ func isTopNode(node *Node, stack *linkedliststack.Stack) bool { topNode := val.(*Node) return topNode == node -} - -// QueryToken types -const ( - Id int = iota - Tag - Class -) - -// QueryToken store data about basic css selectors(ids, classes, tags). -type QueryToken struct { - Type int - SelectorName string - Selector string -} - - /* -TokenizeQuery tokenizes the query and returns a list of QueryToken. - -query should be of only consists of class, tag and/or id. This applies to every function that accepts a parameter name query. -query should not consists of css selectors, Combinators and separators. -*/ -func TokenizeQuery(query string) []QueryToken { - slice := make([]QueryToken, 0, 1) - if strings.TrimSpace(query) == "" { - return slice - } - - iter := strings.SplitSeq(query, " ") - for sec := range iter { - token := QueryToken{} - switch sec { - case "", " ", ".", "#": - continue - } - - switch string(sec[0]) { - case ".": - token.Type = Class - token.SelectorName = sec[1:] - case "#": - token.Type = Id - token.SelectorName = sec[1:] - default: - token.Type = Tag - token.SelectorName = sec - } - token.Selector = sec - slice = append(slice, token) - } - - return slice -} - -// matchQueryTokens returns wether the queryTokens match given the node. -func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { - if len(queryTokens) == 0 { - return false - } - classList := NewClassList() - classList.DecodeFrom(node) - for _, token := range queryTokens { - switch token.Type { - case Id: - idName, _ := node.GetAttribute("id") - if token.SelectorName != idName { - return false - } - case Tag: - if node.GetTagName() != token.SelectorName { - return false - } - case Class: - if !classList.Contains(token.SelectorName) { - return false - } - } - } - return true } \ No newline at end of file From 985b84d01ce58a3452a1c996b8c38e828e55d6c6 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sun, 17 Aug 2025 16:02:21 +0530 Subject: [PATCH 15/25] Changed future changelog --- FUTURE-CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index 39cb589..91c9ad6 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,2 +1,2 @@ ## v0.0.3 -- Closest +- Combinators From c16f5fcb012d05e70b8291b832e2e7ca0c2d61f8 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sat, 23 Aug 2025 20:49:26 +0530 Subject: [PATCH 16/25] Add new selectors. Need manual testing. Closest is temparorly commented for devolopment of selectors and combinators. Once selectors and combinatore are done testing closest will work again. --- querying.go | 55 ++++++++-------------------------------- querying_test.go | 33 ++---------------------- selectors.go | 65 ++++++++++++++++++++++++++++++------------------ 3 files changed, 54 insertions(+), 99 deletions(-) diff --git a/querying.go b/querying.go index c823995..b3d5039 100644 --- a/querying.go +++ b/querying.go @@ -1,7 +1,7 @@ package GoHtml import ( - //"iter" + "iter" "strings" ) @@ -103,23 +103,14 @@ func (node *Node) GetElementsById(idName string) NodeList { /* QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. */ -/* -func QuerySearch(node *Node, query string) iter.Seq[*Node] { + +func QuerySearch(node *Node, selector string) iter.Seq[*Node] { traverser := NewTraverser(node) return func(yield func(node *Node) bool) { - queryTokens := TokenizeQuery(query) + selectorTokens := TokenizeSelectorsAndCombinators(selector) iter := traverser.Walkthrough for node := range iter { - i := matchFromRightMostQueryToken(node, queryTokens, len(queryTokens)-1) - if i == len(queryTokens)-1{ - continue - } - parentNode := node.GetParent() - for parentNode != nil && i>=0 { - i = matchFromRightMostQueryToken(parentNode, queryTokens, i) - parentNode = parentNode.GetParent() - } - if i < 0 && !yield(node){ + if matchFromRightMostSelectors(node, selectorTokens) && !yield(node) { return } } @@ -128,40 +119,17 @@ func QuerySearch(node *Node, query string) iter.Seq[*Node] { } // matchFromRightMostQueryToken tries to match query tokens from right to left and return the index at which point query token last matched. -func matchFromRightMostQueryToken(node *Node, queryTokens []QueryToken, i int) int { - classList := NewClassList() - classList.DecodeFrom(node) - checked := make(map[string]struct{}) -outer: - for i >= 0 { - token := queryTokens[i] - _, ok := checked[token.Selector] - if ok { +func matchFromRightMostSelectors(node *Node, selectorTokens []CombinatorEl) bool { + for i := len(selectorTokens) - 1; i >= 0; i-- { + if node == nil { break - } else { - checked[token.Selector] = struct{}{} - } - - switch token.Type { - case Id: - idName, _ := node.GetAttribute("id") - if token.SelectorName != idName { - break outer - } - case Class: - if !classList.Contains(token.SelectorName) { - break outer - } - case Tag: - if node.GetTagName() != token.SelectorName { - break outer - } } - i-- + node = selectorTokens[i].getMatchingNode(node) } - return i + return node != nil } + // QuerySelector only returns the first node that matches with the QuerySearch. func (node *Node) QuerySelector(query string) *Node { iter := QuerySearch(node, query) @@ -182,4 +150,3 @@ func (node *Node) QuerySelectorAll(query string) NodeList { return nodeList } -*/ \ No newline at end of file diff --git a/querying_test.go b/querying_test.go index feb84ea..52a08d0 100644 --- a/querying_test.go +++ b/querying_test.go @@ -138,45 +138,17 @@ func TestGetElementsById(t *testing.T) { } } -/* -func TestSelectorTokenizer(t *testing.T) { - stack := linkedliststack.New() - stack.Push("article .content") - stack.Push("article p h1") - stack.Push("article p") - stack.Push(".title #user") - stack.Push("#user title .title-1") - - for stack.Size() > 0 { - val, _ := stack.Pop() - selector := val.(string) - - tokens := GoHtml.TokenizeQuery(selector) - s := "" - for _, token := range tokens { - if s == "" { - s += token.Selector - } else { - s += " " + token.Selector - } - } - - if s != selector { - t.Fatal("Expected ", selector, "but got", s) - } - } -} - func TestQuerySelector(t *testing.T) { node, err := testFile4NodeTree() if err != nil { t.Fatal(err) return } - node = node.QuerySelector("html .ordered-list ol li .ordered-item") + node = node.QuerySelector("html ol li") if node == nil { t.Fatal("Node is nill after QuerySelector") } else if node.GetInnerText() != "Apple" { + t.Log(node) t.Fatal("Unexpected text") } } @@ -211,4 +183,3 @@ func TestQuerySelectorAll(t *testing.T) { } } -*/ \ No newline at end of file diff --git a/selectors.go b/selectors.go index 32aa7ea..4132cd8 100644 --- a/selectors.go +++ b/selectors.go @@ -19,9 +19,9 @@ type Selector struct { } func matchNode(node *Node, basicSelectorName string, basicSelectorType BasicSelector) bool { - if basicSelectorName == ""{ + if basicSelectorName == "" { return true - }else if node == nil { + } else if node == nil { return false } @@ -86,6 +86,7 @@ func TokenizeSelectorsAndCombinators(selector string) []CombinatorEl { slice := strings.SplitSeq(selector, " ") currentCombinator := *new(CombinatorEl) currentCombinator.Selector1 = NewSelector("") + currentCombinator.Type = NoneCombinator for str := range slice { if strings.TrimSpace(str) == "" { continue @@ -115,61 +116,77 @@ func TokenizeSelectorsAndCombinators(selector string) []CombinatorEl { return list } -func (ce *CombinatorEl) IsMatchingNode(node *Node) bool { +func (ce *CombinatorEl) getMatchingNode(node *Node) *Node { switch ce.Type { case Descendant: - return ce.isDescended(node) + return ce.getDescended(node) case Child: - return ce.isDirectChild(node) + return ce.getDirectChild(node) case NextSibling: - return ce.isNextSibling(node) + return ce.getNextSibling(node) case SubsequentSibling: - return ce.isSubsequentSibling(node) + return ce.getSubsequentSibling(node) case NoneCombinator: - return matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return node + } } - return false + return nil } // isDescended returns wether the given node is a ce.Selector2 and descended of ce.Selector1. -func (ce *CombinatorEl) isDescended(node *Node) bool { +func (ce *CombinatorEl) getDescended(node *Node) *Node { if !matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { - return false + return nil } parentNode := node.GetParent() - for parentNode != nil && !matchNode(parentNode, ce.Selector1.selectorName, ce.Selector1.selectorType) { + for parentNode != nil { + if matchNode(parentNode, ce.Selector1.selectorName, ce.Selector1.selectorType) { + return parentNode + } parentNode = parentNode.GetParent() } - return parentNode != nil + return nil } // isDirectChild returns whether the given node is a direct child of ce.Selector1 and node is of ce.Selector2 -func (ce *CombinatorEl) isDirectChild(node *Node) bool { +func (ce *CombinatorEl) getDirectChild(node *Node) *Node { if node == nil { - return false + return nil } - return matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && matchNode(node.GetParent(), ce.Selector1.selectorName, ce.Selector1.selectorType) + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && + matchNode(node.GetParent(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return node.GetParent() + } + return nil } // isNextSibling return whether the given node is of ce.Selector2 and next sibling of ce.Selector1 -func (ce *CombinatorEl) isNextSibling(node *Node) bool { +func (ce *CombinatorEl) getNextSibling(node *Node) *Node { if node == nil { - return false + return nil } - return matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && matchNode(node.GetPreviousNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && + matchNode(node.GetPreviousNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return node.GetPreviousNode() + } + return nil } -func (ce *CombinatorEl) isSubsequentSibling(node *Node) bool { - if !matchNode(node, ce.Selector2.selector, ce.Selector2.selectorType) { - return false +func (ce *CombinatorEl) getSubsequentSibling(node *Node) *Node { + if node == nil || !matchNode(node, ce.Selector2.selector, ce.Selector2.selectorType) { + return nil } traverser := NewTraverser(node) - for traverser.GetCurrentNode() != nil && !matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType) { + for traverser.GetCurrentNode() != nil { + if matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType){ + return traverser.GetCurrentNode() + } traverser.Previous() } - return matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType) + return nil } From 5b58d2e5027d76d10d4e28fa5b0504ee58faff94 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Sun, 24 Aug 2025 01:28:55 +0530 Subject: [PATCH 17/25] Fixed selectos does not ignore html tag case --- selectors.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectors.go b/selectors.go index 4132cd8..2158ff6 100644 --- a/selectors.go +++ b/selectors.go @@ -55,7 +55,7 @@ func NewSelector(selector string) Selector { selectorStruct.selectorType = Tag } - selectorStruct.selector = selector + selectorStruct.selector = strings.ToLower(selector) if selectorStruct.selectorType != Tag { selectorStruct.selectorName = selector[1:] } else { From 3286bce606eda32e7873a0493b07f84668ce7e8e Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Mon, 25 Aug 2025 19:21:18 +0530 Subject: [PATCH 18/25] Fixed a bug in selectors and combinators tokenizer --- selectors.go | 27 ++++++++++++++++++++++----- serializer.go | 15 +-------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/selectors.go b/selectors.go index 2158ff6..2b112ff 100644 --- a/selectors.go +++ b/selectors.go @@ -2,6 +2,8 @@ package GoHtml import ( "strings" + + "golang.org/x/net/html" ) type BasicSelector int @@ -40,7 +42,7 @@ func matchNode(node *Node, basicSelectorName string, basicSelectorType BasicSele } func NewSelector(selector string) Selector { - selector = strings.TrimSpace(selector) + selector = strings.TrimSpace(html.EscapeString(selector)) selectorStruct := Selector{} if len(selector) == 0 || (selector[0] == '.' || selector[0] == '#') && len(selector) <= 1 { return selectorStruct @@ -82,12 +84,27 @@ type CombinatorEl struct { } func TokenizeSelectorsAndCombinators(selector string) []CombinatorEl { + iter := func(yield func(string) bool) { + currentStr := "" + for _, char := range selector { + switch char { + case ' ', '>', '+', '~': + if !yield(currentStr) || !yield(string(char)){ + return + } + currentStr = "" + default: + currentStr+=string(char) + } + } + yield(currentStr) + } + list := make([]CombinatorEl, 0, 1) - slice := strings.SplitSeq(selector, " ") currentCombinator := *new(CombinatorEl) currentCombinator.Selector1 = NewSelector("") currentCombinator.Type = NoneCombinator - for str := range slice { + for str := range iter { if strings.TrimSpace(str) == "" { continue } @@ -164,7 +181,7 @@ func (ce *CombinatorEl) getDirectChild(node *Node) *Node { } // isNextSibling return whether the given node is of ce.Selector2 and next sibling of ce.Selector1 -func (ce *CombinatorEl) getNextSibling(node *Node) *Node { +func (ce *CombinatorEl) getNextSibling(node *Node) *Node { if node == nil { return nil } @@ -183,7 +200,7 @@ func (ce *CombinatorEl) getSubsequentSibling(node *Node) *Node { traverser := NewTraverser(node) for traverser.GetCurrentNode() != nil { - if matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType){ + if matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType) { return traverser.GetCurrentNode() } traverser.Previous() diff --git a/serializer.go b/serializer.go index e618142..f0e2c1e 100644 --- a/serializer.go +++ b/serializer.go @@ -30,26 +30,13 @@ func encodeListAttributes(node *Node) string { return w.String() } -// Encode writes to w encoding of rootNode +// Encode writes to w encoding of the node tree from rootNode. func Encode(w io.Writer, rootNode *Node) { type stackFrame struct { node *Node openedTag bool } - /* - traverser := NewTraverser(rootNode) - traverser.Walkthrough(func(node *Node) TraverseCondition { - fmt.Println("+++++++++++++++++++++++++++") - if node.IsTextNode() { - fmt.Println(node.text) - } else { - fmt.Println(node.GetTagName()) - } - return ContinueWalkthrough - }) - */ - stack := linkedliststack.New() stack.Push(stackFrame{node: rootNode, openedTag: false}) From 56bb8fbe07bc930d72331221577d14952473b8da Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Thu, 28 Aug 2025 20:39:29 +0530 Subject: [PATCH 19/25] Bug fixes --- .vscode/settings.json | 1 + FUTURE-CHANGELOG.md | 3 +- node-tree.go | 13 ++----- querying.go | 17 +++++---- querying_test.go | 80 +++++++++++++++++++++++++------------------ selectors.go | 5 ++- selectors_test.go | 13 +++---- test-files/5.html | 3 +- 8 files changed, 71 insertions(+), 64 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 92d3d14..0a55ac1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,6 @@ { "cSpell.words": [ + "arraystack", "autoplay", "Combinators", "DOCTYPEDTD", diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index 91c9ad6..4c09990 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,2 +1,3 @@ ## v0.0.3 -- Combinators +- Write test for (closest, QuerySelector, QuerySelectorAll) +- Serializer bug fix diff --git a/node-tree.go b/node-tree.go index 65ec888..d55450f 100644 --- a/node-tree.go +++ b/node-tree.go @@ -207,18 +207,9 @@ func (node *Node) IsTextNode() bool { return node.GetTagName() == "" } -// Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the full query. -/* -Ex: -query = "div .video-container #div" - -
          - -As shown elements with every described specifiers will only match. -But this is not the case for QuerySearch, QuerySelector and QuerySelectorAll. -*/ +// Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the selector and returns that node. // Adapted from [https://developer.mozilla.org/en-US/docs/Web/API/Element/closest](MDN Element: closest() method) -func (node *Node) Closest(query string) *Node { +func (node *Node) Closest(selector string) *Node { traverser := NewTraverser(node) return traverser.GetCurrentNode() diff --git a/querying.go b/querying.go index b3d5039..ea271f5 100644 --- a/querying.go +++ b/querying.go @@ -101,9 +101,8 @@ func (node *Node) GetElementsById(idName string) NodeList { } /* -QuerySearch tokenizes the query string and search for nodes that matches with the right most query token. After matching right most query it proceeds to match nodes parents nodes for left over tokens and then passed that node to (yield/range). QuerySearch search the whole node tree for matches unless yield get canceled or range iterator get cancel. -*/ - +QuerySearch search returns a iterator that traverse through the node tree from given node and passes nodes that matches the given selector. +*/ func QuerySearch(node *Node, selector string) iter.Seq[*Node] { traverser := NewTraverser(node) return func(yield func(node *Node) bool) { @@ -130,18 +129,18 @@ func matchFromRightMostSelectors(node *Node, selectorTokens []CombinatorEl) bool } -// QuerySelector only returns the first node that matches with the QuerySearch. -func (node *Node) QuerySelector(query string) *Node { - iter := QuerySearch(node, query) +// QuerySelector returns the first node that matches with the selector from the node. +func (node *Node) QuerySelector(selector string) *Node { + iter := QuerySearch(node, selector) for node := range iter { return node } return nil } -// QuerySelectorAll stores nodes passed down by QuerySearch in a nodeList and returns the nodeList. -func (node *Node) QuerySelectorAll(query string) NodeList { - iter := QuerySearch(node, query) +// QuerySelectorAll returns a NodeList that has node that matches the selector form the node. +func (node *Node) QuerySelectorAll(selector string) NodeList { + iter := QuerySearch(node, selector) nodeList := NewNodeList() for node := range iter { diff --git a/querying_test.go b/querying_test.go index 52a08d0..d7add6c 100644 --- a/querying_test.go +++ b/querying_test.go @@ -4,7 +4,7 @@ import ( "os" "testing" - "github.com/emirpasic/gods/stacks/linkedliststack" + Stack "github.com/emirpasic/gods/stacks/arraystack" GoHtml "github.com/udan-jayanith/GoHTML" ) @@ -80,7 +80,7 @@ func TestGetElementsByClassName(t *testing.T) { nodeList := node.GetElementsByClassName("ordered-item") iterator := nodeList.IterNodeList() - stack := linkedliststack.New() + stack := Stack.New() stack.Push("Mango") stack.Push("Orange") stack.Push("Apple") @@ -122,7 +122,7 @@ func TestGetElementsById(t *testing.T) { nodeList := node.GetElementsById("idElement") iter := nodeList.IterNodeList() - stack := linkedliststack.New() + stack := Stack.New() stack.Push("Lorem") stack.Push("") @@ -138,48 +138,62 @@ func TestGetElementsById(t *testing.T) { } } -func TestQuerySelector(t *testing.T) { - node, err := testFile4NodeTree() +func testFile5NodeTree() (*GoHtml.Node, error) { + file, err := os.Open("test-files/5.html") if err != nil { - t.Fatal(err) - return + return nil, err } - node = node.QuerySelector("html ol li") - if node == nil { - t.Fatal("Node is nill after QuerySelector") - } else if node.GetInnerText() != "Apple" { - t.Log(node) - t.Fatal("Unexpected text") + + node, _ := GoHtml.Decode(file) + return node, nil +} + +func TestQuerySelector(t *testing.T) { + rootNode, _ := testFile5NodeTree() + if rootNode == nil { + t.Fatal("Node is nil") } + + node := rootNode.QuerySelector("#list .item") + if node == nil { + t.Fatal("Node is nil after querying.") + } else if node.GetInnerText() != "One" { + t.Fatal("Node contains unexpected inner text. Expected One but got", node.GetInnerText()) + } + //TODO: write test for testcases below. + /* + t.Log(rootNode.QuerySelector("body p")) + t.Log(rootNode.QuerySelector("html head > title")) + t.Log(rootNode.QuerySelector("section+ul")) + t.Log(rootNode.QuerySelector(".item~.last-item")) + */ } + func TestQuerySelectorAll(t *testing.T) { - node, err := testFile4NodeTree() + rootNode, err := testFile5NodeTree() if err != nil { t.Fatal(err) - return } - - nodeList := node.QuerySelectorAll(".unordered-list li") - if nodeList.Len() == 0 { - t.Fatal("Node list is empty") - }else if nodeList.Len() != 3{ - t.Fatal("Extra node in the node list.", nodeList.Len()) + nodeList := rootNode.QuerySelectorAll("article h2") + if nodeList.Len() != 2 { + t.Fatal("Expected node list length of 2 but got", nodeList.Len()) } - stack := linkedliststack.New() - stack.Push("Kottue") - stack.Push("Pizza") - stack.Push("Cake") + + stack := Stack.New() + stack.Push("Second Post (Draft)") + stack.Push("First Post") iter := nodeList.IterNodeList() - for node := range iter{ - val, _ := stack.Pop() - str := val.(string) - if node.GetInnerText() != str{ - t.Fatal("Got unexpected text.", "Expected", str, "But got", node.GetInnerText()) - }else{ - t.Log(node.GetInnerText()) + for node := range iter { + if stack.Size() == 0 { + break + } + v, _ := stack.Pop() + str := v.(string) + if str != node.GetInnerText() { + t.Fatal("Unexpected inner text from the node. Expected", str, "but got", node.GetInnerText()) } } -} +} diff --git a/selectors.go b/selectors.go index 2b112ff..949daf0 100644 --- a/selectors.go +++ b/selectors.go @@ -2,7 +2,6 @@ package GoHtml import ( "strings" - "golang.org/x/net/html" ) @@ -194,13 +193,13 @@ func (ce *CombinatorEl) getNextSibling(node *Node) *Node { } func (ce *CombinatorEl) getSubsequentSibling(node *Node) *Node { - if node == nil || !matchNode(node, ce.Selector2.selector, ce.Selector2.selectorType) { + if node == nil || !matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { return nil } traverser := NewTraverser(node) for traverser.GetCurrentNode() != nil { - if matchNode(traverser.GetCurrentNode(), ce.Selector1.selector, ce.Selector1.selectorType) { + if matchNode(traverser.GetCurrentNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) { return traverser.GetCurrentNode() } traverser.Previous() diff --git a/selectors_test.go b/selectors_test.go index ec573f8..d842cf8 100644 --- a/selectors_test.go +++ b/selectors_test.go @@ -1,13 +1,14 @@ package GoHtml_test -import( +import ( "testing" - "github.com/udan-jayanith/GoHTML" + + GoHtml "github.com/udan-jayanith/GoHTML" ) -func TestTokenizeSelector(t *testing.T){ +func TestTokenizeSelector(t *testing.T) { slice := GoHtml.TokenizeSelectorsAndCombinators(".class-1 > .class-2 + .class-3 a") - for _, el := range slice{ - t.Log(el) + if len(slice) != 4 { + t.Fatal("Exacted slice length of", 4, "but got", len(slice)) } -} \ No newline at end of file +} diff --git a/test-files/5.html b/test-files/5.html index 68dca45..b0605bc 100644 --- a/test-files/5.html +++ b/test-files/5.html @@ -12,6 +12,7 @@

          First Post

          Read More +