From 41212d749f6d0878661443d360cb15f9ac040dcb Mon Sep 17 00:00:00 2001 From: Udan Jayanith Date: Fri, 15 Aug 2025 13:06:21 +0530 Subject: [PATCH 1/6] Renamed QuerySelector and QuerySelectorAll to Query and QueryAll and add a wraper arround html tokanizer --- FUTURE-CHANGELOG.md | 14 +-- benchmarks/benchmark_test.go | 2 +- parser.go | 163 +++++++++++++++++++++++------------ querying.go | 8 +- querying_test.go | 4 +- serializer_test.go | 14 ++- 6 files changed, 132 insertions(+), 73 deletions(-) diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index fd94fb4..6513a6e 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,12 +1,2 @@ -## v0.0.0-beta.3 <- current - -## v0.0.1 -* GetElementByClassName -* GetElementByTagName -* GetElementById -* NodeList -* GetElementsByClassName -* GetElementsByTagName -* GetElementsById -* QuerySelector -* QuerySelectorAll \ No newline at end of file +## v0.0.2 +UTF encoding problem. \ No newline at end of file diff --git a/benchmarks/benchmark_test.go b/benchmarks/benchmark_test.go index f11e896..77af6ab 100644 --- a/benchmarks/benchmark_test.go +++ b/benchmarks/benchmark_test.go @@ -20,7 +20,7 @@ func TestFetchPostCovers(t *testing.T){ t.Fatal(err) } - nodeList := node.GetElementsByClassName("post-title") + nodeList := node.QueryAll(".sm-feat .clearfix article") t.Log("Got ", nodeList.Len(), " post titles.") iter := nodeList.IterNodeList() for node := range iter{ diff --git a/parser.go b/parser.go index 7d958c0..3f67382 100644 --- a/parser.go +++ b/parser.go @@ -8,71 +8,106 @@ import ( "golang.org/x/net/html" ) -// Decode reads from rd and create a node-tree. Then returns the root node and an error. If error were to occur it would be SyntaxError. -func Decode(r io.Reader) (*Node, error) { - rootNode := CreateTextNode("") - stack := linkedliststack.New() - currentNode := rootNode +// Tokenizer contains a *html.Tokenizer. +type Tokenizer struct { + z *html.Tokenizer +} - z := html.NewTokenizer(r) - for { - tt := z.Next() - if tt == html.ErrorToken { - break - } +// NewTokenizer returns a new Tokenizer. +func NewTokenizer(r io.Reader) Tokenizer { + return Tokenizer{ + z: html.NewTokenizer(r), + } +} - currentToken := z.Token() - if strings.TrimSpace(currentToken.Data) == "" { - continue - } +// Advanced scans the next token and returns its type. +func (t *Tokenizer) Advanced() html.TokenType { + return t.z.Next() +} - // token data depend on the token type. +// CurrentNode returns the current node. +func (t *Tokenizer) CurrentNode() *Node { + currentToken := t.z.Token() + if strings.TrimSpace(currentToken.Data) == "" { + return nil + } + + // token data depend on the token type. + switch currentToken.Type { + case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: + var node *Node switch currentToken.Type { - case html.EndTagToken: - val, ok := stack.Pop() - if !ok || val == nil { - continue - } - currentNode = val.(*Node) - case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: - var node *Node - switch currentToken.Type { - case html.TextToken: - node = CreateTextNode(currentToken.Data) - case html.DoctypeToken: - node = CreateNode(DOCTYPEDTD) - node.SetAttribute(currentToken.Data, "") - default: - node = CreateNode(currentToken.Data) - for _, v := range currentToken.Attr { - node.SetAttribute(v.Key, v.Val) - } + case html.TextToken: + node = CreateTextNode(currentToken.Data) + case html.DoctypeToken: + node = CreateNode(DOCTYPEDTD) + node.SetAttribute(currentToken.Data, "") + default: + node = CreateNode(currentToken.Data) + for _, v := range currentToken.Attr { + node.SetAttribute(v.Key, v.Val) } + } + return node + } + return nil +} - if isTopNode(currentNode, stack){ - currentNode.AppendChild(node) - }else{ - currentNode.Append(node) - } +// NodeTreeBuilder is used to build a node tree given a node and it's type. +type NodeTreeBuilder struct { + rootNode *Node + stack *linkedliststack.Stack + currentNode *Node +} - if !node.IsTextNode() && !IsVoidTag(node.GetTagName()){ - stack.Push(node) - } - currentNode = node - } +// NewNodeTreeBuilder returns a new NodeTreeBuilder. +func NewNodeTreeBuilder() NodeTreeBuilder { + rootNode := CreateTextNode("") + return NodeTreeBuilder{ + rootNode: rootNode, + currentNode: rootNode, + stack: linkedliststack.New(), } +} - node := rootNode.GetNextNode() - rootNode.RemoveNode() +// WriteNodeTree append the node given html.TokenType +func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType) { + switch tt { + case html.EndTagToken: + val, ok := ntb.stack.Pop() + if !ok || val == nil { + return + } + ntb.currentNode = val.(*Node) + case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: + if node == nil { + return + } + + if isTopNode(ntb.currentNode, ntb.stack) { + ntb.currentNode.AppendChild(node) + } else { + ntb.currentNode.Append(node) + } - return node, nil + if !node.IsTextNode() && !IsVoidTag(node.GetTagName()) { + ntb.stack.Push(node) + } + ntb.currentNode = node + } } -// HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError. -func HTMLToNodeTree(html string) (*Node, error) { - rd := strings.NewReader(html) - node, err := Decode(rd) - return node, err +// GetRootNode returns the root node of the accumulated node tree and resets the NodeTreeBuilder. +func (ntb *NodeTreeBuilder) GetRootNode() *Node { + node := ntb.rootNode.GetNextNode() + ntb.rootNode.RemoveNode() + + rootNode := CreateTextNode("") + ntb.rootNode = rootNode + ntb.currentNode = rootNode + ntb.stack = linkedliststack.New() + + return node } func isTopNode(node *Node, stack *linkedliststack.Stack) bool { @@ -84,3 +119,25 @@ func isTopNode(node *Node, stack *linkedliststack.Stack) bool { topNode := val.(*Node) return topNode == node } + +// Decode reads from rd and create a node-tree. Then returns the root node and nil. +func Decode(r io.Reader) (*Node, error) { + t := NewTokenizer(r) + nodeTreeBuilder := NewNodeTreeBuilder() + for { + tt := t.Advanced() + if tt == html.ErrorToken{ + break + } + + nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt) + } + return nodeTreeBuilder.GetRootNode(), nil +} + +// HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError. +func HTMLToNodeTree(html string) (*Node, error) { + rd := strings.NewReader(html) + node, err := Decode(rd) + return node, err +} diff --git a/querying.go b/querying.go index 88a6409..8f00960 100644 --- a/querying.go +++ b/querying.go @@ -172,8 +172,8 @@ func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { return true } -// QuerySelector returns the first node that matches with the give node. -func (node *Node) QuerySelector(query string) *Node { +// Query returns the first node that matches with the give query. +func (node *Node) Query(query string) *Node { queryTokens := TokenizeQuery(query) traverser := NewTraverser(node) @@ -188,8 +188,8 @@ func (node *Node) QuerySelector(query string) *Node { return res } -// QuerySelectorAll returns a NodeList containing nodes that matched with the given query. -func (node *Node) QuerySelectorAll(query string) NodeList{ +// QueryAll returns a NodeList containing nodes that matched with the given query. +func (node *Node) QueryAll(query string) NodeList{ nodeList := NewNodeList() queryTokens := TokenizeQuery(query) traverser := NewTraverser(node) diff --git a/querying_test.go b/querying_test.go index 26afac7..2c22331 100644 --- a/querying_test.go +++ b/querying_test.go @@ -172,7 +172,7 @@ func TestQuerySelector(t *testing.T) { t.Fatal(err) return } - imgEl := node.QuerySelector("img #idElement") + imgEl := node.Query("img #idElement") imgSrc, _ := imgEl.GetAttribute("src") imgAlt, _ := imgEl.GetAttribute("alt") if imgSrc != "" || imgAlt != "" { @@ -187,7 +187,7 @@ func TestQuerySelectorAll(t *testing.T) { return } - nodeList := node.QuerySelectorAll("h2") + nodeList := node.QueryAll("h2") if nodeList.Len() != 2{ t.Fatal("") } diff --git a/serializer_test.go b/serializer_test.go index 97290da..7d39df5 100644 --- a/serializer_test.go +++ b/serializer_test.go @@ -3,10 +3,11 @@ package GoHtml_test import ( "strings" "testing" + "os" GoHtml "github.com/udan-jayanith/GoHTML" ) -func TestEncode(t *testing.T) { +func TestEncode1(t *testing.T) { body := GoHtml.CreateNode("body") h1 := GoHtml.CreateNode("h1") h1.AppendText("This is a heading") @@ -20,4 +21,15 @@ func TestEncode(t *testing.T) { builder1 := &strings.Builder{} GoHtml.Encode(builder1, body) //It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html. +} + +func TestEncode2(t *testing.T) { + file, err := os.Open("./test-files/1.html") + if err != nil { + t.Fatal("1.html does not exists.") + } + node, _ := GoHtml.Decode(file) + var builder strings.Builder + GoHtml.Encode(&builder, node) + //It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html. } \ No newline at end of file From cd3f80479767371c581d3a54c99730bf09b5e9e5 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Date: Fri, 15 Aug 2025 13:10:29 +0530 Subject: [PATCH 2/6] Updated readme --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 212f53e..ee2c34a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # GoHTML A powerful and comprehensive HTML parser and DOM manipulation library for Go, bringing JavaScript-like DOM operations to the Go ecosystem. +Note: GoHTML only support UTF-8. It's users responsibility to make sure input is UTF-8. ## Installation @@ -50,12 +51,10 @@ Heres an example of fetching a website and parsing and then using querying metho ## Changelog Changes, bug fixes and new features in this version. - -- add: NodeList -- add: Querying helper functions -- add: ClassList -- bug fix: Empty attribute value parsing bug fixed -- changed: Renamed GetTraverser to NewTraverser +- add: Tokenizer +- add: NodeTreeBuilder +- renamed: QuerySelector to Query +- renamed: QuerySelectorAll to QueryAll ## Documentation From 3416ffcfd72748436d85209b682456f12e43df91 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Date: Fri, 15 Aug 2025 13:11:50 +0530 Subject: [PATCH 3/6] Updated future-changelog --- FUTURE-CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index 6513a6e..021e37f 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,2 +1,4 @@ -## v0.0.2 -UTF encoding problem. \ No newline at end of file +## v0.0.3 +- Parse only +- Decode head +- Closest From 96fffe07e80e984463744af2f56b17317577fbf9 Mon Sep 17 00:00:00 2001 From: Udan Jayanith Date: Fri, 15 Aug 2025 18:09:53 +0530 Subject: [PATCH 4/6] Updated the readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ee2c34a..221176d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # GoHTML -A powerful and comprehensive HTML parser and DOM manipulation library for Go, bringing JavaScript-like DOM operations to the Go ecosystem. -Note: GoHTML only support UTF-8. It's users responsibility to make sure input is UTF-8. +A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree. ## Installation From cc823d6b838ed943e19c632a45e254e23f7188ed Mon Sep 17 00:00:00 2001 From: Udan Jayanith Date: Fri, 15 Aug 2025 18:13:13 +0530 Subject: [PATCH 5/6] Updated main.go file description of GoHTML --- main.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.go b/main.go index 9f61c65..5f44d67 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,5 @@ /* -A powerful and comprehensive HTML parser and DOM manipulation library for Go, -bringing JavaScript-like DOM operations to the Go ecosystem. +A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree. */ package GoHtml From 7147dda00c926add04db113a369dbe12af08532c Mon Sep 17 00:00:00 2001 From: Udan Jayanith Jayakody Date: Fri, 15 Aug 2025 20:51:55 +0530 Subject: [PATCH 6/6] Few changes --- FUTURE-CHANGELOG.md | 2 - parser.go | 116 ++--------------------------------------- tokenizer.go | 123 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 115 deletions(-) create mode 100644 tokenizer.go diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index 021e37f..39cb589 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,4 +1,2 @@ ## v0.0.3 -- Parse only -- Decode head - Closest diff --git a/parser.go b/parser.go index 3f67382..251a7d2 100644 --- a/parser.go +++ b/parser.go @@ -3,122 +3,9 @@ package GoHtml import ( "io" "strings" - - "github.com/emirpasic/gods/stacks/linkedliststack" "golang.org/x/net/html" ) -// Tokenizer contains a *html.Tokenizer. -type Tokenizer struct { - z *html.Tokenizer -} - -// NewTokenizer returns a new Tokenizer. -func NewTokenizer(r io.Reader) Tokenizer { - return Tokenizer{ - z: html.NewTokenizer(r), - } -} - -// Advanced scans the next token and returns its type. -func (t *Tokenizer) Advanced() html.TokenType { - return t.z.Next() -} - -// CurrentNode returns the current node. -func (t *Tokenizer) CurrentNode() *Node { - currentToken := t.z.Token() - if strings.TrimSpace(currentToken.Data) == "" { - return nil - } - - // token data depend on the token type. - switch currentToken.Type { - case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: - var node *Node - switch currentToken.Type { - case html.TextToken: - node = CreateTextNode(currentToken.Data) - case html.DoctypeToken: - node = CreateNode(DOCTYPEDTD) - node.SetAttribute(currentToken.Data, "") - default: - node = CreateNode(currentToken.Data) - for _, v := range currentToken.Attr { - node.SetAttribute(v.Key, v.Val) - } - } - return node - } - return nil -} - -// NodeTreeBuilder is used to build a node tree given a node and it's type. -type NodeTreeBuilder struct { - rootNode *Node - stack *linkedliststack.Stack - currentNode *Node -} - -// NewNodeTreeBuilder returns a new NodeTreeBuilder. -func NewNodeTreeBuilder() NodeTreeBuilder { - rootNode := CreateTextNode("") - return NodeTreeBuilder{ - rootNode: rootNode, - currentNode: rootNode, - stack: linkedliststack.New(), - } -} - -// WriteNodeTree append the node given html.TokenType -func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType) { - switch tt { - case html.EndTagToken: - val, ok := ntb.stack.Pop() - if !ok || val == nil { - return - } - ntb.currentNode = val.(*Node) - case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: - if node == nil { - return - } - - if isTopNode(ntb.currentNode, ntb.stack) { - ntb.currentNode.AppendChild(node) - } else { - ntb.currentNode.Append(node) - } - - if !node.IsTextNode() && !IsVoidTag(node.GetTagName()) { - ntb.stack.Push(node) - } - ntb.currentNode = node - } -} - -// GetRootNode returns the root node of the accumulated node tree and resets the NodeTreeBuilder. -func (ntb *NodeTreeBuilder) GetRootNode() *Node { - node := ntb.rootNode.GetNextNode() - ntb.rootNode.RemoveNode() - - rootNode := CreateTextNode("") - ntb.rootNode = rootNode - ntb.currentNode = rootNode - ntb.stack = linkedliststack.New() - - return node -} - -func isTopNode(node *Node, stack *linkedliststack.Stack) bool { - val, ok := stack.Peek() - if !ok || val == nil { - return false - } - - topNode := val.(*Node) - return topNode == node -} // Decode reads from rd and create a node-tree. Then returns the root node and nil. func Decode(r io.Reader) (*Node, error) { @@ -133,6 +20,7 @@ func Decode(r io.Reader) (*Node, error) { nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt) } return nodeTreeBuilder.GetRootNode(), nil + } // HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError. @@ -141,3 +29,5 @@ func HTMLToNodeTree(html string) (*Node, error) { node, err := Decode(rd) return node, err } + + diff --git a/tokenizer.go b/tokenizer.go new file mode 100644 index 0000000..fde080b --- /dev/null +++ b/tokenizer.go @@ -0,0 +1,123 @@ +package GoHtml + +import ( + "io" + "strings" + + "github.com/emirpasic/gods/stacks/linkedliststack" + "golang.org/x/net/html" +) + + +// Tokenizer contains a *html.Tokenizer. +type Tokenizer struct { + z *html.Tokenizer +} + +// NewTokenizer returns a new Tokenizer. +func NewTokenizer(r io.Reader) Tokenizer { + return Tokenizer{ + z: html.NewTokenizer(r), + } +} + +// Advanced scans the next token and returns its type. +func (t *Tokenizer) Advanced() html.TokenType { + return t.z.Next() +} + +// CurrentNode returns the current node. +// Returned value can be nil regardless of tt. +func (t *Tokenizer) CurrentNode() *Node { + currentToken := t.z.Token() + if strings.TrimSpace(currentToken.Data) == "" { + return nil + } + + // token data depend on the token type. + switch currentToken.Type { + case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: + var node *Node + switch currentToken.Type { + case html.TextToken: + node = CreateTextNode(currentToken.Data) + case html.DoctypeToken: + node = CreateNode(DOCTYPEDTD) + node.SetAttribute(currentToken.Data, "") + default: + node = CreateNode(currentToken.Data) + for _, v := range currentToken.Attr { + node.SetAttribute(v.Key, v.Val) + } + } + return node + } + return nil +} + +// NodeTreeBuilder is used to build a node tree given a node and it's type. +type NodeTreeBuilder struct { + rootNode *Node + stack *linkedliststack.Stack + currentNode *Node +} + +// NewNodeTreeBuilder returns a new NodeTreeBuilder. +func NewNodeTreeBuilder() NodeTreeBuilder { + rootNode := CreateTextNode("") + return NodeTreeBuilder{ + rootNode: rootNode, + currentNode: rootNode, + stack: linkedliststack.New(), + } +} + +// WriteNodeTree append the node given html.TokenType +func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType) { + switch tt { + case html.EndTagToken: + val, ok := ntb.stack.Pop() + if !ok || val == nil { + return + } + ntb.currentNode = val.(*Node) + case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: + if node == nil { + return + } + + if isTopNode(ntb.currentNode, ntb.stack) { + ntb.currentNode.AppendChild(node) + } else { + ntb.currentNode.Append(node) + } + + if !node.IsTextNode() && !IsVoidTag(node.GetTagName()) { + ntb.stack.Push(node) + } + ntb.currentNode = node + } +} + +// GetRootNode returns the root node of the accumulated node tree and resets the NodeTreeBuilder. +func (ntb *NodeTreeBuilder) GetRootNode() *Node { + node := ntb.rootNode.GetNextNode() + ntb.rootNode.RemoveNode() + + rootNode := CreateTextNode("") + ntb.rootNode = rootNode + ntb.currentNode = rootNode + ntb.stack = linkedliststack.New() + + return node +} + +func isTopNode(node *Node, stack *linkedliststack.Stack) bool { + val, ok := stack.Peek() + if !ok || val == nil { + return false + } + + topNode := val.(*Node) + return topNode == node +}