diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index fd94fb4..39cb589 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,12 +1,2 @@ -## v0.0.0-beta.3 <- current - -## v0.0.1 -* GetElementByClassName -* GetElementByTagName -* GetElementById -* NodeList -* GetElementsByClassName -* GetElementsByTagName -* GetElementsById -* QuerySelector -* QuerySelectorAll \ No newline at end of file +## v0.0.3 +- Closest diff --git a/README.md b/README.md index 212f53e..221176d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # GoHTML -A powerful and comprehensive HTML parser and DOM manipulation library for Go, bringing JavaScript-like DOM operations to the Go ecosystem. +A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree. ## Installation @@ -50,12 +50,10 @@ Heres an example of fetching a website and parsing and then using querying metho ## Changelog Changes, bug fixes and new features in this version. - -- add: NodeList -- add: Querying helper functions -- add: ClassList -- bug fix: Empty attribute value parsing bug fixed -- changed: Renamed GetTraverser to NewTraverser +- add: Tokenizer +- add: NodeTreeBuilder +- renamed: QuerySelector to Query +- renamed: QuerySelectorAll to QueryAll ## Documentation diff --git a/benchmarks/benchmark_test.go b/benchmarks/benchmark_test.go index f11e896..77af6ab 100644 --- a/benchmarks/benchmark_test.go +++ b/benchmarks/benchmark_test.go @@ -20,7 +20,7 @@ func TestFetchPostCovers(t *testing.T){ t.Fatal(err) } - nodeList := node.GetElementsByClassName("post-title") + nodeList := node.QueryAll(".sm-feat .clearfix article") t.Log("Got ", nodeList.Len(), " post titles.") iter := nodeList.IterNodeList() for node := range iter{ diff --git a/main.go b/main.go index 9f61c65..5f44d67 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,5 @@ /* -A powerful and comprehensive HTML parser and DOM manipulation library for Go, -bringing JavaScript-like DOM operations to the Go ecosystem. +A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree. */ package GoHtml diff --git a/parser.go b/parser.go index 7d958c0..251a7d2 100644 --- a/parser.go +++ b/parser.go @@ -3,69 +3,24 @@ package GoHtml import ( "io" "strings" - - "github.com/emirpasic/gods/stacks/linkedliststack" "golang.org/x/net/html" ) -// Decode reads from rd and create a node-tree. Then returns the root node and an error. If error were to occur it would be SyntaxError. -func Decode(r io.Reader) (*Node, error) { - rootNode := CreateTextNode("") - stack := linkedliststack.New() - currentNode := rootNode - z := html.NewTokenizer(r) +// Decode reads from rd and create a node-tree. Then returns the root node and nil. +func Decode(r io.Reader) (*Node, error) { + t := NewTokenizer(r) + nodeTreeBuilder := NewNodeTreeBuilder() for { - tt := z.Next() - if tt == html.ErrorToken { + tt := t.Advanced() + if tt == html.ErrorToken{ break } - currentToken := z.Token() - if strings.TrimSpace(currentToken.Data) == "" { - continue - } - - // token data depend on the token type. - switch currentToken.Type { - case html.EndTagToken: - val, ok := stack.Pop() - if !ok || val == nil { - continue - } - currentNode = val.(*Node) - case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: - var node *Node - switch currentToken.Type { - case html.TextToken: - node = CreateTextNode(currentToken.Data) - case html.DoctypeToken: - node = CreateNode(DOCTYPEDTD) - node.SetAttribute(currentToken.Data, "") - default: - node = CreateNode(currentToken.Data) - for _, v := range currentToken.Attr { - node.SetAttribute(v.Key, v.Val) - } - } - - if isTopNode(currentNode, stack){ - currentNode.AppendChild(node) - }else{ - currentNode.Append(node) - } - - if !node.IsTextNode() && !IsVoidTag(node.GetTagName()){ - stack.Push(node) - } - currentNode = node - } + nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt) } + return nodeTreeBuilder.GetRootNode(), nil - node := rootNode.GetNextNode() - rootNode.RemoveNode() - - return node, nil } // HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError. @@ -75,12 +30,4 @@ func HTMLToNodeTree(html string) (*Node, error) { return node, err } -func isTopNode(node *Node, stack *linkedliststack.Stack) bool { - val, ok := stack.Peek() - if !ok || val == nil { - return false - } - topNode := val.(*Node) - return topNode == node -} diff --git a/querying.go b/querying.go index 88a6409..8f00960 100644 --- a/querying.go +++ b/querying.go @@ -172,8 +172,8 @@ func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { return true } -// QuerySelector returns the first node that matches with the give node. -func (node *Node) QuerySelector(query string) *Node { +// Query returns the first node that matches with the give query. +func (node *Node) Query(query string) *Node { queryTokens := TokenizeQuery(query) traverser := NewTraverser(node) @@ -188,8 +188,8 @@ func (node *Node) QuerySelector(query string) *Node { return res } -// QuerySelectorAll returns a NodeList containing nodes that matched with the given query. -func (node *Node) QuerySelectorAll(query string) NodeList{ +// QueryAll returns a NodeList containing nodes that matched with the given query. +func (node *Node) QueryAll(query string) NodeList{ nodeList := NewNodeList() queryTokens := TokenizeQuery(query) traverser := NewTraverser(node) diff --git a/querying_test.go b/querying_test.go index 26afac7..2c22331 100644 --- a/querying_test.go +++ b/querying_test.go @@ -172,7 +172,7 @@ func TestQuerySelector(t *testing.T) { t.Fatal(err) return } - imgEl := node.QuerySelector("img #idElement") + imgEl := node.Query("img #idElement") imgSrc, _ := imgEl.GetAttribute("src") imgAlt, _ := imgEl.GetAttribute("alt") if imgSrc != "" || imgAlt != "" { @@ -187,7 +187,7 @@ func TestQuerySelectorAll(t *testing.T) { return } - nodeList := node.QuerySelectorAll("h2") + nodeList := node.QueryAll("h2") if nodeList.Len() != 2{ t.Fatal("") } diff --git a/serializer_test.go b/serializer_test.go index 97290da..7d39df5 100644 --- a/serializer_test.go +++ b/serializer_test.go @@ -3,10 +3,11 @@ package GoHtml_test import ( "strings" "testing" + "os" GoHtml "github.com/udan-jayanith/GoHTML" ) -func TestEncode(t *testing.T) { +func TestEncode1(t *testing.T) { body := GoHtml.CreateNode("body") h1 := GoHtml.CreateNode("h1") h1.AppendText("This is a heading") @@ -20,4 +21,15 @@ func TestEncode(t *testing.T) { builder1 := &strings.Builder{} GoHtml.Encode(builder1, body) //It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html. +} + +func TestEncode2(t *testing.T) { + file, err := os.Open("./test-files/1.html") + if err != nil { + t.Fatal("1.html does not exists.") + } + node, _ := GoHtml.Decode(file) + var builder strings.Builder + GoHtml.Encode(&builder, node) + //It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html. } \ No newline at end of file diff --git a/tokenizer.go b/tokenizer.go new file mode 100644 index 0000000..fde080b --- /dev/null +++ b/tokenizer.go @@ -0,0 +1,123 @@ +package GoHtml + +import ( + "io" + "strings" + + "github.com/emirpasic/gods/stacks/linkedliststack" + "golang.org/x/net/html" +) + + +// Tokenizer contains a *html.Tokenizer. +type Tokenizer struct { + z *html.Tokenizer +} + +// NewTokenizer returns a new Tokenizer. +func NewTokenizer(r io.Reader) Tokenizer { + return Tokenizer{ + z: html.NewTokenizer(r), + } +} + +// Advanced scans the next token and returns its type. +func (t *Tokenizer) Advanced() html.TokenType { + return t.z.Next() +} + +// CurrentNode returns the current node. +// Returned value can be nil regardless of tt. +func (t *Tokenizer) CurrentNode() *Node { + currentToken := t.z.Token() + if strings.TrimSpace(currentToken.Data) == "" { + return nil + } + + // token data depend on the token type. + switch currentToken.Type { + case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: + var node *Node + switch currentToken.Type { + case html.TextToken: + node = CreateTextNode(currentToken.Data) + case html.DoctypeToken: + node = CreateNode(DOCTYPEDTD) + node.SetAttribute(currentToken.Data, "") + default: + node = CreateNode(currentToken.Data) + for _, v := range currentToken.Attr { + node.SetAttribute(v.Key, v.Val) + } + } + return node + } + return nil +} + +// NodeTreeBuilder is used to build a node tree given a node and it's type. +type NodeTreeBuilder struct { + rootNode *Node + stack *linkedliststack.Stack + currentNode *Node +} + +// NewNodeTreeBuilder returns a new NodeTreeBuilder. +func NewNodeTreeBuilder() NodeTreeBuilder { + rootNode := CreateTextNode("") + return NodeTreeBuilder{ + rootNode: rootNode, + currentNode: rootNode, + stack: linkedliststack.New(), + } +} + +// WriteNodeTree append the node given html.TokenType +func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType) { + switch tt { + case html.EndTagToken: + val, ok := ntb.stack.Pop() + if !ok || val == nil { + return + } + ntb.currentNode = val.(*Node) + case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken: + if node == nil { + return + } + + if isTopNode(ntb.currentNode, ntb.stack) { + ntb.currentNode.AppendChild(node) + } else { + ntb.currentNode.Append(node) + } + + if !node.IsTextNode() && !IsVoidTag(node.GetTagName()) { + ntb.stack.Push(node) + } + ntb.currentNode = node + } +} + +// GetRootNode returns the root node of the accumulated node tree and resets the NodeTreeBuilder. +func (ntb *NodeTreeBuilder) GetRootNode() *Node { + node := ntb.rootNode.GetNextNode() + ntb.rootNode.RemoveNode() + + rootNode := CreateTextNode("") + ntb.rootNode = rootNode + ntb.currentNode = rootNode + ntb.stack = linkedliststack.New() + + return node +} + +func isTopNode(node *Node, stack *linkedliststack.Stack) bool { + val, ok := stack.Peek() + if !ok || val == nil { + return false + } + + topNode := val.(*Node) + return topNode == node +}