First Post
+This is some tricky HTML code.
+ Read More +diff --git a/.vscode/settings.json b/.vscode/settings.json index 5e89b44..4b7feb1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,11 +1,14 @@ { "cSpell.words": [ + "arraystack", "autoplay", + "Combinators", "DOCTYPEDTD", "emirpasic", "gohtml", "Kottue", "linkedliststack", + "println", "yosssi" ] } diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index 39cb589..8b13789 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,2 +1 @@ -## v0.0.3 -- Closest + diff --git a/README.md b/README.md index dafb5af..3f0f89d 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,9 @@ import ( - Querying ## Example + Heres an example of fetching a website and parsing and then using querying methods. + ```go res, err := http.Get("https://www.metalsucks.net/") if err != nil { @@ -34,27 +36,17 @@ Heres an example of fetching a website and parsing and then using querying metho } defer res.Body.Close() - //Parses the given html reader and then returns the root node and an error. node, err := GoHtml.Decode(res.Body) if err != nil { t.Fatal(err) } - nodeList := node.GetElementsByClassName("post-title") - iter := nodeList.IterNodeList() - for node := range iter{ - print(node.GetInnerText()) + nodeList := node.QuerySelectorAll(".left-content article .post-title") + for node := range nodeList.IterNodeList(){ + println(node.GetInnerText()) } ``` -## Changelog - -Changes, bug fixes and new features in this version. -- add: Tokenizer -- add: NodeTreeBuilder -- renamed: QuerySelector to Query -- renamed: QuerySelectorAll to QueryAll - ## Documentation Fully fledged [documentation](https://pkg.go.dev/github.com/udan-jayanith/GoHTML) is available at [go.pkg](https://pkg.go.dev/) diff --git a/benchmarks/benchmark_test.go b/benchmarks/benchmark_test.go index 77af6ab..394c5d0 100644 --- a/benchmarks/benchmark_test.go +++ b/benchmarks/benchmark_test.go @@ -6,7 +6,9 @@ import( "net/http" "time" ) - +/* +Adapted from [GoQuery example](https://github.com/PuerkitoBio/goquery?tab=readme-ov-file#examples) +*/ func TestFetchPostCovers(t *testing.T){ res, err := http.Get("https://www.metalsucks.net/") if err != nil { @@ -20,7 +22,7 @@ func TestFetchPostCovers(t *testing.T){ t.Fatal(err) } - nodeList := node.QueryAll(".sm-feat .clearfix article") + nodeList := node.QuerySelectorAll(".left-content article .post-title") t.Log("Got ", nodeList.Len(), " post titles.") iter := nodeList.IterNodeList() for node := range iter{ diff --git a/classList.go b/classList.go index ff31544..daa0858 100644 --- a/classList.go +++ b/classList.go @@ -68,7 +68,7 @@ func (classList ClassList) Encode() string { return classes } -// EncodeTo encode className for the node. +// EncodeTo encodes classNames for the node. // If node is nil EncodeTo does nothing. func (classList ClassList) EncodeTo(node *Node){ if node == nil { diff --git a/classList_test.go b/classList_test.go index 3568bf0..7c86534 100644 --- a/classList_test.go +++ b/classList_test.go @@ -1,31 +1,63 @@ package GoHtml_test -import( +import ( + "fmt" "testing" - "github.com/udan-jayanith/GoHTML" + + GoHtml "github.com/udan-jayanith/GoHTML" ) -func TestClasses(t *testing.T){ +func TestClasses(t *testing.T) { node := GoHtml.CreateNode("div") node.SetAttribute("class", "div-container main") classList := GoHtml.NewClassList() classList.DecodeFrom(node) - if !classList.Contains("main"){ + if !classList.Contains("main") { t.Fatal("") return } classList.DeleteClass("main") - if classList.Contains("main"){ + if classList.Contains("main") { t.Fatal("") return } classList.AppendClass("main-div") - if !classList.Contains("main-div"){ + if !classList.Contains("main-div") { t.Fatal("") return } classList.EncodeTo(node) +} + +func ExampleClassList_Contains() { + //Creates a div that has classes video-container and main-contents + div := GoHtml.CreateNode("div") + div.SetAttribute("class", "video-container main-contents") + + classList := GoHtml.NewClassList() + //Add the classes in the div to the class list + classList.DecodeFrom(div) + + //Checks wether the following classes exists in the classList + fmt.Println(classList.Contains("container")) + fmt.Println(classList.Contains("video-container")) + + //Output: + //false + //true +} + +func ExampleClassList_Encode(){ + classList := GoHtml.NewClassList() + + //Add classes to the class list + classList.AppendClass("container") + classList.AppendClass("warper") + classList.AppendClass("main-content") + + //This would output something like this "warper container main-content". Order of the output is not guaranteed. + fmt.Println(classList.Encode()) } \ No newline at end of file diff --git a/node-list.go b/node-list.go index 3cb4d29..c232677 100644 --- a/node-list.go +++ b/node-list.go @@ -5,7 +5,7 @@ import ( "iter" ) -//NodeList can store nodes by appended order. +//NodeList can store nodes by appended order and can iterate over the node list by invoking IterNodeList method. type NodeList struct { list *list.List currentEl *list.Element diff --git a/node-list_test.go b/node-list_test.go index e121e24..1a8cf89 100644 --- a/node-list_test.go +++ b/node-list_test.go @@ -1,6 +1,7 @@ package GoHtml_test import ( + "fmt" "os" "testing" @@ -13,6 +14,7 @@ func TestIterNodeList1(t *testing.T) { t.Fatal(err) return } + defer file.Close() node, err := GoHtml.Decode(file) if err != nil { @@ -39,4 +41,20 @@ func TestIterNodeList2(t *testing.T){ for node := range iter{ t.Log(node) } +} + +func ExampleNodeList(){ + nodeList := GoHtml.NewNodeList() + nodeList.Append(GoHtml.CreateNode("br")) + nodeList.Append(GoHtml.CreateNode("hr")) + nodeList.Append(GoHtml.CreateNode("div")) + + iter := nodeList.IterNodeList() + for node := range iter{ + fmt.Println(node.GetTagName()) + } + //Output: + //br + //hr + //div } \ No newline at end of file diff --git a/node-tree.go b/node-tree.go index 759a0a7..1ddfb88 100644 --- a/node-tree.go +++ b/node-tree.go @@ -2,6 +2,7 @@ package GoHtml import ( "strings" + "golang.org/x/net/html" ) @@ -38,7 +39,7 @@ func (node *Node) SetPreviousNode(previousNode *Node) { node.previousNode = previousNode } -// GetChildNode returns the first child elements of this node. +// GetChildNode returns the first child node of this node. func (node *Node) GetChildNode() *Node { return node.childNode } @@ -75,7 +76,7 @@ func (node *Node) GetAttribute(attributeName string) (string, bool) { // RemoveAttribute remove or delete the specified attribute. func (node *Node) RemoveAttribute(attributeName string) { delete(node.attributes, strings.TrimSpace(strings.ToLower(attributeName))) - + } // IterateAttributes calls callback at every attribute in the node by passing attribute and value of the node. @@ -114,6 +115,7 @@ func (node *Node) AppendChild(childNode *Node) { lastNode := node.GetChildNode().GetLastNode() childNode.SetPreviousNode(lastNode) + childNode.setParentNode(lastNode.GetParent()) lastNode.SetNextNode(childNode) } @@ -121,12 +123,13 @@ func (node *Node) AppendChild(childNode *Node) { func (node *Node) Append(newNode *Node) { lastNode := node.GetLastNode() newNode.SetPreviousNode(lastNode) + newNode.setParentNode(lastNode.GetParent()) lastNode.SetNextNode(newNode) } // GetParent returns a pointer to the parent node. func (node *Node) GetParent() *Node { - return node.GetFirstNode().getParentNode() + return node.parentNode } // GetLastNode returns the last node in the node branch. @@ -203,3 +206,21 @@ func (node *Node) RemoveNode() { func (node *Node) IsTextNode() bool { return node.GetTagName() == "" } + +// Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the selector and returns that node. +// Adapted from [https://developer.mozilla.org/en-US/docs/Web/API/Element/closest](MDN Element: closest() method) +func (node *Node) Closest(selector string) *Node { + traverser := NewTraverser(node) + selectors := TokenizeSelectorsAndCombinators(selector) + + for traverser.GetCurrentNode() != nil { + if matchFromRightMostSelectors(traverser.GetCurrentNode(), selectors) { + break + } else if traverser.GetCurrentNode().GetPreviousNode() == nil { + traverser.SetCurrentNodeTo(traverser.GetCurrentNode().GetParent()) + } else { + traverser.Previous() + } + } + return traverser.GetCurrentNode() +} diff --git a/node-tree_test.go b/node-tree_test.go index 3ab4a01..973c0e5 100644 --- a/node-tree_test.go +++ b/node-tree_test.go @@ -112,4 +112,24 @@ func TestRemoveNode(t *testing.T){ //p.RemoveNode() //t.Log(GoHtml.NodeTreeToHTML(article)) +} + +func TestClosest(t *testing.T){ + node, err := testFile4NodeTree() + if err != nil{ + t.Fatal(err) + } + node = node.GetElementByClassName("ordered-item") + if node == nil { + t.Fatal("Node is nil.") + } + + node = node.Closest("img+.ordered-list") + if node == nil { + t.Fatal("Node is nil") + }else if node.GetTagName() != "ol"{ + t.Fatal("Unexpected element.") + } + + } \ No newline at end of file diff --git a/parser.go b/parser.go index 251a7d2..11a72c3 100644 --- a/parser.go +++ b/parser.go @@ -3,24 +3,23 @@ package GoHtml import ( "io" "strings" + "golang.org/x/net/html" ) - // Decode reads from rd and create a node-tree. Then returns the root node and nil. func Decode(r io.Reader) (*Node, error) { t := NewTokenizer(r) nodeTreeBuilder := NewNodeTreeBuilder() for { tt := t.Advanced() - if tt == html.ErrorToken{ + if tt == html.ErrorToken { break } - nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt) + nodeTreeBuilder.WriteNodeTree(t.GetCurrentNode(), tt) } return nodeTreeBuilder.GetRootNode(), nil - } // HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError. @@ -30,4 +29,3 @@ func HTMLToNodeTree(html string) (*Node, error) { return node, err } - diff --git a/parser_test.go b/parser_test.go index 9f2d38f..33c054e 100644 --- a/parser_test.go +++ b/parser_test.go @@ -1,6 +1,7 @@ package GoHtml_test import ( + "fmt" "os" "strings" "testing" @@ -14,6 +15,7 @@ func TestDecode(t *testing.T) { t.Fatal(err) return } + defer file.Close() node, err := GoHtml.Decode(file) if err != nil { @@ -24,3 +26,31 @@ func TestDecode(t *testing.T) { var builder strings.Builder GoHtml.Encode(&builder, node) } + +func ExampleDecode() { + r := strings.NewReader(` + + +
+ + +udanjayanith@gmail.com
+Joined: 01/08/2024
+ + + `) + + rootNode, _ := GoHtml.Decode(r) + + titleNode := rootNode.QuerySelector("title") + title := "" + if titleNode != nil { + title = titleNode.GetInnerText() + } + fmt.Println(title) + //Output: User Profile +} diff --git a/querying.go b/querying.go index 8f00960..ea271f5 100644 --- a/querying.go +++ b/querying.go @@ -1,6 +1,7 @@ package GoHtml import ( + "iter" "strings" ) @@ -99,105 +100,52 @@ func (node *Node) GetElementsById(idName string) NodeList { return nodeList } -// Selector types -const ( - Id int = iota - Tag - Class -) - -// QueryToken store data about basic css selectors(ids, classes, tags). -type QueryToken struct { - Type int - SelectorName string - Selector string -} - -// TokenizeQuery tokenizes the query and returns a list of QueryToken. -func TokenizeQuery(query string) []QueryToken { - slice := make([]QueryToken, 0, 1) - if strings.TrimSpace(query) == "" { - return slice - } - - iter := strings.SplitSeq(query, " ") - for sec := range iter { - token := QueryToken{} - switch sec { - case "", " ", ".", "#": - continue +/* +QuerySearch search returns a iterator that traverse through the node tree from given node and passes nodes that matches the given selector. +*/ +func QuerySearch(node *Node, selector string) iter.Seq[*Node] { + traverser := NewTraverser(node) + return func(yield func(node *Node) bool) { + selectorTokens := TokenizeSelectorsAndCombinators(selector) + iter := traverser.Walkthrough + for node := range iter { + if matchFromRightMostSelectors(node, selectorTokens) && !yield(node) { + return + } } - switch string(sec[0]) { - case ".": - token.Type = Class - token.SelectorName = sec[1:] - case "#": - token.Type = Id - token.SelectorName = sec[1:] - default: - token.Type = Tag - token.SelectorName = sec - } - token.Selector = sec - slice = append(slice, token) } - - return slice } -func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { - if len(queryTokens) == 0 { - return false - } - classList := NewClassList() - classList.DecodeFrom(node) - for _, token := range queryTokens { - switch token.Type { - case Id: - idName, _ := node.GetAttribute("id") - if token.SelectorName != idName { - return false - } - case Tag: - if node.GetTagName() != token.SelectorName { - return false - } - case Class: - if !classList.Contains(token.SelectorName) { - return false - } +// matchFromRightMostQueryToken tries to match query tokens from right to left and return the index at which point query token last matched. +func matchFromRightMostSelectors(node *Node, selectorTokens []CombinatorEl) bool { + for i := len(selectorTokens) - 1; i >= 0; i-- { + if node == nil { + break } + node = selectorTokens[i].getMatchingNode(node) } - return true + return node != nil } -// Query returns the first node that matches with the give query. -func (node *Node) Query(query string) *Node { - queryTokens := TokenizeQuery(query) - traverser := NewTraverser(node) - var res *Node - traverser.Walkthrough(func(node *Node) TraverseCondition { - if matchQueryTokens(node, queryTokens) { - res = node - return StopWalkthrough - } - return ContinueWalkthrough - }) - return res +// QuerySelector returns the first node that matches with the selector from the node. +func (node *Node) QuerySelector(selector string) *Node { + iter := QuerySearch(node, selector) + for node := range iter { + return node + } + return nil } -// QueryAll returns a NodeList containing nodes that matched with the given query. -func (node *Node) QueryAll(query string) NodeList{ +// QuerySelectorAll returns a NodeList that has node that matches the selector form the node. +func (node *Node) QuerySelectorAll(selector string) NodeList { + iter := QuerySearch(node, selector) nodeList := NewNodeList() - queryTokens := TokenizeQuery(query) - traverser := NewTraverser(node) - for node := range traverser.Walkthrough{ - if matchQueryTokens(node, queryTokens) { - nodeList.Append(node) - } + for node := range iter { + nodeList.Append(node) } return nodeList -} \ No newline at end of file +} + diff --git a/querying_test.go b/querying_test.go index 2c22331..f678ad3 100644 --- a/querying_test.go +++ b/querying_test.go @@ -1,10 +1,12 @@ package GoHtml_test import ( + "fmt" + "net/http" "os" "testing" - "github.com/emirpasic/gods/stacks/linkedliststack" + Stack "github.com/emirpasic/gods/stacks/arraystack" GoHtml "github.com/udan-jayanith/GoHTML" ) @@ -13,6 +15,7 @@ func testFile4NodeTree() (*GoHtml.Node, error) { if err != nil { return nil, err } + defer file.Close() node, err := GoHtml.Decode(file) return node, err @@ -24,6 +27,7 @@ func TestGetElementByID(t *testing.T) { t.Fatal(err) return } + defer file.Close() node, err := GoHtml.Decode(file) if err != nil { @@ -80,7 +84,7 @@ func TestGetElementsByClassName(t *testing.T) { nodeList := node.GetElementsByClassName("ordered-item") iterator := nodeList.IterNodeList() - stack := linkedliststack.New() + stack := Stack.New() stack.Push("Mango") stack.Push("Orange") stack.Push("Apple") @@ -122,7 +126,7 @@ func TestGetElementsById(t *testing.T) { nodeList := node.GetElementsById("idElement") iter := nodeList.IterNodeList() - stack := linkedliststack.New() + stack := Stack.New() stack.Push("Lorem") stack.Push("") @@ -138,57 +142,97 @@ func TestGetElementsById(t *testing.T) { } } -func TestSelectorTokenizer(t *testing.T) { - stack := linkedliststack.New() - stack.Push("article .content") - stack.Push("article p h1") - stack.Push("article p") - stack.Push(".title #user") - stack.Push("#user title .title-1") - - for stack.Size() > 0 { - val, _ := stack.Pop() - selector := val.(string) - - tokens := GoHtml.TokenizeQuery(selector) - s := "" - for _, token := range tokens { - if s == "" { - s += token.Selector - } else { - s += " " + token.Selector - } - } - - if s != selector { - t.Fatal("Expected ", selector, "but got", s) - } +func testFile5NodeTree() (*GoHtml.Node, error) { + file, err := os.Open("test-files/5.html") + if err != nil { + return nil, err } + defer file.Close() + + node, _ := GoHtml.Decode(file) + return node, nil } func TestQuerySelector(t *testing.T) { - node, err := testFile4NodeTree() + rootNode, _ := testFile5NodeTree() + if rootNode == nil { + t.Fatal("Node is nil") + } + + node := rootNode.QuerySelector("#list .item") + if node == nil { + t.Fatal("Node is nil after querying.") + } else if node.GetInnerText() != "One" { + t.Fatal("Node contains unexpected inner text. Expected One but got", node.GetInnerText()) + } + //TODO: write test for testcases below. + /* + t.Log(rootNode.QuerySelector("body p")) + t.Log(rootNode.QuerySelector("html head > title")) + t.Log(rootNode.QuerySelector("section+ul")) + t.Log(rootNode.QuerySelector(".item~.last-item")) + */ +} + +func TestQuerySelectorAll(t *testing.T) { + rootNode, err := testFile5NodeTree() if err != nil { t.Fatal(err) + } + nodeList := rootNode.QuerySelectorAll("article h2") + if nodeList.Len() != 2 { + t.Fatal("Expected node list length of 2 but got", nodeList.Len()) + } + + stack := Stack.New() + stack.Push("Second Post (Draft)") + stack.Push("First Post") + + iter := nodeList.IterNodeList() + for node := range iter { + if stack.Size() == 0 { + break + } + v, _ := stack.Pop() + str := v.(string) + if str != node.GetInnerText() { + t.Fatal("Unexpected inner text from the node. Expected", str, "but got", node.GetInnerText()) + } + } + +} + +func ExampleNode_QuerySelector() { + res, err := http.Get("https://example.com/") + if err != nil || res.StatusCode != http.StatusOK { return } - imgEl := node.Query("img #idElement") - imgSrc, _ := imgEl.GetAttribute("src") - imgAlt, _ := imgEl.GetAttribute("alt") - if imgSrc != "" || imgAlt != "" { - t.Fatal("") + defer res.Body.Close() + + rootNode, _ := GoHtml.Decode(res.Body) + res.Body.Close() + + title := rootNode.QuerySelector("title") + if title != nil { + fmt.Println(title.GetInnerText()) + //Example Domain } } -func TestQuerySelectorAll(t *testing.T) { - node, err := testFile4NodeTree() - if err != nil { - t.Fatal(err) +func ExampleQuerySearch() { + //Request the html + res, err := http.Get("https://example.com/") + if err != nil || res.StatusCode != http.StatusOK { return } + defer res.Body.Close() + + //Decode the html + rootNode, _ := GoHtml.Decode(res.Body) - nodeList := node.QueryAll("h2") - if nodeList.Len() != 2{ - t.Fatal("") + //Iterate over every node that matches the query. + for node := range GoHtml.QuerySearch(rootNode, ".event-columns .column .event-block h4 a") { + //Convert the node and it's children nodes to text html and print it. + fmt.Println(GoHtml.NodeTreeToHTML(node)) } } diff --git a/selectors.go b/selectors.go new file mode 100644 index 0000000..7f317ec --- /dev/null +++ b/selectors.go @@ -0,0 +1,214 @@ +package GoHtml + +import ( + "strings" + "golang.org/x/net/html" +) + +type BasicSelector int + +const ( + Id BasicSelector = iota + Class + Tag +) + +//Selector struct represents a single css selector +//Ex: .my-class, #video, div +type Selector struct { + selector string + selectorName string + selectorType BasicSelector +} + +func matchNode(node *Node, basicSelectorName string, basicSelectorType BasicSelector) bool { + if basicSelectorName == "" { + return true + } else if node == nil { + return false + } + + switch basicSelectorType { + case Id: + idName, _ := node.GetAttribute("id") + return idName == basicSelectorName + case Class: + classList := NewClassList() + classList.DecodeFrom(node) + return classList.Contains(basicSelectorName) + case Tag: + return node.GetTagName() == basicSelectorName + } + return false +} + +//NewSelector takes a single css selector and returns a Selector struct. +//Selector string should be only of basic selector. +func NewSelector(selector string) Selector { + selector = strings.TrimSpace(html.EscapeString(selector)) + selectorStruct := Selector{} + if len(selector) == 0 || (selector[0] == '.' || selector[0] == '#') && len(selector) <= 1 { + return selectorStruct + } + + switch selector[0] { + case '.': + selectorStruct.selectorType = Class + case '#': + selectorStruct.selectorType = Id + default: + selectorStruct.selectorType = Tag + } + + selectorStruct.selector = strings.ToLower(selector) + if selectorStruct.selectorType != Tag { + selectorStruct.selectorName = selector[1:] + } else { + selectorStruct.selectorName = selector + } + return selectorStruct +} + +type Combinator int + +const ( + Descendant Combinator = iota + Child + NextSibling + SubsequentSibling + //if no combinator + NoneCombinator +) + +//CombinatorEl is used to represent selectors that are around a combinator. +type CombinatorEl struct { + Type Combinator + Selector1 Selector + Selector2 Selector +} + +//This takes a selector or combinators and selectors and then returns a slice of CombinatorEl. +func TokenizeSelectorsAndCombinators(selector string) []CombinatorEl { + iter := func(yield func(string) bool) { + currentStr := "" + for _, char := range selector { + switch char { + case ' ', '>', '+', '~': + if !yield(currentStr) || !yield(string(char)){ + return + } + currentStr = "" + default: + currentStr+=string(char) + } + } + yield(currentStr) + } + + list := make([]CombinatorEl, 0, 1) + currentCombinator := *new(CombinatorEl) + currentCombinator.Selector1 = NewSelector("") + currentCombinator.Type = NoneCombinator + for str := range iter { + if strings.TrimSpace(str) == "" { + continue + } + + switch str { + case "+": + currentCombinator.Type = NextSibling + case ">": + currentCombinator.Type = Child + case "~": + currentCombinator.Type = SubsequentSibling + default: + newSelector := NewSelector(str) + currentCombinator.Selector2 = newSelector + list = append(list, currentCombinator) + currentCombinator = *new(CombinatorEl) + currentCombinator.Selector1 = newSelector + } + + } + + if len(list) == 1 { + list[0].Type = NoneCombinator + } + + return list +} + +func (ce *CombinatorEl) getMatchingNode(node *Node) *Node { + switch ce.Type { + case Descendant: + return ce.getDescended(node) + case Child: + return ce.getDirectChild(node) + case NextSibling: + return ce.getNextSibling(node) + case SubsequentSibling: + return ce.getSubsequentSibling(node) + case NoneCombinator: + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return node + } + } + return nil +} + +// isDescended returns wether the given node is a ce.Selector2 and descended of ce.Selector1. +func (ce *CombinatorEl) getDescended(node *Node) *Node { + if !matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return nil + } + + parentNode := node.GetParent() + for parentNode != nil { + if matchNode(parentNode, ce.Selector1.selectorName, ce.Selector1.selectorType) { + return parentNode + } + parentNode = parentNode.GetParent() + } + return nil +} + +// isDirectChild returns whether the given node is a direct child of ce.Selector1 and node is of ce.Selector2 +func (ce *CombinatorEl) getDirectChild(node *Node) *Node { + if node == nil { + return nil + } + + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && + matchNode(node.GetParent(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return node.GetParent() + } + return nil +} + +// isNextSibling return whether the given node is of ce.Selector2 and next sibling of ce.Selector1 +func (ce *CombinatorEl) getNextSibling(node *Node) *Node { + if node == nil { + return nil + } + + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && + matchNode(node.GetPreviousNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return node.GetPreviousNode() + } + return nil +} + +func (ce *CombinatorEl) getSubsequentSibling(node *Node) *Node { + if node == nil || !matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return nil + } + + traverser := NewTraverser(node) + for traverser.GetCurrentNode() != nil { + if matchNode(traverser.GetCurrentNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return traverser.GetCurrentNode() + } + traverser.Previous() + } + return nil +} diff --git a/selectors_test.go b/selectors_test.go new file mode 100644 index 0000000..d842cf8 --- /dev/null +++ b/selectors_test.go @@ -0,0 +1,14 @@ +package GoHtml_test + +import ( + "testing" + + GoHtml "github.com/udan-jayanith/GoHTML" +) + +func TestTokenizeSelector(t *testing.T) { + slice := GoHtml.TokenizeSelectorsAndCombinators(".class-1 > .class-2 + .class-3 a") + if len(slice) != 4 { + t.Fatal("Exacted slice length of", 4, "but got", len(slice)) + } +} diff --git a/serializer.go b/serializer.go index e618142..25d8e98 100644 --- a/serializer.go +++ b/serializer.go @@ -9,79 +9,70 @@ import ( "golang.org/x/net/html" ) -func wrapAttributeValue(value string) string { - if isDigit(value) { - return value - } - - return `"` + strings.ReplaceAll(value, `"`, """) + `"` -} - func encodeListAttributes(node *Node) string { w := strings.Builder{} node.IterateAttributes(func(attribute, value string) { - if strings.TrimSpace(attribute) == "" { + if strings.TrimSpace(value) == "" { w.Write(fmt.Appendf(nil, " %s", attribute)) } else { - w.Write(fmt.Appendf(nil, " %s=%s", attribute, wrapAttributeValue(value))) + w.Write(fmt.Appendf(nil, " %s=%s", attribute, `"`+html.EscapeString(value)+`"`)) } }) return w.String() } -// Encode writes to w encoding of rootNode +// Encode writes to w encoding of the node tree from rootNode. func Encode(w io.Writer, rootNode *Node) { - type stackFrame struct { - node *Node - openedTag bool + if rootNode == nil { + return } - /* - traverser := NewTraverser(rootNode) - traverser.Walkthrough(func(node *Node) TraverseCondition { - fmt.Println("+++++++++++++++++++++++++++") - if node.IsTextNode() { - fmt.Println(node.text) - } else { - fmt.Println(node.GetTagName()) - } - return ContinueWalkthrough - }) - */ + type stackFrame struct { + node *Node + isClosingTag bool + } stack := linkedliststack.New() - stack.Push(stackFrame{node: rootNode, openedTag: false}) + stack.Push(stackFrame{ + node: rootNode, + }) for stack.Size() > 0 { - t, _ := stack.Pop() - top := t.(stackFrame) - current := top.node + v, _ := stack.Pop() + currentStackFrame := v.(stackFrame) - if current == nil { + if currentStackFrame.isClosingTag { + fmt.Fprintf(w, "%s>", currentStackFrame.node.GetTagName()) continue + } else if currentStackFrame.node.IsTextNode() { + fmt.Fprint(w, html.EscapeString(currentStackFrame.node.GetText())) + } else { + fmt.Fprintf(w, "<%s%s>", func() string { + tagName := currentStackFrame.node.GetTagName() + tagNameUpperCased := strings.ToUpper(tagName) + if tagNameUpperCased == DOCTYPEDTD { + tagName = tagNameUpperCased + } + return tagName + }(), encodeListAttributes(currentStackFrame.node)) } - tagName := current.GetTagName() - if tagName == "" { - w.Write([]byte(html.EscapeString(current.GetText()))) - } else if IsVoidTag(tagName) { - fmt.Fprintf(w, "<%s%s>", tagName, encodeListAttributes(current)) - if current.GetNextNode() != nil { - stack.Push(stackFrame{node: current.GetNextNode(), openedTag: false}) - } - } else if !top.openedTag { - fmt.Fprintf(w, "<%s%s>", tagName, encodeListAttributes(current)) - stack.Push(stackFrame{node: current, openedTag: true}) - - if current.GetChildNode() != nil { - stack.Push(stackFrame{node: current.GetChildNode(), openedTag: false}) - } - } else { - fmt.Fprintf(w, "%s>", tagName) - if current.GetNextNode() != nil { - stack.Push(stackFrame{node: current.GetNextNode(), openedTag: false}) - } + if currentStackFrame.node.GetNextNode() != nil { + stack.Push(stackFrame{ + node: currentStackFrame.node.GetNextNode(), + }) + } + if !IsVoidTag(currentStackFrame.node.GetTagName()) && !currentStackFrame.node.IsTextNode(){ + stack.Push(stackFrame{ + node: currentStackFrame.node, + isClosingTag: true, + }) + } + if currentStackFrame.node.GetChildNode() != nil { + stack.Push(stackFrame{ + node: currentStackFrame.node.GetChildNode(), + }) } } } diff --git a/serializer_test.go b/serializer_test.go index 7d39df5..795e392 100644 --- a/serializer_test.go +++ b/serializer_test.go @@ -28,6 +28,8 @@ func TestEncode2(t *testing.T) { if err != nil { t.Fatal("1.html does not exists.") } + defer file.Close() + node, _ := GoHtml.Decode(file) var builder strings.Builder GoHtml.Encode(&builder, node) diff --git a/test-files/4.html b/test-files/4.html index 966886b..18a385f 100644 --- a/test-files/4.html +++ b/test-files/4.html @@ -10,7 +10,7 @@Lorem
-This is some tricky HTML code.
+ Read More +tag is a fundamental element used for creating paragraphs in web development. It helps structure content, separating text into distinct blocks. When you wrap text within
...
tags, you tell browsers to treat the enclosed content as a paragraph.") body.AppendChild(p) - traverser := GoHtml.NewTraverser(body) resList := make([]*GoHtml.Node, 0) @@ -26,14 +26,37 @@ func TestWalkthrough(t *testing.T) { testList := []*GoHtml.Node{ body, - h1, + h1, h1.GetChildNode(), p, p.GetChildNode(), } for i := range testList { - if testList[i] != resList[i]{ - t.Fatal("Expected ", testList[i], "but got ", resList[i], "in index ", i ) + if testList[i] != resList[i] { + t.Fatal("Expected ", testList[i], "but got ", resList[i], "in index ", i) } } } + +func ExampleTraverser_Walkthrough() { + //Creation of the node tree. + body := GoHtml.CreateNode("body") + h1 := GoHtml.CreateNode("h1") + h1.AppendText("This is a heading") + body.AppendChild(h1) + p := GoHtml.CreateNode("p") + p.AppendText("The HTMLtag is a fundamental element used for creating paragraphs in web development. It helps structure content, separating text into distinct blocks. When you wrap text within
...
tags, you tell browsers to treat the enclosed content as a paragraph.") + body.AppendChild(p) + + traverser := GoHtml.NewTraverser(body) + + for node := range traverser.Walkthrough { + fmt.Println(node) + } + //or + traverser.Walkthrough(func(node *GoHtml.Node) GoHtml.TraverseCondition { + fmt.Println(node) + return true + }) + +}