diff --git a/.vscode/settings.json b/.vscode/settings.json index 5e89b44..4b7feb1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,11 +1,14 @@ { "cSpell.words": [ + "arraystack", "autoplay", + "Combinators", "DOCTYPEDTD", "emirpasic", "gohtml", "Kottue", "linkedliststack", + "println", "yosssi" ] } diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index 39cb589..8b13789 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,2 +1 @@ -## v0.0.3 -- Closest + diff --git a/README.md b/README.md index dafb5af..3f0f89d 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,9 @@ import ( - Querying ## Example + Heres an example of fetching a website and parsing and then using querying methods. + ```go res, err := http.Get("https://www.metalsucks.net/") if err != nil { @@ -34,27 +36,17 @@ Heres an example of fetching a website and parsing and then using querying metho } defer res.Body.Close() - //Parses the given html reader and then returns the root node and an error. node, err := GoHtml.Decode(res.Body) if err != nil { t.Fatal(err) } - nodeList := node.GetElementsByClassName("post-title") - iter := nodeList.IterNodeList() - for node := range iter{ - print(node.GetInnerText()) + nodeList := node.QuerySelectorAll(".left-content article .post-title") + for node := range nodeList.IterNodeList(){ + println(node.GetInnerText()) } ``` -## Changelog - -Changes, bug fixes and new features in this version. -- add: Tokenizer -- add: NodeTreeBuilder -- renamed: QuerySelector to Query -- renamed: QuerySelectorAll to QueryAll - ## Documentation Fully fledged [documentation](https://pkg.go.dev/github.com/udan-jayanith/GoHTML) is available at [go.pkg](https://pkg.go.dev/) diff --git a/benchmarks/benchmark_test.go b/benchmarks/benchmark_test.go index 77af6ab..394c5d0 100644 --- a/benchmarks/benchmark_test.go +++ b/benchmarks/benchmark_test.go @@ -6,7 +6,9 @@ import( "net/http" "time" ) - +/* +Adapted from [GoQuery example](https://github.com/PuerkitoBio/goquery?tab=readme-ov-file#examples) +*/ func TestFetchPostCovers(t *testing.T){ res, err := http.Get("https://www.metalsucks.net/") if err != nil { @@ -20,7 +22,7 @@ func TestFetchPostCovers(t *testing.T){ t.Fatal(err) } - nodeList := node.QueryAll(".sm-feat .clearfix article") + nodeList := node.QuerySelectorAll(".left-content article .post-title") t.Log("Got ", nodeList.Len(), " post titles.") iter := nodeList.IterNodeList() for node := range iter{ diff --git a/classList.go b/classList.go index ff31544..daa0858 100644 --- a/classList.go +++ b/classList.go @@ -68,7 +68,7 @@ func (classList ClassList) Encode() string { return classes } -// EncodeTo encode className for the node. +// EncodeTo encodes classNames for the node. // If node is nil EncodeTo does nothing. func (classList ClassList) EncodeTo(node *Node){ if node == nil { diff --git a/classList_test.go b/classList_test.go index 3568bf0..7c86534 100644 --- a/classList_test.go +++ b/classList_test.go @@ -1,31 +1,63 @@ package GoHtml_test -import( +import ( + "fmt" "testing" - "github.com/udan-jayanith/GoHTML" + + GoHtml "github.com/udan-jayanith/GoHTML" ) -func TestClasses(t *testing.T){ +func TestClasses(t *testing.T) { node := GoHtml.CreateNode("div") node.SetAttribute("class", "div-container main") classList := GoHtml.NewClassList() classList.DecodeFrom(node) - if !classList.Contains("main"){ + if !classList.Contains("main") { t.Fatal("") return } classList.DeleteClass("main") - if classList.Contains("main"){ + if classList.Contains("main") { t.Fatal("") return } classList.AppendClass("main-div") - if !classList.Contains("main-div"){ + if !classList.Contains("main-div") { t.Fatal("") return } classList.EncodeTo(node) +} + +func ExampleClassList_Contains() { + //Creates a div that has classes video-container and main-contents + div := GoHtml.CreateNode("div") + div.SetAttribute("class", "video-container main-contents") + + classList := GoHtml.NewClassList() + //Add the classes in the div to the class list + classList.DecodeFrom(div) + + //Checks wether the following classes exists in the classList + fmt.Println(classList.Contains("container")) + fmt.Println(classList.Contains("video-container")) + + //Output: + //false + //true +} + +func ExampleClassList_Encode(){ + classList := GoHtml.NewClassList() + + //Add classes to the class list + classList.AppendClass("container") + classList.AppendClass("warper") + classList.AppendClass("main-content") + + //This would output something like this "warper container main-content". Order of the output is not guaranteed. + fmt.Println(classList.Encode()) } \ No newline at end of file diff --git a/node-list.go b/node-list.go index 3cb4d29..c232677 100644 --- a/node-list.go +++ b/node-list.go @@ -5,7 +5,7 @@ import ( "iter" ) -//NodeList can store nodes by appended order. +//NodeList can store nodes by appended order and can iterate over the node list by invoking IterNodeList method. type NodeList struct { list *list.List currentEl *list.Element diff --git a/node-list_test.go b/node-list_test.go index e121e24..1a8cf89 100644 --- a/node-list_test.go +++ b/node-list_test.go @@ -1,6 +1,7 @@ package GoHtml_test import ( + "fmt" "os" "testing" @@ -13,6 +14,7 @@ func TestIterNodeList1(t *testing.T) { t.Fatal(err) return } + defer file.Close() node, err := GoHtml.Decode(file) if err != nil { @@ -39,4 +41,20 @@ func TestIterNodeList2(t *testing.T){ for node := range iter{ t.Log(node) } +} + +func ExampleNodeList(){ + nodeList := GoHtml.NewNodeList() + nodeList.Append(GoHtml.CreateNode("br")) + nodeList.Append(GoHtml.CreateNode("hr")) + nodeList.Append(GoHtml.CreateNode("div")) + + iter := nodeList.IterNodeList() + for node := range iter{ + fmt.Println(node.GetTagName()) + } + //Output: + //br + //hr + //div } \ No newline at end of file diff --git a/node-tree.go b/node-tree.go index 759a0a7..1ddfb88 100644 --- a/node-tree.go +++ b/node-tree.go @@ -2,6 +2,7 @@ package GoHtml import ( "strings" + "golang.org/x/net/html" ) @@ -38,7 +39,7 @@ func (node *Node) SetPreviousNode(previousNode *Node) { node.previousNode = previousNode } -// GetChildNode returns the first child elements of this node. +// GetChildNode returns the first child node of this node. func (node *Node) GetChildNode() *Node { return node.childNode } @@ -75,7 +76,7 @@ func (node *Node) GetAttribute(attributeName string) (string, bool) { // RemoveAttribute remove or delete the specified attribute. func (node *Node) RemoveAttribute(attributeName string) { delete(node.attributes, strings.TrimSpace(strings.ToLower(attributeName))) - + } // IterateAttributes calls callback at every attribute in the node by passing attribute and value of the node. @@ -114,6 +115,7 @@ func (node *Node) AppendChild(childNode *Node) { lastNode := node.GetChildNode().GetLastNode() childNode.SetPreviousNode(lastNode) + childNode.setParentNode(lastNode.GetParent()) lastNode.SetNextNode(childNode) } @@ -121,12 +123,13 @@ func (node *Node) AppendChild(childNode *Node) { func (node *Node) Append(newNode *Node) { lastNode := node.GetLastNode() newNode.SetPreviousNode(lastNode) + newNode.setParentNode(lastNode.GetParent()) lastNode.SetNextNode(newNode) } // GetParent returns a pointer to the parent node. func (node *Node) GetParent() *Node { - return node.GetFirstNode().getParentNode() + return node.parentNode } // GetLastNode returns the last node in the node branch. @@ -203,3 +206,21 @@ func (node *Node) RemoveNode() { func (node *Node) IsTextNode() bool { return node.GetTagName() == "" } + +// Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the selector and returns that node. +// Adapted from [https://developer.mozilla.org/en-US/docs/Web/API/Element/closest](MDN Element: closest() method) +func (node *Node) Closest(selector string) *Node { + traverser := NewTraverser(node) + selectors := TokenizeSelectorsAndCombinators(selector) + + for traverser.GetCurrentNode() != nil { + if matchFromRightMostSelectors(traverser.GetCurrentNode(), selectors) { + break + } else if traverser.GetCurrentNode().GetPreviousNode() == nil { + traverser.SetCurrentNodeTo(traverser.GetCurrentNode().GetParent()) + } else { + traverser.Previous() + } + } + return traverser.GetCurrentNode() +} diff --git a/node-tree_test.go b/node-tree_test.go index 3ab4a01..973c0e5 100644 --- a/node-tree_test.go +++ b/node-tree_test.go @@ -112,4 +112,24 @@ func TestRemoveNode(t *testing.T){ //p.RemoveNode() //t.Log(GoHtml.NodeTreeToHTML(article)) +} + +func TestClosest(t *testing.T){ + node, err := testFile4NodeTree() + if err != nil{ + t.Fatal(err) + } + node = node.GetElementByClassName("ordered-item") + if node == nil { + t.Fatal("Node is nil.") + } + + node = node.Closest("img+.ordered-list") + if node == nil { + t.Fatal("Node is nil") + }else if node.GetTagName() != "ol"{ + t.Fatal("Unexpected element.") + } + + } \ No newline at end of file diff --git a/parser.go b/parser.go index 251a7d2..11a72c3 100644 --- a/parser.go +++ b/parser.go @@ -3,24 +3,23 @@ package GoHtml import ( "io" "strings" + "golang.org/x/net/html" ) - // Decode reads from rd and create a node-tree. Then returns the root node and nil. func Decode(r io.Reader) (*Node, error) { t := NewTokenizer(r) nodeTreeBuilder := NewNodeTreeBuilder() for { tt := t.Advanced() - if tt == html.ErrorToken{ + if tt == html.ErrorToken { break } - nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt) + nodeTreeBuilder.WriteNodeTree(t.GetCurrentNode(), tt) } return nodeTreeBuilder.GetRootNode(), nil - } // HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError. @@ -30,4 +29,3 @@ func HTMLToNodeTree(html string) (*Node, error) { return node, err } - diff --git a/parser_test.go b/parser_test.go index 9f2d38f..33c054e 100644 --- a/parser_test.go +++ b/parser_test.go @@ -1,6 +1,7 @@ package GoHtml_test import ( + "fmt" "os" "strings" "testing" @@ -14,6 +15,7 @@ func TestDecode(t *testing.T) { t.Fatal(err) return } + defer file.Close() node, err := GoHtml.Decode(file) if err != nil { @@ -24,3 +26,31 @@ func TestDecode(t *testing.T) { var builder strings.Builder GoHtml.Encode(&builder, node) } + +func ExampleDecode() { + r := strings.NewReader(` + + + + + + User Profile + + +

Udan

+

udanjayanith@gmail.com

+

Joined: 01/08/2024

+ + + `) + + rootNode, _ := GoHtml.Decode(r) + + titleNode := rootNode.QuerySelector("title") + title := "" + if titleNode != nil { + title = titleNode.GetInnerText() + } + fmt.Println(title) + //Output: User Profile +} diff --git a/querying.go b/querying.go index 8f00960..ea271f5 100644 --- a/querying.go +++ b/querying.go @@ -1,6 +1,7 @@ package GoHtml import ( + "iter" "strings" ) @@ -99,105 +100,52 @@ func (node *Node) GetElementsById(idName string) NodeList { return nodeList } -// Selector types -const ( - Id int = iota - Tag - Class -) - -// QueryToken store data about basic css selectors(ids, classes, tags). -type QueryToken struct { - Type int - SelectorName string - Selector string -} - -// TokenizeQuery tokenizes the query and returns a list of QueryToken. -func TokenizeQuery(query string) []QueryToken { - slice := make([]QueryToken, 0, 1) - if strings.TrimSpace(query) == "" { - return slice - } - - iter := strings.SplitSeq(query, " ") - for sec := range iter { - token := QueryToken{} - switch sec { - case "", " ", ".", "#": - continue +/* +QuerySearch search returns a iterator that traverse through the node tree from given node and passes nodes that matches the given selector. +*/ +func QuerySearch(node *Node, selector string) iter.Seq[*Node] { + traverser := NewTraverser(node) + return func(yield func(node *Node) bool) { + selectorTokens := TokenizeSelectorsAndCombinators(selector) + iter := traverser.Walkthrough + for node := range iter { + if matchFromRightMostSelectors(node, selectorTokens) && !yield(node) { + return + } } - switch string(sec[0]) { - case ".": - token.Type = Class - token.SelectorName = sec[1:] - case "#": - token.Type = Id - token.SelectorName = sec[1:] - default: - token.Type = Tag - token.SelectorName = sec - } - token.Selector = sec - slice = append(slice, token) } - - return slice } -func matchQueryTokens(node *Node, queryTokens []QueryToken) bool { - if len(queryTokens) == 0 { - return false - } - classList := NewClassList() - classList.DecodeFrom(node) - for _, token := range queryTokens { - switch token.Type { - case Id: - idName, _ := node.GetAttribute("id") - if token.SelectorName != idName { - return false - } - case Tag: - if node.GetTagName() != token.SelectorName { - return false - } - case Class: - if !classList.Contains(token.SelectorName) { - return false - } +// matchFromRightMostQueryToken tries to match query tokens from right to left and return the index at which point query token last matched. +func matchFromRightMostSelectors(node *Node, selectorTokens []CombinatorEl) bool { + for i := len(selectorTokens) - 1; i >= 0; i-- { + if node == nil { + break } + node = selectorTokens[i].getMatchingNode(node) } - return true + return node != nil } -// Query returns the first node that matches with the give query. -func (node *Node) Query(query string) *Node { - queryTokens := TokenizeQuery(query) - traverser := NewTraverser(node) - var res *Node - traverser.Walkthrough(func(node *Node) TraverseCondition { - if matchQueryTokens(node, queryTokens) { - res = node - return StopWalkthrough - } - return ContinueWalkthrough - }) - return res +// QuerySelector returns the first node that matches with the selector from the node. +func (node *Node) QuerySelector(selector string) *Node { + iter := QuerySearch(node, selector) + for node := range iter { + return node + } + return nil } -// QueryAll returns a NodeList containing nodes that matched with the given query. -func (node *Node) QueryAll(query string) NodeList{ +// QuerySelectorAll returns a NodeList that has node that matches the selector form the node. +func (node *Node) QuerySelectorAll(selector string) NodeList { + iter := QuerySearch(node, selector) nodeList := NewNodeList() - queryTokens := TokenizeQuery(query) - traverser := NewTraverser(node) - for node := range traverser.Walkthrough{ - if matchQueryTokens(node, queryTokens) { - nodeList.Append(node) - } + for node := range iter { + nodeList.Append(node) } return nodeList -} \ No newline at end of file +} + diff --git a/querying_test.go b/querying_test.go index 2c22331..f678ad3 100644 --- a/querying_test.go +++ b/querying_test.go @@ -1,10 +1,12 @@ package GoHtml_test import ( + "fmt" + "net/http" "os" "testing" - "github.com/emirpasic/gods/stacks/linkedliststack" + Stack "github.com/emirpasic/gods/stacks/arraystack" GoHtml "github.com/udan-jayanith/GoHTML" ) @@ -13,6 +15,7 @@ func testFile4NodeTree() (*GoHtml.Node, error) { if err != nil { return nil, err } + defer file.Close() node, err := GoHtml.Decode(file) return node, err @@ -24,6 +27,7 @@ func TestGetElementByID(t *testing.T) { t.Fatal(err) return } + defer file.Close() node, err := GoHtml.Decode(file) if err != nil { @@ -80,7 +84,7 @@ func TestGetElementsByClassName(t *testing.T) { nodeList := node.GetElementsByClassName("ordered-item") iterator := nodeList.IterNodeList() - stack := linkedliststack.New() + stack := Stack.New() stack.Push("Mango") stack.Push("Orange") stack.Push("Apple") @@ -122,7 +126,7 @@ func TestGetElementsById(t *testing.T) { nodeList := node.GetElementsById("idElement") iter := nodeList.IterNodeList() - stack := linkedliststack.New() + stack := Stack.New() stack.Push("Lorem") stack.Push("") @@ -138,57 +142,97 @@ func TestGetElementsById(t *testing.T) { } } -func TestSelectorTokenizer(t *testing.T) { - stack := linkedliststack.New() - stack.Push("article .content") - stack.Push("article p h1") - stack.Push("article p") - stack.Push(".title #user") - stack.Push("#user title .title-1") - - for stack.Size() > 0 { - val, _ := stack.Pop() - selector := val.(string) - - tokens := GoHtml.TokenizeQuery(selector) - s := "" - for _, token := range tokens { - if s == "" { - s += token.Selector - } else { - s += " " + token.Selector - } - } - - if s != selector { - t.Fatal("Expected ", selector, "but got", s) - } +func testFile5NodeTree() (*GoHtml.Node, error) { + file, err := os.Open("test-files/5.html") + if err != nil { + return nil, err } + defer file.Close() + + node, _ := GoHtml.Decode(file) + return node, nil } func TestQuerySelector(t *testing.T) { - node, err := testFile4NodeTree() + rootNode, _ := testFile5NodeTree() + if rootNode == nil { + t.Fatal("Node is nil") + } + + node := rootNode.QuerySelector("#list .item") + if node == nil { + t.Fatal("Node is nil after querying.") + } else if node.GetInnerText() != "One" { + t.Fatal("Node contains unexpected inner text. Expected One but got", node.GetInnerText()) + } + //TODO: write test for testcases below. + /* + t.Log(rootNode.QuerySelector("body p")) + t.Log(rootNode.QuerySelector("html head > title")) + t.Log(rootNode.QuerySelector("section+ul")) + t.Log(rootNode.QuerySelector(".item~.last-item")) + */ +} + +func TestQuerySelectorAll(t *testing.T) { + rootNode, err := testFile5NodeTree() if err != nil { t.Fatal(err) + } + nodeList := rootNode.QuerySelectorAll("article h2") + if nodeList.Len() != 2 { + t.Fatal("Expected node list length of 2 but got", nodeList.Len()) + } + + stack := Stack.New() + stack.Push("Second Post (Draft)") + stack.Push("First Post") + + iter := nodeList.IterNodeList() + for node := range iter { + if stack.Size() == 0 { + break + } + v, _ := stack.Pop() + str := v.(string) + if str != node.GetInnerText() { + t.Fatal("Unexpected inner text from the node. Expected", str, "but got", node.GetInnerText()) + } + } + +} + +func ExampleNode_QuerySelector() { + res, err := http.Get("https://example.com/") + if err != nil || res.StatusCode != http.StatusOK { return } - imgEl := node.Query("img #idElement") - imgSrc, _ := imgEl.GetAttribute("src") - imgAlt, _ := imgEl.GetAttribute("alt") - if imgSrc != "" || imgAlt != "" { - t.Fatal("") + defer res.Body.Close() + + rootNode, _ := GoHtml.Decode(res.Body) + res.Body.Close() + + title := rootNode.QuerySelector("title") + if title != nil { + fmt.Println(title.GetInnerText()) + //Example Domain } } -func TestQuerySelectorAll(t *testing.T) { - node, err := testFile4NodeTree() - if err != nil { - t.Fatal(err) +func ExampleQuerySearch() { + //Request the html + res, err := http.Get("https://example.com/") + if err != nil || res.StatusCode != http.StatusOK { return } + defer res.Body.Close() + + //Decode the html + rootNode, _ := GoHtml.Decode(res.Body) - nodeList := node.QueryAll("h2") - if nodeList.Len() != 2{ - t.Fatal("") + //Iterate over every node that matches the query. + for node := range GoHtml.QuerySearch(rootNode, ".event-columns .column .event-block h4 a") { + //Convert the node and it's children nodes to text html and print it. + fmt.Println(GoHtml.NodeTreeToHTML(node)) } } diff --git a/selectors.go b/selectors.go new file mode 100644 index 0000000..7f317ec --- /dev/null +++ b/selectors.go @@ -0,0 +1,214 @@ +package GoHtml + +import ( + "strings" + "golang.org/x/net/html" +) + +type BasicSelector int + +const ( + Id BasicSelector = iota + Class + Tag +) + +//Selector struct represents a single css selector +//Ex: .my-class, #video, div +type Selector struct { + selector string + selectorName string + selectorType BasicSelector +} + +func matchNode(node *Node, basicSelectorName string, basicSelectorType BasicSelector) bool { + if basicSelectorName == "" { + return true + } else if node == nil { + return false + } + + switch basicSelectorType { + case Id: + idName, _ := node.GetAttribute("id") + return idName == basicSelectorName + case Class: + classList := NewClassList() + classList.DecodeFrom(node) + return classList.Contains(basicSelectorName) + case Tag: + return node.GetTagName() == basicSelectorName + } + return false +} + +//NewSelector takes a single css selector and returns a Selector struct. +//Selector string should be only of basic selector. +func NewSelector(selector string) Selector { + selector = strings.TrimSpace(html.EscapeString(selector)) + selectorStruct := Selector{} + if len(selector) == 0 || (selector[0] == '.' || selector[0] == '#') && len(selector) <= 1 { + return selectorStruct + } + + switch selector[0] { + case '.': + selectorStruct.selectorType = Class + case '#': + selectorStruct.selectorType = Id + default: + selectorStruct.selectorType = Tag + } + + selectorStruct.selector = strings.ToLower(selector) + if selectorStruct.selectorType != Tag { + selectorStruct.selectorName = selector[1:] + } else { + selectorStruct.selectorName = selector + } + return selectorStruct +} + +type Combinator int + +const ( + Descendant Combinator = iota + Child + NextSibling + SubsequentSibling + //if no combinator + NoneCombinator +) + +//CombinatorEl is used to represent selectors that are around a combinator. +type CombinatorEl struct { + Type Combinator + Selector1 Selector + Selector2 Selector +} + +//This takes a selector or combinators and selectors and then returns a slice of CombinatorEl. +func TokenizeSelectorsAndCombinators(selector string) []CombinatorEl { + iter := func(yield func(string) bool) { + currentStr := "" + for _, char := range selector { + switch char { + case ' ', '>', '+', '~': + if !yield(currentStr) || !yield(string(char)){ + return + } + currentStr = "" + default: + currentStr+=string(char) + } + } + yield(currentStr) + } + + list := make([]CombinatorEl, 0, 1) + currentCombinator := *new(CombinatorEl) + currentCombinator.Selector1 = NewSelector("") + currentCombinator.Type = NoneCombinator + for str := range iter { + if strings.TrimSpace(str) == "" { + continue + } + + switch str { + case "+": + currentCombinator.Type = NextSibling + case ">": + currentCombinator.Type = Child + case "~": + currentCombinator.Type = SubsequentSibling + default: + newSelector := NewSelector(str) + currentCombinator.Selector2 = newSelector + list = append(list, currentCombinator) + currentCombinator = *new(CombinatorEl) + currentCombinator.Selector1 = newSelector + } + + } + + if len(list) == 1 { + list[0].Type = NoneCombinator + } + + return list +} + +func (ce *CombinatorEl) getMatchingNode(node *Node) *Node { + switch ce.Type { + case Descendant: + return ce.getDescended(node) + case Child: + return ce.getDirectChild(node) + case NextSibling: + return ce.getNextSibling(node) + case SubsequentSibling: + return ce.getSubsequentSibling(node) + case NoneCombinator: + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return node + } + } + return nil +} + +// isDescended returns wether the given node is a ce.Selector2 and descended of ce.Selector1. +func (ce *CombinatorEl) getDescended(node *Node) *Node { + if !matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return nil + } + + parentNode := node.GetParent() + for parentNode != nil { + if matchNode(parentNode, ce.Selector1.selectorName, ce.Selector1.selectorType) { + return parentNode + } + parentNode = parentNode.GetParent() + } + return nil +} + +// isDirectChild returns whether the given node is a direct child of ce.Selector1 and node is of ce.Selector2 +func (ce *CombinatorEl) getDirectChild(node *Node) *Node { + if node == nil { + return nil + } + + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && + matchNode(node.GetParent(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return node.GetParent() + } + return nil +} + +// isNextSibling return whether the given node is of ce.Selector2 and next sibling of ce.Selector1 +func (ce *CombinatorEl) getNextSibling(node *Node) *Node { + if node == nil { + return nil + } + + if matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) && + matchNode(node.GetPreviousNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return node.GetPreviousNode() + } + return nil +} + +func (ce *CombinatorEl) getSubsequentSibling(node *Node) *Node { + if node == nil || !matchNode(node, ce.Selector2.selectorName, ce.Selector2.selectorType) { + return nil + } + + traverser := NewTraverser(node) + for traverser.GetCurrentNode() != nil { + if matchNode(traverser.GetCurrentNode(), ce.Selector1.selectorName, ce.Selector1.selectorType) { + return traverser.GetCurrentNode() + } + traverser.Previous() + } + return nil +} diff --git a/selectors_test.go b/selectors_test.go new file mode 100644 index 0000000..d842cf8 --- /dev/null +++ b/selectors_test.go @@ -0,0 +1,14 @@ +package GoHtml_test + +import ( + "testing" + + GoHtml "github.com/udan-jayanith/GoHTML" +) + +func TestTokenizeSelector(t *testing.T) { + slice := GoHtml.TokenizeSelectorsAndCombinators(".class-1 > .class-2 + .class-3 a") + if len(slice) != 4 { + t.Fatal("Exacted slice length of", 4, "but got", len(slice)) + } +} diff --git a/serializer.go b/serializer.go index e618142..25d8e98 100644 --- a/serializer.go +++ b/serializer.go @@ -9,79 +9,70 @@ import ( "golang.org/x/net/html" ) -func wrapAttributeValue(value string) string { - if isDigit(value) { - return value - } - - return `"` + strings.ReplaceAll(value, `"`, """) + `"` -} - func encodeListAttributes(node *Node) string { w := strings.Builder{} node.IterateAttributes(func(attribute, value string) { - if strings.TrimSpace(attribute) == "" { + if strings.TrimSpace(value) == "" { w.Write(fmt.Appendf(nil, " %s", attribute)) } else { - w.Write(fmt.Appendf(nil, " %s=%s", attribute, wrapAttributeValue(value))) + w.Write(fmt.Appendf(nil, " %s=%s", attribute, `"`+html.EscapeString(value)+`"`)) } }) return w.String() } -// Encode writes to w encoding of rootNode +// Encode writes to w encoding of the node tree from rootNode. func Encode(w io.Writer, rootNode *Node) { - type stackFrame struct { - node *Node - openedTag bool + if rootNode == nil { + return } - /* - traverser := NewTraverser(rootNode) - traverser.Walkthrough(func(node *Node) TraverseCondition { - fmt.Println("+++++++++++++++++++++++++++") - if node.IsTextNode() { - fmt.Println(node.text) - } else { - fmt.Println(node.GetTagName()) - } - return ContinueWalkthrough - }) - */ + type stackFrame struct { + node *Node + isClosingTag bool + } stack := linkedliststack.New() - stack.Push(stackFrame{node: rootNode, openedTag: false}) + stack.Push(stackFrame{ + node: rootNode, + }) for stack.Size() > 0 { - t, _ := stack.Pop() - top := t.(stackFrame) - current := top.node + v, _ := stack.Pop() + currentStackFrame := v.(stackFrame) - if current == nil { + if currentStackFrame.isClosingTag { + fmt.Fprintf(w, "", currentStackFrame.node.GetTagName()) continue + } else if currentStackFrame.node.IsTextNode() { + fmt.Fprint(w, html.EscapeString(currentStackFrame.node.GetText())) + } else { + fmt.Fprintf(w, "<%s%s>", func() string { + tagName := currentStackFrame.node.GetTagName() + tagNameUpperCased := strings.ToUpper(tagName) + if tagNameUpperCased == DOCTYPEDTD { + tagName = tagNameUpperCased + } + return tagName + }(), encodeListAttributes(currentStackFrame.node)) } - tagName := current.GetTagName() - if tagName == "" { - w.Write([]byte(html.EscapeString(current.GetText()))) - } else if IsVoidTag(tagName) { - fmt.Fprintf(w, "<%s%s>", tagName, encodeListAttributes(current)) - if current.GetNextNode() != nil { - stack.Push(stackFrame{node: current.GetNextNode(), openedTag: false}) - } - } else if !top.openedTag { - fmt.Fprintf(w, "<%s%s>", tagName, encodeListAttributes(current)) - stack.Push(stackFrame{node: current, openedTag: true}) - - if current.GetChildNode() != nil { - stack.Push(stackFrame{node: current.GetChildNode(), openedTag: false}) - } - } else { - fmt.Fprintf(w, "", tagName) - if current.GetNextNode() != nil { - stack.Push(stackFrame{node: current.GetNextNode(), openedTag: false}) - } + if currentStackFrame.node.GetNextNode() != nil { + stack.Push(stackFrame{ + node: currentStackFrame.node.GetNextNode(), + }) + } + if !IsVoidTag(currentStackFrame.node.GetTagName()) && !currentStackFrame.node.IsTextNode(){ + stack.Push(stackFrame{ + node: currentStackFrame.node, + isClosingTag: true, + }) + } + if currentStackFrame.node.GetChildNode() != nil { + stack.Push(stackFrame{ + node: currentStackFrame.node.GetChildNode(), + }) } } } diff --git a/serializer_test.go b/serializer_test.go index 7d39df5..795e392 100644 --- a/serializer_test.go +++ b/serializer_test.go @@ -28,6 +28,8 @@ func TestEncode2(t *testing.T) { if err != nil { t.Fatal("1.html does not exists.") } + defer file.Close() + node, _ := GoHtml.Decode(file) var builder strings.Builder GoHtml.Encode(&builder, node) diff --git a/test-files/4.html b/test-files/4.html index 966886b..18a385f 100644 --- a/test-files/4.html +++ b/test-files/4.html @@ -10,7 +10,7 @@

Document

List 1

-
    +
    1. Apple
    2. Orange
    3. Mango
    4. @@ -18,7 +18,7 @@

      List 1

      List 2

      Lorem

      -