diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 6403cdc..9e923dd 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* udan-jayanith +* @udan-jayanith diff --git a/FUTURE-CHANGELOG.md b/FUTURE-CHANGELOG.md index df24a8a..32c0e02 100644 --- a/FUTURE-CHANGELOG.md +++ b/FUTURE-CHANGELOG.md @@ -1,11 +1,9 @@ -## v0.0.1-beta.1 -- bug fix: style attribute not get serialized correctly. +## v0.0.0-beta.3 <- current -## v0.0.1 -DecodeHeader only serializes only up to head. And return a node with only head and it's child nodes. -* DecodeHeader +## v0.0.0 +Delete deletes the branch without connecting sibling nodes. +* Delete -## v0.0.2 QuerySelector takes attribute name and regexp for the value and returns the first node that matches the regexp. * QuerySelector @@ -15,7 +13,7 @@ QuerySelectorAll takes two regexps and returns all nodes that matches the regexp Closest returns the closest node that matches the className. * Closest -## v0.0.3 +## v0.0.1 AddClass add the given class name to the node. * AddClass @@ -28,10 +26,16 @@ HasClass returns a boolean value specifying whether the node has the specified c GetClassList returns a map of class names in the specified node. * GetClassList -## v0.0.4 +## v0.0.2 * GetElementById * GetElementByClassName * GetElementByTagName * GetElementsById * GetElementsByClassName -* GetElementsByTagName \ No newline at end of file +* GetElementsByTagName + +## v0.0.3 +DecodeHeader only serializes only up to head. And return a node with only head and it's child nodes. +* DecodeOnly +* DecodeOnlyByClassName +* DecodeHeader \ No newline at end of file diff --git a/README.md b/README.md index 6d0b5c0..b72c34e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - + # GoHTML A powerful and comprehensive HTML parser and DOM manipulation library for Go, bringing JavaScript-like DOM operations to the Go ecosystem. diff --git a/assets/media/Black-text version.png b/assets/media/Black-text version.png new file mode 100644 index 0000000..5e609fa Binary files /dev/null and b/assets/media/Black-text version.png differ diff --git a/assets/media/Black-text version.svg b/assets/media/Black-text version.svg new file mode 100644 index 0000000..e6a5aa1 --- /dev/null +++ b/assets/media/Black-text version.svg @@ -0,0 +1,6 @@ + diff --git a/assets/media/Favicon.png b/assets/media/Favicon.png new file mode 100644 index 0000000..95105e2 Binary files /dev/null and b/assets/media/Favicon.png differ diff --git a/assets/media/Favicon.svg b/assets/media/Favicon.svg new file mode 100644 index 0000000..68ee392 --- /dev/null +++ b/assets/media/Favicon.svg @@ -0,0 +1,4 @@ + diff --git a/assets/media/Transparent Black version.png b/assets/media/Transparent Black version.png deleted file mode 100644 index 242d3fd..0000000 Binary files a/assets/media/Transparent Black version.png and /dev/null differ diff --git a/assets/media/Transparent Black version.svg b/assets/media/Transparent Black version.svg deleted file mode 100644 index faa4336..0000000 --- a/assets/media/Transparent Black version.svg +++ /dev/null @@ -1,10 +0,0 @@ - diff --git a/assets/media/White version.png b/assets/media/White version.png deleted file mode 100644 index 34e51fb..0000000 Binary files a/assets/media/White version.png and /dev/null differ diff --git a/assets/media/White version.svg b/assets/media/White version.svg deleted file mode 100644 index 3ce4e77..0000000 --- a/assets/media/White version.svg +++ /dev/null @@ -1,11 +0,0 @@ - diff --git a/assets/media/White-text version.png b/assets/media/White-text version.png new file mode 100644 index 0000000..b08c0ce Binary files /dev/null and b/assets/media/White-text version.png differ diff --git a/assets/media/White-text version.svg b/assets/media/White-text version.svg new file mode 100644 index 0000000..8ce4df8 --- /dev/null +++ b/assets/media/White-text version.svg @@ -0,0 +1,7 @@ + diff --git a/main.go b/main.go index 9ccf475..de202ee 100644 --- a/main.go +++ b/main.go @@ -1,14 +1,20 @@ /* -A powerful and comprehensive HTML parser and DOM manipulation library for Go, +A powerful and comprehensive HTML parser and DOM manipulation library for Go, bringing JavaScript-like DOM operations to the Go ecosystem. */ package GoHtml import ( + "fmt" + "regexp" "strings" "sync" ) +var ( + SyntaxError error = fmt.Errorf("Syntax error") +) + //CreateNode returns a initialized new node. func CreateNode(tagName string) *Node { return &Node{ @@ -67,3 +73,12 @@ func ApplySaveChanges(node *Node){ parentNode.rwMutex.Unlock() } } + +func isQuote(chr string) bool { + return chr == `"` || chr == `'` || chr == "`" +} + +func isDigit(value string) bool { + reg := regexp.MustCompile(`^[\d\.]+$`) + return reg.Match([]byte(value)) +} \ No newline at end of file diff --git a/node-tree.go b/node-tree.go index 1057908..af80c81 100644 --- a/node-tree.go +++ b/node-tree.go @@ -1,6 +1,7 @@ package GoHtml import ( + "strings" "sync" ) @@ -78,6 +79,10 @@ func (node *Node) GetTagName() string{ node.rwMutex.Lock() defer node.rwMutex.Unlock() + if strings.ToUpper(node.tagName) == DOCTYPEDTD{ + return strings.ToUpper(node.tagName) + } + return node.tagName } @@ -86,15 +91,16 @@ func (node *Node) SetTagName(tagName string){ node.rwMutex.Lock() defer node.rwMutex.Unlock() - node.tagName = tagName + node.tagName = strings.TrimSpace(strings.ToLower(tagName)) } -//GetAttribute returns the specified attribute form the node. -func (node *Node) GetAttribute(attributeName string) string{ +//GetAttribute returns the specified attribute value form the node. +func (node *Node) GetAttribute(attributeName string) (string, bool){ node.rwMutex.Lock() defer node.rwMutex.Unlock() - return node.attributes[attributeName] + v, ok := node.attributes[attributeName] + return v, ok } //RemoveAttribute remove or delete the specified attribute. @@ -120,7 +126,7 @@ func (node *Node) SetAttribute(attribute, value string){ node.rwMutex.Lock() defer node.rwMutex.Unlock() - node.attributes[attribute] = value + node.attributes[strings.TrimSpace(attribute)] = strings.TrimSpace(value) } //GetText returns text on the node. This does not returns text on it's child nodes. If you also wants child nodes text use GetInnerText method on the node. diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..f434a29 --- /dev/null +++ b/parser.go @@ -0,0 +1,312 @@ +package GoHtml + +import ( + "bufio" + "io" + "regexp" + "strings" + + "github.com/emirpasic/gods/stacks/linkedliststack" +) + +var( + closingTagReg = regexp.MustCompile(`(?s)^\s*<\/.*>\s*$`) + openingTagReg = regexp.MustCompile(`(?s)^\s*<.*>\s*$`) +) + +// Decode reads from rd and create a node-tree. Then returns the root node and an error. If error were to occur it would be SyntaxError. +func Decode(rd io.Reader) (*Node, error) { + newRd := bufio.NewReader(rd) + rootNode := CreateNode("") + currentNode := rootNode + stack := linkedliststack.New() + + str := "" + readingQuote := "" + readingComment := "" + for { + byt, err := newRd.ReadByte() + if err != nil { + node := rootNode.GetNextNode() + rootNode.RemoveNode() + return node, nil + } + str += string(byt) + + if readingComment == "" && isStartingComment(currentNode, str) && readingQuote == "" { + readingComment = getStartingComment(currentNode, str) + } else if readingComment != "" && isEndingComment(currentNode, readingComment, str) { + readingComment = "" + str = escapeComments(str, currentNode) + } + + if readingComment != "" { + continue + } + + if isQuote(string(byt)) && (currentNode.GetTagName() == "script" || currentNode.GetTagName() == "style" || isReadingTag(str)) && + readingQuote == "" { + readingQuote = string(byt) + continue + } else if readingQuote == string(byt) { + readingQuote = "" + } + + if readingQuote != "" { + continue + } + + if closingTagReg.MatchString(str) { + //closing tag + str = "" + currentNode, err = getFirstOpenNode(currentNode, stack) + if err != nil { + return currentNode, err + } + } else if openingTagReg.MatchString(str) { + node, err := serializeHTMLTag(str) + if err != nil { + node := rootNode.GetNextNode() + rootNode.RemoveNode() + return node, err + } + str = "" + + if isOpen(currentNode, stack) { + currentNode.AppendChild(node) + } else { + currentNode.Append(node) + } + currentNode = node + if !isSelfClosingNode(node) { + stack.Push(currentNode) + } + } else if string(byt) == "<" && strings.TrimSpace(str) != "<" { + // text + str = str[:len(str)-1] + node := serializeTextNode(str) + str = "<" + + if isOpen(currentNode, stack) { + currentNode.AppendChild(node) + } else { + currentNode.Append(node) + } + currentNode = node + } + } +} + +var ( + startingBlockCommentReg = regexp.MustCompile(`\s*(\/\*)\s*$`) + endingBlockCommentReg = regexp.MustCompile(`\s*(\*\/)\s*$`) + doubleSlashReg = regexp.MustCompile(`\s*(//)\s*$`) + htmlCommentStarterReg = regexp.MustCompile(`\s*()\s*$`) +) + +func isEndingComment(currentNode *Node, startingComment string, str string) bool { + if currentNode.GetTagName() == "script" { + return (endingNewLineReg.MatchString(str) && startingComment == "//") || (endingBlockCommentReg.MatchString(str) && startingComment == "/*") + } else if currentNode.GetTagName() == "style" { + return endingBlockCommentReg.MatchString(str) && startingComment == "/*" + } + + return htmlCommentEndReg.MatchString(str) && startingComment == "`) + blockCommentReg = regexp.MustCompile(`(?s)\/\*.*\*\/`) + inlineCommentReg = regexp.MustCompile(`(?s)\/\/.*\n`) +) + +func escapeComments(str string, currentNode *Node) string{ + if currentNode.GetTagName() == "script"{ + str = blockCommentReg.ReplaceAllLiteralString(str, "") + str = inlineCommentReg.ReplaceAllLiteralString(str, "") + }else if currentNode.GetTagName() == "style"{ + str = blockCommentReg.ReplaceAllLiteralString(str, "") + }else{ + str = htmlCommentReg.ReplaceAllLiteralString(str, "") + } + return str +} + +func getFirstOpenNode(currentNode *Node, stack *linkedliststack.Stack) (*Node, error) { + traverser := GetTraverser(currentNode) + for traverser.GetCurrentNode() != nil { + n, ok := stack.Peek() + if !ok { + return traverser.GetCurrentNode(), SyntaxError + } + node := n.(*Node) + + if traverser.GetCurrentNode() == node { + stack.Pop() + return node, nil + } + + if traverser.GetCurrentNode().GetPreviousNode() == nil { + traverser.SetCurrentNodeTo(traverser.GetCurrentNode().GetParent()) + } else { + traverser.Previous() + } + } + + return traverser.GetCurrentNode(), SyntaxError +} + +func isOpen(currentNode *Node, stack *linkedliststack.Stack) bool { + if stack.Size() < 1 || isSelfClosingNode(currentNode) { + return false + } + + n, _ := stack.Peek() + node := n.(*Node) + return node == currentNode +} + +func isSelfClosingNode(node *Node) bool { + return node.GetTagName() == "" || IsVoidTag(node.GetTagName()) +} + +var ( + tagNameRegex = regexp.MustCompile(`^\s*([\w\-_!]*)`) + afterTagNameReg = regexp.MustCompile(`^\s*[\w\-_!]*\s*(.*)`) + attributeNameReg = regexp.MustCompile(`^\s*([\w\-_!]*)\s*`) + afterAttributeNameReg = regexp.MustCompile(`^\s*[\w\-_!]*\s*(.*)`) + isDefinedValueReg = regexp.MustCompile(`(?s)^\s*=.*`) + afterEqualSignReg = regexp.MustCompile(`(?s)^\s*=(.*)`) + definedValueReg = regexp.MustCompile(`(?s)\s*(\s*('.*?'|".*?"|\s*[\S]*).*).*`) + afterDefinedValueReg = regexp.MustCompile(`(?s)\s*('.*?'|".*?"|\s*[\S]*)\s*(.*)\s*`) +) + +func serializeHTMLTag(tag string) (*Node, error) { + tag = strings.TrimSpace(tag) + tag = strings.TrimRight(strings.TrimRight(strings.TrimLeft(tag, "<"), ">"), `/`) + node := CreateNode("") + + //extract the html tag name + tagName := tagNameRegex.FindString(tag) + if tagName == "" { + return node, SyntaxError + } + node.SetTagName(strings.TrimSpace(tagName)) + + //Cut the tag name from tag. + tag = strings.TrimSpace(getRightMostString(afterTagNameReg.FindStringSubmatch(tag))) + if strings.TrimSpace(tag) == "" || tag == strings.TrimSpace(tagName) { + return node, nil + } + + for { + if tag == "" { + return node, nil + } + + //This parses attribute name. + attributeName := strings.TrimSpace(getRightMostString(attributeNameReg.FindStringSubmatch(tag))) + if attributeName == "" { + return node, SyntaxError + } + + //This removes attribute name from the tag. + tag = strings.TrimSpace(getRightMostString(afterAttributeNameReg.FindStringSubmatch(tag))) + if tag == "" { + return node, nil + } + + if !isDefinedValueReg.MatchString(tag) { + node.SetAttribute(attributeName, "") + if attributeName == strings.TrimSpace(tag){ + return node, nil + } + continue + } + + tag = strings.TrimSpace(getRightMostString(afterEqualSignReg.FindStringSubmatch(tag))) + if tag == "" { + return node, SyntaxError + } + + attributeValue := strings.TrimSpace(getRightMostString(definedValueReg.FindStringSubmatch(tag))) + node.SetAttribute(attributeName, escapeQuotes(attributeValue)) + + tag = strings.TrimSpace(getRightMostString(afterDefinedValueReg.FindStringSubmatch(tag))) + if tag == "" || attributeValue == tag{ + return node, nil + } + + } +} + +func serializeTextNode(s string) *Node { + node := CreateTextNode(s) + return node +} + +var ( + firstCharLesserReg *regexp.Regexp = regexp.MustCompile(`^<.*`) +) + +func isReadingTag(strBuf string) bool { + return firstCharLesserReg.MatchString(strBuf) +} + +// HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError. +func HTMLToNodeTree(html string) (*Node, error) { + rd := strings.NewReader(html) + node, err := Decode(rd) + return node, err +} + +var ( + escapeQuotesReg *regexp.Regexp = regexp.MustCompile(`(?s)^\s*('(.*)'|"(.*)"|([\d+\.]*)|.*)\s*$`) +) + +func escapeQuotes(str string) string { + matches := escapeQuotesReg.FindStringSubmatch(str) + for i := len(matches) - 1; i >= 0; i-- { + if strings.TrimSpace(matches[i]) != "" { + return matches[i] + } + } + return "" +} + +func getRightMostString(slice []string) string { + for i := len(slice) - 1; i >= 0; i-- { + if strings.TrimSpace(slice[i]) != "" { + return slice[i] + } + } + return "" +} diff --git a/parser_test.go b/parser_test.go new file mode 100644 index 0000000..6d5f5e8 --- /dev/null +++ b/parser_test.go @@ -0,0 +1,37 @@ +package GoHtml_test + +import ( + "os" + "strings" + "testing" + + GoHtml "github.com/udan-jayanith/GoHTML" +) + +func TestDecode(t *testing.T) { + file, err := os.Open("./test-files/1.html") + if err != nil { + t.Fatal(err) + return + } + + node, err := GoHtml.Decode(file) + if err != nil { + t.Fatal(err) + return + } + + traverser := GoHtml.GetTraverser(node) + traverser.Walkthrough(func(node *GoHtml.Node) GoHtml.TraverseCondition { + attributeValue, ok := node.GetAttribute("inner-text") + if attributeValue == "js" && ok { + t.Fatal("js comment got parsed", node) + return GoHtml.StopWalkthrough + } + return GoHtml.ContinueWalkthrough + }) + + builder1 := &strings.Builder{} + GoHtml.Encode(builder1, node) + //It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html. +} diff --git a/serialize.go b/serialize.go deleted file mode 100644 index 4354cbc..0000000 --- a/serialize.go +++ /dev/null @@ -1,299 +0,0 @@ -package GoHtml - -import ( - "bufio" - "fmt" - "io" - "regexp" - "strings" - - "github.com/emirpasic/gods/stacks/linkedliststack" -) - -var ( - SyntaxError error = fmt.Errorf("Syntax error") -) - -// Decode reads from rd and create a node-tree. Then returns the root node and an error. If error were to occur it would be SyntaxError. -func Decode(rd io.Reader) (*Node, error) { - newRd := bufio.NewReader(rd) - rootNode := CreateNode("") - currentNode := rootNode - stack := linkedliststack.New() - - str := "" - readingQuote := "" - readingComment := "" - for { - byt, err := newRd.ReadByte() - if err != nil { - node := rootNode.GetNextNode() - rootNode.RemoveNode() - return node, nil - } - str += string(byt) - - last4Char := getLast4Char(str) - if readingComment == "" && isStartingComment(currentNode, last4Char) { - readingComment = getStartingComment(currentNode, last4Char) - } else if readingComment != "" && isEndingComment(currentNode, readingComment, last4Char) { - readingComment = "" - str = "" - } - - if readingComment != "" { - continue - } - - if isQuote(string(byt)) && (currentNode.GetTagName() == "script" || currentNode.GetTagName() == "style" || isReadingTag(str)) && - readingQuote == "" { - readingQuote = string(byt) - continue - } else if readingQuote == string(byt) { - readingQuote = "" - } - - if readingQuote != "" { - continue - } - - if regexp.MustCompile(`^\s*<\/.*>\s*$`).MatchString(str) { - //closing tag - str = "" - currentNode, err = getFirstOpenNode(currentNode, stack) - if err != nil { - return currentNode, err - } - } else if regexp.MustCompile(`^\s*<.*>\s*$`).MatchString(str) { - //opening and void tags - node, err := serializeHTMLTag(str) - if err != nil { - node := rootNode.GetNextNode() - rootNode.RemoveNode() - return node, err - } - str = "" - - if isOpen(currentNode, stack) { - currentNode.AppendChild(node) - } else { - currentNode.Append(node) - } - currentNode = node - if !isSelfClosingNode(node) { - stack.Push(currentNode) - } - } else if string(byt) == "<" && !regexp.MustCompile(`^\s*<$`).MatchString(str) { - // text - str = str[:len(str)-1] - node := serializeTextNode(str) - str = "<" - - if isOpen(currentNode, stack) { - currentNode.AppendChild(node) - } else { - currentNode.Append(node) - } - currentNode = node - } - } -} - -func isStartingComment(currentNode *Node, last4Char string) bool { - if currentNode.GetTagName() == "script" { - return regexp.MustCompile(`//$`).MatchString(last4Char) || regexp.MustCompile(`/\*$`).MatchString(last4Char) - } else if currentNode.GetTagName() == "style" { - return regexp.MustCompile(`/\*$`).MatchString(last4Char) - } - - return regexp.MustCompile(`$`).MatchString(last4Char) && startingComment == " - - -
- - -This paragraph shows off CSS and inline styling.
- - -