Skip to content

Commit

Permalink
improvements to implied property parsing
Browse files Browse the repository at this point in the history
- add missing check for no nested microformats or explicit u-*
  properties before parsing for implied photo

- add missing check for no h-* class on grandchild abbr elements when
  parsing for implied name

- refactor implied name checks to simplify code (no functional changes)

- fix subtle bug in :only-of-type checks for implied photo and URL.

  When `:only-of-type` is combined with an attribute selector (such as
  `.h-x>a[href]:only-of-type:not[.h-*]`), it does not mean "only of type
  with the attribute" as it was previously implemented, but rather "only
  of type irrespective of attributes" and also "has this attribute".

  This has the effect of being able to remove the
  getOnlyChildAtomWithAttr func entirely, since it's never actually
  needed.  This was not causing any problems in the wild that I'm aware
  of, I just noticed it when re-reading the parsing spec.
  • Loading branch information
willnorris committed Jun 19, 2021
1 parent 48b639f commit 3daa7ff
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 128 deletions.
125 changes: 44 additions & 81 deletions microformats.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,16 @@ func (p *parser) walk(node *html.Node) {
}
}
if _, ok := curItem.Properties["photo"]; !ok {
photo, alt := getImpliedPhoto(node, p.base)
if alt != "" {
curItem.Properties["photo"] = append(curItem.Properties["photo"], map[string]string{
"alt": alt,
"value": photo,
})
} else if photo != "" {
curItem.Properties["photo"] = append(curItem.Properties["photo"], photo)
if !curItem.hasNestedMicroformats && !curItem.hasUProperties {
photo, alt := getImpliedPhoto(node, p.base)
if alt != "" {
curItem.Properties["photo"] = append(curItem.Properties["photo"], map[string]string{
"alt": alt,
"value": photo,
})
} else if photo != "" {
curItem.Properties["photo"] = append(curItem.Properties["photo"], photo)
}
}
}
if _, ok := curItem.Properties["url"]; !ok {
Expand Down Expand Up @@ -626,91 +628,51 @@ func getOnlyChildAtom(node *html.Node, atom atom.Atom) *html.Node {
return n
}

// getOnlyChild returns the sole child of node with the specified atom and
// attribute. Returns nil if node has zero or more than one child with that
// atom and attribute.
func getOnlyChildAtomWithAttr(node *html.Node, atom atom.Atom, attr string) *html.Node {
if node == nil {
return nil
}
var n *html.Node
for c := node.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.DataAtom == atom && hasAttr(c, attr) {
if n == nil {
n = c
} else {
return nil
}
}
}
return n
}

// getImpliedName gets the implied name value for node.
//
// See http://microformats.org/wiki/microformats2-parsing
//
//nolint:gocyclo,funlen // getImpliedName is necessarily complex due to parsing rules
func getImpliedName(node *html.Node) string {
var name *string
if isAtom(node, atom.Img, atom.Area) {

switch {
case isAtom(node, atom.Img, atom.Area):
name = getAttrPtr(node, "alt")
}
if name == nil && isAtom(node, atom.Abbr) {
case isAtom(node, atom.Abbr):
name = getAttrPtr(node, "title")
}

if name == nil {
subnode := getOnlyChild(node)
if subnode != nil && subnode.DataAtom == atom.Img && !hasMatchingClass(subnode, rootClassNames) {
name = getAttrPtr(subnode, "alt")
}
}
if name == nil {
subnode := getOnlyChild(node)
if subnode != nil && subnode.DataAtom == atom.Area && !hasMatchingClass(subnode, rootClassNames) {
name = getAttrPtr(subnode, "alt")
}
}
if name == nil {
subnode := getOnlyChild(node)
if subnode != nil && subnode.DataAtom == atom.Abbr && !hasMatchingClass(subnode, rootClassNames) {
name = getAttrPtr(subnode, "title")
}
}

if name == nil {
subnode := getOnlyChild(node)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subsubnode := getOnlyChild(subnode)
if subsubnode != nil && subsubnode.DataAtom == atom.Img && !hasMatchingClass(subsubnode, rootClassNames) {
name = getAttrPtr(subsubnode, "alt")
}
}
}
if name == nil {
subnode := getOnlyChild(node)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subsubnode := getOnlyChild(subnode)
if subsubnode != nil && subsubnode.DataAtom == atom.Area && !hasMatchingClass(subsubnode, rootClassNames) {
name = getAttrPtr(subsubnode, "alt")
switch {
case isAtom(subnode, atom.Img, atom.Area):
name = getAttrPtr(subnode, "alt")
case isAtom(subnode, atom.Abbr):
name = getAttrPtr(subnode, "title")
}
}
}

if name == nil {
subnode := getOnlyChild(node)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subsubnode := getOnlyChild(subnode)
if subsubnode != nil && subsubnode.DataAtom == atom.Abbr {
name = getAttrPtr(subsubnode, "title")
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
switch {
case isAtom(subsubnode, atom.Img, atom.Area):
name = getAttrPtr(subsubnode, "alt")
case isAtom(subsubnode, atom.Abbr):
name = getAttrPtr(subsubnode, "title")
}
}
}
}

if name == nil {
name = new(string)
*name = strings.TrimSpace(getTextContent(node, imageAltValue))
*name = getTextContent(node, imageAltValue)
}

return strings.TrimSpace(*name)
}

Expand All @@ -719,23 +681,24 @@ func getImpliedName(node *html.Node) string {
// See http://microformats.org/wiki/microformats2-parsing
func getImpliedPhoto(node *html.Node, baseURL *url.URL) (src, alt string) {
var photo *string
if photo == nil && isAtom(node, atom.Img) {

switch {
case isAtom(node, atom.Img):
photo = getAttrPtr(node, "src")
alt = getAttr(node, "alt")
}
if photo == nil && isAtom(node, atom.Object) {
case isAtom(node, atom.Object):
photo = getAttrPtr(node, "data")
}

if photo == nil {
subnode := getOnlyChildAtomWithAttr(node, atom.Img, "src")
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subnode := getOnlyChildAtom(node, atom.Img)
if subnode != nil && hasAttr(subnode, "src") && !hasMatchingClass(subnode, rootClassNames) {
photo = getAttrPtr(subnode, "src")
alt = getAttr(subnode, "alt")
}
}
if photo == nil {
subnode := getOnlyChildAtomWithAttr(node, atom.Object, "data")
subnode := getOnlyChildAtom(node, atom.Object)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
photo = getAttrPtr(subnode, "data")
}
Expand All @@ -744,8 +707,8 @@ func getImpliedPhoto(node *html.Node, baseURL *url.URL) (src, alt string) {
if photo == nil {
subnode := getOnlyChild(node)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subsubnode := getOnlyChildAtomWithAttr(subnode, atom.Img, "src")
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
subsubnode := getOnlyChildAtom(subnode, atom.Img)
if subsubnode != nil && hasAttr(subsubnode, "src") && !hasMatchingClass(subsubnode, rootClassNames) {
photo = getAttrPtr(subsubnode, "src")
alt = getAttr(subsubnode, "alt")
}
Expand All @@ -754,7 +717,7 @@ func getImpliedPhoto(node *html.Node, baseURL *url.URL) (src, alt string) {
if photo == nil {
subnode := getOnlyChild(node)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subsubnode := getOnlyChildAtomWithAttr(subnode, atom.Object, "data")
subsubnode := getOnlyChildAtom(subnode, atom.Object)
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
photo = getAttrPtr(subsubnode, "data")
}
Expand All @@ -767,7 +730,7 @@ func getImpliedPhoto(node *html.Node, baseURL *url.URL) (src, alt string) {
return expandURL(*photo, baseURL), alt
}

// getImpliedName gets the implied url value for node.
// getImpliedURL gets the implied url value for node.
//
// See http://microformats.org/wiki/microformats2-parsing
func getImpliedURL(node *html.Node, baseURL *url.URL) string {
Expand All @@ -777,13 +740,13 @@ func getImpliedURL(node *html.Node, baseURL *url.URL) string {
}

if value == nil {
subnode := getOnlyChildAtomWithAttr(node, atom.A, "href")
subnode := getOnlyChildAtom(node, atom.A)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
value = getAttrPtr(subnode, "href")
}
}
if value == nil {
subnode := getOnlyChildAtomWithAttr(node, atom.Area, "href")
subnode := getOnlyChildAtom(node, atom.Area)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
value = getAttrPtr(subnode, "href")
}
Expand All @@ -792,7 +755,7 @@ func getImpliedURL(node *html.Node, baseURL *url.URL) string {
if value == nil {
subnode := getOnlyChild(node)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subsubnode := getOnlyChildAtomWithAttr(subnode, atom.A, "href")
subsubnode := getOnlyChildAtom(subnode, atom.A)
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
value = getAttrPtr(subsubnode, "href")
}
Expand All @@ -801,7 +764,7 @@ func getImpliedURL(node *html.Node, baseURL *url.URL) string {
if value == nil {
subnode := getOnlyChild(node)
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
subsubnode := getOnlyChildAtomWithAttr(subnode, atom.Area, "href")
subsubnode := getOnlyChildAtom(subnode, atom.Area)
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
value = getAttrPtr(subsubnode, "href")
}
Expand Down
47 changes: 0 additions & 47 deletions microformats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,53 +365,6 @@ func Test_GetOnlyChildAtom(t *testing.T) {
}
}

func Test_GetOnlyChildAtomWithAttr(t *testing.T) {
tests := []struct {
html, atom, attr, child string
}{
{"", "", "", ""},
{`<a><img></a>`, "img", "", ""},

{`<a><img src></a>`, "img", "src", "<img src>"},
{`<a><img src=""></a>`, "img", "src", `<img src="">`},
{`<a><img src=""><img></a>`, "img", "src", `<img src="">`},
{`<a>foo<img src>bar</a>`, "img", "src", "<img src>"},
{`<a><b class><img></b></a>`, "b", "class", "<b class><img></b>"},

// wrong atom
{`<a><img src></a>`, "b", "", ""},
// wrong attr
{`<a><img src></a>`, "img", "class", ""},
// too many children
{`<a><img src><img src></a>`, "img", "src", ""},
// child too deep
{`<a><b><img src></b></a>`, "img", "src", ""},
}

for _, tt := range tests {
n, err := parseNode(tt.html)
if err != nil {
t.Fatalf("Error parsing HTML: %v", err)
}

want, err := parseNode(tt.child)
if err != nil {
t.Fatalf("Error parsing HTML: %v", err)
}

got := getOnlyChildAtomWithAttr(n, atom.Lookup([]byte(tt.atom)), tt.attr)
if got != nil {
// for the purposes of comparison, adjacent nodes don't matter
got.Parent = nil
got.PrevSibling = nil
got.NextSibling = nil
}
if !cmp.Equal(got, want) {
t.Errorf("getOnlyChildAtomWithAttr(%q, %q, %q) returned %#v, want %#v", tt.html, tt.atom, tt.attr, got, want)
}
}
}

func Test_GetImpliedName(t *testing.T) {
tests := []struct {
html, name string
Expand Down

0 comments on commit 3daa7ff

Please sign in to comment.