Skip to content

Commit

Permalink
Normalize unicode when rendering hashtags
Browse files Browse the repository at this point in the history
  • Loading branch information
autumnull committed Feb 2, 2023
1 parent a3f0fe2 commit f210f44
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 6 deletions.
2 changes: 1 addition & 1 deletion internal/text/goldmark_extension.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ func (p *hashtagParser) Parse(parent ast.Node, block text.Reader, pc parser.Cont
case r == '#' && i == 0:
// ignore initial #
continue
case !util.IsPermittedInHashtag(r) && !util.IsMentionOrHashtagBoundary(r):
case !util.IsPlausiblyInHashtag(r) && !util.IsMentionOrHashtagBoundary(r):
// Fake hashtag, don't trust it
return nil
case util.IsMentionOrHashtagBoundary(r):
Expand Down
9 changes: 9 additions & 0 deletions internal/text/markdown_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ const (
mdItalicHashtagExpected = "<p><em><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a></em></p>"
mdItalicHashtags = "_#hashtag #hashtag #hashtag_"
mdItalicHashtagsExpected = "<p><em><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a> <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a> <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a></em></p>"
// BEWARE: sneaky unicode business going on.
// the first ö is one rune, the second ö is an o with a combining diacritic.
mdUnnormalizedHashtag = "#hellöthere #hellöthere"
mdUnnormalizedHashtagExpected = "<p><a href=\"http://localhost:8080/tags/hell%C3%B6there\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hellöthere</span></a> <a href=\"http://localhost:8080/tags/hell%C3%B6there\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hellöthere</span></a></p>"
)

type MarkdownTestSuite struct {
Expand Down Expand Up @@ -191,6 +195,11 @@ func (suite *MarkdownTestSuite) TestParseItalicHashtags() {
suite.Equal(mdItalicHashtagsExpected, formatted.HTML)
}

func (suite *MarkdownTestSuite) TestParseUnnormalizedHashtag() {
formatted := suite.FromMarkdown(mdUnnormalizedHashtag)
suite.Equal(mdUnnormalizedHashtagExpected, formatted.HTML)
}

func TestMarkdownTestSuite(t *testing.T) {
suite.Run(t, new(MarkdownTestSuite))
}
7 changes: 7 additions & 0 deletions internal/text/plain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,13 @@ func (suite *PlainTestSuite) TestDeriveMultiple() {
assert.Len(suite.T(), f.Emojis, 0)
}

func (suite *PlainTestSuite) TestZalgoHashtag() {
statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?`
f := suite.FromPlain(statusText)
assert.Len(suite.T(), f.Tags, 1)
assert.Equal(suite.T(), "praying", f.Tags[0].Name)
}

func TestPlainTestSuite(t *testing.T) {
suite.Run(t, new(PlainTestSuite))
}
22 changes: 17 additions & 5 deletions internal/text/replace.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"errors"
"github.com/superseriousbusiness/gotosocial/internal/db"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/util"
"golang.org/x/text/unicode/norm"
"strings"
)

Expand Down Expand Up @@ -85,13 +87,23 @@ func (r *customRenderer) replaceMention(text string) string {
return b.String()
}

// replaceMention takes a string in the form #HashedTag
// replaceMention takes a string in the form #HashedTag, and will normalize it before
// adding it to the db and turning it into HTML.
func (r *customRenderer) replaceHashtag(text string) string {
if len(text)-1 > maximumHashtagLength {
return text
// this normalization is specifically to avoid cases where visually-identical
// hashtags are stored with different unicode representations (e.g. with combining
// diacritics). It allows a tasteful number of combining diacritics to be used,
// as long as they can be combined with parent characters to form regular letter
// symbols.
normalized := norm.NFC.String(text[1:])

for i, r := range normalized {
if i >= maximumHashtagLength || !util.IsPermittedInHashtag(r) {
return text
}
}

tag, err := r.f.db.TagStringToTag(r.ctx, text[1:], r.accountID)
tag, err := r.f.db.TagStringToTag(r.ctx, normalized, r.accountID)
if err != nil {
log.Errorf("error generating hashtags from status: %s", err)
return text
Expand Down Expand Up @@ -122,7 +134,7 @@ func (r *customRenderer) replaceHashtag(text string) string {
b.WriteString(`<a href="`)
b.WriteString(tag.URL)
b.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
b.WriteString(text[1:])
b.WriteString(normalized)
b.WriteString(`</span></a>`)

return b.String()
Expand Down
6 changes: 6 additions & 0 deletions internal/util/statustools.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ import (
"unicode"
)

func IsPlausiblyInHashtag(r rune) bool {
// Marks are allowed during parsing, prior to normalization, but not after,
// since they may be combined into letters during normalization.
return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r)
}

func IsPermittedInHashtag(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r)
}
Expand Down

0 comments on commit f210f44

Please sign in to comment.