Normalize unicode when rendering hashtags

superseriousbusiness · Feb 2, 2023 · f210f44 · f210f44
1 parent a3f0fe2
commit f210f44
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 6 deletions.
diff --git a/internal/text/goldmark_extension.go b/internal/text/goldmark_extension.go
@@ -162,7 +162,7 @@ func (p *hashtagParser) Parse(parent ast.Node, block text.Reader, pc parser.Cont
 		case r == '#' && i == 0:
 			// ignore initial #
 			continue
-		case !util.IsPermittedInHashtag(r) && !util.IsMentionOrHashtagBoundary(r):
+		case !util.IsPlausiblyInHashtag(r) && !util.IsMentionOrHashtagBoundary(r):
 			// Fake hashtag, don't trust it
 			return nil
 		case util.IsMentionOrHashtagBoundary(r):

diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go
@@ -81,6 +81,10 @@ const (
 	mdItalicHashtagExpected         = "<p><em><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a></em></p>"
 	mdItalicHashtags                = "_#hashtag #hashtag #hashtag_"
 	mdItalicHashtagsExpected        = "<p><em><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a> <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a> <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a></em></p>"
+	// BEWARE: sneaky unicode business going on.
+	// the first ö is one rune, the second ö is an o with a combining diacritic.
+	mdUnnormalizedHashtag         = "#hellöthere #hellöthere"
+	mdUnnormalizedHashtagExpected = "<p><a href=\"http://localhost:8080/tags/hell%C3%B6there\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hellöthere</span></a> <a href=\"http://localhost:8080/tags/hell%C3%B6there\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hellöthere</span></a></p>"
 )
 
 type MarkdownTestSuite struct {
@@ -191,6 +195,11 @@ func (suite *MarkdownTestSuite) TestParseItalicHashtags() {
 	suite.Equal(mdItalicHashtagsExpected, formatted.HTML)
 }
 
+func (suite *MarkdownTestSuite) TestParseUnnormalizedHashtag() {
+	formatted := suite.FromMarkdown(mdUnnormalizedHashtag)
+	suite.Equal(mdUnnormalizedHashtagExpected, formatted.HTML)
+}
+
 func TestMarkdownTestSuite(t *testing.T) {
 	suite.Run(t, new(MarkdownTestSuite))
 }
diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go
@@ -135,6 +135,13 @@ func (suite *PlainTestSuite) TestDeriveMultiple() {
 	assert.Len(suite.T(), f.Emojis, 0)
 }
 
+func (suite *PlainTestSuite) TestZalgoHashtag() {
+	statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?`
+	f := suite.FromPlain(statusText)
+	assert.Len(suite.T(), f.Tags, 1)
+	assert.Equal(suite.T(), "praying", f.Tags[0].Name)
+}
+
 func TestPlainTestSuite(t *testing.T) {
 	suite.Run(t, new(PlainTestSuite))
 }
diff --git a/internal/text/replace.go b/internal/text/replace.go
@@ -22,6 +22,8 @@ import (
 	"errors"
 	"github.com/superseriousbusiness/gotosocial/internal/db"
 	"github.com/superseriousbusiness/gotosocial/internal/log"
+	"github.com/superseriousbusiness/gotosocial/internal/util"
+	"golang.org/x/text/unicode/norm"
 	"strings"
 )
 
@@ -85,13 +87,23 @@ func (r *customRenderer) replaceMention(text string) string {
 	return b.String()
 }
 
-// replaceMention takes a string in the form #HashedTag
+// replaceMention takes a string in the form #HashedTag, and will normalize it before
+// adding it to the db and turning it into HTML.
 func (r *customRenderer) replaceHashtag(text string) string {
-	if len(text)-1 > maximumHashtagLength {
-		return text
+	// this normalization is specifically to avoid cases where visually-identical
+	// hashtags are stored with different unicode representations (e.g. with combining
+	// diacritics). It allows a tasteful number of combining diacritics to be used,
+	// as long as they can be combined with parent characters to form regular letter
+	// symbols.
+	normalized := norm.NFC.String(text[1:])
+
+	for i, r := range normalized {
+		if i >= maximumHashtagLength || !util.IsPermittedInHashtag(r) {
+			return text
+		}
 	}
 
-	tag, err := r.f.db.TagStringToTag(r.ctx, text[1:], r.accountID)
+	tag, err := r.f.db.TagStringToTag(r.ctx, normalized, r.accountID)
 	if err != nil {
 		log.Errorf("error generating hashtags from status: %s", err)
 		return text
@@ -122,7 +134,7 @@ func (r *customRenderer) replaceHashtag(text string) string {
 	b.WriteString(`<a href="`)
 	b.WriteString(tag.URL)
 	b.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
-	b.WriteString(text[1:])
+	b.WriteString(normalized)
 	b.WriteString(`</span></a>`)
 
 	return b.String()

diff --git a/internal/util/statustools.go b/internal/util/statustools.go
@@ -22,6 +22,12 @@ import (
 	"unicode"
 )
 
+func IsPlausiblyInHashtag(r rune) bool {
+	// Marks are allowed during parsing, prior to normalization, but not after,
+	// since they may be combined into letters during normalization.
+	return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r)
+}
+
 func IsPermittedInHashtag(r rune) bool {
 	return unicode.IsLetter(r) || unicode.IsNumber(r)
 }