From 37e5eb95c3e0cf01f9f4b5a80ac53e03b0438b9e Mon Sep 17 00:00:00 2001 From: Richard Gomez Date: Sat, 16 Mar 2024 11:51:14 -0400 Subject: [PATCH] fix(git): decode unicode paths --- pkg/gitparse/gitparse.go | 83 ++++++++++++++++++++++++++++------- pkg/gitparse/gitparse_test.go | 67 +++++++++++++++++++++++++--- 2 files changed, 128 insertions(+), 22 deletions(-) diff --git a/pkg/gitparse/gitparse.go b/pkg/gitparse/gitparse.go index 3c364644aa84..4ee7c899eaaf 100644 --- a/pkg/gitparse/gitparse.go +++ b/pkg/gitparse/gitparse.go @@ -424,10 +424,10 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan case isBinaryLine(latestState, line): latestState = BinaryFileLine - path, ok := pathFromBinaryLine(line) + path, ok := pathFromBinaryLine(ctx, line) if !ok { err = fmt.Errorf(`expected line to match 'Binary files a/fileA and b/fileB differ', got "%s"`, line) - ctx.Logger().Error(err, "Failed to parse binary file line") + ctx.Logger().Error(err, "Failed to parse BinaryFileLine") latestState = ParseFailure continue } @@ -443,8 +443,15 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan case isToFileLine(latestState, line): latestState = ToFileLine - // TODO: Is this fix still required? - currentDiff.PathB = strings.TrimRight(strings.TrimRight(string(line[6:]), "\n"), "\t") // Trim the newline and tab characters. https://github.com/trufflesecurity/trufflehog/issues/1060 + path, ok := pathFromToFileLine(ctx, line) + if !ok { + err = fmt.Errorf(`expected line to match format '+++ b/path/to/file.go', got "%s"`, line) + ctx.Logger().Error(err, "Failed to parse ToFileLine") + latestState = ParseFailure + continue + } + + currentDiff.PathB = path case isHunkLineNumberLine(latestState, line): latestState = HunkLineNumberLine @@ -681,27 +688,35 @@ func isBinaryLine(latestState ParseState, line []byte) bool { } // Get the b/ file path. Ignoring the edge case of files having `and /b` in the name for simplicity. -func pathFromBinaryLine(line []byte) (string, bool) { +func pathFromBinaryLine(ctx context.Context, line []byte) (string, bool) { if bytes.Contains(line, []byte("and /dev/null")) { return "", true } - _, after, ok := bytes.Cut(line, []byte(" and b/")) - if ok { + var path string + if _, after, ok := bytes.Cut(line, []byte(" and b/")); ok { // drop the " differ\n" - return string(after[:len(after)-8]), true - } + path = string(after[:len(after)-8]) + } else if _, after, ok = bytes.Cut(line, []byte(` and "b/`)); ok { + // Edge case where the path is quoted. + // https://github.com/trufflesecurity/trufflehog/issues/2384 - // Edge case where the path is quoted. - // https://github.com/trufflesecurity/trufflehog/issues/2384 - _, after, ok = bytes.Cut(line, []byte(` and "b/`)) - if ok { // drop the `" differ\n` - return string(after[:len(after)-9]), true + path = string(after[:len(after)-9]) + } else { + // Unknown format. + return "", false + } + + // Handle escaped characters in the path, such as "\342\200\224" instead of "—". + // See https://github.com/trufflesecurity/trufflehog/issues/2418 + unicodePath, err := strconv.Unquote(`"` + path + `"`) + if err != nil { + ctx.Logger().Error(err, "failed to decode path", "path", path) + return path, true } - // Unknown format. - return "", false + return unicodePath, true } // --- a/internal/addrs/move_endpoint_module.go @@ -727,6 +742,42 @@ func isToFileLine(latestState ParseState, line []byte) bool { return false } +// Get the b/ file path. +func pathFromToFileLine(ctx context.Context, line []byte) (string, bool) { + // Normalize paths, as they can end in `\n`, `\t\n`, etc. + // See https://github.com/trufflesecurity/trufflehog/issues/1060 + line = bytes.TrimSpace(line) + + // File was deleted. + if bytes.Equal(line, []byte("+++ /dev/null")) { + return "", true + } + + var path string + if _, after, ok := bytes.Cut(line, []byte("+++ b/")); ok { + path = string(after) + } else if _, after, ok = bytes.Cut(line, []byte(`+++ "b/`)); ok { + // Edge case where the path is quoted. + // e.g., `+++ "b/C++/1 \320\243\321\200\320\276\320\272/B.c"` + + // drop the trailing `"` + path = string(after[:len(after)-1]) + } else { + // Unknown format. + return "", false + } + + // Handle escaped characters in the path, such as "\342\200\224" instead of "—". + // See https://github.com/trufflesecurity/trufflehog/issues/2418 + unicodePath, err := strconv.Unquote(`"` + path + `"`) + if err != nil { + ctx.Logger().Error(err, "failed to decode path", "path", path) + return path, true + } + + return unicodePath, true +} + // @@ -298 +298 @@ func maxRetryErrorHandler(resp *http.Response, err error, numTries int) func isHunkLineNumberLine(latestState ParseState, line []byte) bool { if !(latestState == ToFileLine || latestState == HunkContentLine) { diff --git a/pkg/gitparse/gitparse_test.go b/pkg/gitparse/gitparse_test.go index 45f8196e2ecd..fe7ab25bd444 100644 --- a/pkg/gitparse/gitparse_test.go +++ b/pkg/gitparse/gitparse_test.go @@ -589,15 +589,36 @@ func TestLineChecksNoStaged(t *testing.T) { } func TestBinaryPathParse(t *testing.T) { + ctx := context.Background() + cases := map[string]string{ + "Binary files a/trufflehog_3.42.0_linux_arm64.tar.gz and /dev/null differ\n": "", + "Binary files /dev/null and b/plugin.sig differ\n": "plugin.sig", + "Binary files /dev/null and b/ Lunch and Learn - HCDiag.pdf differ\n": " Lunch and Learn - HCDiag.pdf", + "Binary files /dev/null and \"b/assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png\" differ\n": "assets/retailers/ON-ikony-Platforma-ecom — kopia.png", + "Binary files /dev/null and \"b/\\346\\267\\261\\345\\272\\246\\345\\255\\246\\344\\271\\240500\\351\\227\\256-Tan-00\\347\\233\\256\\345\\275\\225.docx\" differ\n": "深度学习500问-Tan-00目录.docx", + } + + for name, expected := range cases { + filename, ok := pathFromBinaryLine(ctx, []byte(name)) + if !ok { + t.Errorf("Failed to get path: %s", name) + } + if filename != expected { + t.Errorf("Expected: %s, Got: %s", expected, filename) + } + } +} + +func TestToFileLinePathParse(t *testing.T) { + ctx := context.Background() cases := map[string]string{ - "Binary files a/trufflehog_3.42.0_linux_arm64.tar.gz and /dev/null differ\n": "", - "Binary files /dev/null and b/plugin.sig differ\n": "plugin.sig", - "Binary files /dev/null and b/ Lunch and Learn - HCDiag.pdf differ\n": " Lunch and Learn - HCDiag.pdf", - "Binary files /dev/null and \"b/assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png\" differ\n": "assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png", + "+++ /dev/null\n": "", + "+++ b/embeds.xml\t\n": "embeds.xml", + "+++ \"b/C++/1 \\320\\243\\321\\200\\320\\276\\320\\272/B.c\"\t\n": "C++/1 Урок/B.c", } for name, expected := range cases { - filename, ok := pathFromBinaryLine([]byte(name)) + filename, ok := pathFromToFileLine(ctx, []byte(name)) if !ok { t.Errorf("Failed to get path: %s", name) } @@ -1336,7 +1357,29 @@ func TestMaxCommitSize(t *testing.T) { } -const commitLog = `commit fd6e99e7a80199b76a694603be57c5ade1de18e7 +const commitLog = `commit e50b135fd29e91b2fbb25923797f5ecffe59f359 +Author: lionzxy +Date: Wed Mar 1 18:20:04 2017 +0300 + + Все работает, но он не принимает :( + +diff --git "a/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml" "b/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml" +index 85bfb17..89b08b5 100644 +--- "a/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml" ++++ "b/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml" +@@ -29,8 +29,8 @@ + + + +- +- ++ ++ + + + + +commit fd6e99e7a80199b76a694603be57c5ade1de18e7 Author: Jaliborc Date: Mon Apr 25 16:28:06 2011 +0100 @@ -1776,6 +1819,18 @@ func newStringBuilderValue(value string) strings.Builder { // This throws a nasty panic if it's a top-level var. func expectedDiffs() []*Diff { return []*Diff{ + { + PathB: "C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml", + LineStart: 29, + Commit: &Commit{ + Hash: "e50b135fd29e91b2fbb25923797f5ecffe59f359", + Author: "lionzxy ", + Date: newTime("Wed Mar 1 18:20:04 2017 +0300"), + Message: newStringBuilderValue("Все работает, но он не принимает :(\n"), + }, + contentWriter: newBufferWithContent([]byte("\n\n\n \n \n\n\n\n")), + IsBinary: false, + }, { PathB: "components/item.lua", LineStart: 9,