Skip to content

Commit

Permalink
fix(git): decode unicode paths
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz authored and Richard Gomez committed Mar 19, 2024
1 parent 7e164d4 commit 43dcff0
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 22 deletions.
84 changes: 68 additions & 16 deletions pkg/gitparse/gitparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,10 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
case isBinaryLine(latestState, line):
latestState = BinaryFileLine

path, ok := pathFromBinaryLine(line)
path, ok := pathFromBinaryLine(ctx, line)
if !ok {
err = fmt.Errorf(`expected line to match 'Binary files a/fileA and b/fileB differ', got "%s"`, line)
ctx.Logger().Error(err, "Failed to parse binary file line")
ctx.Logger().Error(err, "Failed to parse BinaryFileLine")
latestState = ParseFailure
continue
}
Expand All @@ -443,8 +443,15 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
case isToFileLine(latestState, line):
latestState = ToFileLine

// TODO: Is this fix still required?
currentDiff.PathB = strings.TrimRight(strings.TrimRight(string(line[6:]), "\n"), "\t") // Trim the newline and tab characters. https://github.com/trufflesecurity/trufflehog/issues/1060
path, ok := pathFromToFileLine(ctx, line)
if !ok {
err = fmt.Errorf(`expected line to match format '+++ b/path/to/file.go', got "%s"`, line)
ctx.Logger().Error(err, "Failed to parse ToFileLine")
latestState = ParseFailure
continue
}

currentDiff.PathB = path
case isHunkLineNumberLine(latestState, line):
latestState = HunkLineNumberLine

Expand Down Expand Up @@ -681,27 +688,35 @@ func isBinaryLine(latestState ParseState, line []byte) bool {
}

// Get the b/ file path. Ignoring the edge case of files having `and /b` in the name for simplicity.
func pathFromBinaryLine(line []byte) (string, bool) {
func pathFromBinaryLine(ctx context.Context, line []byte) (string, bool) {
if bytes.Contains(line, []byte("and /dev/null")) {
return "", true
}

_, after, ok := bytes.Cut(line, []byte(" and b/"))
if ok {
var path string
if _, after, ok := bytes.Cut(line, []byte(" and b/")); ok {
// drop the " differ\n"
return string(after[:len(after)-8]), true
}
path = string(after[:len(after)-8])
} else if _, after, ok = bytes.Cut(line, []byte(` and "b/`)); ok {
// Edge case where the path is quoted.
// https://github.com/trufflesecurity/trufflehog/issues/2384

// Edge case where the path is quoted.
// https://github.com/trufflesecurity/trufflehog/issues/2384
_, after, ok = bytes.Cut(line, []byte(` and "b/`))
if ok {
// drop the `" differ\n`
return string(after[:len(after)-9]), true
path = string(after[:len(after)-9])
} else {
// Unknown format.
return "", false
}

// Handle escaped characters in the path, such as "\342\200\224" instead of "—".
// See https://github.com/trufflesecurity/trufflehog/issues/2418
unicodePath, err := strconv.Unquote(`"` + path + `"`)
if err != nil {
ctx.Logger().Error(err, "failed to decode path", "path", path)
return path, true
}

// Unknown format.
return "", false
return unicodePath, true
}

// --- a/internal/addrs/move_endpoint_module.go
Expand All @@ -727,6 +742,43 @@ func isToFileLine(latestState ParseState, line []byte) bool {
return false
}

// Get the b/ file path.
func pathFromToFileLine(ctx context.Context, line []byte) (string, bool) {
// Normalize paths, as they can end in `\n`, `\t\n`, etc.
// See https://github.com/trufflesecurity/trufflehog/issues/1060
line = bytes.TrimSpace(line)

// File was deleted.
if bytes.Equal(line, []byte("+++ /dev/null")) {
return "", true
}

var path string
if _, after, ok := bytes.Cut(line, []byte("+++ b/")); ok {
// drop the `\n`
path = string(after[:len(after)])

Check failure on line 759 in pkg/gitparse/gitparse.go

View workflow job for this annotation

GitHub Actions / golangci-lint

S1010: should omit second index in slice, s[a:len(s)] is identical to s[a:] (gosimple)
} else if _, after, ok = bytes.Cut(line, []byte(`+++ "b/`)); ok {
// Edge case where the path is quoted.
// e.g., `+++ "b/C++/1 \320\243\321\200\320\276\320\272/B.c"`

// drop the `"\n`
path = string(after[:len(after)-1])
} else {
// Unknown format.
return "", false
}

// Handle escaped characters in the path, such as "\342\200\224" instead of "—".
// See https://github.com/trufflesecurity/trufflehog/issues/2418
unicodePath, err := strconv.Unquote(`"` + path + `"`)
if err != nil {
ctx.Logger().Error(err, "failed to decode path", "path", path)
return path, true
}

return unicodePath, true
}

// @@ -298 +298 @@ func maxRetryErrorHandler(resp *http.Response, err error, numTries int)
func isHunkLineNumberLine(latestState ParseState, line []byte) bool {
if !(latestState == ToFileLine || latestState == HunkContentLine) {
Expand Down
67 changes: 61 additions & 6 deletions pkg/gitparse/gitparse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,15 +589,36 @@ func TestLineChecksNoStaged(t *testing.T) {
}

func TestBinaryPathParse(t *testing.T) {
ctx := context.Background()
cases := map[string]string{
"Binary files a/trufflehog_3.42.0_linux_arm64.tar.gz and /dev/null differ\n": "",
"Binary files /dev/null and b/plugin.sig differ\n": "plugin.sig",
"Binary files /dev/null and b/ Lunch and Learn - HCDiag.pdf differ\n": " Lunch and Learn - HCDiag.pdf",
"Binary files /dev/null and \"b/assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png\" differ\n": "assets/retailers/ON-ikony-Platforma-ecom — kopia.png",
"Binary files /dev/null and \"b/\\346\\267\\261\\345\\272\\246\\345\\255\\246\\344\\271\\240500\\351\\227\\256-Tan-00\\347\\233\\256\\345\\275\\225.docx\" differ\n": "深度学习500问-Tan-00目录.docx",
}

for name, expected := range cases {
filename, ok := pathFromBinaryLine(ctx, []byte(name))
if !ok {
t.Errorf("Failed to get path: %s", name)
}
if filename != expected {
t.Errorf("Expected: %s, Got: %s", expected, filename)
}
}
}

func TestToFileLinePathParse(t *testing.T) {
ctx := context.Background()
cases := map[string]string{
"Binary files a/trufflehog_3.42.0_linux_arm64.tar.gz and /dev/null differ\n": "",
"Binary files /dev/null and b/plugin.sig differ\n": "plugin.sig",
"Binary files /dev/null and b/ Lunch and Learn - HCDiag.pdf differ\n": " Lunch and Learn - HCDiag.pdf",
"Binary files /dev/null and \"b/assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png\" differ\n": "assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png",
"+++ /dev/null\n": "",
"+++ b/embeds.xml\t\n": "embeds.xml",
"+++ \"b/C++/1 \\320\\243\\321\\200\\320\\276\\320\\272/B.c\"\t\n": "C++/1 Урок/B.c",
}

for name, expected := range cases {
filename, ok := pathFromBinaryLine([]byte(name))
filename, ok := pathFromToFileLine(ctx, []byte(name))
if !ok {
t.Errorf("Failed to get path: %s", name)
}
Expand Down Expand Up @@ -1336,7 +1357,29 @@ func TestMaxCommitSize(t *testing.T) {

}

const commitLog = `commit fd6e99e7a80199b76a694603be57c5ade1de18e7
const commitLog = `commit e50b135fd29e91b2fbb25923797f5ecffe59f359
Author: lionzxy <nikita@kulikof.ru>
Date: Wed Mar 1 18:20:04 2017 +0300
Все работает, но он не принимает :(
diff --git "a/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml" "b/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml"
index 85bfb17..89b08b5 100644
--- "a/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml"
+++ "b/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml"
@@ -29,8 +29,8 @@
<file leaf-file-name="CMakeLists.txt" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/CMakeLists.txt">
<provider selected="true" editor-type-id="text-editor">
- <state relative-caret-position="0">
- <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
+ <state relative-caret-position="72">
+ <caret line="4" column="0" lean-forward="false" selection-start-line="4" selection-start-column="0" selection-end-line="4" selection-end-column="0" />
<folding />
</state>
</provider>
commit fd6e99e7a80199b76a694603be57c5ade1de18e7
Author: Jaliborc <jaliborc@gmail.com>
Date: Mon Apr 25 16:28:06 2011 +0100
Expand Down Expand Up @@ -1776,6 +1819,18 @@ func newStringBuilderValue(value string) strings.Builder {
// This throws a nasty panic if it's a top-level var.
func expectedDiffs() []*Diff {
return []*Diff{
{
PathB: "C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml",
LineStart: 29,
Commit: &Commit{
Hash: "e50b135fd29e91b2fbb25923797f5ecffe59f359",
Author: "lionzxy <nikita@kulikof.ru>",
Date: newTime("Wed Mar 1 18:20:04 2017 +0300"),
Message: newStringBuilderValue("Все работает, но он не принимает :(\n"),
},
contentWriter: newBufferWithContent([]byte("\n\n\n <state relative-caret-position=\"72\">\n <caret line=\"4\" column=\"0\" lean-forward=\"false\" selection-start-line=\"4\" selection-start-column=\"0\" selection-end-line=\"4\" selection-end-column=\"0\" />\n\n\n\n")),
IsBinary: false,
},
{
PathB: "components/item.lua",
LineStart: 9,
Expand Down

0 comments on commit 43dcff0

Please sign in to comment.