Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Git work with escaped unicode characcters #2585

Merged
merged 1 commit into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 67 additions & 16 deletions pkg/gitparse/gitparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,10 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
case isBinaryLine(latestState, line):
latestState = BinaryFileLine

path, ok := pathFromBinaryLine(line)
path, ok := pathFromBinaryLine(ctx, line)
if !ok {
err = fmt.Errorf(`expected line to match 'Binary files a/fileA and b/fileB differ', got "%s"`, line)
ctx.Logger().Error(err, "Failed to parse binary file line")
ctx.Logger().Error(err, "Failed to parse BinaryFileLine")
latestState = ParseFailure
continue
}
Expand All @@ -443,8 +443,15 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
case isToFileLine(latestState, line):
latestState = ToFileLine

// TODO: Is this fix still required?
currentDiff.PathB = strings.TrimRight(strings.TrimRight(string(line[6:]), "\n"), "\t") // Trim the newline and tab characters. https://github.com/trufflesecurity/trufflehog/issues/1060
path, ok := pathFromToFileLine(ctx, line)
if !ok {
err = fmt.Errorf(`expected line to match format '+++ b/path/to/file.go', got "%s"`, line)
ctx.Logger().Error(err, "Failed to parse ToFileLine")
latestState = ParseFailure
continue
}

currentDiff.PathB = path
case isHunkLineNumberLine(latestState, line):
latestState = HunkLineNumberLine

Expand Down Expand Up @@ -681,27 +688,35 @@ func isBinaryLine(latestState ParseState, line []byte) bool {
}

// Get the b/ file path. Ignoring the edge case of files having `and /b` in the name for simplicity.
func pathFromBinaryLine(line []byte) (string, bool) {
func pathFromBinaryLine(ctx context.Context, line []byte) (string, bool) {
if bytes.Contains(line, []byte("and /dev/null")) {
return "", true
}

_, after, ok := bytes.Cut(line, []byte(" and b/"))
if ok {
var path string
if _, after, ok := bytes.Cut(line, []byte(" and b/")); ok {
// drop the " differ\n"
return string(after[:len(after)-8]), true
}
path = string(after[:len(after)-8])
} else if _, after, ok = bytes.Cut(line, []byte(` and "b/`)); ok {
// Edge case where the path is quoted.
// https://github.com/trufflesecurity/trufflehog/issues/2384

// Edge case where the path is quoted.
// https://github.com/trufflesecurity/trufflehog/issues/2384
_, after, ok = bytes.Cut(line, []byte(` and "b/`))
if ok {
// drop the `" differ\n`
return string(after[:len(after)-9]), true
path = string(after[:len(after)-9])
} else {
// Unknown format.
return "", false
}

// Handle escaped characters in the path, such as "\342\200\224" instead of "—".
// See https://github.com/trufflesecurity/trufflehog/issues/2418
unicodePath, err := strconv.Unquote(`"` + path + `"`)
rgmz marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
ctx.Logger().Error(err, "failed to decode path", "path", path)
return path, true
}

// Unknown format.
return "", false
return unicodePath, true
}

// --- a/internal/addrs/move_endpoint_module.go
Expand All @@ -727,6 +742,42 @@ func isToFileLine(latestState ParseState, line []byte) bool {
return false
}

// Get the b/ file path.
func pathFromToFileLine(ctx context.Context, line []byte) (string, bool) {
// Normalize paths, as they can end in `\n`, `\t\n`, etc.
// See https://github.com/trufflesecurity/trufflehog/issues/1060
line = bytes.TrimSpace(line)

// File was deleted.
if bytes.Equal(line, []byte("+++ /dev/null")) {
return "", true
}

var path string
if _, after, ok := bytes.Cut(line, []byte("+++ b/")); ok {
path = string(after)
} else if _, after, ok = bytes.Cut(line, []byte(`+++ "b/`)); ok {
// Edge case where the path is quoted.
// e.g., `+++ "b/C++/1 \320\243\321\200\320\276\320\272/B.c"`

// drop the trailing `"`
path = string(after[:len(after)-1])
} else {
// Unknown format.
return "", false
}

// Handle escaped characters in the path, such as "\342\200\224" instead of "—".
// See https://github.com/trufflesecurity/trufflehog/issues/2418
unicodePath, err := strconv.Unquote(`"` + path + `"`)
if err != nil {
ctx.Logger().Error(err, "failed to decode path", "path", path)
return path, true
}

return unicodePath, true
}

// @@ -298 +298 @@ func maxRetryErrorHandler(resp *http.Response, err error, numTries int)
func isHunkLineNumberLine(latestState ParseState, line []byte) bool {
if !(latestState == ToFileLine || latestState == HunkContentLine) {
Expand Down
67 changes: 61 additions & 6 deletions pkg/gitparse/gitparse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,15 +589,36 @@ func TestLineChecksNoStaged(t *testing.T) {
}

func TestBinaryPathParse(t *testing.T) {
ctx := context.Background()
cases := map[string]string{
"Binary files a/trufflehog_3.42.0_linux_arm64.tar.gz and /dev/null differ\n": "",
"Binary files /dev/null and b/plugin.sig differ\n": "plugin.sig",
"Binary files /dev/null and b/ Lunch and Learn - HCDiag.pdf differ\n": " Lunch and Learn - HCDiag.pdf",
"Binary files /dev/null and \"b/assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png\" differ\n": "assets/retailers/ON-ikony-Platforma-ecom — kopia.png",
"Binary files /dev/null and \"b/\\346\\267\\261\\345\\272\\246\\345\\255\\246\\344\\271\\240500\\351\\227\\256-Tan-00\\347\\233\\256\\345\\275\\225.docx\" differ\n": "深度学习500问-Tan-00目录.docx",
}

for name, expected := range cases {
filename, ok := pathFromBinaryLine(ctx, []byte(name))
if !ok {
t.Errorf("Failed to get path: %s", name)
}
if filename != expected {
t.Errorf("Expected: %s, Got: %s", expected, filename)
}
}
}

func TestToFileLinePathParse(t *testing.T) {
ctx := context.Background()
cases := map[string]string{
"Binary files a/trufflehog_3.42.0_linux_arm64.tar.gz and /dev/null differ\n": "",
"Binary files /dev/null and b/plugin.sig differ\n": "plugin.sig",
"Binary files /dev/null and b/ Lunch and Learn - HCDiag.pdf differ\n": " Lunch and Learn - HCDiag.pdf",
"Binary files /dev/null and \"b/assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png\" differ\n": "assets/retailers/ON-ikony-Platforma-ecom \\342\\200\\224 kopia.png",
"+++ /dev/null\n": "",
"+++ b/embeds.xml\t\n": "embeds.xml",
"+++ \"b/C++/1 \\320\\243\\321\\200\\320\\276\\320\\272/B.c\"\t\n": "C++/1 Урок/B.c",
}

for name, expected := range cases {
filename, ok := pathFromBinaryLine([]byte(name))
filename, ok := pathFromToFileLine(ctx, []byte(name))
if !ok {
t.Errorf("Failed to get path: %s", name)
}
Expand Down Expand Up @@ -1336,7 +1357,29 @@ func TestMaxCommitSize(t *testing.T) {

}

const commitLog = `commit fd6e99e7a80199b76a694603be57c5ade1de18e7
const commitLog = `commit e50b135fd29e91b2fbb25923797f5ecffe59f359
Author: lionzxy <nikita@kulikof.ru>
Date: Wed Mar 1 18:20:04 2017 +0300

Все работает, но он не принимает :(

diff --git "a/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml" "b/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml"
index 85bfb17..89b08b5 100644
--- "a/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml"
+++ "b/C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml"
@@ -29,8 +29,8 @@
<file leaf-file-name="CMakeLists.txt" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/CMakeLists.txt">
<provider selected="true" editor-type-id="text-editor">
- <state relative-caret-position="0">
- <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
+ <state relative-caret-position="72">
+ <caret line="4" column="0" lean-forward="false" selection-start-line="4" selection-start-column="0" selection-end-line="4" selection-end-column="0" />
<folding />
</state>
</provider>

commit fd6e99e7a80199b76a694603be57c5ade1de18e7
Author: Jaliborc <jaliborc@gmail.com>
Date: Mon Apr 25 16:28:06 2011 +0100

Expand Down Expand Up @@ -1776,6 +1819,18 @@ func newStringBuilderValue(value string) strings.Builder {
// This throws a nasty panic if it's a top-level var.
func expectedDiffs() []*Diff {
return []*Diff{
{
PathB: "C++/1 \320\243\321\200\320\276\320\272/.idea/workspace.xml",
LineStart: 29,
Commit: &Commit{
Hash: "e50b135fd29e91b2fbb25923797f5ecffe59f359",
Author: "lionzxy <nikita@kulikof.ru>",
Date: newTime("Wed Mar 1 18:20:04 2017 +0300"),
Message: newStringBuilderValue("Все работает, но он не принимает :(\n"),
},
contentWriter: newBufferWithContent([]byte("\n\n\n <state relative-caret-position=\"72\">\n <caret line=\"4\" column=\"0\" lean-forward=\"false\" selection-start-line=\"4\" selection-start-column=\"0\" selection-end-line=\"4\" selection-end-column=\"0\" />\n\n\n\n")),
IsBinary: false,
},
{
PathB: "components/item.lua",
LineStart: 9,
Expand Down
Loading