Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scan commit metadata #2713

Merged
merged 1 commit into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 116 additions & 21 deletions pkg/gitparse/gitparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (

const (
// defaultDateFormat is the standard date format for git.
defaultDateFormat = "Mon Jan 02 15:04:05 2006 -0700"
defaultDateFormat = "Mon Jan 2 15:04:05 2006 -0700"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is a bug as git log appears to use d and not dd formatting. It's unclear if or how frequently this failure occurred, as the logging in isDateLine/isAuthorDateLine was level 2.


// defaultMaxDiffSize is the maximum size for a diff. Larger diffs will be cut off.
defaultMaxDiffSize = 2 * 1024 * 1024 * 1024 // 2GB
Expand Down Expand Up @@ -106,11 +106,12 @@ func (d *Diff) finalize() error {

// Commit contains commit header info and diffs.
type Commit struct {
Hash string
Author string
Date time.Time
Message strings.Builder
Size int // in bytes
Hash string
Author string
Committer string
Date time.Time
Message strings.Builder
Size int // in bytes

hasDiffs bool
}
Expand All @@ -131,10 +132,15 @@ const (
CommitLine
MergeLine
AuthorLine
DateLine
AuthorDateLine
CommitterLine
CommitterDateLine
MessageStartLine
MessageLine
MessageEndLine
NotesStartLine
NotesLine
NotesEndLine
DiffLine
ModeLine
IndexLine
Expand All @@ -152,10 +158,15 @@ func (state ParseState) String() string {
"CommitLine",
"MergeLine",
"AuthorLine",
"DateLine",
"AuthorDateLine",
"CommitterLine",
"CommitterDateLine",
"MessageStartLine",
"MessageLine",
"MessageEndLine",
"NotesStartLine",
"NotesLine",
"NotesEndLine",
"DiffLine",
"ModeLine",
"IndexLine",
Expand Down Expand Up @@ -209,7 +220,15 @@ func NewParser(options ...Option) *Parser {
// RepoPath parses the output of the `git log` command for the `source` path.
// The Diff chan will return diffs in the order they are parsed from the log.
func (c *Parser) RepoPath(ctx context.Context, source string, head string, abbreviatedLog bool, excludedGlobs []string, isBare bool) (chan *Diff, error) {
args := []string{"-C", source, "log", "-p", "--full-history", "--date=format:%a %b %d %H:%M:%S %Y %z"}
args := []string{
"-C", source,
"log",
"--patch", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---patch
"--full-history",
"--date=format:%a %b %d %H:%M:%S %Y %z",
"--pretty=fuller", // https://git-scm.com/docs/git-log#_pretty_formats
rgmz marked this conversation as resolved.
Show resolved Hide resolved
"--notes", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---notesltrefgt
}
if abbreviatedLog {
args = append(args, "--diff-filter=AM")
}
Expand Down Expand Up @@ -373,16 +392,23 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
latestState = MergeLine
case isAuthorLine(isStaged, latestState, line):
latestState = AuthorLine
currentCommit.Author = strings.TrimRight(string(line[8:]), "\n")

case isDateLine(isStaged, latestState, line):
latestState = DateLine
currentCommit.Author = strings.TrimSpace(string(line[8:]))
case isAuthorDateLine(isStaged, latestState, line):
latestState = AuthorDateLine

date, err := time.Parse(c.dateFormat, strings.TrimSpace(string(line[6:])))
date, err := time.Parse(c.dateFormat, strings.TrimSpace(string(line[12:])))
if err != nil {
ctx.Logger().V(2).Info("Could not parse date from git stream.", "error", err)
ctx.Logger().Error(err, "failed to parse commit date", "commit", currentCommit.Hash, "latestState", latestState.String())
latestState = ParseFailure
continue
}
currentCommit.Date = date
case isCommitterLine(isStaged, latestState, line):
latestState = CommitterLine
currentCommit.Committer = strings.TrimSpace(string(line[8:]))
case isCommitterDateLine(isStaged, latestState, line):
latestState = CommitterDateLine
rgmz marked this conversation as resolved.
Show resolved Hide resolved
// NoOp
case isMessageStartLine(isStaged, latestState, line):
latestState = MessageStartLine
// NoOp
Expand All @@ -393,6 +419,17 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
case isMessageEndLine(isStaged, latestState, line):
latestState = MessageEndLine
// NoOp
case isNotesStartLine(isStaged, latestState, line):
latestState = NotesStartLine

currentCommit.Message.WriteString("\n")
currentCommit.Message.Write(line)
case isNotesLine(isStaged, latestState, line):
latestState = NotesLine
currentCommit.Message.Write(line[4:]) // Notes are indented by 4 spaces.
case isNotesEndLine(isStaged, latestState, line):
latestState = NotesEndLine
// NoOp
case isDiffLine(isStaged, latestState, line):
latestState = DiffLine

Expand Down Expand Up @@ -577,20 +614,42 @@ func isAuthorLine(isStaged bool, latestState ParseState, line []byte) bool {
return false
}

// Date: Tue Aug 10 15:20:40 2021 +0100
func isDateLine(isStaged bool, latestState ParseState, line []byte) bool {
// AuthorDate: Tue Aug 10 15:20:40 2021 +0100
func isAuthorDateLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != AuthorLine {
return false
}
if len(line) > 7 && bytes.Equal(line[:5], []byte("Date:")) {
if len(line) > 10 && bytes.Equal(line[:11], []byte("AuthorDate:")) {
return true
}
return false
}

// Commit: Bill Rich <bill.rich@trufflesec.com>
func isCommitterLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != AuthorDateLine {
return false
}
if len(line) > 8 && bytes.Equal(line[:7], []byte("Commit:")) {
return true
}
return false
}

// Line directly after Date with only a newline.
// CommitDate: Wed Apr 17 19:59:28 2024 -0400
func isCommitterDateLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != CommitterLine {
return false
}
if len(line) > 10 && bytes.Equal(line[:11], []byte("CommitDate:")) {
return true
}
return false
}

// Line directly after CommitterDate with only a newline.
func isMessageStartLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != DateLine {
if isStaged || latestState != CommitterDateLine {
return false
}
// TODO: Improve the implementation of this and isMessageEndLine
Expand Down Expand Up @@ -622,15 +681,51 @@ func isMessageEndLine(isStaged bool, latestState ParseState, line []byte) bool {
return false
}

// `Notes:` or `Notes (context):`
// See https://tylercipriani.com/blog/2022/11/19/git-notes-gits-coolest-most-unloved-feature/
func isNotesStartLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != MessageEndLine {
return false
}
if len(line) > 5 && bytes.Equal(line[:5], []byte("Notes")) {
return true
}
return false
}

// Line after NotesStartLine that starts with 4 spaces
func isNotesLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || !(latestState == NotesStartLine || latestState == NotesLine) {
return false
}
if len(line) > 4 && bytes.Equal(line[:4], []byte(" ")) {
return true
}
return false
}

// Line directly after NotesLine with only a newline.
func isNotesEndLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != NotesLine {
return false
}
if len(strings.TrimRight(string(line[:]), "\r\n")) == 0 {
return true
}
return false
}

// diff --git a/internal/addrs/move_endpoint_module.go b/internal/addrs/move_endpoint_module.go
func isDiffLine(isStaged bool, latestState ParseState, line []byte) bool {
if !(latestState == MessageStartLine || // Empty commit messages can go from MessageStart->Diff
latestState == MessageEndLine ||
latestState == NotesEndLine ||
latestState == BinaryFileLine ||
latestState == ModeLine ||
latestState == IndexLine ||
latestState == HunkContentLine ||
latestState == ParseFailure) {
if latestState == Initial && !isStaged {
if !(isStaged && latestState == Initial) {
rgmz marked this conversation as resolved.
Show resolved Hide resolved
return false
}
}
Expand Down
Loading
Loading