Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[not-fixup] - Reduce memory consumption for Buffered File Writer #2377

Merged
merged 30 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
5e5a57a
correctly use the buffered file writer
ahrav Feb 2, 2024
3b21ea3
use value from source
ahrav Feb 3, 2024
9aaf0c8
reorder fields
ahrav Feb 3, 2024
f7f890a
Merge branch 'main' into fixup-use-git-parser
ahrav Feb 3, 2024
1edc0a3
use only the DetectorKey as a map field
ahrav Feb 3, 2024
d530798
correctly use the buffered file writer
ahrav Feb 2, 2024
ea8bc7f
use value from source
ahrav Feb 3, 2024
3bc4482
reorder fields
ahrav Feb 3, 2024
510ce03
add tests and update
ahrav Feb 3, 2024
f6f1f44
Fix issue with buffer slices growing
ahrav Feb 4, 2024
7659980
fix test
ahrav Feb 4, 2024
b0dbbbf
Merge branch 'bug-unhashable-key' into fixup-use-git-parser
ahrav Feb 4, 2024
96ef60a
Merge branch 'fixup-use-git-parser' into optimize-buffered-file-writer
ahrav Feb 4, 2024
a07c39b
fix
ahrav Feb 4, 2024
e41cced
add singleton
ahrav Feb 4, 2024
8e3d56c
use shared pool
ahrav Feb 4, 2024
3e67392
optimize
ahrav Feb 5, 2024
d8e5589
rename and cleanup
ahrav Feb 5, 2024
3894900
use correct calculation to grow buffer
ahrav Feb 5, 2024
999d662
only grow if needed
ahrav Feb 5, 2024
54c8a65
merge
ahrav Feb 5, 2024
57a2305
merge main
ahrav Feb 5, 2024
faa423b
address comments
ahrav Feb 5, 2024
c8b4b02
remove unused
ahrav Feb 5, 2024
8190669
remove
ahrav Feb 5, 2024
c1cf67c
rip out Grow
ahrav Feb 5, 2024
2e74a4c
address coment
ahrav Feb 6, 2024
a2d23ef
use 2k default buffer
ahrav Feb 6, 2024
10532d5
Merge branch 'main' into optimize-buffered-file-writer
ahrav Feb 6, 2024
f1bd25d
update comment allow large buffers to be garbage collected
ahrav Feb 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pkg/engine/ahocorasick/ahocorasickcore.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ type DetectorKey struct {
customDetectorName string
}

// Type returns the detector type of the key.
func (k DetectorKey) Type() detectorspb.DetectorType { return k.detectorType }

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These detector changes seem irrelevant to this PR - is this other work leaking in?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forgot to rebase

// AhoCorasickCore encapsulates the operations and data structures used for keyword matching via the
// Aho-Corasick algorithm. It is responsible for constructing and managing the trie for efficient
// substring searches, as well as mapping keywords to their associated detectors for rapid lookups.
Expand Down
8 changes: 4 additions & 4 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,8 +598,8 @@ func (e *Engine) detectorWorker(ctx context.Context) {
// by the same detector in the chunk. Exact matches on lookup indicate a duplicate secret for a detector
// in that chunk - which is expected and not malicious. Those intra-detector dupes are still verified.
type chunkSecretKey struct {
secret string
detectorInfo ahocorasick.DetectorInfo
secret string
detectorKey ahocorasick.DetectorKey
}

func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSecretKey]struct{}) bool {
Expand All @@ -615,7 +615,7 @@ func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSec

// If the detector type is the same, we don't need to compare the strings.
// These are not duplicates, and should be verified.
if val.detectorInfo.Type() == dupeKey.detectorInfo.Type() {
if val.detectorKey.Type() == dupeKey.detectorKey.Type() {
continue
}

Expand Down Expand Up @@ -674,7 +674,7 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
// Ex:
// - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r
// - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r
key := chunkSecretKey{secret: string(val), detectorInfo: detector}
key := chunkSecretKey{secret: string(val), detectorKey: detector.Key}
if _, ok := chunkSecrets[key]; ok {
continue
}
Expand Down
24 changes: 12 additions & 12 deletions pkg/engine/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -562,47 +562,47 @@ func TestLikelyDuplicate(t *testing.T) {
}{
{
name: "exact duplicate different detector",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA},
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB}: {},
{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB.Key}: {},
},
expected: true,
},
{
name: "non-duplicate length outside range",
val: chunkSecretKey{"short", detectorA},
val: chunkSecretKey{"short", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"muchlongerthanthevalstring", detectorB}: {},
{"muchlongerthanthevalstring", detectorB.Key}: {},
},
expected: false,
},
{
name: "similar within threshold",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA},
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB}: {},
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB.Key}: {},
},
expected: true,
},
{
name: "similar outside threshold",
val: chunkSecretKey{"anotherkey", detectorA},
val: chunkSecretKey{"anotherkey", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"completelydifferent", detectorB}: {},
{"completelydifferent", detectorB.Key}: {},
},
expected: false,
},
{
name: "empty strings",
val: chunkSecretKey{"", detectorA},
dupes: map[chunkSecretKey]struct{}{{"", detectorB}: {}},
val: chunkSecretKey{"", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{{"", detectorB.Key}: {}},
expected: true,
},
{
name: "similar within threshold same detector",
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA},
val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA}: {},
{"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA.Key}: {},
},
expected: false,
},
Expand Down
43 changes: 28 additions & 15 deletions pkg/gitparse/gitparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
bufferedfilewriter "github.com/trufflesecurity/trufflehog/v3/pkg/writers/buffered_file_writer"
)

const (
Expand Down Expand Up @@ -100,21 +101,26 @@ func (b *buffer) String() (string, error) { return b.Buffer.String(), nil }
// The use of contentWriter enables the management of diff data either in memory or on disk,
// based on its size, optimizing resource usage and performance.
type Diff struct {
PathB string
LineStart int
PathB string
LineStart int
IsBinary bool

contentWriter contentWriter
IsBinary bool
}

type diffOption func(*Diff)

// withPathB sets the PathB option.
func withPathB(pathB string) diffOption { return func(d *Diff) { d.PathB = pathB } }

// withCustomContentWriter sets the useCustomContentWriter option.
func withCustomContentWriter(cr contentWriter) diffOption {
return func(d *Diff) { d.contentWriter = cr }
}

// NewDiff creates a new Diff with a threshold.
func NewDiff(opts ...diffOption) *Diff {
diff := new(Diff)
diff.contentWriter = newBuffer()
for _, opt := range opts {
opt(diff)
}
Expand Down Expand Up @@ -203,7 +209,8 @@ type Parser struct {
maxDiffSize int
maxCommitSize int
dateFormat string
contentWriter contentWriter

useCustomContentWriter bool
}

type ParseState int
Expand Down Expand Up @@ -250,11 +257,9 @@ func (state ParseState) String() string {
}[state]
}

// WithContentWriter sets the ContentWriter for the Parser.
func WithContentWriter(writer contentWriter) Option {
return func(parser *Parser) {
parser.contentWriter = writer
}
// UseCustomContentWriter sets useCustomContentWriter option.
func UseCustomContentWriter() Option {
return func(parser *Parser) { parser.useCustomContentWriter = true }
}

// WithMaxDiffSize sets maxDiffSize option. Diffs larger than maxDiffSize will
Expand Down Expand Up @@ -283,7 +288,6 @@ func NewParser(options ...Option) *Parser {
dateFormat: defaultDateFormat,
maxDiffSize: defaultMaxDiffSize,
maxCommitSize: defaultMaxCommitSize,
contentWriter: newBuffer(),
}
for _, option := range options {
option(parser)
Expand Down Expand Up @@ -387,7 +391,9 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
totalLogSize int
)
var latestState = Initial
currentDiff := NewDiff()

writer := c.contentWriter()
currentDiff := NewDiff(withCustomContentWriter(writer))

defer common.RecoverWithExit(ctx)
defer close(commitChan)
Expand Down Expand Up @@ -430,7 +436,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
totalLogSize += currentCommit.Size
}
// Create a new currentDiff and currentCommit
currentDiff = NewDiff()
currentDiff = NewDiff(withCustomContentWriter(c.contentWriter()))
currentCommit = &Commit{Message: strings.Builder{}}
// Check that the commit line contains a hash and set it.
if len(line) >= 47 {
Expand Down Expand Up @@ -498,7 +504,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
currentCommit.Message.WriteString(oldCommit.Message.String())
}
}
currentDiff = NewDiff()
currentDiff = NewDiff(withCustomContentWriter(c.contentWriter()))
case isModeLine(isStaged, latestState, line):
latestState = ModeLine
// NoOp
Expand Down Expand Up @@ -538,7 +544,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
}
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
}
currentDiff = NewDiff(withPathB(currentDiff.PathB))
currentDiff = NewDiff(withCustomContentWriter(c.contentWriter()), withPathB(currentDiff.PathB))

words := bytes.Split(line, []byte(" "))
if len(words) >= 3 {
Expand Down Expand Up @@ -606,6 +612,13 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
ctx.Logger().V(2).Info("finished parsing git log.", "total_log_size", totalLogSize)
}

func (c *Parser) contentWriter() contentWriter {
if c.useCustomContentWriter {
return bufferedfilewriter.New()
}
return newBuffer()
}

func isMergeLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || latestState != CommitLine {
return false
Expand Down
5 changes: 2 additions & 3 deletions pkg/sources/git/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sanitizer"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
bufferedfilewriter "github.com/trufflesecurity/trufflehog/v3/pkg/writers/buffered_file_writer"
)

const SourceType = sourcespb.SourceType_SOURCE_TYPE_GIT
Expand Down Expand Up @@ -99,7 +98,7 @@ type Config struct {
func NewGit(config *Config) *Git {
var parser *gitparse.Parser
if config.UseCustomContentWriter {
parser = gitparse.NewParser(gitparse.WithContentWriter(bufferedfilewriter.New()))
parser = gitparse.NewParser(gitparse.UseCustomContentWriter())
} else {
parser = gitparse.NewParser()
}
Expand Down Expand Up @@ -522,7 +521,7 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string
repoCtx = context.WithValue(ctx, "repo", path)
}

commitChan, err := gitparse.NewParser().RepoPath(repoCtx, path, scanOptions.HeadHash, scanOptions.BaseHash == "", scanOptions.ExcludeGlobs, scanOptions.Bare)
commitChan, err := s.parser.RepoPath(repoCtx, path, scanOptions.HeadHash, scanOptions.BaseHash == "", scanOptions.ExcludeGlobs, scanOptions.Bare)
if err != nil {
return err
}
Expand Down
Loading
Loading