From eca32b27e26e82fb46a3e32f801afa52c4e0037c Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 25 Jan 2024 08:00:45 -0800 Subject: [PATCH 01/49] draft reverify chunks --- pkg/engine/ahocorasickcore.go | 10 ++- pkg/engine/engine.go | 140 ++++++++++++++++++++++++++++++++-- 2 files changed, 141 insertions(+), 9 deletions(-) diff --git a/pkg/engine/ahocorasickcore.go b/pkg/engine/ahocorasickcore.go index 19fda9d90f35..25055211cd16 100644 --- a/pkg/engine/ahocorasickcore.go +++ b/pkg/engine/ahocorasickcore.go @@ -4,6 +4,7 @@ import ( "strings" ahocorasick "github.com/BobuSumisu/aho-corasick" + "github.com/trufflesecurity/trufflehog/v3/pkg/custom_detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" @@ -65,12 +66,17 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore { // PopulateMatchingDetectors populates the given detector slice with all the detectors matching the // provided input. This method populates an existing map rather than allocating a new one because // it will be called once per chunk and that many allocations has a noticeable performance cost. -func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, detectors map[DetectorKey]detectors.Detector) { +func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, dts map[DetectorKey]detectors.Detector) []detectors.Detector { + matches := ac.prefilter.MatchString(strings.ToLower(chunkData)) + d := make([]detectors.Detector, 0, len(matches)) for _, m := range ac.prefilter.MatchString(strings.ToLower(chunkData)) { for _, k := range ac.keywordsToDetectors[m.MatchString()] { - detectors[k] = ac.detectorsByKey[k] + dts[k] = ac.detectorsByKey[k] + d = append(d, ac.detectorsByKey[k]) } } + + return d } // createDetectorKey creates a unique key for each detector from its type, version, and, for diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 38f0da44371a..eeca994c135a 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -68,12 +68,14 @@ type Engine struct { ahoCorasickCore *AhoCorasickCore // Engine synchronization primitives. - sourceManager *sources.SourceManager - results chan detectors.ResultWithMetadata - detectableChunksChan chan detectableChunk - workersWg sync.WaitGroup - wgDetectorWorkers sync.WaitGroup - WgNotifier sync.WaitGroup + sourceManager *sources.SourceManager + results chan detectors.ResultWithMetadata + detectableChunksChan chan detectableChunk + reverifiableChunksChan chan reVerifiableChunk + workersWg sync.WaitGroup + reverifiersWg sync.WaitGroup + wgDetectorWorkers sync.WaitGroup + WgNotifier sync.WaitGroup // Runtime information. metrics runtimeMetrics @@ -303,6 +305,7 @@ func (e *Engine) initialize(ctx context.Context, options ...Option) error { // Channels are used for communication between different parts of the engine, // ensuring that data flows smoothly without race conditions. e.detectableChunksChan = make(chan detectableChunk, defaultChannelBuffer) + e.reverifiableChunksChan = make(chan reVerifiableChunk, defaultChannelBuffer) e.results = make(chan detectors.ResultWithMetadata, defaultChannelBuffer) e.dedupeCache = cache e.printer = new(output.PlainPrinter) @@ -392,6 +395,18 @@ func (e *Engine) startWorkers(ctx context.Context) { }() } + // reverifiers... + ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) + for worker := uint64(0); worker < uint64(e.concurrency); worker++ { + e.reverifiersWg.Add(1) + go func() { + ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5)) + defer common.Recover(ctx) + defer e.reverifiersWg.Done() + e.reverifierWorker(ctx) + }() + } + // Notifier workers communicate detected issues to the user or any downstream systems. // We want 1/4th of the notifier workers as the number of scanner workers. const notifierWorkerRatio = 4 @@ -420,12 +435,20 @@ func (e *Engine) Finish(ctx context.Context) error { err := e.sourceManager.Wait() e.workersWg.Wait() // Wait for the workers to finish scanning chunks. + + close(e.reverifiableChunksChan) + e.reverifiersWg.Wait() + close(e.detectableChunksChan) e.wgDetectorWorkers.Wait() // Wait for the detector workers to finish detecting chunks. close(e.results) // Detector workers are done, close the results channel and call it a day. e.WgNotifier.Wait() // Wait for the notifier workers to finish notifying results. + fmt.Printf("counter: %d\n", counter) + // fmt.Printf("total: %d\n", total) + // fmt.Printf("max: %d\n", max) + if err := cleantemp.CleanTempArtifacts(ctx); err != nil { ctx.Logger().Error(err, "error cleaning temp artifacts") } @@ -458,8 +481,20 @@ type detectableChunk struct { wgDoneFn func() } +type reVerifiableChunk struct { + chunk sources.Chunk + decoder detectorspb.DecoderType + detectors []detectors.Detector + reverifyWgDoneFn func() +} + +var counter uint64 +var total uint64 +var max uint64 + func (e *Engine) detectorWorker(ctx context.Context) { var wgDetect sync.WaitGroup + var wgReverify sync.WaitGroup // Reuse the same map to avoid allocations. const avgDetectorsPerChunk = 2 @@ -474,7 +509,41 @@ func (e *Engine) detectorWorker(ctx context.Context) { continue } - e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors) + matchingDetectors := e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors) + // newValue := uint64(len(chunkSpecificDetectors)) + // for { + // currentMax := atomic.LoadUint64(&max) // Read current max value atomically + // if newValue <= currentMax { + // // New value is not greater, no need to swap, exit loop + // break + // } + // // Attempt to swap if currentMax is still the actual max value + // if atomic.CompareAndSwapUint64(&max, currentMax, newValue) { + // // Swap was successful, exit loop + // break + // } + // // If swap failed, it means another goroutine updated max in the meantime. + // // Loop will re-attempt to load the new max and compare again. + // } + // if newValue > 1 { + // atomic.AddUint64(&counter, 1) + // atomic.AddUint64(&total, uint64(len(chunkSpecificDetectors))) + // atomic.CompareAndSwapUint64(&max, atomic.LoadUint64(&max), uint64(len(chunkSpecificDetectors))) + // } + if len(chunkSpecificDetectors) > 1 { + wgReverify.Add(1) + e.reverifiableChunksChan <- reVerifiableChunk{ + chunk: *decoded.Chunk, + detectors: matchingDetectors, + decoder: decoded.DecoderType, + reverifyWgDoneFn: wgReverify.Done, + } + // Empty the map. + for k := range chunkSpecificDetectors { + delete(chunkSpecificDetectors, k) + } + continue + } for k, detector := range chunkSpecificDetectors { decoded.Chunk.Verify = e.verify @@ -491,10 +560,67 @@ func (e *Engine) detectorWorker(ctx context.Context) { } atomic.AddUint64(&e.metrics.ChunksScanned, 1) } + + wgReverify.Wait() wgDetect.Wait() ctx.Logger().V(4).Info("finished scanning chunks") } +func (e *Engine) reverifierWorker(ctx context.Context) { + var wgDetect sync.WaitGroup + // Reuse the same map to avoid allocations. + const avg = 20 + dupes := make(map[string]struct{}, avg) + +nextChunk: + for chunk := range e.reverifiableChunksChan { + for _, detector := range chunk.detectors { + // DO NOT VERIFY at this stage of the pipeline. + results, err := detector.FromData(ctx, false, chunk.chunk.Data) + if err != nil { + ctx.Logger().Error(err, "error verifying chunk") + } + for _, res := range results { + var val []byte + if res.RawV2 != nil { + val = res.RawV2 + } else { + val = res.Raw + } + + if _, ok := dupes[string(val)]; ok { + // This indicates that the same secret was found by multiple detectors. + // We should NOT continue to process this chunk. + atomic.AddUint64(&counter, 1) + chunk.reverifyWgDoneFn() + continue nextChunk + } + dupes[string(res.Raw)] = struct{}{} + } + } + + for _, detector := range chunk.detectors { + wgDetect.Add(1) + chunk.chunk.Verify = e.verify + e.detectableChunksChan <- detectableChunk{ + chunk: chunk.chunk, + detector: detector, + decoder: chunk.decoder, + wgDoneFn: wgDetect.Done, + } + } + + // Empty the dupes map. + for k := range dupes { + delete(dupes, k) + } + chunk.reverifyWgDoneFn() + } + + wgDetect.Wait() + ctx.Logger().V(4).Info("finished reverifying chunks") +} + func (e *Engine) detectChunks(ctx context.Context) { for data := range e.detectableChunksChan { e.detectChunk(ctx, data) From c7691aee84df455e6af6c58603e88ec5c9ade87f Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 25 Jan 2024 08:16:29 -0800 Subject: [PATCH 02/49] remove --- pkg/engine/engine.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index eeca994c135a..f332c5bb57c5 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -446,8 +446,6 @@ func (e *Engine) Finish(ctx context.Context) error { e.WgNotifier.Wait() // Wait for the notifier workers to finish notifying results. fmt.Printf("counter: %d\n", counter) - // fmt.Printf("total: %d\n", total) - // fmt.Printf("max: %d\n", max) if err := cleantemp.CleanTempArtifacts(ctx); err != nil { ctx.Logger().Error(err, "error cleaning temp artifacts") @@ -489,8 +487,6 @@ type reVerifiableChunk struct { } var counter uint64 -var total uint64 -var max uint64 func (e *Engine) detectorWorker(ctx context.Context) { var wgDetect sync.WaitGroup From 3a7a308c65188832b7823de5543c3cc38adc8f98 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 25 Jan 2024 08:17:07 -0800 Subject: [PATCH 03/49] remove --- pkg/engine/engine.go | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index f332c5bb57c5..b7bb43c094e4 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -506,26 +506,6 @@ func (e *Engine) detectorWorker(ctx context.Context) { } matchingDetectors := e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors) - // newValue := uint64(len(chunkSpecificDetectors)) - // for { - // currentMax := atomic.LoadUint64(&max) // Read current max value atomically - // if newValue <= currentMax { - // // New value is not greater, no need to swap, exit loop - // break - // } - // // Attempt to swap if currentMax is still the actual max value - // if atomic.CompareAndSwapUint64(&max, currentMax, newValue) { - // // Swap was successful, exit loop - // break - // } - // // If swap failed, it means another goroutine updated max in the meantime. - // // Loop will re-attempt to load the new max and compare again. - // } - // if newValue > 1 { - // atomic.AddUint64(&counter, 1) - // atomic.AddUint64(&total, uint64(len(chunkSpecificDetectors))) - // atomic.CompareAndSwapUint64(&max, atomic.LoadUint64(&max), uint64(len(chunkSpecificDetectors))) - // } if len(chunkSpecificDetectors) > 1 { wgReverify.Add(1) e.reverifiableChunksChan <- reVerifiableChunk{ From b4deb04e6a06b668ded4e2068f4ed499c5ac2755 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 25 Jan 2024 08:46:44 -0800 Subject: [PATCH 04/49] reduce dupe map cap --- pkg/engine/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index b7bb43c094e4..74414eab478e 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -545,7 +545,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { func (e *Engine) reverifierWorker(ctx context.Context) { var wgDetect sync.WaitGroup // Reuse the same map to avoid allocations. - const avg = 20 + const avg = 8 dupes := make(map[string]struct{}, avg) nextChunk: From 89e0330bf92de413d21cb3d44ae548d37184355f Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 25 Jan 2024 08:55:24 -0800 Subject: [PATCH 05/49] do not verify chunk --- pkg/engine/engine.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 74414eab478e..258102fe5893 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -566,9 +566,17 @@ nextChunk: if _, ok := dupes[string(val)]; ok { // This indicates that the same secret was found by multiple detectors. - // We should NOT continue to process this chunk. + // We should NOT VERIFY this chunk's data. atomic.AddUint64(&counter, 1) chunk.reverifyWgDoneFn() + wgDetect.Add(1) + chunk.chunk.Verify = false // DO NOT VERIFY + e.detectableChunksChan <- detectableChunk{ + chunk: chunk.chunk, + detector: detector, + decoder: chunk.decoder, + wgDoneFn: wgDetect.Done, + } continue nextChunk } dupes[string(res.Raw)] = struct{}{} From 59421a1d1cd41d4f7c0d4cd43902b24ce8139272 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Thu, 25 Jan 2024 14:36:58 -0600 Subject: [PATCH 06/49] cli arg and use val for dupe lut --- main.go | 2 ++ pkg/engine/engine.go | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index ff1651c78ff8..2efff3b0cae2 100644 --- a/main.go +++ b/main.go @@ -49,6 +49,7 @@ var ( concurrency = cli.Flag("concurrency", "Number of concurrent workers.").Default(strconv.Itoa(runtime.NumCPU())).Int() noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool() onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool() + forceReverification = cli.Flag("force-reverification", "Verify credentials when multiple similar credentials are found across detectors.").Bool() filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool() filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64() configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile() @@ -409,6 +410,7 @@ func run(state overseer.State) { engine.WithPrintAvgDetectorTime(*printAvgDetectorTime), engine.WithPrinter(printer), engine.WithFilterEntropy(*filterEntropy), + engine.WithForceReverification(*forceReverification), ) if err != nil { logFatal(err, "error initializing engine") diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 258102fe5893..bf5676120fdb 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -62,6 +62,7 @@ type Engine struct { // entropyFilter is used to filter out unverified results using Shannon entropy. filterEntropy *float64 onlyVerified bool + forceReverification bool printAvgDetectorTime bool // ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups. @@ -183,6 +184,13 @@ func WithVerify(verify bool) Option { } } +// WithForceReverification TODO comment +func WithForceReverification(forceReverification bool) Option { + return func(e *Engine) { + e.forceReverification = forceReverification + } +} + func filterDetectors(filterFunc func(detectors.Detector) bool, input []detectors.Detector) []detectors.Detector { var out []detectors.Detector for _, detector := range input { @@ -506,7 +514,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { } matchingDetectors := e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors) - if len(chunkSpecificDetectors) > 1 { + if len(chunkSpecificDetectors) > 1 && !e.forceReverification { wgReverify.Add(1) e.reverifiableChunksChan <- reVerifiableChunk{ chunk: *decoded.Chunk, @@ -544,6 +552,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { func (e *Engine) reverifierWorker(ctx context.Context) { var wgDetect sync.WaitGroup + // Reuse the same map to avoid allocations. const avg = 8 dupes := make(map[string]struct{}, avg) @@ -564,6 +573,18 @@ nextChunk: val = res.Raw } + // TODO: use leveinshtein distance to compare similar tokens + // Below is a hack to remove the first len(val)/8 characters from the token. + // We do this to detect similar credentials regardless of unique prefix. + // Ex: + // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r + // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r + // + // if len(val) < 20 it likely doesn't have a unique prefix. + if len(val) > 20 { + val = val[len(val)/10:] + } + if _, ok := dupes[string(val)]; ok { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. @@ -579,7 +600,7 @@ nextChunk: } continue nextChunk } - dupes[string(res.Raw)] = struct{}{} + dupes[string(val)] = struct{}{} } } From cd615f00a489f29506caa82bb1726bd1c849dc32 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Thu, 25 Jan 2024 15:20:55 -0600 Subject: [PATCH 07/49] remove counter --- pkg/engine/engine.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index bf5676120fdb..ab843d67218e 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -453,8 +453,6 @@ func (e *Engine) Finish(ctx context.Context) error { close(e.results) // Detector workers are done, close the results channel and call it a day. e.WgNotifier.Wait() // Wait for the notifier workers to finish notifying results. - fmt.Printf("counter: %d\n", counter) - if err := cleantemp.CleanTempArtifacts(ctx); err != nil { ctx.Logger().Error(err, "error cleaning temp artifacts") } @@ -494,8 +492,6 @@ type reVerifiableChunk struct { reverifyWgDoneFn func() } -var counter uint64 - func (e *Engine) detectorWorker(ctx context.Context) { var wgDetect sync.WaitGroup var wgReverify sync.WaitGroup @@ -588,7 +584,6 @@ nextChunk: if _, ok := dupes[string(val)]; ok { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. - atomic.AddUint64(&counter, 1) chunk.reverifyWgDoneFn() wgDetect.Add(1) chunk.chunk.Verify = false // DO NOT VERIFY From 83e6c8d4b83fa57714eab24fe04a7e6cbecd572e Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 25 Jan 2024 14:33:16 -0800 Subject: [PATCH 08/49] skipp empty results] --- pkg/engine/engine.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index ab843d67218e..14042d9bc2d6 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -552,6 +552,7 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Reuse the same map to avoid allocations. const avg = 8 dupes := make(map[string]struct{}, avg) + detectorsWithResult := make(map[detectors.Detector]struct{}, avg) nextChunk: for chunk := range e.reverifiableChunksChan { @@ -561,6 +562,12 @@ nextChunk: if err != nil { ctx.Logger().Error(err, "error verifying chunk") } + + if len(results) == 0 { + continue + } + detectorsWithResult[detector] = struct{}{} + for _, res := range results { var val []byte if res.RawV2 != nil { @@ -570,7 +577,7 @@ nextChunk: } // TODO: use leveinshtein distance to compare similar tokens - // Below is a hack to remove the first len(val)/8 characters from the token. + // Below is a hack to remove the first len()/8 characters from the token. // We do this to detect similar credentials regardless of unique prefix. // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r @@ -599,7 +606,7 @@ nextChunk: } } - for _, detector := range chunk.detectors { + for detector := range detectorsWithResult { wgDetect.Add(1) chunk.chunk.Verify = e.verify e.detectableChunksChan <- detectableChunk{ From 6809ed5b9850a9133190c6cd844eccbc65d1756d Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Thu, 25 Jan 2024 16:36:56 -0600 Subject: [PATCH 09/49] working on test and normalizing val for comparison --- pkg/engine/engine.go | 53 ++++++++++++++++--- pkg/engine/engine_test.go | 39 ++++++++++++++ .../testdata/reverification_detectors.yaml | 13 +++++ .../testdata/reverification_secrets.txt | 2 + pkg/engine/testdata/secrets.txt | 2 +- 5 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 pkg/engine/testdata/reverification_detectors.yaml create mode 100644 pkg/engine/testdata/reverification_secrets.txt diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index ab843d67218e..6279abe31250 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -94,6 +94,14 @@ type Engine struct { // verify determines whether the scanner will attempt to verify candidate secrets verify bool + + // Note: bad hack only used for testing + reverificationTracking *reverificationTracking +} + +type reverificationTracking struct { + reverificationDuplicateCount int + mu sync.Mutex } // Option is used to configure the engine during initialization using functional options. @@ -184,6 +192,14 @@ func WithVerify(verify bool) Option { } } +func withReverificationTracking() Option { + return func(e *Engine) { + e.reverificationTracking = &reverificationTracking{ + reverificationDuplicateCount: 0, + } + } +} + // WithForceReverification TODO comment func WithForceReverification(forceReverification bool) Option { return func(e *Engine) { @@ -546,6 +562,27 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } +// There has got to be a better way, my brain is fried +func normalizeVal(s string) string { + var n int + length := len(s) + switch { + case length <= 20: + n = 10 + case length <= 40: + n = 30 + case length <= 70: + n = 50 + default: + n = 60 + } + + if n > length { + return s + } + return s[len(s)-n:] +} + func (e *Engine) reverifierWorker(ctx context.Context) { var wgDetect sync.WaitGroup @@ -570,20 +607,21 @@ nextChunk: } // TODO: use leveinshtein distance to compare similar tokens - // Below is a hack to remove the first len(val)/8 characters from the token. - // We do this to detect similar credentials regardless of unique prefix. // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r - // - // if len(val) < 20 it likely doesn't have a unique prefix. - if len(val) > 20 { - val = val[len(val)/10:] - } + // normalizeVal is a hack to only look at the last n characters of the secret. _ideally_ this normalizes + // the secret enough to compare similar secrets + val = []byte(normalizeVal(string(val))) if _, ok := dupes[string(val)]; ok { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. + if e.reverificationTracking != nil { + e.reverificationTracking.mu.Lock() + e.reverificationTracking.reverificationDuplicateCount++ + e.reverificationTracking.mu.Unlock() + } chunk.reverifyWgDoneFn() wgDetect.Add(1) chunk.chunk.Verify = false // DO NOT VERIFY @@ -593,6 +631,7 @@ nextChunk: decoder: chunk.decoder, wgDoneFn: wgDetect.Done, } + continue nextChunk } dupes[string(val)] = struct{}{} diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index f7273e0d270e..bc059f64ca5b 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -8,6 +8,7 @@ import ( "github.com/stretchr/testify/assert" + "github.com/trufflesecurity/trufflehog/v3/pkg/config" "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" @@ -214,6 +215,44 @@ func TestEngine_DuplicatSecrets(t *testing.T) { assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) } +func TestReverifcationChunk(t *testing.T) { + ctx := context.Background() + + absPath, err := filepath.Abs("./testdata/reverification_secrets.txt") + assert.Nil(t, err) + + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + confPath, err := filepath.Abs("./testdata/reverification_detectors.yaml") + assert.Nil(t, err) + conf, err := config.Read(confPath) + assert.Nil(t, err) + + e, err := Start(ctx, + WithConcurrency(1), + WithDecoders(decoders.DefaultDecoders()...), + WithDetectors(conf.Detectors...), + WithVerify(true), + WithPrinter(new(discardPrinter)), + withReverificationTracking(), + ) + assert.Nil(t, err) + + cfg := sources.FilesystemConfig{Paths: []string{absPath}} + if err := e.ScanFileSystem(ctx, cfg); err != nil { + return + } + + // Wait for all the chunks to be processed. + assert.Nil(t, e.Finish(ctx)) + want := uint64(1) + assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) + + wantDupe := 1 + assert.Equal(t, wantDupe, e.reverificationTracking.reverificationDuplicateCount) +} + func TestFragmentFirstLineAndLink(t *testing.T) { tests := []struct { name string diff --git a/pkg/engine/testdata/reverification_detectors.yaml b/pkg/engine/testdata/reverification_detectors.yaml new file mode 100644 index 000000000000..583a12f69d36 --- /dev/null +++ b/pkg/engine/testdata/reverification_detectors.yaml @@ -0,0 +1,13 @@ +# config.yaml +detectors: + - name: detector1 + keywords: + - PMAK + regex: + api_key: \b(PMAK-[a-zA-Z-0-9]{59})\b + + - name: detector2 + keywords: + - ost + regex: + api_key: \b([a-zA-Z-0-9]{59})\b \ No newline at end of file diff --git a/pkg/engine/testdata/reverification_secrets.txt b/pkg/engine/testdata/reverification_secrets.txt new file mode 100644 index 000000000000..30eff11c9f33 --- /dev/null +++ b/pkg/engine/testdata/reverification_secrets.txt @@ -0,0 +1,2 @@ + +POSTMAN_API_KEY="PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r" diff --git a/pkg/engine/testdata/secrets.txt b/pkg/engine/testdata/secrets.txt index a675dda0227e..ed277e51f408 100644 --- a/pkg/engine/testdata/secrets.txt +++ b/pkg/engine/testdata/secrets.txt @@ -3,4 +3,4 @@ sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 - sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 + sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 \ No newline at end of file From eaa0e7c2722a2ba5f7c96c3135bb692dabab774d Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Thu, 25 Jan 2024 16:41:25 -0600 Subject: [PATCH 10/49] forgot to save file --- pkg/engine/engine.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 6f26048c5ee6..394d0cff136e 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -614,11 +614,6 @@ nextChunk: } // TODO: use leveinshtein distance to compare similar tokens -<<<<<<< HEAD -======= - // Below is a hack to remove the first len()/8 characters from the token. - // We do this to detect similar credentials regardless of unique prefix. ->>>>>>> 83e6c8d4b83fa57714eab24fe04a7e6cbecd572e // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r From 43cf8d0f6b1fbacfc7c8cd88546732bb5a194f04 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 25 Jan 2024 14:54:41 -0800 Subject: [PATCH 11/49] optimize normalize --- pkg/engine/engine.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 394d0cff136e..85c734b3d39c 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -563,9 +563,9 @@ func (e *Engine) detectorWorker(ctx context.Context) { } // There has got to be a better way, my brain is fried -func normalizeVal(s string) string { +func normalizeVal(b []byte) []byte { + length := len(b) var n int - length := len(s) switch { case length <= 20: n = 10 @@ -578,9 +578,9 @@ func normalizeVal(s string) string { } if n > length { - return s + return b } - return s[len(s)-n:] + return b[len(b)-n:] } func (e *Engine) reverifierWorker(ctx context.Context) { @@ -619,9 +619,7 @@ nextChunk: // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // normalizeVal is a hack to only look at the last n characters of the secret. _ideally_ this normalizes // the secret enough to compare similar secrets - val = []byte(normalizeVal(string(val))) - - if _, ok := dupes[string(val)]; ok { + if _, ok := dupes[string(normalizeVal(val))]; ok { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. if e.reverificationTracking != nil { From 0aee157cc55c7ab40b18b49ecb3a3c81607870cf Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 06:07:39 -0800 Subject: [PATCH 12/49] reuse map --- pkg/engine/engine.go | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 85c734b3d39c..cbf57f928c1f 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -468,6 +468,7 @@ func (e *Engine) Finish(ctx context.Context) error { close(e.results) // Detector workers are done, close the results channel and call it a day. e.WgNotifier.Wait() // Wait for the notifier workers to finish notifying results. + fmt.Printf("max %d\n", max) if err := cleantemp.CleanTempArtifacts(ctx); err != nil { ctx.Logger().Error(err, "error cleaning temp artifacts") @@ -583,13 +584,15 @@ func normalizeVal(b []byte) []byte { return b[len(b)-n:] } +var max uint64 + func (e *Engine) reverifierWorker(ctx context.Context) { var wgDetect sync.WaitGroup // Reuse the same map to avoid allocations. - const avg = 8 - dupes := make(map[string]struct{}, avg) - detectorsWithResult := make(map[detectors.Detector]struct{}, avg) + const avgSecretsPerDetector = 8 + detectorSecrets := make(map[string]struct{}, avgSecretsPerDetector) + detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) nextChunk: for chunk := range e.reverifiableChunksChan { @@ -619,7 +622,7 @@ nextChunk: // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // normalizeVal is a hack to only look at the last n characters of the secret. _ideally_ this normalizes // the secret enough to compare similar secrets - if _, ok := dupes[string(normalizeVal(val))]; ok { + if _, ok := detectorSecrets[string(normalizeVal(val))]; ok { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. if e.reverificationTracking != nil { @@ -639,7 +642,7 @@ nextChunk: continue nextChunk } - dupes[string(val)] = struct{}{} + detectorSecrets[string(val)] = struct{}{} } } @@ -654,10 +657,14 @@ nextChunk: } } - // Empty the dupes map. - for k := range dupes { - delete(dupes, k) + // Empty the dupes map and the detectorsWithResult map. + for k := range detectorSecrets { + delete(detectorSecrets, k) + } + for k := range detectorsWithResult { + delete(detectorsWithResult, k) } + chunk.reverifyWgDoneFn() } From cfa9b558abdb1f3c30d0d017a9f6a1869c2d7af4 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 06:08:14 -0800 Subject: [PATCH 13/49] remove print --- pkg/engine/engine.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index cbf57f928c1f..f9fd832795ef 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -468,7 +468,6 @@ func (e *Engine) Finish(ctx context.Context) error { close(e.results) // Detector workers are done, close the results channel and call it a day. e.WgNotifier.Wait() // Wait for the notifier workers to finish notifying results. - fmt.Printf("max %d\n", max) if err := cleantemp.CleanTempArtifacts(ctx); err != nil { ctx.Logger().Error(err, "error cleaning temp artifacts") @@ -584,8 +583,6 @@ func normalizeVal(b []byte) []byte { return b[len(b)-n:] } -var max uint64 - func (e *Engine) reverifierWorker(ctx context.Context) { var wgDetect sync.WaitGroup From c38107b8c8c3d5f16b45d2ad440617abc66d3004 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Fri, 26 Jan 2024 10:26:46 -0600 Subject: [PATCH 14/49] use levenshtein distance to check dupes --- go.mod | 3 ++- go.sum | 50 ++++++-------------------------------------- pkg/engine/engine.go | 39 +++++++++++++++++++++------------- 3 files changed, 32 insertions(+), 60 deletions(-) diff --git a/go.mod b/go.mod index fc4a9b3bf3e9..62266125b483 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/AzureAD/microsoft-authentication-library-for-go v1.2.1 github.com/BobuSumisu/aho-corasick v1.0.3 github.com/TheZeroSlave/zapsentry v1.19.0 + github.com/adrg/strutil v0.3.1 github.com/alecthomas/kingpin/v2 v2.4.0 github.com/aws/aws-sdk-go v1.50.0 github.com/aymanbagabas/go-osc52 v1.2.2 @@ -72,6 +73,7 @@ require ( github.com/stretchr/testify v1.8.4 github.com/tailscale/depaware v0.0.0-20210622194025-720c4b409502 github.com/trufflesecurity/disk-buffer-reader v0.2.1 + github.com/wasilibs/go-re2 v1.4.1 github.com/xanzy/go-gitlab v0.94.0 go.mongodb.org/mongo-driver v1.12.1 go.uber.org/mock v0.3.0 @@ -239,7 +241,6 @@ require ( github.com/therootcompany/xz v1.0.1 // indirect github.com/ulikunitz/xz v0.5.11 // indirect github.com/vbatts/tar-split v0.11.3 // indirect - github.com/wasilibs/go-re2 v1.4.1 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect github.com/xdg-go/scram v1.1.2 // indirect diff --git a/go.sum b/go.sum index 0f344b33d2e9..6c9dbde9dec1 100644 --- a/go.sum +++ b/go.sum @@ -7,8 +7,6 @@ cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTj cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= -cloud.google.com/go v0.110.10 h1:LXy9GEO+timppncPIAZoOj3l58LIU9k+kn48AN7IO3Y= -cloud.google.com/go v0.110.10/go.mod h1:v1OoFqYxiBkUrruItNM3eT4lLByNjxmJSV/xDKJNnic= cloud.google.com/go v0.111.0 h1:YHLKNupSD1KqjDbQ3+LVdQ81h/UJbJyZG203cEfnQgM= cloud.google.com/go v0.111.0/go.mod h1:0mibmpKP1TyOOFYQY5izo0LnT+ecvOQ0Sg3OdmMiNRU= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= @@ -84,6 +82,8 @@ github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 h1:kkhsdkhsCv github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0= github.com/TheZeroSlave/zapsentry v1.19.0 h1:/FVdMrq/w7bYt98m49ImZgmCTybXWbGc8/hOT0nLmyc= github.com/TheZeroSlave/zapsentry v1.19.0/go.mod h1:D1YMfSuu6xnkhwFXxrronesmsiyDhIqo+86I3Ok+r64= +github.com/adrg/strutil v0.3.1 h1:OLvSS7CSJO8lBii4YmBt8jiK9QOtB9CzCzwl4Ic/Fz4= +github.com/adrg/strutil v0.3.1/go.mod h1:8h90y18QLrs11IBffcGX3NW/GFBXCMcNg4M7H6MspPA= github.com/alecthomas/chroma v0.10.0 h1:7XDcGkCQopCNKjZHfYrNLraA+M7e0fMiJ/Mfikbfjek= github.com/alecthomas/chroma v0.10.0/go.mod h1:jtJATyUxlIORhUOFNA9NZDWGAQ8wpxQQqNSB4rjA/1s= github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY= @@ -104,8 +104,6 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= -github.com/aws/aws-sdk-go v1.49.19 h1:oZryiqeQpeJsIcAmZlp86duMu/s/DJ43qyfwa51qmLg= -github.com/aws/aws-sdk-go v1.49.19/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk= github.com/aws/aws-sdk-go v1.50.0 h1:HBtrLeO+QyDKnc3t1+5DR1RxodOHCGr8ZcrHudpv7jI= github.com/aws/aws-sdk-go v1.50.0/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk= github.com/aws/aws-sdk-go-v2 v1.17.7 h1:CLSjnhJSTSogvqUGhIC6LqFKATMRexcxLZ0i/Nzk9Eg= @@ -194,20 +192,12 @@ github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 h1:q2hJAaP1k2 github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk= github.com/containerd/stargz-snapshotter/estargz v0.14.3 h1:OqlDCK3ZVUO6C3B/5FSkDwbkEETK84kQgEeFwDC+62k= github.com/containerd/stargz-snapshotter/estargz v0.14.3/go.mod h1:KY//uOCIkSuNAHhJogcZtrNHdKrA99/FCCRjE3HD36o= -github.com/couchbase/gocb/v2 v2.7.0 h1:zU/Eh9+RIS1TvQFiEF4JBajMm9VTjkeQssE9ov7F87c= -github.com/couchbase/gocb/v2 v2.7.0/go.mod h1:IHq/c3cnrqKq9scFQJ8OyD/xhqZ0b4mHYVH6VEMnsnw= github.com/couchbase/gocb/v2 v2.7.1 h1:Wy5IufpGWDStErhe9bNxXdiHpXf4LIhEpWnR7gJcme0= github.com/couchbase/gocb/v2 v2.7.1/go.mod h1:tn/jNMSMGwEB2Dd1uHW/aTwScx1lXZqb9oM0zyWeEUg= -github.com/couchbase/gocbcore/v10 v10.3.0 h1:cu5KWP5Yq9cANw0UitpKWmb8mv9NDhC0ApIf9rMrVq8= -github.com/couchbase/gocbcore/v10 v10.3.0/go.mod h1:lYQIIk+tzoMcwtwU5GzPbDdqEkwkH3isI2rkSpfL0oM= github.com/couchbase/gocbcore/v10 v10.3.1 h1:dx+lub02eDYiQXavtF0EwYMppVUcbjCxAAqa6/nQldg= github.com/couchbase/gocbcore/v10 v10.3.1/go.mod h1:lYQIIk+tzoMcwtwU5GzPbDdqEkwkH3isI2rkSpfL0oM= -github.com/couchbase/gocbcoreps v0.1.0 h1:9+Qq+H/YXYn+H6f5A5MndUv40qdCwPwoJjinHolxq2g= -github.com/couchbase/gocbcoreps v0.1.0/go.mod h1:LjH33s/LNVBAwVU1Ka/YU3cLkuAyFC2dzGGiValJ5oY= github.com/couchbase/gocbcoreps v0.1.1 h1:H5Q/TVmRqEpcdTDlepwAmLW7cemP9Di6Lp91Qa9oz1A= github.com/couchbase/gocbcoreps v0.1.1/go.mod h1:tpbHglpBO7DZZmr8XhHe7INj5VJcJ3i+41Ktep9lejI= -github.com/couchbase/goprotostellar v1.0.0 h1:umfH4hOxrUS/0QY1AkdoVcpp9rg7Jl+UNWzNJ3KxIHc= -github.com/couchbase/goprotostellar v1.0.0/go.mod h1:gs1eioLVOHETTFWxDY4v7Q/kRPMgqmX6t/TPcI429ls= github.com/couchbase/goprotostellar v1.0.1 h1:mtDVYTgnnDSQ3t7mQRG6jl/tOXKOuuFM9PakqC1qhCY= github.com/couchbase/goprotostellar v1.0.1/go.mod h1:gs1eioLVOHETTFWxDY4v7Q/kRPMgqmX6t/TPcI429ls= github.com/couchbaselabs/gocaves/client v0.0.0-20230307083111-cc3960c624b1/go.mod h1:AVekAZwIY2stsJOMWLAS/0uA/+qdp7pjO8EHnl61QkY= @@ -254,8 +244,6 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/envoyproxy/protoc-gen-validate v1.0.2 h1:QkIBuU5k+x7/QXPvPPnWXWlCdaBFApVqftFV6k087DA= -github.com/envoyproxy/protoc-gen-validate v1.0.2/go.mod h1:GpiZQP3dDbg4JouG/NNS7QWXpgx6x8QiMKdmN72jogE= github.com/envoyproxy/protoc-gen-validate v1.0.4 h1:gVPz/FMfvh57HdSJQyvBtF00j8JU4zdyUgIUNhlgg0A= github.com/envoyproxy/protoc-gen-validate v1.0.4/go.mod h1:qys6tmnRsYrQqIhm2bvKZH4Blx/1gTIZ2UKVY1M+Yew= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= @@ -400,8 +388,6 @@ github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4= -github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfFxPRy3Bf7vr3h0cechB90XaQs= @@ -438,8 +424,6 @@ github.com/hashicorp/go-retryablehttp v0.7.5/go.mod h1:Jy/gPYAdjqffZ/yFGCFV2doI5 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1 h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.6.0 h1:uL2shRDx7RTrOrTCUZEGP/wJUFiUI8QT6E7z5o8jga4= -github.com/hashicorp/golang-lru v0.6.0/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= @@ -684,6 +668,8 @@ github.com/vbatts/tar-split v0.11.3 h1:hLFqsOLQ1SsppQNTMpkpPXClLDfC2A3Zgy9OUU+RV github.com/vbatts/tar-split v0.11.3/go.mod h1:9QlHN18E+fEH7RdG+QAJJcuya3rqT7eXSTY7wGrAokY= github.com/wasilibs/go-re2 v1.4.1 h1:E5+9O1M8UoGeqLB2A9omeoaWImqpuYDs9cKwvTJq/Oo= github.com/wasilibs/go-re2 v1.4.1/go.mod h1:ynB8eCwd9JsqUnsk8WlPDk6cEeme8BguZmnqOSURE4Y= +github.com/wasilibs/nottinygc v0.4.0 h1:h1TJMihMC4neN6Zq+WKpLxgd9xCFMw7O9ETLwY2exJQ= +github.com/wasilibs/nottinygc v0.4.0/go.mod h1:oDcIotskuYNMpqMF23l7Z8uzD4TC0WXHK8jetlB3HIo= github.com/wsxiaoys/terminal v0.0.0-20160513160801-0940f3fc43a0 h1:3UeQBvD0TFrlVjOeLOBz+CPAI8dnbqNSVwUwRrkp7vQ= github.com/wsxiaoys/terminal v0.0.0-20160513160801-0940f3fc43a0/go.mod h1:IXCdmsXIht47RaVFLEdVnh1t+pgYtTAhQGj73kz+2DM= github.com/xanzy/go-gitlab v0.94.0 h1:GmBl2T5zqUHqyjkxFSvsT7CbelGdAH/dmBqUBqS+4BE= @@ -727,6 +713,8 @@ go.opentelemetry.io/otel v1.19.0 h1:MuS/TNf4/j4IXsZuJegVzI1cwut7Qc00344rgH7p8bs= go.opentelemetry.io/otel v1.19.0/go.mod h1:i0QyjOq3UPoTzff0PJB2N66fb4S0+rSbSB15/oyH9fY= go.opentelemetry.io/otel/metric v1.19.0 h1:aTzpGtV0ar9wlV4Sna9sdJyII5jTVJEvKETPiOKwvpE= go.opentelemetry.io/otel/metric v1.19.0/go.mod h1:L5rUsV9kM1IxCj1MmSdS+JQAcVm319EUrDVLrt7jqt8= +go.opentelemetry.io/otel/sdk v1.19.0 h1:6USY6zH+L8uMH8L3t1enZPR3WFEmSTADlqldyHtJi3o= +go.opentelemetry.io/otel/sdk v1.19.0/go.mod h1:NedEbbS4w3C6zElbLdPJKOpJQOrGUJ+GfzpjUvI0v1A= go.opentelemetry.io/otel/trace v1.19.0 h1:DFVQmlVbfVeOuBRrwdtaehRrWiL1JoVs9CPIQ1Dzxpg= go.opentelemetry.io/otel/trace v1.19.0/go.mod h1:mfaSyvGyEJEI0nyV2I4qhNQnbBOUUmYZpYojqMnX2vo= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= @@ -755,8 +743,6 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= -golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= -golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -767,8 +753,6 @@ golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= -golang.org/x/exp v0.0.0-20240110193028-0dcbfd608b1e h1:723BNChdd0c2Wk6WOE320qGBiPtYx0F0Bbm1kriShfE= -golang.org/x/exp v0.0.0-20240110193028-0dcbfd608b1e/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= golang.org/x/exp v0.0.0-20240119083558-1b970713d09a h1:Q8/wZp0KX97QFTc2ywcOE0YRjZPVIx+MXInMzdvQqcA= golang.org/x/exp v0.0.0-20240119083558-1b970713d09a/go.mod h1:idGWGoKP1toJGkd5/ig9ZLuPcZBC3ewk7SzmH0uou08= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= @@ -822,8 +806,6 @@ golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= -golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= @@ -842,8 +824,6 @@ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -892,8 +872,6 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -903,8 +881,6 @@ golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= -golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= -golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -957,8 +933,6 @@ golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/tools v0.16.0 h1:GO788SKMRunPIBCXiQyo2AaexLstOrVhuAL5YwsckQM= -golang.org/x/tools v0.16.0/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= golang.org/x/tools v0.17.0 h1:FvmRgNOcs3kOa+T20R1uhfP9F6HgG2mfxDv1vrx1Htc= golang.org/x/tools v0.17.0/go.mod h1:xsh6VxdV005rRVaS6SSAf9oiAqljS7UZUacMZ8Bnsps= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -984,8 +958,6 @@ google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7 google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= @@ -1003,16 +975,10 @@ google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvx google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200423170343-7949de9c1215/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto v0.0.0-20231106174013-bbf56f31fb17 h1:wpZ8pe2x1Q3f2KyT5f8oP/fa9rHAKgFPr/HZdNuS+PQ= -google.golang.org/genproto v0.0.0-20231106174013-bbf56f31fb17/go.mod h1:J7XzRzVy1+IPwWHZUzoD0IccYZIrXILAQpc+Qy9CMhY= google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917 h1:nz5NESFLZbJGPFxDT/HCn+V1mZ8JGNoY4nUpmW/Y2eg= google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917/go.mod h1:pZqR+glSb11aJ+JQcczCvgf47+duRuzNSKqE8YAQnV0= -google.golang.org/genproto/googleapis/api v0.0.0-20231106174013-bbf56f31fb17 h1:JpwMPBpFN3uKhdaekDpiNlImDdkUAyiJ6ez/uxGaUSo= -google.golang.org/genproto/googleapis/api v0.0.0-20231106174013-bbf56f31fb17/go.mod h1:0xJLfVdJqpAPl8tDg1ujOCGzx6LFLttXT5NhllGOXY4= google.golang.org/genproto/googleapis/api v0.0.0-20231212172506-995d672761c0 h1:s1w3X6gQxwrLEpxnLd/qXTVLgQE2yXwaOaoa6IlY/+o= google.golang.org/genproto/googleapis/api v0.0.0-20231212172506-995d672761c0/go.mod h1:CAny0tYF+0/9rmDB9fahA9YLzX3+AEVl1qXbv5hhj6c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231120223509-83a465c0220f h1:ultW7fxlIvee4HYrtnaRPon9HpEgFk5zYpmfMgtKB5I= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231120223509-83a465c0220f/go.mod h1:L9KNLi232K1/xB6f7AlSX692koaRnKaWSR0stBki0Yc= google.golang.org/genproto/googleapis/rpc v0.0.0-20240108191215-35c7eff3a6b1 h1:gphdwh0npgs8elJ4T6J+DQJHPVF7RsuJHCfwztUb4J4= google.golang.org/genproto/googleapis/rpc v0.0.0-20240108191215-35c7eff3a6b1/go.mod h1:daQN87bsDqDoe316QbbvX60nMoJQa4r6Ds0ZuoAe5yA= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= @@ -1025,8 +991,6 @@ google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8 google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= -google.golang.org/grpc v1.59.0 h1:Z5Iec2pjwb+LEOqzpB2MR12/eKFhDPhuqW91O+4bwUk= -google.golang.org/grpc v1.59.0/go.mod h1:aUPDwccQo6OTjy7Hct4AfBPD1GptF4fyUjIkQ9YtF98= google.golang.org/grpc v1.60.1 h1:26+wFr+cNqSGFcOXcabYC0lUVJVRa2Sb2ortSK7VrEU= google.golang.org/grpc v1.60.1/go.mod h1:OlCHIeLYqSSsLi6i49B5QGdzaMZK9+M7LXN2FKz4eGM= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= @@ -1040,8 +1004,6 @@ google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpAD google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 394d0cff136e..db78b8c16f7d 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -8,6 +8,8 @@ import ( "sync/atomic" "time" + "github.com/adrg/strutil" + "github.com/adrg/strutil/metrics" lru "github.com/hashicorp/golang-lru" "google.golang.org/protobuf/proto" @@ -104,6 +106,12 @@ type reverificationTracking struct { mu sync.Mutex } +func (r *reverificationTracking) increment() { + r.mu.Lock() + r.reverificationDuplicateCount++ + r.mu.Unlock() +} + // Option is used to configure the engine during initialization using functional options. type Option func(*Engine) @@ -583,16 +591,27 @@ func normalizeVal(s string) string { return s[len(s)-n:] } +func likelyDuplicate(val []byte, dupesSlice []string) bool { + for _, v := range dupesSlice { + similarity := strutil.Similarity(string(val), v, metrics.NewLevenshtein()) + // close enough + if similarity > 0.9 { + return true + } + } + return false +} + func (e *Engine) reverifierWorker(ctx context.Context) { var wgDetect sync.WaitGroup // Reuse the same map to avoid allocations. const avg = 8 - dupes := make(map[string]struct{}, avg) detectorsWithResult := make(map[detectors.Detector]struct{}, avg) nextChunk: for chunk := range e.reverifiableChunksChan { + dupes := make([]string, 0, avg) for _, detector := range chunk.detectors { // DO NOT VERIFY at this stage of the pipeline. results, err := detector.FromData(ctx, false, chunk.chunk.Data) @@ -613,21 +632,15 @@ nextChunk: val = res.Raw } - // TODO: use leveinshtein distance to compare similar tokens + // Use levenstein distance to determine if the secret is likely the same. // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r - // normalizeVal is a hack to only look at the last n characters of the secret. _ideally_ this normalizes - // the secret enough to compare similar secrets - val = []byte(normalizeVal(string(val))) - - if _, ok := dupes[string(val)]; ok { + if likelyDuplicate(val, dupes) { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. if e.reverificationTracking != nil { - e.reverificationTracking.mu.Lock() - e.reverificationTracking.reverificationDuplicateCount++ - e.reverificationTracking.mu.Unlock() + e.reverificationTracking.increment() } chunk.reverifyWgDoneFn() wgDetect.Add(1) @@ -641,7 +654,7 @@ nextChunk: continue nextChunk } - dupes[string(val)] = struct{}{} + dupes = append(dupes, string(val)) } } @@ -656,10 +669,6 @@ nextChunk: } } - // Empty the dupes map. - for k := range dupes { - delete(dupes, k) - } chunk.reverifyWgDoneFn() } From ef4861c5f7805b431bd241d6b7755f0ded54b468 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Fri, 26 Jan 2024 10:46:51 -0600 Subject: [PATCH 15/49] forgot to leave in emptying map --- pkg/engine/engine.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 48e88fc7509e..3a2d420523d8 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -584,7 +584,7 @@ func likelyDuplicate(val []byte, dupesSlice []string) bool { func (e *Engine) reverifierWorker(ctx context.Context) { var wgDetect sync.WaitGroup - // Reuse the same map to avoid allocations. + // Reuse the same map and slice to avoid allocations. const avgSecretsPerDetector = 8 detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) chunkSecrets := make([]string, 0, avgSecretsPerDetector) @@ -637,9 +637,6 @@ nextChunk: } } - // reset the slice - chunkSecrets = chunkSecrets[:0] - for detector := range detectorsWithResult { wgDetect.Add(1) chunk.chunk.Verify = e.verify @@ -651,6 +648,13 @@ nextChunk: } } + // Empty the dupes sliace and the detectorsWithResult map. + chunkSecrets = chunkSecrets[:0] + + for k := range detectorsWithResult { + delete(detectorsWithResult, k) + } + chunk.reverifyWgDoneFn() } From da9093880de993399c15b6feba9cf33cee83da54 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 08:50:39 -0800 Subject: [PATCH 16/49] use slice --- pkg/engine/engine.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 3a2d420523d8..baf439584ed1 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -586,7 +586,7 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Reuse the same map and slice to avoid allocations. const avgSecretsPerDetector = 8 - detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) + detectorsWithResult := make([]detectors.Detector, 0, avgSecretsPerDetector) chunkSecrets := make([]string, 0, avgSecretsPerDetector) nextChunk: @@ -601,7 +601,7 @@ nextChunk: if len(results) == 0 { continue } - detectorsWithResult[detector] = struct{}{} + detectorsWithResult = append(detectorsWithResult, detector) for _, res := range results { var val []byte @@ -637,7 +637,7 @@ nextChunk: } } - for detector := range detectorsWithResult { + for _, detector := range detectorsWithResult { wgDetect.Add(1) chunk.chunk.Verify = e.verify e.detectableChunksChan <- detectableChunk{ @@ -650,10 +650,7 @@ nextChunk: // Empty the dupes sliace and the detectorsWithResult map. chunkSecrets = chunkSecrets[:0] - - for k := range detectorsWithResult { - delete(detectorsWithResult, k) - } + detectorsWithResult = detectorsWithResult[:0] chunk.reverifyWgDoneFn() } From 61fd8cbc90ae0820c5fc745c8ac8ed87471ab5f0 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Fri, 26 Jan 2024 10:54:10 -0600 Subject: [PATCH 17/49] small tweak --- pkg/engine/engine.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 3a2d420523d8..06894167e09f 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -570,9 +570,13 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } -func likelyDuplicate(val []byte, dupesSlice []string) bool { +func likelyDuplicate(val string, dupesSlice []string) bool { for _, v := range dupesSlice { - similarity := strutil.Similarity(string(val), v, metrics.NewLevenshtein()) + if v == val { + return true + } + similarity := strutil.Similarity(val, v, metrics.NewLevenshtein()) + // close enough if similarity > 0.9 { return true @@ -615,7 +619,7 @@ nextChunk: // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r - if likelyDuplicate(val, chunkSecrets) { + if likelyDuplicate(string(val), chunkSecrets) { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. if e.reverificationTracking != nil { From 744a4011a8641977e936f5e65a28a008b7acae73 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Fri, 26 Jan 2024 10:55:44 -0600 Subject: [PATCH 18/49] comment --- pkg/engine/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 6140cceb37e9..88d353444378 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -652,7 +652,7 @@ nextChunk: } } - // Empty the dupes sliace and the detectorsWithResult map. + // Empty the dupes and detectors slice chunkSecrets = chunkSecrets[:0] detectorsWithResult = detectorsWithResult[:0] From 9d52160be2fd4fcd371693e99a5e380a4885647f Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 08:56:18 -0800 Subject: [PATCH 19/49] use bytes --- pkg/engine/engine.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index baf439584ed1..1a12c902fe48 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -570,9 +570,9 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } -func likelyDuplicate(val []byte, dupesSlice []string) bool { +func likelyDuplicate(val []byte, dupesSlice [][]byte) bool { for _, v := range dupesSlice { - similarity := strutil.Similarity(string(val), v, metrics.NewLevenshtein()) + similarity := strutil.Similarity(string(val), string(v), metrics.NewLevenshtein()) // close enough if similarity > 0.9 { return true @@ -587,7 +587,7 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Reuse the same map and slice to avoid allocations. const avgSecretsPerDetector = 8 detectorsWithResult := make([]detectors.Detector, 0, avgSecretsPerDetector) - chunkSecrets := make([]string, 0, avgSecretsPerDetector) + chunkSecrets := make([][]byte, 0, avgSecretsPerDetector) nextChunk: for chunk := range e.reverifiableChunksChan { @@ -633,7 +633,7 @@ nextChunk: continue nextChunk } - chunkSecrets = append(chunkSecrets, string(val)) + chunkSecrets = append(chunkSecrets, val) } } From 6d8c3095fcae4d6cd7e81c09f2cea081f379ec04 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Fri, 26 Jan 2024 11:13:08 -0600 Subject: [PATCH 20/49] praise --- pkg/engine/engine.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 88d353444378..cd30d19425bd 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -573,12 +573,14 @@ func (e *Engine) detectorWorker(ctx context.Context) { func likelyDuplicate(val string, dupesSlice []string) bool { for _, v := range dupesSlice { if v == val { + fmt.Println("found exact duplicate", val, v) return true } similarity := strutil.Similarity(val, v, metrics.NewLevenshtein()) // close enough if similarity > 0.9 { + fmt.Println("found similar duplicate", val, v, similarity) return true } } @@ -635,6 +637,9 @@ nextChunk: wgDoneFn: wgDetect.Done, } + // Empty the dupes and detectors slice + chunkSecrets = chunkSecrets[:0] + detectorsWithResult = detectorsWithResult[:0] continue nextChunk } chunkSecrets = append(chunkSecrets, string(val)) From c6bfc1c7379ecf06b9b802f7c78d53096b99e246 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 09:49:59 -0800 Subject: [PATCH 21/49] use ctx logger --- pkg/engine/engine.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 8d67385a8e7f..e93528b6baad 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -570,17 +570,17 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } -func likelyDuplicate(val string, dupesSlice []string) bool { +func likelyDuplicate(ctx context.Context, val string, dupesSlice []string) bool { for _, v := range dupesSlice { if v == val { - fmt.Println("found exact duplicate", val, v) + ctx.Logger().V(2).Info("found exact duplicate", "val", val, "v", v) return true } - similarity := strutil.Similarity(string(val), string(v), metrics.NewLevenshtein()) + similarity := strutil.Similarity(val, v, metrics.NewLevenshtein()) // close enough if similarity > 0.9 { - fmt.Println("found similar duplicate", val, v, similarity) + ctx.Logger().V(2).Info("found similar duplicate", "val", val, "v", v, "similarity", similarity) return true } } @@ -622,7 +622,7 @@ nextChunk: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r valStr := string(val) - if likelyDuplicate(valStr, chunkSecrets) { + if likelyDuplicate(ctx, valStr, chunkSecrets) { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. if e.reverificationTracking != nil { From 398dfb90d168ea1dae3dcea59d9c34255a0ee22e Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 10:44:24 -0800 Subject: [PATCH 22/49] add len check --- pkg/engine/engine.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index e93528b6baad..32e6c2532ea0 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -576,6 +576,11 @@ func likelyDuplicate(ctx context.Context, val string, dupesSlice []string) bool ctx.Logger().V(2).Info("found exact duplicate", "val", val, "v", v) return true } + // Avoid comparing strings of vastly different lengths. + if len(v)*10 < len(val)*9 || len(v)*10 > len(val)*11 { + continue + } + similarity := strutil.Similarity(val, v, metrics.NewLevenshtein()) // close enough From c415880eb875b53578704a5269b6c2b32c071c79 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 10:49:43 -0800 Subject: [PATCH 23/49] add comments --- pkg/engine/engine.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 32e6c2532ea0..2513b2693bbc 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -427,7 +427,8 @@ func (e *Engine) startWorkers(ctx context.Context) { }() } - // reverifiers... + // Reverifier workers handle verification of chunks that have been detected by multiple detectors. + // They ensure that verification is disabled for any secrets that have been detected by multiple detectors. ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) for worker := uint64(0); worker < uint64(e.concurrency); worker++ { e.reverifiersWg.Add(1) @@ -509,6 +510,9 @@ type detectableChunk struct { wgDoneFn func() } +// reVerifiableChunk is a decoded chunk that has multiple detectors that match it. +// It will be initially processed with verification disabled, and then reprocessed with verification +// enabled if the same secret was not found by multiple detectors. type reVerifiableChunk struct { chunk sources.Chunk decoder detectorspb.DecoderType @@ -643,7 +647,7 @@ nextChunk: wgDoneFn: wgDetect.Done, } - // Empty the dupes and detectors slice + // Empty the dupes and detectors slice. chunkSecrets = chunkSecrets[:0] detectorsWithResult = detectorsWithResult[:0] continue nextChunk From 8b21a7199a4e1ac37ae8650f1ca8cea5850b295e Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Fri, 26 Jan 2024 14:07:58 -0800 Subject: [PATCH 24/49] use 8x concurrency for reverifier workers --- pkg/engine/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 2513b2693bbc..df1bf9912469 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -430,7 +430,7 @@ func (e *Engine) startWorkers(ctx context.Context) { // Reverifier workers handle verification of chunks that have been detected by multiple detectors. // They ensure that verification is disabled for any secrets that have been detected by multiple detectors. ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) - for worker := uint64(0); worker < uint64(e.concurrency); worker++ { + for worker := uint64(0); worker < uint64(e.concurrency)*100; worker++ { e.reverifiersWg.Add(1) go func() { ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5)) From f15f9ab44e82c144a73bfa46be6846c9b9d5820e Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Tue, 30 Jan 2024 14:25:38 -0800 Subject: [PATCH 25/49] revert worker count --- pkg/engine/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index df1bf9912469..2513b2693bbc 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -430,7 +430,7 @@ func (e *Engine) startWorkers(ctx context.Context) { // Reverifier workers handle verification of chunks that have been detected by multiple detectors. // They ensure that verification is disabled for any secrets that have been detected by multiple detectors. ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) - for worker := uint64(0); worker < uint64(e.concurrency)*100; worker++ { + for worker := uint64(0); worker < uint64(e.concurrency); worker++ { e.reverifiersWg.Add(1) go func() { ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5)) From 0c56de564c7dcdbbb22bf5b1a582b11ed08aac0e Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Tue, 30 Jan 2024 14:46:13 -0800 Subject: [PATCH 26/49] use more workers --- pkg/engine/engine.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 0f710b780d9a..247b120a6f47 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -13,13 +13,14 @@ import ( lru "github.com/hashicorp/golang-lru" "google.golang.org/protobuf/proto" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" + "github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp" "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/config" "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" - "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" "github.com/trufflesecurity/trufflehog/v3/pkg/giturl" "github.com/trufflesecurity/trufflehog/v3/pkg/output" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" @@ -431,7 +432,7 @@ func (e *Engine) startWorkers(ctx context.Context) { // Reverifier workers handle verification of chunks that have been detected by multiple detectors. // They ensure that verification is disabled for any secrets that have been detected by multiple detectors. ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) - for worker := uint64(0); worker < uint64(e.concurrency); worker++ { + for worker := uint64(0); worker < uint64(e.concurrency*100); worker++ { e.reverifiersWg.Add(1) go func() { ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5)) From c98bff1db533afad4bf8d0192b95f87c3a5b0d6b Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Tue, 30 Jan 2024 15:21:23 -0800 Subject: [PATCH 27/49] process result directly for any collisions --- pkg/engine/engine.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 247b120a6f47..6e04483861f5 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -640,14 +640,13 @@ nextChunk: e.reverificationTracking.increment() } chunk.reverifyWgDoneFn() - wgDetect.Add(1) - chunk.chunk.Verify = false // DO NOT VERIFY - e.detectableChunksChan <- detectableChunk{ + + e.processResult(ctx, detectableChunk{ chunk: chunk.chunk, detector: detector, decoder: chunk.decoder, wgDoneFn: wgDetect.Done, - } + }, res) // Empty the dupes and detectors slice. chunkSecrets = chunkSecrets[:0] From 428b051f0db91259b655b45f1c2c1ced2d3e5d1e Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Tue, 30 Jan 2024 15:46:17 -0800 Subject: [PATCH 28/49] continue after decoder match for reverifying --- pkg/engine/engine.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 6e04483861f5..1d66ceca62c6 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -530,6 +530,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { const avgDetectorsPerChunk = 2 chunkSpecificDetectors := make(map[ahocorasick.DetectorKey]detectors.Detector, avgDetectorsPerChunk) for originalChunk := range e.ChunksChan() { + nextChunk: for chunk := range sources.Chunker(originalChunk) { atomic.AddUint64(&e.metrics.BytesScanned, uint64(len(chunk.Data))) for _, decoder := range e.decoders { @@ -552,7 +553,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { for k := range chunkSpecificDetectors { delete(chunkSpecificDetectors, k) } - continue + continue nextChunk } for k, detector := range chunkSpecificDetectors { From 4ea40b08d4ba8cce7f225bc1353bcb5a083b12db Mon Sep 17 00:00:00 2001 From: Dustin Decker Date: Tue, 30 Jan 2024 15:52:28 -0800 Subject: [PATCH 29/49] use map --- pkg/engine/engine.go | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 6e04483861f5..c2057910d72f 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -576,22 +576,17 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } -func likelyDuplicate(ctx context.Context, val string, dupesSlice []string) bool { - for _, v := range dupesSlice { - if v == val { - ctx.Logger().V(2).Info("found exact duplicate", "val", val, "v", v) - return true - } - // Avoid comparing strings of vastly different lengths. - if len(v)*10 < len(val)*9 || len(v)*10 > len(val)*11 { - continue - } +func likelyDuplicate(ctx context.Context, val string, dupes map[string]struct{}) bool { + if _, ok := dupes[val]; ok { + return true + } - similarity := strutil.Similarity(val, v, metrics.NewLevenshtein()) + for k := range dupes { + similarity := strutil.Similarity(val, k, metrics.NewLevenshtein()) // close enough if similarity > 0.9 { - ctx.Logger().V(2).Info("found similar duplicate", "val", val, "v", v, "similarity", similarity) + ctx.Logger().V(2).Info("found similar duplicate", "val", val, "similarity", similarity) return true } } @@ -604,7 +599,7 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Reuse the same map and slice to avoid allocations. const avgSecretsPerDetector = 8 detectorsWithResult := make([]detectors.Detector, 0, avgSecretsPerDetector) - chunkSecrets := make([]string, 0, avgSecretsPerDetector) + chunkSecrets := make(map[string]struct{}, avgSecretsPerDetector) nextChunk: for chunk := range e.reverifiableChunksChan { @@ -649,11 +644,11 @@ nextChunk: }, res) // Empty the dupes and detectors slice. - chunkSecrets = chunkSecrets[:0] + chunkSecrets = make(map[string]struct{}, avgSecretsPerDetector) detectorsWithResult = detectorsWithResult[:0] continue nextChunk } - chunkSecrets = append(chunkSecrets, valStr) + chunkSecrets[valStr] = struct{}{} } } @@ -669,7 +664,7 @@ nextChunk: } // Empty the dupes and detectors slice - chunkSecrets = chunkSecrets[:0] + chunkSecrets = make(map[string]struct{}, avgSecretsPerDetector) detectorsWithResult = detectorsWithResult[:0] chunk.reverifyWgDoneFn() From 6ca05efea8a3bb263bb6fd7e191899e55a2d98d7 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Tue, 30 Jan 2024 16:02:42 -0800 Subject: [PATCH 30/49] use map --- pkg/engine/engine.go | 48 ++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 1d66ceca62c6..c2182187309d 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -530,7 +530,6 @@ func (e *Engine) detectorWorker(ctx context.Context) { const avgDetectorsPerChunk = 2 chunkSpecificDetectors := make(map[ahocorasick.DetectorKey]detectors.Detector, avgDetectorsPerChunk) for originalChunk := range e.ChunksChan() { - nextChunk: for chunk := range sources.Chunker(originalChunk) { atomic.AddUint64(&e.metrics.BytesScanned, uint64(len(chunk.Data))) for _, decoder := range e.decoders { @@ -553,7 +552,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { for k := range chunkSpecificDetectors { delete(chunkSpecificDetectors, k) } - continue nextChunk + continue } for k, detector := range chunkSpecificDetectors { @@ -577,22 +576,21 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } -func likelyDuplicate(ctx context.Context, val string, dupesSlice []string) bool { - for _, v := range dupesSlice { - if v == val { - ctx.Logger().V(2).Info("found exact duplicate", "val", val, "v", v) - return true - } +func likelyDuplicate(ctx context.Context, val string, dupes map[string]struct{}) bool { + if _, ok := dupes[val]; ok { + return true + } + for k := range dupes { // Avoid comparing strings of vastly different lengths. - if len(v)*10 < len(val)*9 || len(v)*10 > len(val)*11 { + if len(k)*10 < len(val)*9 || len(k)*10 > len(val)*11 { continue } - similarity := strutil.Similarity(val, v, metrics.NewLevenshtein()) + similarity := strutil.Similarity(val, k, metrics.NewLevenshtein()) // close enough if similarity > 0.9 { - ctx.Logger().V(2).Info("found similar duplicate", "val", val, "v", v, "similarity", similarity) + ctx.Logger().V(2).Info("found similar duplicate", "val", val, "k", k, "similarity", similarity) return true } } @@ -604,8 +602,8 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Reuse the same map and slice to avoid allocations. const avgSecretsPerDetector = 8 - detectorsWithResult := make([]detectors.Detector, 0, avgSecretsPerDetector) - chunkSecrets := make([]string, 0, avgSecretsPerDetector) + detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) + chunkSecrets := make(map[string]struct{}, avgSecretsPerDetector) nextChunk: for chunk := range e.reverifiableChunksChan { @@ -619,7 +617,9 @@ nextChunk: if len(results) == 0 { continue } - detectorsWithResult = append(detectorsWithResult, detector) + if _, ok := detectorsWithResult[detector]; !ok { + detectorsWithResult[detector] = struct{}{} + } for _, res := range results { var val []byte @@ -650,15 +650,19 @@ nextChunk: }, res) // Empty the dupes and detectors slice. - chunkSecrets = chunkSecrets[:0] - detectorsWithResult = detectorsWithResult[:0] + for k := range chunkSecrets { + delete(chunkSecrets, k) + } + for k := range detectorsWithResult { + delete(detectorsWithResult, k) + } continue nextChunk } - chunkSecrets = append(chunkSecrets, valStr) + chunkSecrets[valStr] = struct{}{} } } - for _, detector := range detectorsWithResult { + for detector := range detectorsWithResult { wgDetect.Add(1) chunk.chunk.Verify = e.verify e.detectableChunksChan <- detectableChunk{ @@ -670,8 +674,12 @@ nextChunk: } // Empty the dupes and detectors slice - chunkSecrets = chunkSecrets[:0] - detectorsWithResult = detectorsWithResult[:0] + for k := range chunkSecrets { + delete(chunkSecrets, k) + } + for k := range detectorsWithResult { + delete(detectorsWithResult, k) + } chunk.reverifyWgDoneFn() } From 76c2b20b394229faf5366be4c800225b10fbb7c9 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Tue, 30 Jan 2024 16:07:45 -0800 Subject: [PATCH 31/49] otimization and fix the bug. --- pkg/engine/ahocorasick/ahocorasickcore.go | 22 ++++++-- pkg/engine/engine.go | 67 ++++++++++++++--------- pkg/engine/engine_test.go | 7 ++- 3 files changed, 62 insertions(+), 34 deletions(-) diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index 47656d45797c..c62ad2557153 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -68,15 +68,27 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore { // it will be called once per chunk and that many allocations has a noticeable performance cost. func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, dts map[DetectorKey]detectors.Detector) []detectors.Detector { matches := ac.prefilter.MatchString(strings.ToLower(chunkData)) - d := make([]detectors.Detector, 0, len(matches)) - for _, m := range ac.prefilter.MatchString(strings.ToLower(chunkData)) { + + // Use a map to avoid adding duplicate detectors to the slice. + addedDetectors := make(map[DetectorKey]struct{}) + uniqueDetectors := make([]detectors.Detector, 0, len(matches)) + + for _, m := range matches { for _, k := range ac.keywordsToDetectors[m.MatchString()] { - dts[k] = ac.detectorsByKey[k] - d = append(d, ac.detectorsByKey[k]) + if _, exists := addedDetectors[k]; exists { + continue + } + // Add to the map to track already added detectors. + addedDetectors[k] = struct{}{} + + // Add the detector to the map and slice. + detector := ac.detectorsByKey[k] + dts[k] = detector + uniqueDetectors = append(uniqueDetectors, detector) } } - return d + return uniqueDetectors } // createDetectorKey creates a unique key for each detector from its type, version, and, for diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 61d3e772a321..111fcd3e39c0 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -321,7 +321,7 @@ func Start(ctx context.Context, options ...Option) (*Engine, error) { return e, nil } -const defaultChannelBuffer = 1 +var defaultChannelBuffer = runtime.NumCPU() // initialize prepares the engine's internal structures. The LRU cache optimizes // deduplication efforts, allowing the engine to quickly check if a chunk has @@ -334,11 +334,25 @@ func (e *Engine) initialize(ctx context.Context, options ...Option) error { if err != nil { return fmt.Errorf("failed to initialize LRU cache: %w", err) } + const ( + // detectableChunksChanMultiplier is set to accommodate a high number of concurrent worker goroutines. + // This multiplier ensures that the detectableChunksChan channel has sufficient buffer capacity + // to hold messages from multiple worker groups (detector workers/ reverifier workers) without blocking. + // A large buffer helps accommodate for the fact workers are producing data at a faster rate + // than it can be consumed. + detectableChunksChanMultiplier = 50 + // reverifiableChunksChanMultiplier uses a smaller buffer compared to detectableChunksChanMultiplier. + // This reflects the anticipated lower volume of data that needs re-verification. + // The buffer size is a trade-off between memory usage and the need to prevent blocking. + reverifiableChunksChanMultiplier = 25 + ) // Channels are used for communication between different parts of the engine, // ensuring that data flows smoothly without race conditions. - e.detectableChunksChan = make(chan detectableChunk, defaultChannelBuffer) - e.reverifiableChunksChan = make(chan reVerifiableChunk, defaultChannelBuffer) + // The buffer sizes for these channels are set to multiples of defaultChannelBuffer, + // considering the expected concurrency and workload in the system. + e.detectableChunksChan = make(chan detectableChunk, defaultChannelBuffer*detectableChunksChanMultiplier) + e.reverifiableChunksChan = make(chan reVerifiableChunk, defaultChannelBuffer*reverifiableChunksChanMultiplier) e.results = make(chan detectors.ResultWithMetadata, defaultChannelBuffer) e.dedupeCache = cache e.printer = new(output.PlainPrinter) @@ -430,8 +444,9 @@ func (e *Engine) startWorkers(ctx context.Context) { // Reverifier workers handle verification of chunks that have been detected by multiple detectors. // They ensure that verification is disabled for any secrets that have been detected by multiple detectors. + const reverifierWorkerMultiplier = detectorWorkerMultiplier / 2 ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) - for worker := uint64(0); worker < uint64(e.concurrency*100); worker++ { + for worker := uint64(0); worker < uint64(e.concurrency*reverifierWorkerMultiplier); worker++ { e.reverifiersWg.Add(1) go func() { ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5)) @@ -526,7 +541,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { var wgReverify sync.WaitGroup // Reuse the same map to avoid allocations. - const avgDetectorsPerChunk = 2 + const avgDetectorsPerChunk = 8 chunkSpecificDetectors := make(map[ahocorasick.DetectorKey]detectors.Detector, avgDetectorsPerChunk) for originalChunk := range e.ChunksChan() { for chunk := range sources.Chunker(originalChunk) { @@ -575,21 +590,31 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } -func likelyDuplicate(ctx context.Context, val string, dupes map[string]struct{}) bool { - if _, ok := dupes[val]; ok { +func likelyDuplicate(ctx context.Context, val []byte, dupes map[string]struct{}) bool { + if _, ok := dupes[string(val)]; ok { return true } - for k := range dupes { + + // The string conversion is purposefully placed after the dupes check to avoid the allocation. + // []byte -> string conversion within a map lookup does not allocate. (due to compiler optimizations) + valStr := string(val) + const similarityThreshold = 0.9 + for dupe := range dupes { // Avoid comparing strings of vastly different lengths. - if len(k)*10 < len(val)*9 || len(k)*10 > len(val)*11 { + if len(dupe)*10 < len(valStr)*9 || len(dupe)*10 > len(valStr)*11 { continue } - similarity := strutil.Similarity(val, k, metrics.NewLevenshtein()) + similarity := strutil.Similarity(valStr, dupe, metrics.NewLevenshtein()) // close enough - if similarity > 0.9 { - ctx.Logger().V(2).Info("found similar duplicate", "val", val, "k", k, "similarity", similarity) + if similarity > similarityThreshold { + ctx.Logger().V(2).Info( + "found similar duplicate", + "val", val, + "dupe", dupe, + "similarity", similarity, + ) return true } } @@ -604,7 +629,6 @@ func (e *Engine) reverifierWorker(ctx context.Context) { detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) chunkSecrets := make(map[string]struct{}, avgSecretsPerDetector) -nextChunk: for chunk := range e.reverifiableChunksChan { for _, detector := range chunk.detectors { // DO NOT VERIFY at this stage of the pipeline. @@ -632,15 +656,12 @@ nextChunk: // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r - valStr := string(val) - if likelyDuplicate(ctx, valStr, chunkSecrets) { + if likelyDuplicate(ctx, val, chunkSecrets) { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. if e.reverificationTracking != nil { e.reverificationTracking.increment() } - chunk.reverifyWgDoneFn() - e.processResult(ctx, detectableChunk{ chunk: chunk.chunk, detector: detector, @@ -648,16 +669,10 @@ nextChunk: wgDoneFn: wgDetect.Done, }, res) - // Empty the dupes and detectors slice. - for k := range chunkSecrets { - delete(chunkSecrets, k) - } - for k := range detectorsWithResult { - delete(detectorsWithResult, k) - } - continue nextChunk + // Remove the detector and secret from the maps + delete(detectorsWithResult, detector) } - chunkSecrets[valStr] = struct{}{} + chunkSecrets[string(val)] = struct{}{} } } diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index bc059f64ca5b..8f67c11229fc 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -199,7 +199,7 @@ func TestEngine_DuplicatSecrets(t *testing.T) { WithConcurrency(1), WithDecoders(decoders.DefaultDecoders()...), WithDetectors(DefaultDetectors()...), - WithVerify(true), + WithVerify(false), WithPrinter(new(discardPrinter)), ) assert.Nil(t, err) @@ -233,7 +233,7 @@ func TestReverifcationChunk(t *testing.T) { WithConcurrency(1), WithDecoders(decoders.DefaultDecoders()...), WithDetectors(conf.Detectors...), - WithVerify(true), + WithVerify(false), WithPrinter(new(discardPrinter)), withReverificationTracking(), ) @@ -246,7 +246,8 @@ func TestReverifcationChunk(t *testing.T) { // Wait for all the chunks to be processed. assert.Nil(t, e.Finish(ctx)) - want := uint64(1) + // We want TWO secrets that match both the custom regexes. + want := uint64(2) assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) wantDupe := 1 From 76bcb515372412d42b6b40b162afc8413fd626c2 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 31 Jan 2024 06:46:41 -0800 Subject: [PATCH 32/49] revert worker count --- pkg/engine/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 111fcd3e39c0..daf53bf982b6 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -444,7 +444,7 @@ func (e *Engine) startWorkers(ctx context.Context) { // Reverifier workers handle verification of chunks that have been detected by multiple detectors. // They ensure that verification is disabled for any secrets that have been detected by multiple detectors. - const reverifierWorkerMultiplier = detectorWorkerMultiplier / 2 + const reverifierWorkerMultiplier = detectorWorkerMultiplier ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) for worker := uint64(0); worker < uint64(e.concurrency*reverifierWorkerMultiplier); worker++ { e.reverifiersWg.Add(1) From 5bf61da351353b07479e637ec3592a8be2a583dd Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Wed, 31 Jan 2024 11:50:42 -0600 Subject: [PATCH 33/49] better option naming --- main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.go b/main.go index 2efff3b0cae2..8c2a62b8175b 100644 --- a/main.go +++ b/main.go @@ -49,7 +49,7 @@ var ( concurrency = cli.Flag("concurrency", "Number of concurrent workers.").Default(strconv.Itoa(runtime.NumCPU())).Int() noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool() onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool() - forceReverification = cli.Flag("force-reverification", "Verify credentials when multiple similar credentials are found across detectors.").Bool() + forceReverification = cli.Flag("allow-verification-overlap", "Allow verification of similar credentials across detectors").Bool() filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool() filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64() configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile() From 96c6cd518d8cc8a28f77ce5e0caebc6692bb2b57 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 31 Jan 2024 11:57:14 -0800 Subject: [PATCH 34/49] handle identical secrets in chunks --- pkg/engine/engine.go | 31 ++++++---- pkg/engine/engine_test.go | 60 +++++++++++++++++++ pkg/engine/testdata/mixed_secrets.txt | 24 ++++++++ .../testdata/verified_canary_secrets.txt | 20 +++++++ 4 files changed, 123 insertions(+), 12 deletions(-) create mode 100644 pkg/engine/testdata/mixed_secrets.txt create mode 100644 pkg/engine/testdata/verified_canary_secrets.txt diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index daf53bf982b6..d8b2dea68eb4 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -590,16 +590,17 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } -func likelyDuplicate(ctx context.Context, val []byte, dupes map[string]struct{}) bool { - if _, ok := dupes[string(val)]; ok { - return true - } +type chunkSecretKey struct { + secret string + detectorID int32 +} - // The string conversion is purposefully placed after the dupes check to avoid the allocation. - // []byte -> string conversion within a map lookup does not allocate. (due to compiler optimizations) - valStr := string(val) +func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSecretKey]struct{}) bool { const similarityThreshold = 0.9 - for dupe := range dupes { + + valStr := val.secret + for dupeKey := range dupes { + dupe := dupeKey.secret // Avoid comparing strings of vastly different lengths. if len(dupe)*10 < len(valStr)*9 || len(dupe)*10 > len(valStr)*11 { continue @@ -627,7 +628,7 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Reuse the same map and slice to avoid allocations. const avgSecretsPerDetector = 8 detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) - chunkSecrets := make(map[string]struct{}, avgSecretsPerDetector) + chunkSecrets := make(map[chunkSecretKey]struct{}, avgSecretsPerDetector) for chunk := range e.reverifiableChunksChan { for _, detector := range chunk.detectors { @@ -656,7 +657,13 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r - if likelyDuplicate(ctx, val, chunkSecrets) { + key := chunkSecretKey{secret: string(val), detectorID: int32(res.DetectorType)} + if _, ok := chunkSecrets[key]; ok { + chunkSecrets[key] = struct{}{} + continue + } + + if likelyDuplicate(ctx, key, chunkSecrets) { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. if e.reverificationTracking != nil { @@ -669,10 +676,10 @@ func (e *Engine) reverifierWorker(ctx context.Context) { wgDoneFn: wgDetect.Done, }, res) - // Remove the detector and secret from the maps + // Remove the detector from the list of detectors with results. delete(detectorsWithResult, detector) } - chunkSecrets[string(val)] = struct{}{} + chunkSecrets[key] = struct{}{} } } diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index 8f67c11229fc..2d6932f2e811 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -215,6 +215,66 @@ func TestEngine_DuplicatSecrets(t *testing.T) { assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) } +func TestEngine_DuplicatVerifiedSecrets(t *testing.T) { + ctx := context.Background() + + absPath, err := filepath.Abs("./testdata/verified_canary_secrets.txt") + assert.Nil(t, err) + + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + e, err := Start(ctx, + WithConcurrency(1), + WithDecoders(decoders.DefaultDecoders()...), + WithDetectors(DefaultDetectors()...), + WithVerify(true), + WithPrinter(new(discardPrinter)), + ) + assert.Nil(t, err) + + cfg := sources.FilesystemConfig{Paths: []string{absPath}} + if err := e.ScanFileSystem(ctx, cfg); err != nil { + return + } + + // Wait for all the chunks to be processed. + assert.Nil(t, e.Finish(ctx)) + want := uint64(4) + assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound) +} + +func TestEngine_DuplicatVerifiedSecretsMultipleDetectors(t *testing.T) { + ctx := context.Background() + + absPath, err := filepath.Abs("./testdata/mixed_secrets.txt") + assert.Nil(t, err) + + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + e, err := Start(ctx, + WithConcurrency(1), + WithDecoders(decoders.DefaultDecoders()...), + WithDetectors(DefaultDetectors()...), + WithVerify(true), + WithPrinter(new(discardPrinter)), + ) + assert.Nil(t, err) + + cfg := sources.FilesystemConfig{Paths: []string{absPath}} + if err := e.ScanFileSystem(ctx, cfg); err != nil { + return + } + + // Wait for all the chunks to be processed. + assert.Nil(t, e.Finish(ctx)) + wantVerified := uint64(4) + wantUnverified := uint64(4) + assert.Equal(t, wantVerified, e.GetMetrics().VerifiedSecretsFound) + assert.Equal(t, wantUnverified, e.GetMetrics().UnverifiedSecretsFound) +} + func TestReverifcationChunk(t *testing.T) { ctx := context.Background() diff --git a/pkg/engine/testdata/mixed_secrets.txt b/pkg/engine/testdata/mixed_secrets.txt new file mode 100644 index 000000000000..57e07e62a33a --- /dev/null +++ b/pkg/engine/testdata/mixed_secrets.txt @@ -0,0 +1,24 @@ +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 + sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 + sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 + sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 + sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 diff --git a/pkg/engine/testdata/verified_canary_secrets.txt b/pkg/engine/testdata/verified_canary_secrets.txt new file mode 100644 index 000000000000..942d04ffd222 --- /dev/null +++ b/pkg/engine/testdata/verified_canary_secrets.txt @@ -0,0 +1,20 @@ +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 +[default] +aws_access_key_id = AKIAQYLPMN5HFC4N7W75 +aws_secret_access_key = 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 +output = json +region = us-east-2 \ No newline at end of file From a72201c7268da2f054a4e1a5d2b4fa99ef82c142 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 31 Jan 2024 12:08:32 -0800 Subject: [PATCH 35/49] update comment --- pkg/engine/engine.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index d8b2dea68eb4..912b2f8ffc38 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -590,6 +590,10 @@ func (e *Engine) detectorWorker(ctx context.Context) { ctx.Logger().V(4).Info("finished scanning chunks") } +// chunkSecretKey ties secrets to the specific detector that found them. This allows identifying identical +// credentials extracted by multiple different detectors processing the same chunk. Or duplicates found by the +// same detector in the chunk. Exact matches on lookup indicate a duplicate secret for a detector in +// that chunk - which is expected and not malicious. Those intra-detector dupes are still verified. type chunkSecretKey struct { secret string detectorID int32 From 8ad9859464aa2a1fd36c261fc65174a0fcf08346 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 31 Jan 2024 12:09:40 -0800 Subject: [PATCH 36/49] update comment --- pkg/engine/engine.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 912b2f8ffc38..8616e270aec8 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -591,9 +591,9 @@ func (e *Engine) detectorWorker(ctx context.Context) { } // chunkSecretKey ties secrets to the specific detector that found them. This allows identifying identical -// credentials extracted by multiple different detectors processing the same chunk. Or duplicates found by the -// same detector in the chunk. Exact matches on lookup indicate a duplicate secret for a detector in -// that chunk - which is expected and not malicious. Those intra-detector dupes are still verified. +// credentials extracted by multiple different detectors processing the same chunk. Or duplicates found +// by the same detector in the chunk. Exact matches on lookup indicate a duplicate secret for a detector +// in that chunk - which is expected and not malicious. Those intra-detector dupes are still verified. type chunkSecretKey struct { secret string detectorID int32 From c098a6da684c307bf4042885f1118ae5650bca0b Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 31 Jan 2024 12:17:34 -0800 Subject: [PATCH 37/49] fix test --- pkg/engine/engine_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index 2d6932f2e811..9dd78f799f3d 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -189,7 +189,7 @@ func BenchmarkSupportsLineNumbersLoop(b *testing.B) { func TestEngine_DuplicatSecrets(t *testing.T) { ctx := context.Background() - absPath, err := filepath.Abs("./testdata") + absPath, err := filepath.Abs("./testdata/secrets.txt") assert.Nil(t, err) ctx, cancel := context.WithTimeout(ctx, 10*time.Second) From 46a58281038c6fed8ab7a3d9f7bd501fdab82793 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 31 Jan 2024 12:27:17 -0800 Subject: [PATCH 38/49] use DetecotrKey --- pkg/engine/ahocorasick/ahocorasickcore.go | 15 ++++++++++++--- pkg/engine/engine.go | 8 ++++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index c62ad2557153..5a4c7c1bbbd9 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -63,15 +63,24 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore { } } +// DetectorInfo represents a detected pattern's metadata in a data chunk. +// It encapsulates the key identifying a specific detector and the detector instance itself. +type DetectorInfo struct { + Key DetectorKey + detectors.Detector +} + // PopulateMatchingDetectors populates the given detector slice with all the detectors matching the // provided input. This method populates an existing map rather than allocating a new one because // it will be called once per chunk and that many allocations has a noticeable performance cost. -func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, dts map[DetectorKey]detectors.Detector) []detectors.Detector { +// It returns a slice of unique 'DetectorInfo' corresponding to the matched detectors. This slice is +// constructed to prevent duplications by utilizing an internal map to track already processed detectors. +func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, dts map[DetectorKey]detectors.Detector) []DetectorInfo { matches := ac.prefilter.MatchString(strings.ToLower(chunkData)) // Use a map to avoid adding duplicate detectors to the slice. addedDetectors := make(map[DetectorKey]struct{}) - uniqueDetectors := make([]detectors.Detector, 0, len(matches)) + uniqueDetectors := make([]DetectorInfo, 0, len(matches)) for _, m := range matches { for _, k := range ac.keywordsToDetectors[m.MatchString()] { @@ -84,7 +93,7 @@ func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, dts map[D // Add the detector to the map and slice. detector := ac.detectorsByKey[k] dts[k] = detector - uniqueDetectors = append(uniqueDetectors, detector) + uniqueDetectors = append(uniqueDetectors, DetectorInfo{Key: k, Detector: detector}) } } diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 8616e270aec8..c5b002505d0c 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -532,7 +532,7 @@ type detectableChunk struct { type reVerifiableChunk struct { chunk sources.Chunk decoder detectorspb.DecoderType - detectors []detectors.Detector + detectors []ahocorasick.DetectorInfo reverifyWgDoneFn func() } @@ -595,8 +595,8 @@ func (e *Engine) detectorWorker(ctx context.Context) { // by the same detector in the chunk. Exact matches on lookup indicate a duplicate secret for a detector // in that chunk - which is expected and not malicious. Those intra-detector dupes are still verified. type chunkSecretKey struct { - secret string - detectorID int32 + secret string + detectorInfo ahocorasick.DetectorInfo } func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSecretKey]struct{}) bool { @@ -661,7 +661,7 @@ func (e *Engine) reverifierWorker(ctx context.Context) { // Ex: // - postman api key: PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r - key := chunkSecretKey{secret: string(val), detectorID: int32(res.DetectorType)} + key := chunkSecretKey{secret: string(val), detectorInfo: detector} if _, ok := chunkSecrets[key]; ok { chunkSecrets[key] = struct{}{} continue From c3125f36c399afbccdefb126b2d8b7c329455fc1 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Wed, 31 Jan 2024 15:09:22 -0600 Subject: [PATCH 39/49] rm out of scope tests and testdata --- pkg/engine/engine_test.go | 60 ------------------- pkg/engine/testdata/mixed_secrets.txt | 24 -------- .../testdata/verified_canary_secrets.txt | 20 ------- 3 files changed, 104 deletions(-) delete mode 100644 pkg/engine/testdata/mixed_secrets.txt delete mode 100644 pkg/engine/testdata/verified_canary_secrets.txt diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index 9dd78f799f3d..2898ed0c7c88 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -215,66 +215,6 @@ func TestEngine_DuplicatSecrets(t *testing.T) { assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) } -func TestEngine_DuplicatVerifiedSecrets(t *testing.T) { - ctx := context.Background() - - absPath, err := filepath.Abs("./testdata/verified_canary_secrets.txt") - assert.Nil(t, err) - - ctx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - e, err := Start(ctx, - WithConcurrency(1), - WithDecoders(decoders.DefaultDecoders()...), - WithDetectors(DefaultDetectors()...), - WithVerify(true), - WithPrinter(new(discardPrinter)), - ) - assert.Nil(t, err) - - cfg := sources.FilesystemConfig{Paths: []string{absPath}} - if err := e.ScanFileSystem(ctx, cfg); err != nil { - return - } - - // Wait for all the chunks to be processed. - assert.Nil(t, e.Finish(ctx)) - want := uint64(4) - assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound) -} - -func TestEngine_DuplicatVerifiedSecretsMultipleDetectors(t *testing.T) { - ctx := context.Background() - - absPath, err := filepath.Abs("./testdata/mixed_secrets.txt") - assert.Nil(t, err) - - ctx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - e, err := Start(ctx, - WithConcurrency(1), - WithDecoders(decoders.DefaultDecoders()...), - WithDetectors(DefaultDetectors()...), - WithVerify(true), - WithPrinter(new(discardPrinter)), - ) - assert.Nil(t, err) - - cfg := sources.FilesystemConfig{Paths: []string{absPath}} - if err := e.ScanFileSystem(ctx, cfg); err != nil { - return - } - - // Wait for all the chunks to be processed. - assert.Nil(t, e.Finish(ctx)) - wantVerified := uint64(4) - wantUnverified := uint64(4) - assert.Equal(t, wantVerified, e.GetMetrics().VerifiedSecretsFound) - assert.Equal(t, wantUnverified, e.GetMetrics().UnverifiedSecretsFound) -} - func TestReverifcationChunk(t *testing.T) { ctx := context.Background() diff --git a/pkg/engine/testdata/mixed_secrets.txt b/pkg/engine/testdata/mixed_secrets.txt deleted file mode 100644 index 57e07e62a33a..000000000000 --- a/pkg/engine/testdata/mixed_secrets.txt +++ /dev/null @@ -1,24 +0,0 @@ -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 - sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 - sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 - sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 - sentry 27ac84f4bcdb4fca9701f4d6f6f58cd7d96b69c9d9754d40800645a51d668f90 diff --git a/pkg/engine/testdata/verified_canary_secrets.txt b/pkg/engine/testdata/verified_canary_secrets.txt deleted file mode 100644 index 942d04ffd222..000000000000 --- a/pkg/engine/testdata/verified_canary_secrets.txt +++ /dev/null @@ -1,20 +0,0 @@ -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 -[default] -aws_access_key_id = AKIAQYLPMN5HFC4N7W75 -aws_secret_access_key = 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 -output = json -region = us-east-2 \ No newline at end of file From a89933a7d4df46ddd2aa9b6e7b3aeeb545e2ea52 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Wed, 31 Jan 2024 16:14:47 -0600 Subject: [PATCH 40/49] rename all reverification elements --- main.go | 34 +++--- pkg/engine/engine.go | 110 +++++++++--------- pkg/engine/engine_test.go | 10 +- ...aml => verificationoverlap_detectors.yaml} | 0 ...ts.txt => verificationoverlap_secrets.txt} | 0 5 files changed, 77 insertions(+), 77 deletions(-) rename pkg/engine/testdata/{reverification_detectors.yaml => verificationoverlap_detectors.yaml} (100%) rename pkg/engine/testdata/{reverification_secrets.txt => verificationoverlap_secrets.txt} (100%) diff --git a/main.go b/main.go index a17f83fd15bd..26362f93e496 100644 --- a/main.go +++ b/main.go @@ -37,22 +37,22 @@ import ( ) var ( - cli = kingpin.New("TruffleHog", "TruffleHog is a tool for finding credentials.") - cmd string - debug = cli.Flag("debug", "Run in debug mode.").Bool() - trace = cli.Flag("trace", "Run in trace mode.").Bool() - profile = cli.Flag("profile", "Enables profiling and sets a pprof and fgprof server on :18066.").Bool() - localDev = cli.Flag("local-dev", "Hidden feature to disable overseer for local dev.").Hidden().Bool() - jsonOut = cli.Flag("json", "Output in JSON format.").Short('j').Bool() - jsonLegacy = cli.Flag("json-legacy", "Use the pre-v3.0 JSON format. Only works with git, gitlab, and github sources.").Bool() - gitHubActionsFormat = cli.Flag("github-actions", "Output in GitHub Actions format.").Bool() - concurrency = cli.Flag("concurrency", "Number of concurrent workers.").Default(strconv.Itoa(runtime.NumCPU())).Int() - noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool() - onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool() - forceReverification = cli.Flag("allow-verification-overlap", "Allow verification of similar credentials across detectors").Bool() - filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool() - filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64() - configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile() + cli = kingpin.New("TruffleHog", "TruffleHog is a tool for finding credentials.") + cmd string + debug = cli.Flag("debug", "Run in debug mode.").Bool() + trace = cli.Flag("trace", "Run in trace mode.").Bool() + profile = cli.Flag("profile", "Enables profiling and sets a pprof and fgprof server on :18066.").Bool() + localDev = cli.Flag("local-dev", "Hidden feature to disable overseer for local dev.").Hidden().Bool() + jsonOut = cli.Flag("json", "Output in JSON format.").Short('j').Bool() + jsonLegacy = cli.Flag("json-legacy", "Use the pre-v3.0 JSON format. Only works with git, gitlab, and github sources.").Bool() + gitHubActionsFormat = cli.Flag("github-actions", "Output in GitHub Actions format.").Bool() + concurrency = cli.Flag("concurrency", "Number of concurrent workers.").Default(strconv.Itoa(runtime.NumCPU())).Int() + noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool() + onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool() + allowVerificationOverlap = cli.Flag("allow-verification-overlap", "Allow verification of similar credentials across detectors").Bool() + filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool() + filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64() + configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile() // rules = cli.Flag("rules", "Path to file with custom rules.").String() printAvgDetectorTime = cli.Flag("print-avg-detector-time", "Print the average time spent on each detector.").Bool() noUpdate = cli.Flag("no-update", "Don't check for updates.").Bool() @@ -412,7 +412,7 @@ func run(state overseer.State) { engine.WithPrintAvgDetectorTime(*printAvgDetectorTime), engine.WithPrinter(printer), engine.WithFilterEntropy(*filterEntropy), - engine.WithForceReverification(*forceReverification), + engine.WithVerificationOverlap(*allowVerificationOverlap), ) if err != nil { logFatal(err, "error initializing engine") diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index c5b002505d0c..629f71240662 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -65,21 +65,21 @@ type Engine struct { // entropyFilter is used to filter out unverified results using Shannon entropy. filterEntropy *float64 onlyVerified bool - forceReverification bool + verificationOverlap bool printAvgDetectorTime bool // ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups. ahoCorasickCore *ahocorasick.AhoCorasickCore // Engine synchronization primitives. - sourceManager *sources.SourceManager - results chan detectors.ResultWithMetadata - detectableChunksChan chan detectableChunk - reverifiableChunksChan chan reVerifiableChunk - workersWg sync.WaitGroup - reverifiersWg sync.WaitGroup - wgDetectorWorkers sync.WaitGroup - WgNotifier sync.WaitGroup + sourceManager *sources.SourceManager + results chan detectors.ResultWithMetadata + detectableChunksChan chan detectableChunk + verificationOverlapChunksChan chan verificationOverlapChunk + workersWg sync.WaitGroup + verificationOverlapWg sync.WaitGroup + wgDetectorWorkers sync.WaitGroup + WgNotifier sync.WaitGroup // Runtime information. metrics runtimeMetrics @@ -99,17 +99,17 @@ type Engine struct { verify bool // Note: bad hack only used for testing - reverificationTracking *reverificationTracking + verificationOverlapTracker *verificationOverlapTracker } -type reverificationTracking struct { - reverificationDuplicateCount int - mu sync.Mutex +type verificationOverlapTracker struct { + verificationOverlapDuplicateCount int + mu sync.Mutex } -func (r *reverificationTracking) increment() { +func (r *verificationOverlapTracker) increment() { r.mu.Lock() - r.reverificationDuplicateCount++ + r.verificationOverlapDuplicateCount++ r.mu.Unlock() } @@ -201,18 +201,18 @@ func WithVerify(verify bool) Option { } } -func withReverificationTracking() Option { +func withVerificationOverlapTracking() Option { return func(e *Engine) { - e.reverificationTracking = &reverificationTracking{ - reverificationDuplicateCount: 0, + e.verificationOverlapTracker = &verificationOverlapTracker{ + verificationOverlapDuplicateCount: 0, } } } -// WithForceReverification TODO comment -func WithForceReverification(forceReverification bool) Option { +// WithVerificationOverlap +func WithVerificationOverlap(verificationOverlap bool) Option { return func(e *Engine) { - e.forceReverification = forceReverification + e.verificationOverlap = verificationOverlap } } @@ -337,14 +337,14 @@ func (e *Engine) initialize(ctx context.Context, options ...Option) error { const ( // detectableChunksChanMultiplier is set to accommodate a high number of concurrent worker goroutines. // This multiplier ensures that the detectableChunksChan channel has sufficient buffer capacity - // to hold messages from multiple worker groups (detector workers/ reverifier workers) without blocking. + // to hold messages from multiple worker groups (detector workers/ verificationOverlap workers) without blocking. // A large buffer helps accommodate for the fact workers are producing data at a faster rate // than it can be consumed. detectableChunksChanMultiplier = 50 - // reverifiableChunksChanMultiplier uses a smaller buffer compared to detectableChunksChanMultiplier. + // verificationOverlapChunksChanMultiplier uses a smaller buffer compared to detectableChunksChanMultiplier. // This reflects the anticipated lower volume of data that needs re-verification. // The buffer size is a trade-off between memory usage and the need to prevent blocking. - reverifiableChunksChanMultiplier = 25 + verificationOverlapChunksChanMultiplier = 25 ) // Channels are used for communication between different parts of the engine, @@ -352,7 +352,7 @@ func (e *Engine) initialize(ctx context.Context, options ...Option) error { // The buffer sizes for these channels are set to multiples of defaultChannelBuffer, // considering the expected concurrency and workload in the system. e.detectableChunksChan = make(chan detectableChunk, defaultChannelBuffer*detectableChunksChanMultiplier) - e.reverifiableChunksChan = make(chan reVerifiableChunk, defaultChannelBuffer*reverifiableChunksChanMultiplier) + e.verificationOverlapChunksChan = make(chan verificationOverlapChunk, defaultChannelBuffer*verificationOverlapChunksChanMultiplier) e.results = make(chan detectors.ResultWithMetadata, defaultChannelBuffer) e.dedupeCache = cache e.printer = new(output.PlainPrinter) @@ -442,17 +442,17 @@ func (e *Engine) startWorkers(ctx context.Context) { }() } - // Reverifier workers handle verification of chunks that have been detected by multiple detectors. + // verificationOverlap workers handle verification of chunks that have been detected by multiple detectors. // They ensure that verification is disabled for any secrets that have been detected by multiple detectors. - const reverifierWorkerMultiplier = detectorWorkerMultiplier - ctx.Logger().V(2).Info("starting reverifier workers", "count", e.concurrency) - for worker := uint64(0); worker < uint64(e.concurrency*reverifierWorkerMultiplier); worker++ { - e.reverifiersWg.Add(1) + const verificationOverlapWorkerMultiplier = detectorWorkerMultiplier + ctx.Logger().V(2).Info("starting verificationOverlap workers", "count", e.concurrency) + for worker := uint64(0); worker < uint64(e.concurrency*verificationOverlapWorkerMultiplier); worker++ { + e.verificationOverlapWg.Add(1) go func() { ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5)) defer common.Recover(ctx) - defer e.reverifiersWg.Done() - e.reverifierWorker(ctx) + defer e.verificationOverlapWg.Done() + e.verificationOverlapWorker(ctx) }() } @@ -485,8 +485,8 @@ func (e *Engine) Finish(ctx context.Context) error { e.workersWg.Wait() // Wait for the workers to finish scanning chunks. - close(e.reverifiableChunksChan) - e.reverifiersWg.Wait() + close(e.verificationOverlapChunksChan) + e.verificationOverlapWg.Wait() close(e.detectableChunksChan) e.wgDetectorWorkers.Wait() // Wait for the detector workers to finish detecting chunks. @@ -526,19 +526,19 @@ type detectableChunk struct { wgDoneFn func() } -// reVerifiableChunk is a decoded chunk that has multiple detectors that match it. +// verificationOverlapChunk is a decoded chunk that has multiple detectors that match it. // It will be initially processed with verification disabled, and then reprocessed with verification // enabled if the same secret was not found by multiple detectors. -type reVerifiableChunk struct { - chunk sources.Chunk - decoder detectorspb.DecoderType - detectors []ahocorasick.DetectorInfo - reverifyWgDoneFn func() +type verificationOverlapChunk struct { + chunk sources.Chunk + decoder detectorspb.DecoderType + detectors []ahocorasick.DetectorInfo + verificationOverlapWgDoneFn func() } func (e *Engine) detectorWorker(ctx context.Context) { var wgDetect sync.WaitGroup - var wgReverify sync.WaitGroup + var wgVerificationOverlap sync.WaitGroup // Reuse the same map to avoid allocations. const avgDetectorsPerChunk = 8 @@ -554,13 +554,13 @@ func (e *Engine) detectorWorker(ctx context.Context) { } matchingDetectors := e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors) - if len(chunkSpecificDetectors) > 1 && !e.forceReverification { - wgReverify.Add(1) - e.reverifiableChunksChan <- reVerifiableChunk{ - chunk: *decoded.Chunk, - detectors: matchingDetectors, - decoder: decoded.DecoderType, - reverifyWgDoneFn: wgReverify.Done, + if len(chunkSpecificDetectors) > 1 && !e.verificationOverlap { + wgVerificationOverlap.Add(1) + e.verificationOverlapChunksChan <- verificationOverlapChunk{ + chunk: *decoded.Chunk, + detectors: matchingDetectors, + decoder: decoded.DecoderType, + verificationOverlapWgDoneFn: wgVerificationOverlap.Done, } // Empty the map. for k := range chunkSpecificDetectors { @@ -585,7 +585,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { atomic.AddUint64(&e.metrics.ChunksScanned, 1) } - wgReverify.Wait() + wgVerificationOverlap.Wait() wgDetect.Wait() ctx.Logger().V(4).Info("finished scanning chunks") } @@ -626,7 +626,7 @@ func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSec return false } -func (e *Engine) reverifierWorker(ctx context.Context) { +func (e *Engine) verificationOverlapWorker(ctx context.Context) { var wgDetect sync.WaitGroup // Reuse the same map and slice to avoid allocations. @@ -634,7 +634,7 @@ func (e *Engine) reverifierWorker(ctx context.Context) { detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) chunkSecrets := make(map[chunkSecretKey]struct{}, avgSecretsPerDetector) - for chunk := range e.reverifiableChunksChan { + for chunk := range e.verificationOverlapChunksChan { for _, detector := range chunk.detectors { // DO NOT VERIFY at this stage of the pipeline. results, err := detector.FromData(ctx, false, chunk.chunk.Data) @@ -670,8 +670,8 @@ func (e *Engine) reverifierWorker(ctx context.Context) { if likelyDuplicate(ctx, key, chunkSecrets) { // This indicates that the same secret was found by multiple detectors. // We should NOT VERIFY this chunk's data. - if e.reverificationTracking != nil { - e.reverificationTracking.increment() + if e.verificationOverlapTracker != nil { + e.verificationOverlapTracker.increment() } e.processResult(ctx, detectableChunk{ chunk: chunk.chunk, @@ -706,11 +706,11 @@ func (e *Engine) reverifierWorker(ctx context.Context) { delete(detectorsWithResult, k) } - chunk.reverifyWgDoneFn() + chunk.verificationOverlapWgDoneFn() } wgDetect.Wait() - ctx.Logger().V(4).Info("finished reverifying chunks") + ctx.Logger().V(4).Info("finished verificationOverlap chunks") } func (e *Engine) detectChunks(ctx context.Context) { diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index 2898ed0c7c88..fb2f3b79f583 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -215,16 +215,16 @@ func TestEngine_DuplicatSecrets(t *testing.T) { assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) } -func TestReverifcationChunk(t *testing.T) { +func TestVerificationOverlapChunk(t *testing.T) { ctx := context.Background() - absPath, err := filepath.Abs("./testdata/reverification_secrets.txt") + absPath, err := filepath.Abs("./testdata/verificationoverlap_secrets.txt") assert.Nil(t, err) ctx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() - confPath, err := filepath.Abs("./testdata/reverification_detectors.yaml") + confPath, err := filepath.Abs("./testdata/verificationoverlap_detectors.yaml") assert.Nil(t, err) conf, err := config.Read(confPath) assert.Nil(t, err) @@ -235,7 +235,7 @@ func TestReverifcationChunk(t *testing.T) { WithDetectors(conf.Detectors...), WithVerify(false), WithPrinter(new(discardPrinter)), - withReverificationTracking(), + withVerificationOverlapTracking(), ) assert.Nil(t, err) @@ -251,7 +251,7 @@ func TestReverifcationChunk(t *testing.T) { assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) wantDupe := 1 - assert.Equal(t, wantDupe, e.reverificationTracking.reverificationDuplicateCount) + assert.Equal(t, wantDupe, e.verificationOverlapTracker.verificationOverlapDuplicateCount) } func TestFragmentFirstLineAndLink(t *testing.T) { diff --git a/pkg/engine/testdata/reverification_detectors.yaml b/pkg/engine/testdata/verificationoverlap_detectors.yaml similarity index 100% rename from pkg/engine/testdata/reverification_detectors.yaml rename to pkg/engine/testdata/verificationoverlap_detectors.yaml diff --git a/pkg/engine/testdata/reverification_secrets.txt b/pkg/engine/testdata/verificationoverlap_secrets.txt similarity index 100% rename from pkg/engine/testdata/reverification_secrets.txt rename to pkg/engine/testdata/verificationoverlap_secrets.txt From 2cc3f29d12763d52910e166e1ac2d50d907557d0 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 1 Feb 2024 07:28:36 -0800 Subject: [PATCH 41/49] don't re-write map entry --- pkg/engine/engine.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 629f71240662..85baa1320f43 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -663,7 +663,6 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) { // - malicious detector "api key": qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r key := chunkSecretKey{secret: string(val), detectorInfo: detector} if _, ok := chunkSecrets[key]; ok { - chunkSecrets[key] = struct{}{} continue } From c7c66781e18cb8c63bb4dd34532ed1b1f0c99a1e Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 1 Feb 2024 07:45:20 -0800 Subject: [PATCH 42/49] use correct key --- pkg/engine/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 85baa1320f43..28c3ce8bda57 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -631,7 +631,7 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) { // Reuse the same map and slice to avoid allocations. const avgSecretsPerDetector = 8 - detectorsWithResult := make(map[detectors.Detector]struct{}, avgSecretsPerDetector) + detectorsWithResult := make(map[ahocorasick.DetectorInfo]struct{}, avgSecretsPerDetector) chunkSecrets := make(map[chunkSecretKey]struct{}, avgSecretsPerDetector) for chunk := range e.verificationOverlapChunksChan { From 9d2c2b2f1638c4d20aad7686a6bd0d3903897564 Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Thu, 1 Feb 2024 13:22:19 -0600 Subject: [PATCH 43/49] rename worker, remove log val --- pkg/engine/engine.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 28c3ce8bda57..3257d53d8a7b 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -449,7 +449,7 @@ func (e *Engine) startWorkers(ctx context.Context) { for worker := uint64(0); worker < uint64(e.concurrency*verificationOverlapWorkerMultiplier); worker++ { e.verificationOverlapWg.Add(1) go func() { - ctx := context.WithValue(ctx, "secret_worker_id", common.RandomID(5)) + ctx := context.WithValue(ctx, "verification_overlap_worker_id", common.RandomID(5)) defer common.Recover(ctx) defer e.verificationOverlapWg.Done() e.verificationOverlapWorker(ctx) @@ -616,9 +616,6 @@ func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSec if similarity > similarityThreshold { ctx.Logger().V(2).Info( "found similar duplicate", - "val", val, - "dupe", dupe, - "similarity", similarity, ) return true } From bf31cabb13ac347e478132202ca59aeeb0edfaae Mon Sep 17 00:00:00 2001 From: Zachary Rice Date: Thu, 1 Feb 2024 15:07:16 -0600 Subject: [PATCH 44/49] test likelydupe, add eq detector check in loop --- pkg/engine/ahocorasick/ahocorasickcore.go | 6 +- pkg/engine/engine.go | 12 ++++ pkg/engine/engine_test.go | 73 +++++++++++++++++++++++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index 5a4c7c1bbbd9..f54f723353ea 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -47,7 +47,7 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore { detectorsByKey := make(map[DetectorKey]detectors.Detector, len(allDetectors)) var keywords []string for _, d := range allDetectors { - key := createDetectorKey(d) + key := CreateDetectorKey(d) detectorsByKey[key] = d for _, kw := range d.Keywords() { kwLower := strings.ToLower(kw) @@ -100,9 +100,9 @@ func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, dts map[D return uniqueDetectors } -// createDetectorKey creates a unique key for each detector from its type, version, and, for +// CreateDetectorKey creates a unique key for each detector from its type, version, and, for // custom regex detectors, its name. -func createDetectorKey(d detectors.Detector) DetectorKey { +func CreateDetectorKey(d detectors.Detector) DetectorKey { detectorType := d.Type() var version int if v, ok := d.(detectors.Versioner); ok { diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 3257d53d8a7b..7b2c26741308 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -610,6 +610,18 @@ func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSec continue } + // If the detectors are the same, we don't need to compare the secrets. + if val.detectorInfo == dupeKey.detectorInfo { + continue + } + + if valStr == dupe { + ctx.Logger().V(2).Info( + "found exact duplicate", + ) + return true + } + similarity := strutil.Similarity(valStr, dupe, metrics.NewLevenshtein()) // close enough diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index fb2f3b79f583..e922a4f47646 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -12,6 +12,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" @@ -438,3 +439,75 @@ func TestSetLink(t *testing.T) { }) } } + +func TestLikelyDuplicate(t *testing.T) { + // Initialize detectors + // (not actually calling detector FromData or anything, just using detector struct for key creation) + detectorA := ahocorasick.DetectorInfo{Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[0])} + detectorB := ahocorasick.DetectorInfo{Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[1])} + + // Define test cases + tests := []struct { + name string + val chunkSecretKey + dupes map[chunkSecretKey]struct{} + expected bool + }{ + { + name: "exact duplicate different detector", + val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA}, + dupes: map[chunkSecretKey]struct{}{ + {"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB}: {}, + }, + expected: true, + }, + { + name: "non-duplicate length outside range", + val: chunkSecretKey{"short", detectorA}, + dupes: map[chunkSecretKey]struct{}{ + {"muchlongerthanthevalstring", detectorB}: {}, + }, + expected: false, + }, + { + name: "similar within threshold", + val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA}, + dupes: map[chunkSecretKey]struct{}{ + {"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorB}: {}, + }, + expected: true, + }, + { + name: "similar outside threshold", + val: chunkSecretKey{"anotherkey", detectorA}, + dupes: map[chunkSecretKey]struct{}{ + {"completelydifferent", detectorB}: {}, + }, + expected: false, + }, + { + name: "empty strings", + val: chunkSecretKey{"", detectorA}, + dupes: map[chunkSecretKey]struct{}{{"", detectorB}: {}}, + expected: true, + }, + { + name: "similar within threshold same detector", + val: chunkSecretKey{"PMAK-qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA}, + dupes: map[chunkSecretKey]struct{}{ + {"qnwfsLyRSyfCwfpHaQP1UzDhrgpWvHjbYzjpRCMshjt417zWcrzyHUArs7r", detectorA}: {}, + }, + expected: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + result := likelyDuplicate(ctx, tc.val, tc.dupes) + if result != tc.expected { + t.Errorf("expected %v, got %v", tc.expected, result) + } + }) + } +} From 1e5b773ac4efd81ed2893e0b763a2a4b2c3d9770 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 1 Feb 2024 18:02:06 -0800 Subject: [PATCH 45/49] add test --- pkg/engine/engine_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index e922a4f47646..b653f0cb73e4 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -2,12 +2,14 @@ package engine import ( "fmt" + "os" "path/filepath" "testing" "time" "github.com/stretchr/testify/assert" + "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/config" "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" @@ -216,6 +218,42 @@ func TestEngine_DuplicatSecrets(t *testing.T) { assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) } +func TestEngine_VersionedDetectorsVerifiedSecrets(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) + defer cancel() + testSecrets, err := common.GetSecret(ctx, "trufflehog-testing", "detectors4") + assert.NoError(t, err) + secretV2 := testSecrets.MustGetField("GITLABV2") + secretV1 := testSecrets.MustGetField("GITLAB") + fmt.Printf("Secrets: %s %s\n", secretV2, secretV1) + + tmpFile, err := os.CreateTemp("", "testfile") + assert.Nil(t, err) + defer tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.WriteString(fmt.Sprintf("You can find a gitlab secrets %s and another gitlab secret %s within", secretV2, secretV1)) + assert.Nil(t, err) + + e, err := Start(ctx, + WithConcurrency(1), + WithDecoders(decoders.DefaultDecoders()...), + WithDetectors(DefaultDetectors()...), + WithVerify(true), + WithPrinter(new(discardPrinter)), + ) + assert.Nil(t, err) + + cfg := sources.FilesystemConfig{Paths: []string{tmpFile.Name()}} + if err := e.ScanFileSystem(ctx, cfg); err != nil { + return + } + + assert.Nil(t, e.Finish(ctx)) + want := uint64(2) + assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound) +} + func TestVerificationOverlapChunk(t *testing.T) { ctx := context.Background() From 482c005de1bfbd0295254535469d8185f47b9783 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 1 Feb 2024 18:03:41 -0800 Subject: [PATCH 46/49] add comment --- pkg/engine/engine_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index b653f0cb73e4..446fd20ffb13 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -218,6 +218,8 @@ func TestEngine_DuplicatSecrets(t *testing.T) { assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) } +// TestEngine_VersionedDetectorsVerifiedSecrets is a test that detects ALL verified secrets across +// versioned detectors. func TestEngine_VersionedDetectorsVerifiedSecrets(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) defer cancel() @@ -225,7 +227,6 @@ func TestEngine_VersionedDetectorsVerifiedSecrets(t *testing.T) { assert.NoError(t, err) secretV2 := testSecrets.MustGetField("GITLABV2") secretV1 := testSecrets.MustGetField("GITLAB") - fmt.Printf("Secrets: %s %s\n", secretV2, secretV1) tmpFile, err := os.CreateTemp("", "testfile") assert.Nil(t, err) From c82850ab462335010944e36a6f94230e8a5cdee4 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Thu, 1 Feb 2024 18:58:57 -0800 Subject: [PATCH 47/49] add test --- pkg/engine/engine.go | 5 ++-- pkg/engine/engine_test.go | 61 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 7b2c26741308..8b50bf6727b8 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -610,8 +610,9 @@ func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSec continue } - // If the detectors are the same, we don't need to compare the secrets. - if val.detectorInfo == dupeKey.detectorInfo { + // If the detector type is the same, we don't need to compare the strings. + // These are not duplicates, and should be verified. + if val.detectorInfo.Type() == dupeKey.detectorInfo.Type() { continue } diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index 446fd20ffb13..a54291e8b13e 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -2,6 +2,8 @@ package engine import ( "fmt" + "net/http" + "net/http/httptest" "os" "path/filepath" "testing" @@ -12,9 +14,11 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/config" "github.com/trufflesecurity/trufflehog/v3/pkg/context" + "github.com/trufflesecurity/trufflehog/v3/pkg/custom_detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/decoders" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/custom_detectorspb" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" @@ -255,6 +259,63 @@ func TestEngine_VersionedDetectorsVerifiedSecrets(t *testing.T) { assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound) } +// TestEngine_CustomDetectorsDetectorsVerifiedSecrets is a test that covers an edge case where there are +// multiple detectors with the same type, keywords and regex that match the same secret. +// This ensures that those secrets get verified. +func TestEngine_CustomDetectorsDetectorsVerifiedSecrets(t *testing.T) { + tmpFile, err := os.CreateTemp("", "testfile") + assert.Nil(t, err) + defer tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.WriteString("test stuff") + assert.Nil(t, err) + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer ts.Close() + + customDetector1, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{ + Name: "custom detector 1", + Keywords: []string{"test"}, + Regex: map[string]string{"test": "\\w+"}, + Verify: []*custom_detectorspb.VerifierConfig{{Endpoint: ts.URL, Unsafe: true, SuccessRanges: []string{"200"}}}, + }) + assert.Nil(t, err) + + customDetector2, err := custom_detectors.NewWebhookCustomRegex(&custom_detectorspb.CustomRegex{ + Name: "custom detector 2", + Keywords: []string{"test"}, + Regex: map[string]string{"test": "\\w+"}, + Verify: []*custom_detectorspb.VerifierConfig{{Endpoint: ts.URL, Unsafe: true, SuccessRanges: []string{"200"}}}, + }) + assert.Nil(t, err) + + allDetectors := []detectors.Detector{customDetector1, customDetector2} + + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + defer cancel() + e, err := Start(ctx, + WithConcurrency(1), + WithDecoders(decoders.DefaultDecoders()...), + WithDetectors(allDetectors...), + WithVerify(true), + WithPrinter(new(discardPrinter)), + ) + assert.Nil(t, err) + + cfg := sources.FilesystemConfig{Paths: []string{tmpFile.Name()}} + if err := e.ScanFileSystem(ctx, cfg); err != nil { + return + } + + assert.Nil(t, e.Finish(ctx)) + // We should have 4 verified secrets, 2 for each custom detector. + want := uint64(4) + assert.Equal(t, want, e.GetMetrics().VerifiedSecretsFound) +} + func TestVerificationOverlapChunk(t *testing.T) { ctx := context.Background() From 893a8b83d329f5a7a305035661105895d0210119 Mon Sep 17 00:00:00 2001 From: Dustin Decker Date: Fri, 2 Feb 2024 08:57:57 -0800 Subject: [PATCH 48/49] Set verification error --- pkg/engine/engine.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 8b50bf6727b8..de3001352233 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -2,6 +2,7 @@ package engine import ( "bytes" + "errors" "fmt" "runtime" "sync" @@ -28,6 +29,8 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/sources" ) +var overlapError = errors.New("More than one detector has found this result. For your safety, verification has been disabled. You can override this behavior by using the --allow-verification-overlap flag.") + // Metrics for the scan engine for external consumption. type Metrics struct { BytesScanned uint64 @@ -682,6 +685,7 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) { if e.verificationOverlapTracker != nil { e.verificationOverlapTracker.increment() } + res.SetVerificationError(overlapError) e.processResult(ctx, detectableChunk{ chunk: chunk.chunk, detector: detector, From 4bd6450bb66e6e8181771372fc83ed7cd323d243 Mon Sep 17 00:00:00 2001 From: Dustin Decker Date: Fri, 2 Feb 2024 09:18:44 -0800 Subject: [PATCH 49/49] Update tests --- pkg/engine/engine_test.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go index a54291e8b13e..ea4b5807b4f5 100644 --- a/pkg/engine/engine_test.go +++ b/pkg/engine/engine_test.go @@ -351,7 +351,8 @@ func TestVerificationOverlapChunk(t *testing.T) { want := uint64(2) assert.Equal(t, want, e.GetMetrics().UnverifiedSecretsFound) - wantDupe := 1 + // We want 0 because these are custom detectors and verification should still occur. + wantDupe := 0 assert.Equal(t, wantDupe, e.verificationOverlapTracker.verificationOverlapDuplicateCount) } @@ -543,8 +544,14 @@ func TestSetLink(t *testing.T) { func TestLikelyDuplicate(t *testing.T) { // Initialize detectors // (not actually calling detector FromData or anything, just using detector struct for key creation) - detectorA := ahocorasick.DetectorInfo{Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[0])} - detectorB := ahocorasick.DetectorInfo{Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[1])} + detectorA := ahocorasick.DetectorInfo{ + Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[0]), + Detector: DefaultDetectors()[0], + } + detectorB := ahocorasick.DetectorInfo{ + Key: ahocorasick.CreateDetectorKey(DefaultDetectors()[1]), + Detector: DefaultDetectors()[1], + } // Define test cases tests := []struct {