trufflesecurity · joeleonjr · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024
@@ -1155,6 +1155,17 @@ func (e *Engine) processResult(
 		return
 	}
 
+	// Add in handler metadata. Existing extra data is not overwritten.
+	if res.ExtraData == nil && data.chunk.HandleMetadata != nil {
+		res.ExtraData = data.chunk.HandleMetadata
+	} else {
+		for k, v := range data.chunk.HandleMetadata {
+			if _, ok := res.ExtraData[k]; !ok {
+				res.ExtraData[k] = v
+			}
+		}
+	}
+
 	secret := detectors.CopyMetadata(&data.chunk, res)
 	secret.DecoderType = data.decoder
 	secret.DetectorDescription = data.detector.Detector.Description()

@@ -204,7 +204,9 @@ func (h *apkHandler) handleAPKFileContent(
 		ctx,
 		"filename", fileName,
 	)
-	return h.handleNonArchiveContent(ctx, mimeReader, apkChan)
+	//Note: the *zip.File.Name attribute (passed into this function as fileName)
+	// includes the full path within the apk.
+	return h.handleNonArchiveContent(ctx, fileName, mimeReader, apkChan)
 }
 
 // createZipReader creates a new ZIP reader from the input fileReader.

@@ -104,7 +104,8 @@ func (h *arHandler) processARFiles(ctx logContext.Context, reader *deb.Ar, dataO
 				continue
 			}
 
-			if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
+			// Note: emptyFilePath is used as the archiveEntryPath value b/c the `ar` format is a flat archive format (no nested dirs).
+			if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
 				dataOrErrChan <- DataOrErr{
 					Err: fmt.Errorf("%w: error handling archive content in AR: %v", ErrProcessingWarning, err),
 				}

@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"path/filepath"
 	"time"
 
 	"github.com/mholt/archiver/v4"
@@ -86,7 +87,7 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
 		}()
 
 		start := time.Now()
-		err := h.openArchive(ctx, 0, input, dataOrErrChan)
+		err := h.openArchive(ctx, []string{}, input, dataOrErrChan)
 		if err == nil {
 			h.metrics.incFilesProcessed()
 		}
@@ -101,37 +102,37 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
 var ErrMaxDepthReached = errors.New("max archive depth reached")
 
 // openArchive recursively extracts content from an archive up to a maximum depth, handling nested archives if necessary.
-// It takes a reader from which it attempts to identify and process the archive format. Depending on the archive type,
-// it either decompresses or extracts the contents directly, sending data to the provided channel.
+// It takes a string representing the path to the archive and a reader from which it attempts to identify and process the archive format.
+// Depending on the archive type, it either decompresses or extracts the contents directly, sending data to the provided channel.
 // Returns an error if the archive cannot be processed due to issues like exceeding maximum depth or unsupported formats.
 func (h *archiveHandler) openArchive(
 	ctx logContext.Context,
-	depth int,
+	archiveEntryPaths []string,
 	reader fileReader,
 	dataOrErrChan chan DataOrErr,
 ) error {
-	ctx.Logger().V(4).Info("Starting archive processing", "depth", depth)
-	defer ctx.Logger().V(4).Info("Finished archive processing", "depth", depth)
+	ctx.Logger().V(4).Info("Starting archive processing", "depth", len(archiveEntryPaths))
+	defer ctx.Logger().V(4).Info("Finished archive processing", "depth", len(archiveEntryPaths))
 
 	if common.IsDone(ctx) {
 		return ctx.Err()
 	}
 
-	if depth >= maxDepth {
+	if len(archiveEntryPaths) >= maxDepth {
 		h.metrics.incMaxArchiveDepthCount()
 		return ErrMaxDepthReached
 	}
 
 	if reader.format == nil {
-		if depth > 0 {
-			return h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
+		if len(archiveEntryPaths) > 0 {
+			return h.handleNonArchiveContent(ctx, filepath.Join(archiveEntryPaths...), newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
 		}
 		return fmt.Errorf("unknown archive format")
 	}
 
 	switch archive := reader.format.(type) {
 	case archiver.Decompressor:
-		// Decompress tha archive and feed the decompressed data back into the archive handler to extract any nested archives.
+		// Decompress the archive and feed the decompressed data back into the archive handler to extract any nested archives.
 		compReader, err := archive.OpenReader(reader)
 		if err != nil {
 			return fmt.Errorf("error opening decompressor with format: %s %w", reader.format.Name(), err)
@@ -152,9 +153,11 @@ func (h *archiveHandler) openArchive(
 		}
 		defer rdr.Close()
 
-		return h.openArchive(ctx, depth+1, rdr, dataOrErrChan)
+		// Note: We're limited in our ability to add file names to the archiveEntryPath here, as the decompressor doesn't have access to a fileName value.
+		// We add a empty string so we can keep track of the archive depth.
+		return h.openArchive(ctx, append(archiveEntryPaths, ""), rdr, dataOrErrChan)
 	case archiver.Extractor:
-		err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(dataOrErrChan))
+		err := archive.Extract(ctx, reader, nil, h.extractorHandler(archiveEntryPaths, dataOrErrChan))
 		if err != nil {
 			return fmt.Errorf("error extracting archive with format: %s: %w", reader.format.Name(), err)
 		}
@@ -168,7 +171,7 @@ func (h *archiveHandler) openArchive(
 // It logs the extraction, checks for cancellation, and decides whether to skip the file based on its name or type,
 // particularly for binary files if configured to skip. If the file is not skipped, it recursively calls openArchive
 // to handle nested archives or to continue processing based on the file's content and depth in the archive structure.
-func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
+func (h *archiveHandler) extractorHandler(archiveEntryPaths []string, dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
 	return func(ctx context.Context, file archiver.File) error {
 		lCtx := logContext.WithValues(
 			logContext.AddLogger(ctx),
@@ -186,11 +189,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con
 			return ctx.Err()
 		}
 
-		depth := 0
-		if ctxDepth, ok := ctx.Value(depthKey).(int); ok {
-			depth = ctxDepth
-		}
-
 		fileSize := file.Size()
 		if int(fileSize) > maxSize {
 			lCtx.Logger().V(2).Info("skipping file: size exceeds max allowed", "size", fileSize, "limit", maxSize)
@@ -240,6 +238,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con
 		h.metrics.observeFileSize(fileSize)
 
 		lCtx.Logger().V(4).Info("Processed file successfully", "filename", file.Name(), "size", file.Size())
-		return h.openArchive(lCtx, depth, rdr, dataOrErrChan)
+		return h.openArchive(lCtx, append(archiveEntryPaths, file.Name()), rdr, dataOrErrChan)
 	}
 }
@@ -19,60 +19,71 @@ func TestArchiveHandler(t *testing.T) {
 		expectedChunks int
 		matchString    string
 		expectErr      bool
+		matchFileName  string
 	}{
 		"gzip-single": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one-zip.gz",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
+			"",
 		},
 		"gzip-nested": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/double-zip.gz",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
+			// This is b/c we can't get file path from nested archiver.OpenReader()
+			"",
 		},
 		"gzip-too-deep": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six-zip.gz",
 			0,
 			"",
 			true,
+			"",
 		},
 		"tar-single": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one.tar",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
+			"aws-canary-creds",
 		},
 		"tar-nested": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/two.tar",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
+			"one.tar/aws-canary-creds",
 		},
 		"tar-too-deep": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six.tar",
 			0,
 			"",
 			true,
+			"",
 		},
 		"targz-single": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/tar-archive.tar.gz",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
+			"aws-canary-creds",
 		},
 		"gzip-large": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/FifteenMB.gz",
 			1543,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
+			"",
 		},
 		"zip-single": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
+			"aws-canary-creds",
 		},
 	}
 
@@ -104,6 +115,7 @@ func TestArchiveHandler(t *testing.T) {
 				count++
 				if re.Match(chunk.Data) {
 					matched = true
+					assert.Equal(t, chunk.ArchiveEntryPath, testCase.matchFileName)
 				}
 			}
 
@@ -125,6 +137,6 @@ func TestOpenInvalidArchive(t *testing.T) {
 
 	dataOrErrChan := make(chan DataOrErr)
 
-	err = handler.openArchive(ctx, 0, rdr, dataOrErrChan)
+	err = handler.openArchive(ctx, []string{}, rdr, dataOrErrChan)
 	assert.Error(t, err)
 }
@@ -44,7 +44,7 @@ func (h *defaultHandler) HandleFile(ctx logContext.Context, input fileReader) ch
 		defer close(dataOrErrChan)
 
 		start := time.Now()
-		err := h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
+		err := h.handleNonArchiveContent(ctx, emptyFilePath, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
 		if err == nil {
 			h.metrics.incFilesProcessed()
 		}
@@ -93,6 +93,7 @@ func (h *defaultHandler) measureLatencyAndHandleErrors(
 // file content, regardless of being an archive or not, is handled appropriately.
 func (h *defaultHandler) handleNonArchiveContent(
 	ctx logContext.Context,
+	archiveEntryPath string,
 	reader mimeTypeReader,
 	dataOrErrChan chan DataOrErr,
 ) error {
@@ -109,6 +110,9 @@ func (h *defaultHandler) handleNonArchiveContent(
 	chunkReader := sources.NewChunkReader()
 	for data := range chunkReader(ctx, reader) {
 		dataOrErr := DataOrErr{}
+		if archiveEntryPath != "" {
+			dataOrErr.ArchiveEntryPath = archiveEntryPath
+		}
 		if err := data.Error(); err != nil {
 			h.metrics.incErrors()
 			dataOrErr.Err = fmt.Errorf("%w: error reading chunk: %v", ErrProcessingWarning, err)

@@ -52,6 +52,9 @@ var (
 	// ErrProcessingWarning indicates a recoverable error that can be logged,
 	// allowing processing to continue.
 	ErrProcessingWarning = errors.New("error processing file")
+
+	// emptyFilePath is used to represent an empty file path.
+	emptyFilePath = ""
 )
 
 type readerConfig struct{ fileExtension string }
@@ -183,8 +186,9 @@ func newFileReader(r io.Reader, options ...readerOption) (fReader fileReader, er
 // efficient streaming of file contents while also providing a way to propagate errors
 // that may occur during the file handling process.
 type DataOrErr struct {
-	Data []byte
-	Err  error
+	Data             []byte
+	Err              error
+	ArchiveEntryPath string //optional, only for archived files
 }
 
 // FileHandler represents a handler for files.
@@ -402,6 +406,7 @@ func HandleFile(
 // - If it contains an error, the function handles it based on severity:
 //   - Fatal errors (context cancellation, deadline exceeded, ErrProcessingFatal) cause immediate termination
 //   - Non-fatal errors (ErrProcessingWarning and others) are logged and processing continues
+//
 // The function also listens for context cancellation to gracefully terminate processing if the context is done.
 // It returns nil upon successful processing of all data, or the first encountered fatal error.
 func handleChunksWithError(
@@ -428,6 +433,9 @@ func handleChunksWithError(
 			if len(dataOrErr.Data) > 0 {
 				chunk := *chunkSkel
 				chunk.Data = dataOrErr.Data
+				if dataOrErr.ArchiveEntryPath != "" {
+					chunk.HandleMetadata = map[string]string{"Archive Entry Path": dataOrErr.ArchiveEntryPath}
+				}
 				if err := reporter.ChunkOk(ctx, chunk); err != nil {
 					return fmt.Errorf("error reporting chunk: %w", err)
 				}

@@ -115,7 +115,9 @@ func (h *rpmHandler) processRPMFiles(
 				return fmt.Errorf("error creating mime-type reader: %w", err)
 			}
 
-			if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
+			// ToDo: Update processRPMFiles to accommodate nested archives. Once completed,
+			// adjust the emptyFilePath value to reflect the actual file path.
+			if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
 				dataOrErrChan <- DataOrErr{
 					Err: fmt.Errorf("%w: error processing RPM archive: %v", ErrProcessingWarning, err),
 				}

@@ -40,6 +40,8 @@ type Chunk struct {
 	SourceMetadata *source_metadatapb.MetaData
 	// SourceType is the type of Source that produced the chunk.
 	SourceType sourcespb.SourceType
+	// HandleMetadata holds the metadata from a handler if one was used.
+	HandleMetadata map[string]string
 
 	// Verify specifies whether any secrets in the Chunk should be verified.
 	Verify bool