updated to combine archive paths + depth

trufflesecurity · joeleonjr · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024
commit fc528d23a300acd060736c2281a9f391ecec5e6c
@@ -1155,7 +1155,7 @@ func (e *Engine) processResult(
 		return
 	}
 
-	// Add in handler metadata
+	// Add in handler metadata. Existing extra data is not overwritten.
 	if res.ExtraData == nil && data.chunk.HandleMetadata != nil {
 		res.ExtraData = data.chunk.HandleMetadata
 	} else {

@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"path/filepath"
 	"time"
 
 	"github.com/mholt/archiver/v4"
@@ -86,7 +87,7 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
 		}()
 
 		start := time.Now()
-		err := h.openArchive(ctx, 0, emptyFilePath, input, dataOrErrChan)
+		err := h.openArchive(ctx, []string{}, input, dataOrErrChan)
 		if err == nil {
 			h.metrics.incFilesProcessed()
 		}
@@ -106,26 +107,25 @@ var ErrMaxDepthReached = errors.New("max archive depth reached")
 // Returns an error if the archive cannot be processed due to issues like exceeding maximum depth or unsupported formats.
 func (h *archiveHandler) openArchive(
 	ctx logContext.Context,
-	depth int,
-	archiveEntryPath string,
+	archiveEntryPaths []string,
 	reader fileReader,
 	dataOrErrChan chan DataOrErr,
 ) error {
-	ctx.Logger().V(4).Info("Starting archive processing", "depth", depth)
-	defer ctx.Logger().V(4).Info("Finished archive processing", "depth", depth)
+	ctx.Logger().V(4).Info("Starting archive processing", "depth", len(archiveEntryPaths))
+	defer ctx.Logger().V(4).Info("Finished archive processing", "depth", len(archiveEntryPaths))
 
 	if common.IsDone(ctx) {
 		return ctx.Err()
 	}
 
-	if depth >= maxDepth {
+	if len(archiveEntryPaths) >= maxDepth {
 		h.metrics.incMaxArchiveDepthCount()
 		return ErrMaxDepthReached
 	}
 
 	if reader.format == nil {
-		if depth > 0 {
-			return h.handleNonArchiveContent(ctx, archiveEntryPath, newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
+		if len(archiveEntryPaths) > 0 {
+			return h.handleNonArchiveContent(ctx, filepath.Join(archiveEntryPaths...), newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
 		}
 		return fmt.Errorf("unknown archive format")
 	}
@@ -154,12 +154,9 @@ func (h *archiveHandler) openArchive(
 		defer rdr.Close()
 
 		// Note: We're limited in our ability to add file names to the archiveEntryPath here, as the decompressor doesn't have access to a fileName value. Instead, we're adding a generic string to indicate that the file is decompressed. This could be improved.
-		if depth > 0 {
-			archiveEntryPath = archiveEntryPath + "(decompressed " + reader.format.Name() + " file)"
-		}
-		return h.openArchive(ctx, depth+1, archiveEntryPath, rdr, dataOrErrChan)
+		return h.openArchive(ctx, append(archiveEntryPaths, "(decompressed "+reader.format.Name()+" file)"), rdr, dataOrErrChan)
 	case archiver.Extractor:
-		err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(archiveEntryPath, dataOrErrChan))
+		err := archive.Extract(ctx, reader, nil, h.extractorHandler(archiveEntryPaths, dataOrErrChan))
 		if err != nil {
 			return fmt.Errorf("error extracting archive with format: %s: %w", reader.format.Name(), err)
 		}
@@ -173,7 +170,7 @@ func (h *archiveHandler) openArchive(
 // It logs the extraction, checks for cancellation, and decides whether to skip the file based on its name or type,
 // particularly for binary files if configured to skip. If the file is not skipped, it recursively calls openArchive
 // to handle nested archives or to continue processing based on the file's content and depth in the archive structure.
-func (h *archiveHandler) extractorHandler(archiveEntryPath string, dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
+func (h *archiveHandler) extractorHandler(archiveEntryPaths []string, dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
 	return func(ctx context.Context, file archiver.File) error {
 		lCtx := logContext.WithValues(
 			logContext.AddLogger(ctx),
@@ -191,11 +188,6 @@ func (h *archiveHandler) extractorHandler(archiveEntryPath string, dataOrErrChan
 			return ctx.Err()
 		}
 
-		depth := 0
-		if ctxDepth, ok := ctx.Value(depthKey).(int); ok {
-			depth = ctxDepth
-		}
-
 		fileSize := file.Size()
 		if int(fileSize) > maxSize {
 			lCtx.Logger().V(2).Info("skipping file: size exceeds max allowed", "size", fileSize, "limit", maxSize)
@@ -245,6 +237,6 @@ func (h *archiveHandler) extractorHandler(archiveEntryPath string, dataOrErrChan
 		h.metrics.observeFileSize(fileSize)
 
 		lCtx.Logger().V(4).Info("Processed file successfully", "filename", file.Name(), "size", file.Size())
-		return h.openArchive(lCtx, depth, (archiveEntryPath + "/" + file.Name()), rdr, dataOrErrChan)
+		return h.openArchive(lCtx, append(archiveEntryPaths, file.Name()), rdr, dataOrErrChan)
 	}
 }
@@ -26,15 +26,15 @@ func TestArchiveHandler(t *testing.T) {
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
-			"",
+			"(decompressed .gz file)",
 		},
 		"gzip-nested": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/double-zip.gz",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
 			// This is b/c we can't get file path from nested archiver.OpenReader()
-			"(decompressed .gz file)",
+			"(decompressed .gz file)/(decompressed .gz file)",
 		},
 		"gzip-too-deep": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six-zip.gz",
@@ -48,14 +48,14 @@ func TestArchiveHandler(t *testing.T) {
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
-			"/aws-canary-creds",
+			"aws-canary-creds",
 		},
 		"tar-nested": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/two.tar",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
-			"/one.tar/aws-canary-creds",
+			"one.tar/aws-canary-creds",
 		},
 		"tar-too-deep": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six.tar",
@@ -69,21 +69,21 @@ func TestArchiveHandler(t *testing.T) {
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
-			"/aws-canary-creds",
+			"(decompressed .tar.gz file)/aws-canary-creds",
 		},
 		"gzip-large": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/FifteenMB.gz",
 			1543,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
-			"",
+			"(decompressed .gz file)",
 		},
 		"zip-single": {
 			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip",
 			1,
 			"AKIAYVP4CIPPH5TNP3SW",
 			false,
-			"/aws-canary-creds",
+			"aws-canary-creds",
 		},
 	}
 
@@ -137,6 +137,6 @@ func TestOpenInvalidArchive(t *testing.T) {
 
 	dataOrErrChan := make(chan DataOrErr)
 
-	err = handler.openArchive(ctx, 0, emptyFilePath, rdr, dataOrErrChan)
+	err = handler.openArchive(ctx, []string{}, rdr, dataOrErrChan)
 	assert.Error(t, err)
 }
@@ -115,7 +115,7 @@ func (h *rpmHandler) processRPMFiles(
 				return fmt.Errorf("error creating mime-type reader: %w", err)
 			}
 
-			// ToDo: Update processRPMFiles to accomdate nested archives. Once completed,
+			// ToDo: Update processRPMFiles to accommodate nested archives. Once completed,
 			// adjust the emptyFilePath value to reflect the actual file path.
 			if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
 				dataOrErrChan <- DataOrErr{