Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding archive entry paths #3638

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
11 changes: 11 additions & 0 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
@@ -1155,6 +1155,17 @@ func (e *Engine) processResult(
return
}

// Add in handler metadata. Existing extra data is not overwritten.
if res.ExtraData == nil && data.chunk.HandleMetadata != nil {
res.ExtraData = data.chunk.HandleMetadata
} else {
for k, v := range data.chunk.HandleMetadata {
if _, ok := res.ExtraData[k]; !ok {
res.ExtraData[k] = v
}
}
}

secret := detectors.CopyMetadata(&data.chunk, res)
secret.DecoderType = data.decoder
secret.DetectorDescription = data.detector.Detector.Description()
4 changes: 3 additions & 1 deletion pkg/handlers/apk.go
Original file line number Diff line number Diff line change
@@ -204,7 +204,9 @@ func (h *apkHandler) handleAPKFileContent(
ctx,
"filename", fileName,
)
return h.handleNonArchiveContent(ctx, mimeReader, apkChan)
//Note: the *zip.File.Name attribute (passed into this function as fileName)
// includes the full path within the apk.
return h.handleNonArchiveContent(ctx, fileName, mimeReader, apkChan)
}

// createZipReader creates a new ZIP reader from the input fileReader.
3 changes: 2 additions & 1 deletion pkg/handlers/ar.go
Original file line number Diff line number Diff line change
@@ -104,7 +104,8 @@ func (h *arHandler) processARFiles(ctx logContext.Context, reader *deb.Ar, dataO
continue
}

if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
// Note: emptyFilePath is used as the archiveEntryPath value b/c the `ar` format is a flat archive format (no nested dirs).
if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
dataOrErrChan <- DataOrErr{
Err: fmt.Errorf("%w: error handling archive content in AR: %v", ErrProcessingWarning, err),
}
36 changes: 17 additions & 19 deletions pkg/handlers/archive.go
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"path/filepath"
"time"

"github.com/mholt/archiver/v4"
@@ -86,7 +87,7 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
}()

start := time.Now()
err := h.openArchive(ctx, 0, input, dataOrErrChan)
err := h.openArchive(ctx, []string{}, input, dataOrErrChan)
if err == nil {
h.metrics.incFilesProcessed()
}
@@ -101,37 +102,37 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
var ErrMaxDepthReached = errors.New("max archive depth reached")

// openArchive recursively extracts content from an archive up to a maximum depth, handling nested archives if necessary.
// It takes a reader from which it attempts to identify and process the archive format. Depending on the archive type,
// it either decompresses or extracts the contents directly, sending data to the provided channel.
// It takes a string representing the path to the archive and a reader from which it attempts to identify and process the archive format.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment reads "a string" but the actual function takes a string slice. Which is correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll fix that. That comment was from a previous version.

// Depending on the archive type, it either decompresses or extracts the contents directly, sending data to the provided channel.
// Returns an error if the archive cannot be processed due to issues like exceeding maximum depth or unsupported formats.
func (h *archiveHandler) openArchive(
ctx logContext.Context,
depth int,
archiveEntryPaths []string,
reader fileReader,
dataOrErrChan chan DataOrErr,
) error {
ctx.Logger().V(4).Info("Starting archive processing", "depth", depth)
defer ctx.Logger().V(4).Info("Finished archive processing", "depth", depth)
ctx.Logger().V(4).Info("Starting archive processing", "depth", len(archiveEntryPaths))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

imo just attach the paths here now that you have them

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean change depth to paths in the logs?

defer ctx.Logger().V(4).Info("Finished archive processing", "depth", len(archiveEntryPaths))

if common.IsDone(ctx) {
return ctx.Err()
}

if depth >= maxDepth {
if len(archiveEntryPaths) >= maxDepth {
h.metrics.incMaxArchiveDepthCount()
return ErrMaxDepthReached
}

if reader.format == nil {
if depth > 0 {
return h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
if len(archiveEntryPaths) > 0 {
return h.handleNonArchiveContent(ctx, filepath.Join(archiveEntryPaths...), newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
}
return fmt.Errorf("unknown archive format")
}

switch archive := reader.format.(type) {
case archiver.Decompressor:
// Decompress tha archive and feed the decompressed data back into the archive handler to extract any nested archives.
// Decompress the archive and feed the decompressed data back into the archive handler to extract any nested archives.
compReader, err := archive.OpenReader(reader)
if err != nil {
return fmt.Errorf("error opening decompressor with format: %s %w", reader.format.Name(), err)
@@ -152,9 +153,11 @@ func (h *archiveHandler) openArchive(
}
defer rdr.Close()

return h.openArchive(ctx, depth+1, rdr, dataOrErrChan)
// Note: We're limited in our ability to add file names to the archiveEntryPath here, as the decompressor doesn't have access to a fileName value.
// We add a empty string so we can keep track of the archive depth.
return h.openArchive(ctx, append(archiveEntryPaths, ""), rdr, dataOrErrChan)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we ever end up stringifying the slice of path parts, an empty string isn't going to maximally visually obvious as another level of depth, compared to like a ? or something. (Compare: some/path///file.txt to e.g. some/path/?/?/file.txt) What do you think of using a non-empty marker like ? instead?

Copy link
Contributor Author

@joeleonjr joeleonjr Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commit 36bdef0 fixes an oversight from an earlier version and is relevant to this discussion.

The main change is using file.NameInArchive instead of file.Name(). The difference is file.NameInArchive contains the relative path inside the archive that is being extracted. This does a few things:

  1. Previously, I mistakenly treated all files during an extraction operation as if they were in a flat directory and didn't preserve the actual directory structure. I didn't realize the file.Name() operation just grabbed the filename, not the path. That's fixed and we now have complete relative file paths in all supported archives.
  2. This change makes it abundantly clear how a user would navigate an archive to get to the actual file. For example: if you have an archive file containing another archive named archive.tar.gz with a file named secret.txt, it would return this in the archive entry path archive.tar.gz/archive/secret.txt. When you manually double-click to unarchive archive.tar.gz it tosses all of the contents into a new folder named archive and then you'd see secret.txt. It's super clean and clear. This method still uses the "" to track depth during decompression, but in the filepath.Join() operation, empty strings are ignored, which is exactly what we want to reconstruct the relative file path. If we change to add ? or anything else, we'd need to write a custom function to strip those out during the filepath.Join(). Since we're able to keep accurate track of depth and relative file path, I'd suggest just leaving as is.
  3. One other note: I only updated the error logs to include file.NameInArchive b/c I didn't want to bloat our non-error logs with the entire file path. This means the log at the very beginning of the extractorHandler function only has file.Name(). Not sure if that was the correct call or not.

case archiver.Extractor:
err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(dataOrErrChan))
err := archive.Extract(ctx, reader, nil, h.extractorHandler(archiveEntryPaths, dataOrErrChan))
if err != nil {
return fmt.Errorf("error extracting archive with format: %s: %w", reader.format.Name(), err)
}
@@ -168,7 +171,7 @@ func (h *archiveHandler) openArchive(
// It logs the extraction, checks for cancellation, and decides whether to skip the file based on its name or type,
// particularly for binary files if configured to skip. If the file is not skipped, it recursively calls openArchive
// to handle nested archives or to continue processing based on the file's content and depth in the archive structure.
func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
func (h *archiveHandler) extractorHandler(archiveEntryPaths []string, dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
return func(ctx context.Context, file archiver.File) error {
lCtx := logContext.WithValues(
logContext.AddLogger(ctx),
@@ -186,11 +189,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con
return ctx.Err()
}

depth := 0
if ctxDepth, ok := ctx.Value(depthKey).(int); ok {
depth = ctxDepth
}

fileSize := file.Size()
if int(fileSize) > maxSize {
lCtx.Logger().V(2).Info("skipping file: size exceeds max allowed", "size", fileSize, "limit", maxSize)
@@ -240,6 +238,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con
h.metrics.observeFileSize(fileSize)

lCtx.Logger().V(4).Info("Processed file successfully", "filename", file.Name(), "size", file.Size())
return h.openArchive(lCtx, depth, rdr, dataOrErrChan)
return h.openArchive(lCtx, append(archiveEntryPaths, file.Name()), rdr, dataOrErrChan)
}
}
14 changes: 13 additions & 1 deletion pkg/handlers/archive_test.go
Original file line number Diff line number Diff line change
@@ -19,60 +19,71 @@ func TestArchiveHandler(t *testing.T) {
expectedChunks int
matchString string
expectErr bool
matchFileName string
}{
"gzip-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one-zip.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"",
},
"gzip-nested": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/double-zip.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
// This is b/c we can't get file path from nested archiver.OpenReader()
"",
},
"gzip-too-deep": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six-zip.gz",
0,
"",
true,
"",
},
"tar-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one.tar",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"aws-canary-creds",
},
"tar-nested": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/two.tar",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"one.tar/aws-canary-creds",
},
"tar-too-deep": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six.tar",
0,
"",
true,
"",
},
"targz-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/tar-archive.tar.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"aws-canary-creds",
},
"gzip-large": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/FifteenMB.gz",
1543,
"AKIAYVP4CIPPH5TNP3SW",
false,
"",
},
"zip-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"aws-canary-creds",
},
}

@@ -104,6 +115,7 @@ func TestArchiveHandler(t *testing.T) {
count++
if re.Match(chunk.Data) {
matched = true
assert.Equal(t, chunk.ArchiveEntryPath, testCase.matchFileName)
}
}

@@ -125,6 +137,6 @@ func TestOpenInvalidArchive(t *testing.T) {

dataOrErrChan := make(chan DataOrErr)

err = handler.openArchive(ctx, 0, rdr, dataOrErrChan)
err = handler.openArchive(ctx, []string{}, rdr, dataOrErrChan)
assert.Error(t, err)
}
6 changes: 5 additions & 1 deletion pkg/handlers/default.go
Original file line number Diff line number Diff line change
@@ -44,7 +44,7 @@ func (h *defaultHandler) HandleFile(ctx logContext.Context, input fileReader) ch
defer close(dataOrErrChan)

start := time.Now()
err := h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
err := h.handleNonArchiveContent(ctx, emptyFilePath, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
if err == nil {
h.metrics.incFilesProcessed()
}
@@ -93,6 +93,7 @@ func (h *defaultHandler) measureLatencyAndHandleErrors(
// file content, regardless of being an archive or not, is handled appropriately.
func (h *defaultHandler) handleNonArchiveContent(
ctx logContext.Context,
archiveEntryPath string,
reader mimeTypeReader,
dataOrErrChan chan DataOrErr,
) error {
@@ -109,6 +110,9 @@ func (h *defaultHandler) handleNonArchiveContent(
chunkReader := sources.NewChunkReader()
for data := range chunkReader(ctx, reader) {
dataOrErr := DataOrErr{}
if archiveEntryPath != "" {
dataOrErr.ArchiveEntryPath = archiveEntryPath
}
if err := data.Error(); err != nil {
h.metrics.incErrors()
dataOrErr.Err = fmt.Errorf("%w: error reading chunk: %v", ErrProcessingWarning, err)
12 changes: 10 additions & 2 deletions pkg/handlers/handlers.go
Original file line number Diff line number Diff line change
@@ -52,6 +52,9 @@ var (
// ErrProcessingWarning indicates a recoverable error that can be logged,
// allowing processing to continue.
ErrProcessingWarning = errors.New("error processing file")

// emptyFilePath is used to represent an empty file path.
emptyFilePath = ""
)

type readerConfig struct{ fileExtension string }
@@ -183,8 +186,9 @@ func newFileReader(r io.Reader, options ...readerOption) (fReader fileReader, er
// efficient streaming of file contents while also providing a way to propagate errors
// that may occur during the file handling process.
type DataOrErr struct {
Data []byte
Err error
Data []byte
Err error
ArchiveEntryPath string //optional, only for archived files
}

// FileHandler represents a handler for files.
@@ -402,6 +406,7 @@ func HandleFile(
// - If it contains an error, the function handles it based on severity:
// - Fatal errors (context cancellation, deadline exceeded, ErrProcessingFatal) cause immediate termination
// - Non-fatal errors (ErrProcessingWarning and others) are logged and processing continues
//
// The function also listens for context cancellation to gracefully terminate processing if the context is done.
// It returns nil upon successful processing of all data, or the first encountered fatal error.
func handleChunksWithError(
@@ -428,6 +433,9 @@ func handleChunksWithError(
if len(dataOrErr.Data) > 0 {
chunk := *chunkSkel
chunk.Data = dataOrErr.Data
if dataOrErr.ArchiveEntryPath != "" {
chunk.HandleMetadata = map[string]string{"Archive Entry Path": dataOrErr.ArchiveEntryPath}
}
if err := reporter.ChunkOk(ctx, chunk); err != nil {
return fmt.Errorf("error reporting chunk: %w", err)
}
4 changes: 3 additions & 1 deletion pkg/handlers/rpm.go
Original file line number Diff line number Diff line change
@@ -115,7 +115,9 @@ func (h *rpmHandler) processRPMFiles(
return fmt.Errorf("error creating mime-type reader: %w", err)
}

if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
// ToDo: Update processRPMFiles to accommodate nested archives. Once completed,
// adjust the emptyFilePath value to reflect the actual file path.
if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
dataOrErrChan <- DataOrErr{
Err: fmt.Errorf("%w: error processing RPM archive: %v", ErrProcessingWarning, err),
}
2 changes: 2 additions & 0 deletions pkg/sources/sources.go
Original file line number Diff line number Diff line change
@@ -40,6 +40,8 @@ type Chunk struct {
SourceMetadata *source_metadatapb.MetaData
// SourceType is the type of Source that produced the chunk.
SourceType sourcespb.SourceType
// HandleMetadata holds the metadata from a handler if one was used.
HandleMetadata map[string]string

// Verify specifies whether any secrets in the Chunk should be verified.
Verify bool
Loading
Oops, something went wrong.