Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding archive entry paths #3638

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Next Next commit
initial pass at adding archive entry paths
  • Loading branch information
joeleonjr committed Nov 20, 2024
commit 8c4c83b87894347d018e7a1b0489aff1468ef27c
11 changes: 11 additions & 0 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
@@ -1155,6 +1155,17 @@ func (e *Engine) processResult(
return
}

// Add in handler metadata
if res.ExtraData == nil && data.chunk.HandleMetadata != nil {
res.ExtraData = data.chunk.HandleMetadata
} else {
for k, v := range data.chunk.HandleMetadata {
if _, ok := res.ExtraData[k]; !ok {
res.ExtraData[k] = v
}
}
}

secret := detectors.CopyMetadata(&data.chunk, res)
secret.DecoderType = data.decoder
secret.DetectorDescription = data.detector.Detector.Description()
4 changes: 3 additions & 1 deletion pkg/handlers/apk.go
Original file line number Diff line number Diff line change
@@ -204,7 +204,9 @@ func (h *apkHandler) handleAPKFileContent(
ctx,
"filename", fileName,
)
return h.handleNonArchiveContent(ctx, mimeReader, apkChan)
//Note: the *zip.File.Name attribute (passed into this function as fileName)
// includes the full path within the apk.
return h.handleNonArchiveContent(ctx, fileName, mimeReader, apkChan)
}

// createZipReader creates a new ZIP reader from the input fileReader.
3 changes: 2 additions & 1 deletion pkg/handlers/ar.go
Original file line number Diff line number Diff line change
@@ -104,7 +104,8 @@ func (h *arHandler) processARFiles(ctx logContext.Context, reader *deb.Ar, dataO
continue
}

if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
// Note: emptyFilePath is used as the archiveEntryPath value b/c the `ar` format is a flat archive format (no nested dirs).
if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
dataOrErrChan <- DataOrErr{
Err: fmt.Errorf("%w: error handling archive content in AR: %v", ErrProcessingWarning, err),
}
23 changes: 14 additions & 9 deletions pkg/handlers/archive.go
Original file line number Diff line number Diff line change
@@ -86,7 +86,7 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
}()

start := time.Now()
err := h.openArchive(ctx, 0, input, dataOrErrChan)
err := h.openArchive(ctx, 0, emptyFilePath, input, dataOrErrChan)
if err == nil {
h.metrics.incFilesProcessed()
}
@@ -101,12 +101,13 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
var ErrMaxDepthReached = errors.New("max archive depth reached")

// openArchive recursively extracts content from an archive up to a maximum depth, handling nested archives if necessary.
// It takes a reader from which it attempts to identify and process the archive format. Depending on the archive type,
// it either decompresses or extracts the contents directly, sending data to the provided channel.
// It takes a string representing the path to the archive and a reader from which it attempts to identify and process the archive format.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment reads "a string" but the actual function takes a string slice. Which is correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll fix that. That comment was from a previous version.

// Depending on the archive type, it either decompresses or extracts the contents directly, sending data to the provided channel.
// Returns an error if the archive cannot be processed due to issues like exceeding maximum depth or unsupported formats.
func (h *archiveHandler) openArchive(
ctx logContext.Context,
depth int,
archiveEntryPath string,
reader fileReader,
dataOrErrChan chan DataOrErr,
) error {
@@ -124,14 +125,14 @@ func (h *archiveHandler) openArchive(

if reader.format == nil {
if depth > 0 {
return h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
return h.handleNonArchiveContent(ctx, archiveEntryPath, newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
}
return fmt.Errorf("unknown archive format")
}

switch archive := reader.format.(type) {
case archiver.Decompressor:
// Decompress tha archive and feed the decompressed data back into the archive handler to extract any nested archives.
// Decompress the archive and feed the decompressed data back into the archive handler to extract any nested archives.
compReader, err := archive.OpenReader(reader)
if err != nil {
return fmt.Errorf("error opening decompressor with format: %s %w", reader.format.Name(), err)
@@ -152,9 +153,13 @@ func (h *archiveHandler) openArchive(
}
defer rdr.Close()

return h.openArchive(ctx, depth+1, rdr, dataOrErrChan)
// Note: We're limited in our ability to add file names to the archiveEntryPath here, as the decompressor doesn't have access to a fileName value. Instead, we're adding a generic string to indicate that the file is decompressed. This could be improved.
if depth > 0 {
archiveEntryPath = archiveEntryPath + "(decompressed " + reader.format.Name() + " file)"
}
return h.openArchive(ctx, depth+1, archiveEntryPath, rdr, dataOrErrChan)
case archiver.Extractor:
err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(dataOrErrChan))
err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(archiveEntryPath, dataOrErrChan))
if err != nil {
return fmt.Errorf("error extracting archive with format: %s: %w", reader.format.Name(), err)
}
@@ -168,7 +173,7 @@ func (h *archiveHandler) openArchive(
// It logs the extraction, checks for cancellation, and decides whether to skip the file based on its name or type,
// particularly for binary files if configured to skip. If the file is not skipped, it recursively calls openArchive
// to handle nested archives or to continue processing based on the file's content and depth in the archive structure.
func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
func (h *archiveHandler) extractorHandler(archiveEntryPath string, dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
return func(ctx context.Context, file archiver.File) error {
lCtx := logContext.WithValues(
logContext.AddLogger(ctx),
@@ -240,6 +245,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con
h.metrics.observeFileSize(fileSize)

lCtx.Logger().V(4).Info("Processed file successfully", "filename", file.Name(), "size", file.Size())
return h.openArchive(lCtx, depth, rdr, dataOrErrChan)
return h.openArchive(lCtx, depth, (archiveEntryPath + "/" + file.Name()), rdr, dataOrErrChan)
}
}
14 changes: 13 additions & 1 deletion pkg/handlers/archive_test.go
Original file line number Diff line number Diff line change
@@ -19,60 +19,71 @@ func TestArchiveHandler(t *testing.T) {
expectedChunks int
matchString string
expectErr bool
matchFileName string
}{
"gzip-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one-zip.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"",
},
"gzip-nested": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/double-zip.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
// This is b/c we can't get file path from nested archiver.OpenReader()
"(decompressed .gz file)",
},
"gzip-too-deep": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six-zip.gz",
0,
"",
true,
"",
},
"tar-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one.tar",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/aws-canary-creds",
},
"tar-nested": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/two.tar",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/one.tar/aws-canary-creds",
},
"tar-too-deep": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six.tar",
0,
"",
true,
"",
},
"targz-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/tar-archive.tar.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/aws-canary-creds",
},
"gzip-large": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/FifteenMB.gz",
1543,
"AKIAYVP4CIPPH5TNP3SW",
false,
"",
},
"zip-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/aws-canary-creds",
},
}

@@ -104,6 +115,7 @@ func TestArchiveHandler(t *testing.T) {
count++
if re.Match(chunk.Data) {
matched = true
assert.Equal(t, chunk.ArchiveEntryPath, testCase.matchFileName)
}
}

@@ -125,6 +137,6 @@ func TestOpenInvalidArchive(t *testing.T) {

dataOrErrChan := make(chan DataOrErr)

err = handler.openArchive(ctx, 0, rdr, dataOrErrChan)
err = handler.openArchive(ctx, 0, emptyFilePath, rdr, dataOrErrChan)
assert.Error(t, err)
}
6 changes: 5 additions & 1 deletion pkg/handlers/default.go
Original file line number Diff line number Diff line change
@@ -44,7 +44,7 @@ func (h *defaultHandler) HandleFile(ctx logContext.Context, input fileReader) ch
defer close(dataOrErrChan)

start := time.Now()
err := h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
err := h.handleNonArchiveContent(ctx, emptyFilePath, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
if err == nil {
h.metrics.incFilesProcessed()
}
@@ -93,6 +93,7 @@ func (h *defaultHandler) measureLatencyAndHandleErrors(
// file content, regardless of being an archive or not, is handled appropriately.
func (h *defaultHandler) handleNonArchiveContent(
ctx logContext.Context,
archiveEntryPath string,
reader mimeTypeReader,
dataOrErrChan chan DataOrErr,
) error {
@@ -109,6 +110,9 @@ func (h *defaultHandler) handleNonArchiveContent(
chunkReader := sources.NewChunkReader()
for data := range chunkReader(ctx, reader) {
dataOrErr := DataOrErr{}
if archiveEntryPath != "" {
dataOrErr.ArchiveEntryPath = archiveEntryPath
}
if err := data.Error(); err != nil {
h.metrics.incErrors()
dataOrErr.Err = fmt.Errorf("%w: error reading chunk: %v", ErrProcessingWarning, err)
12 changes: 10 additions & 2 deletions pkg/handlers/handlers.go
Original file line number Diff line number Diff line change
@@ -52,6 +52,9 @@ var (
// ErrProcessingWarning indicates a recoverable error that can be logged,
// allowing processing to continue.
ErrProcessingWarning = errors.New("error processing file")

// emptyFilePath is used to represent an empty file path.
emptyFilePath = ""
)

type readerConfig struct{ fileExtension string }
@@ -183,8 +186,9 @@ func newFileReader(r io.Reader, options ...readerOption) (fReader fileReader, er
// efficient streaming of file contents while also providing a way to propagate errors
// that may occur during the file handling process.
type DataOrErr struct {
Data []byte
Err error
Data []byte
Err error
ArchiveEntryPath string //optional, only for archived files
}

// FileHandler represents a handler for files.
@@ -402,6 +406,7 @@ func HandleFile(
// - If it contains an error, the function handles it based on severity:
// - Fatal errors (context cancellation, deadline exceeded, ErrProcessingFatal) cause immediate termination
// - Non-fatal errors (ErrProcessingWarning and others) are logged and processing continues
//
// The function also listens for context cancellation to gracefully terminate processing if the context is done.
// It returns nil upon successful processing of all data, or the first encountered fatal error.
func handleChunksWithError(
@@ -428,6 +433,9 @@ func handleChunksWithError(
if len(dataOrErr.Data) > 0 {
chunk := *chunkSkel
chunk.Data = dataOrErr.Data
if dataOrErr.ArchiveEntryPath != "" {
chunk.HandleMetadata = map[string]string{"Archive Entry Path": dataOrErr.ArchiveEntryPath}
}
if err := reporter.ChunkOk(ctx, chunk); err != nil {
return fmt.Errorf("error reporting chunk: %w", err)
}
4 changes: 3 additions & 1 deletion pkg/handlers/rpm.go
Original file line number Diff line number Diff line change
@@ -115,7 +115,9 @@
return fmt.Errorf("error creating mime-type reader: %w", err)
}

if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
// ToDo: Update processRPMFiles to accomdate nested archives. Once completed,

Check failure on line 118 in pkg/handlers/rpm.go

GitHub Actions / golangci-lint

`accomdate` is a misspelling of `accommodate` (misspell)
// adjust the emptyFilePath value to reflect the actual file path.
if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
dataOrErrChan <- DataOrErr{
Err: fmt.Errorf("%w: error processing RPM archive: %v", ErrProcessingWarning, err),
}
2 changes: 2 additions & 0 deletions pkg/sources/sources.go
Original file line number Diff line number Diff line change
@@ -40,6 +40,8 @@ type Chunk struct {
SourceMetadata *source_metadatapb.MetaData
// SourceType is the type of Source that produced the chunk.
SourceType sourcespb.SourceType
// HandleMetadata holds the metadata from a handler if one was used.
HandleMetadata map[string]string

// Verify specifies whether any secrets in the Chunk should be verified.
Verify bool
Loading
Oops, something went wrong.