Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[refactor] - base64 decoder #3762

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
updates
  • Loading branch information
ahrav committed Dec 7, 2024
commit 0948f1556015876199506c7d96f05d11af5636d9
103 changes: 77 additions & 26 deletions pkg/decoders/base64.go
Original file line number Diff line number Diff line change
@@ -4,49 +4,87 @@ import (
"bytes"
"encoding/base64"
"unicode"
"unicode/utf8"
"unsafe"

"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

// Base64 is a decoder that identifies and decodes base64-encoded strings within text.
// It decodes both standard and URL-safe base64 strings.
type (
Base64 struct{}
)

var (
b64Charset = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/-_=")
// b64Charset contains all valid base64 characters including padding and URL-safe variants.
b64Charset = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/-_=")
// b64EndChars are characters that can appear at the end of base64 strings (padding and URL-safe chars)
b64EndChars = "+/-_="

b64CharsetMapping [128]bool
b64EndCharsMapping [128]bool
// Pre-computed lookup sets for efficient character membership testing.
b64CharsetSet asciiSet
b64EndCharsSet asciiSet
)

func init() {
for _, char := range b64Charset {
if char < 128 {
b64CharsetMapping[char] = true
// asciiSet is a 256-bit value (8 * 32 bits), but we only use the lower 128 bits for ASCII.
// Each bit represents whether a given ASCII character is in the set.
// The lower 16 bytes represent all ASCII chars (0-127).
// Non-ASCII chars will map outside the 128-bit range and will be effectively "not in the set."
// This provides very efficient O(1) character membership testing using bitwise operations.
type asciiSet [8]uint32

// makeASCIISet creates a set of ASCII characters and reports whether all
// characters in chars are ASCII. It uses bit manipulation to create an efficient
// lookup table where each bit represents presence/absence of a character.
func makeASCIISet(chars string) (as asciiSet, ok bool) {
for i := 0; i < len(chars); i++ {
c := chars[i]
if c >= utf8.RuneSelf { // non-ASCII char
return as, false
}
// For each character, set the corresponding bit in the correct uint32.
// c/32 determines which uint32 in the array.
// c%32 determines which bit within that uint32.
as[c/32] |= 1 << (c % 32)
}
for _, char := range b64EndChars {
if char < 128 {
b64EndCharsMapping[char] = true
}
return as, true
}

// contains reports whether c is inside the set by using bitwise operations
// to check if the corresponding bit is set in the lookup table.
func (as *asciiSet) contains(c byte) bool {
return (as[c/32] & (1 << (c % 32))) != 0
}

func init() {
var ok bool
if b64CharsetSet, ok = makeASCIISet(string(b64Charset)); !ok {
panic("b64Charset contains non-ASCII characters")
}
if b64EndCharsSet, ok = makeASCIISet(b64EndChars); !ok {
panic("b64EndChars contains non-ASCII characters")
}
}

// Type returns the decoder type for the Base64 decoder.
func (d *Base64) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_BASE64
}

// FromChunk attempts to identify and decode base64-encoded substrings within the given chunk of data.
// It returns a new chunk with any found base64 strings decoded, or nil if no valid base64 was found.
func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
candidates := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndCharsMapping)
// Find potential base64 substrings that are at least 20 chars long
candidates := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetSet, b64EndCharsSet)

if len(candidates) == 0 {
return nil
}

// Try to decode each candidate substring.
var decodedCandidates []decodedCandidate
for _, c := range candidates {
data := chunk.Data[c.start:c.end]
@@ -59,7 +97,7 @@ func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
// both decodings would produce identical output (they only differ in
// how they encode '+/' vs '-_')
// 3. Therefore, if we successfully decode with our first attempt, we can
// skip trying the other encoding
// skip trying the other encoding.
var dec []byte
if bytes.Contains(data, []byte("=")) {
dec, _ = base64.StdEncoding.DecodeString(substring)
@@ -73,6 +111,7 @@ func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
}
}

// Only keep successfully decoded strings that are ASCII
if len(dec) > 0 && isASCII(dec) {
decodedCandidates = append(decodedCandidates, decodedCandidate{
start: c.start,
@@ -86,7 +125,7 @@ func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
return nil
}

// Rebuild the chunk data
// Rebuild the chunk data by replacing base64 strings with their decoded values
var result bytes.Buffer
result.Grow(len(chunk.Data))

@@ -107,6 +146,8 @@ func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
return decodableChunk
}

// bytesToString converts a byte slice to a string without copying the underlying data.
// Since the underlying byte slice is not being modified, we can safely use unsafe.Pointer.
func bytesToString(b []byte) string { return *(*string)(unsafe.Pointer(&b)) }

func isASCII(b []byte) bool {
@@ -118,26 +159,31 @@ func isASCII(b []byte) bool {
return true
}

// candidate represents a potential base64-encoded substring's position in the original data.
type candidate struct {
start int
end int
}

// decodedCandidate represents a successfully decoded base64 substring and its position.
type decodedCandidate struct {
start int
end int
decoded []byte
}

func getSubstringsOfCharacterSet(data []byte, threshold int, charsetMapping [128]bool, endCharsMapping [128]bool) []candidate {
// getSubstringsOfCharacterSet finds substrings that consist primarily of base64 characters
// and are longer than the threshold.
func getSubstringsOfCharacterSet(data []byte, threshold int, charsetMapping asciiSet, endCharsMapping asciiSet) []candidate {
if len(data) == 0 {
return nil
}

// First pass: count potential base64 substrings to allocate correct slice size.
count := 0
substringsCount := 0
for _, char := range data {
if char < 128 && charsetMapping[char] {
if char < 128 && charsetMapping.contains(byte(char)) {
count++
} else {
if count > threshold {
@@ -154,12 +200,13 @@ func getSubstringsOfCharacterSet(data []byte, threshold int, charsetMapping [128
return nil
}

// Second pass: collect the actual substrings.
candidates := make([]candidate, 0, substringsCount)

count = 0
start := 0
for i, char := range data {
if char < 128 && charsetMapping[char] {
if char < 128 && charsetMapping.contains(byte(char)) {
if count == 0 {
start = i
}
@@ -171,46 +218,50 @@ func getSubstringsOfCharacterSet(data []byte, threshold int, charsetMapping [128
count = 0
}
}

// Handle trailing substring if needed.
if count > threshold {
candidates = appendB64Substring(data, start, count, candidates, endCharsMapping)
}
return candidates
}

func appendB64Substring(data []byte, start, count int, candidates []candidate, endCharsMapping [128]bool) []candidate {
sub := data[start : start+count] // Original slice before trimming
// appendB64Substring processes a potential base64 substring by trimming padding characters
// and handling special cases with '=' padding. It adds valid candidates to the slice.
func appendB64Substring(data []byte, start, count int, candidates []candidate, endCharsMapping asciiSet) []candidate {
sub := data[start : start+count] // Original slice before trimming.

// Manually trim left.
// Trim padding chars from the left.
left := 0
for left < len(sub) && sub[left] < 128 && endCharsMapping[sub[left]] {
for left < len(sub) && sub[left] < 128 && endCharsMapping.contains(sub[left]) {
left++
}
substring := sub[left:] // substring after left trim
substringLength := len(substring)

// Manually trim right on 'substring'.
// Trim padding chars from the right.
right := substringLength - 1
for right >= 0 && substring[right] < 128 && endCharsMapping[substring[right]] {
for right >= 0 && substring[right] < 128 && endCharsMapping.contains(substring[right]) {
right--
}

// If right < 0, everything got trimmed out.
// If everything was trimmed, skip this candidate.
if right < 0 {
// This matches the original behavior: if nothing remains after trimming, we don't add a candidate.
return candidates
}

trimmedRight := substring[:right+1]
idx := bytes.IndexByte(trimmedRight, '=')

// Handle special case where '=' is found mid-string.
if idx != -1 {
// Substring after '='.
// Add substring after the '=' character
candidates = append(candidates, candidate{
start: start + (count - substringLength) + idx + 1,
end: start + (count - substringLength) + substringLength,
})
} else {
// Add the entire trimmed substring.
candidates = append(candidates, candidate{
start: start + (count - substringLength),
end: start + (count - substringLength) + substringLength,