Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement CustomRegex detector #950

Merged
merged 9 commits into from
Dec 14, 2022
281 changes: 178 additions & 103 deletions pkg/custom_detectors/custom_detectors.go
Original file line number Diff line number Diff line change
@@ -1,151 +1,226 @@
package custom_detectors

import (
"fmt"
"bytes"
"context"
"encoding/json"
"net/http"
"regexp"
"strconv"
"strings"

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/custom_detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)

// customRegex is a CustomRegex that is guaranteed to be valid.
type customRegex *custom_detectorspb.CustomRegex
// The maximum number of matches from one chunk. This const is used when
// permutating each regex match to protect the scanner from doing too much work
// for poorly defined regexps.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very useful comment 🙌

const maxTotalMatches = 100

func ValidateKeywords(keywords []string) error {
if len(keywords) == 0 {
return fmt.Errorf("no keywords")
}

for _, keyword := range keywords {
if len(keyword) == 0 {
return fmt.Errorf("empty keyword")
}
}
return nil
// customRegexWebhook is a CustomRegex with webhook validation that is
// guaranteed to be valid (assuming the data is not changed after
// initialization).
type customRegexWebhook struct {
*custom_detectorspb.CustomRegex
}

func ValidateRegex(regex map[string]string) error {
if len(regex) == 0 {
return fmt.Errorf("no regex")
}

for _, r := range regex {
if _, err := regexp.Compile(r); err != nil {
return fmt.Errorf("invalid regex %q", r)
}
}

return nil
}
// Ensure the Scanner satisfies the interface at compile time.
var _ detectors.Detector = (*customRegexWebhook)(nil)

func ValidateVerifyEndpoint(endpoint string, unsafe bool) error {
if len(endpoint) == 0 {
return fmt.Errorf("no endpoint")
// NewWebhookCustomRegex initializes and validates a customRegexWebhook. An
// unexported type is intentionally returned here to ensure the values have
// been validated.
func NewWebhookCustomRegex(pb *custom_detectorspb.CustomRegex) (*customRegexWebhook, error) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: is this used anywhere? I don't see any reference to it. Also it looks like we are returning an unexported type from an exported function, is that on purpose? Or could we maybe make them both exported.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not used anywhere yet. The intention is to use it in the engine once the functionality is implemented here.

I did intentionally return an unexported type from the exported function to control initialization. The idea being if a variable exists as that type, the values must have been validated.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add some comments to make that clear for the function.

// TODO: Return all validation errors.
if err := ValidateKeywords(pb.Keywords); err != nil {
return nil, err
}

if strings.HasPrefix(endpoint, "http://") && !unsafe {
return fmt.Errorf("http endpoint must have unsafe=true")
if err := ValidateRegex(pb.Regex); err != nil {
return nil, err
}
return nil
}

func ValidateVerifyHeaders(headers []string) error {
for _, header := range headers {
if !strings.Contains(header, ":") {
return fmt.Errorf("header %q must contain a colon", header)
for _, verify := range pb.Verify {
if err := ValidateVerifyEndpoint(verify.Endpoint, verify.Unsafe); err != nil {
return nil, err
}
if err := ValidateVerifyHeaders(verify.Headers); err != nil {
return nil, err
}
}
return nil
}

func ValidateVerifyRanges(ranges []string) error {
const httpLowerBound = 100
const httpUpperBound = 599
// TODO: Copy only necessary data out of pb.
return &customRegexWebhook{pb}, nil
}

for _, successRange := range ranges {
if !strings.Contains(successRange, "-") {
httpCode, err := strconv.Atoi(successRange)
if err != nil {
return fmt.Errorf("unable to convert http code to int %q", successRange)
}
var httpClient = common.SaneHttpClient()

if httpCode < httpLowerBound || httpCode > httpUpperBound {
return fmt.Errorf("invalid http status code %q", successRange)
}
func (c *customRegexWebhook) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
dataStr := string(data)
regexMatches := make(map[string][][]string, len(c.GetRegex()))

// Find all submatches for each regex.
for name, regex := range c.GetRegex() {
regex, err := regexp.Compile(regex)
if err != nil {
// TODO: Log error.
// This should never happen due to validation.
continue
}
regexMatches[name] = regex.FindAllStringSubmatch(dataStr, -1)
}

httpRange := strings.Split(successRange, "-")
if len(httpRange) != 2 {
return fmt.Errorf("invalid range format %q", successRange)
// Permutate each individual match.
// {
// "foo": [["match1"]]
// "bar": [["match2"], ["match3"]]
// }
// becomes
// [
// {"foo": ["match1"], "bar": ["match2"]},
// {"foo": ["match1"], "bar": ["match3"]},
// ]
matches := permutateMatches(regexMatches)

// Create result object and test for verification.
for _, match := range matches {
if common.IsDone(ctx) {
// TODO: Log we're possibly leaving out results.
return results, nil
}

lowerBound, err := strconv.Atoi(httpRange[0])
if err != nil {
return fmt.Errorf("unable to convert lower bound to int %q", successRange)
var raw string
for _, values := range match {
// values[0] contains the entire regex match.
raw += values[0]
}

upperBound, err := strconv.Atoi(httpRange[1])
if err != nil {
return fmt.Errorf("unable to convert upper bound to int %q", successRange)
result := detectors.Result{
DetectorType: detectorspb.DetectorType_CustomRegex,
Raw: []byte(raw),
}

if lowerBound > upperBound {
return fmt.Errorf("lower bound greater than upper bound on range %q", successRange)
if isKnownFalsePositive(match) {
continue
}

if lowerBound < httpLowerBound || upperBound > httpUpperBound {
return fmt.Errorf("invalid http status code range %q", successRange)
if !verify {
results = append(results, result)
continue
}
}
return nil
}

func ValidateRegexVars(regex map[string]string, body ...string) error {
for _, b := range body {
matches := NewRegexVarString(b).variables

for match := range matches {
if _, ok := regex[match]; !ok {
return fmt.Errorf("body %q contains an unknown variable", b)
// Verify via webhook.
jsonBody, err := json.Marshal(map[string]map[string][]string{
c.GetName(): match,
})
if err != nil {
continue
}
// Try each config until we successfully verify.
for _, verifyConfig := range c.GetVerify() {
if common.IsDone(ctx) {
// TODO: Log we're possibly leaving out results.
return results, nil
}
req, err := http.NewRequestWithContext(ctx, "POST", verifyConfig.GetEndpoint(), bytes.NewReader(jsonBody))
if err != nil {
continue
}
for _, header := range verifyConfig.GetHeaders() {
key, value, found := strings.Cut(header, ":")
if !found {
// Should be unreachable due to validation.
continue
}
req.Header.Add(key, strings.TrimLeft(value, "\t\n\v\f\r "))
}
res, err := httpClient.Do(req)
if err != nil {
continue
}
// TODO: Read response body.
res.Body.Close()
if res.StatusCode == http.StatusOK {
result.Verified = true
break
}
}
results = append(results, result)
}

return nil
return results, nil
}

func NewCustomRegex(pb *custom_detectorspb.CustomRegex) (customRegex, error) {
// TODO: Return all validation errors.
if err := ValidateKeywords(pb.Keywords); err != nil {
return nil, err
}
func (c *customRegexWebhook) Keywords() []string {
return c.GetKeywords()
}

if err := ValidateRegex(pb.Regex); err != nil {
return nil, err
// productIndices produces a permutation of indices for each length. Example:
// productIndices(3, 2) -> [[0 0] [1 0] [2 0] [0 1] [1 1] [2 1]]. It returns
// a slice of length no larger than maxTotalMatches.
func productIndices(lengths ...int) [][]int {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: If we do some precomputations up front we get a nice little speed up.

// Find the max length and total number of permutations.
	maxLength := 0
	t := 1
	for _, l := range lengths {
		t *= l
		if l > maxLength {
			maxLength = l
		}
	}
	result := make([][]int, 0, t)
	for r := range result {
		result[r] = make([]int, 0, maxLength)
	}

	for _, length := range lengths {
		var nextResult [][]int
		for i := 0; i < length; i++ {
			// Append index to all existing results.
			for _, curResult := range result {
				nextResult = append(nextResult, append(curResult, i))
				if len(nextResult) >= maxTotalMatches {
					return nextResult
				}
			}
		}
		result = nextResult
	}
	return result

Using:

func BenchmarkProductIndices(b *testing.B) {
	for i := 0; i < b.N; i++ {
		_ = productIndices(3, 2, 6)
	}
}

Screen Shot 2022-12-13 at 10 23 24 AM
Screen Shot 2022-12-13 at 10 23 34 AM

With just 2,3,2 as the input:
Screen Shot 2022-12-13 at 10 05 01 AM
Screen Shot 2022-12-13 at 10 27 28 AM

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, that's way better!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ended up implementing a different algorithm that's a bit faster and still passes the tests:

» go test -bench . ./pkg/custom_detectors
BenchmarkProductIndices-8         512116              2325 ns/op

» go test -bench . ./pkg/custom_detectors
BenchmarkProductIndices-8        1355827               890.8 ns/op

I think the 126 ns/op you were getting was because it was always returning an empty result.. (also explains the 1 alloc).

count := 1
for _, l := range lengths {
count *= l
}
if count == 0 {
return nil
}
if count > maxTotalMatches {
count = maxTotalMatches
}

for _, verify := range pb.Verify {

if err := ValidateVerifyEndpoint(verify.Endpoint, verify.Unsafe); err != nil {
return nil, err
results := make([][]int, count)
for i := 0; i < count; i++ {
j := 1
result := make([]int, 0, len(lengths))
for _, l := range lengths {
result = append(result, (i/j)%l)
j *= l
}
results[i] = result
}
return results
}

if err := ValidateVerifyHeaders(verify.Headers); err != nil {
return nil, err
}
// permutateMatches converts the list of all regex matches into all possible
// permutations selecting one from each named entry in the map. For example:
// {"foo": [matchA, matchB], "bar": [matchC]} becomes
// [{"foo": matchA, "bar": matchC}, {"foo": matchB, "bar": matchC}]
func permutateMatches(regexMatches map[string][][]string) []map[string][]string {
// Get a consistent order for names and their matching lengths.
// The lengths are used in calculating the permutation so order matters.
names := make([]string, 0, len(regexMatches))
lengths := make([]int, 0, len(regexMatches))
for key, value := range regexMatches {
names = append(names, key)
lengths = append(lengths, len(value))
}

if err := ValidateVerifyRanges(verify.SuccessRanges); err != nil {
return nil, err
// Permutate all the indices for each match. For example, if "foo" has
// [matchA, matchB] and "bar" has [matchC], we will get indices [0 0] [1 0].
permutationIndices := productIndices(lengths...)

// Build {"foo": matchA, "bar": matchC} and {"foo": matchB, "bar": matchC}
// from the indices.
var matches []map[string][]string
for _, permutation := range permutationIndices {
candidate := make(map[string][]string, len(permutationIndices))
for i, name := range names {
candidate[name] = regexMatches[name][permutation[i]]
}
matches = append(matches, candidate)
}

if err := ValidateRegexVars(pb.Regex, append(verify.Headers, verify.Endpoint)...); err != nil {
return nil, err
}
return matches
}

// This function will check false positives for common test words, but also it
// will make sure the key appears 'random' enough to be a real key.
func isKnownFalsePositive(match map[string][]string) bool {
for _, values := range match {
for _, value := range values {
if detectors.IsKnownFalsePositive(value, detectors.DefaultFalsePositives, true) {
return true
}
}
}

return pb, nil
return false
}
Loading