-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Overhaul npm detector #2264
base: main
Are you sure you want to change the base?
Overhaul npm detector #2264
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
package npm | ||
|
||
import ( | ||
"context" | ||
"crypto/tls" | ||
"encoding/json" | ||
"errors" | ||
"fmt" | ||
"io" | ||
"net/http" | ||
"strings" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/common" | ||
) | ||
|
||
var defaultClient = common.SaneHttpClient() | ||
|
||
type npmScanner struct { | ||
client *http.Client | ||
} | ||
|
||
// verifyToken attempts to verify a |token| by finding the associated registry URL in |data|. | ||
// It returns three values: | ||
// 1. whether the token is valid | ||
// 2. data associated with the token | ||
// 3. any errors encountered during verification | ||
func (s npmScanner) verifyToken(ctx context.Context, data string, token string) (bool, map[string]string, error) { | ||
if s.client == nil { | ||
s.client = defaultClient | ||
} | ||
|
||
registry := findTokenRegistry(data, token) | ||
if registry != nil { | ||
// A high confidence match was found, attempt to verify the token against it. | ||
// e.g., |token|="s3cret" and |data| contains "//npm.company.com/:_authToken=s3cret". | ||
// TODO: Handle multiple high confidence matches | ||
return doVerification(ctx, s.client, registry, token) | ||
} else { | ||
// A high confidence match was not found. | ||
// Attempt to verify the token against any registries we can find. | ||
var ( | ||
registries = findAllRegistryURLs(data) | ||
errs = make([]error, 0, len(registries)) | ||
|
||
verified bool | ||
extraData map[string]string | ||
err error | ||
) | ||
for _, registry := range registries { | ||
verified, extraData, err = doVerification(ctx, s.client, registry, token) | ||
if verified { | ||
return true, extraData, err | ||
} | ||
if err != nil { | ||
errs = append(errs, err) | ||
} | ||
} | ||
return false, nil, errors.Join(errs...) | ||
} | ||
} | ||
|
||
// Most repositories implement a "whoami" endpoint | ||
// that returns the username of the authenticated user. | ||
type whoamiResponse struct { | ||
Username string `json:"username"` | ||
} | ||
|
||
// doVerification checks whether |token| is valid for the given |registry|. | ||
func doVerification(ctx context.Context, client *http.Client, registry *registryInfo, token string) (bool, map[string]string, error) { | ||
// Construct and send request. | ||
scheme := registry.Scheme.Prefix() | ||
if registry.Scheme == unknown { | ||
scheme = isHttps.Prefix() | ||
} | ||
reqUrl := fmt.Sprintf("%s%s/-/whoami", scheme, registry.Uri) | ||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqUrl, nil) | ||
if err != nil { | ||
return false, nil, fmt.Errorf("failed to construct request: %s", err) | ||
} | ||
|
||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token)) | ||
res, err := client.Do(req) | ||
if err != nil { | ||
// A |tls.RecordHeaderError| likely means that the server is using HTTP, not HTTPS. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Am I paranoid to be concerned by an automatic fallback here? It seems like we should be trying to the scheme in the original URI and failing if https is expected but not present. Am I thinking about this right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In some circumstances it isn't possible to determine the scheme of the registry URI. For example,
If that information ( We could limit the automation fallback to cases where the scheme is |
||
// TODO: Is it possible to handle the reverse case? | ||
var tlsErr tls.RecordHeaderError | ||
if errors.As(err, &tlsErr) && registry.Scheme == isHttps { | ||
r := *registry | ||
r.Scheme = isHttp | ||
return doVerification(ctx, client, &r, token) | ||
} | ||
return false, nil, fmt.Errorf("request to %s failed: %w", reqUrl, err) | ||
} | ||
defer func() { | ||
_, _ = io.Copy(io.Discard, res.Body) | ||
_ = res.Body.Close() | ||
}() | ||
|
||
// Handle the response. | ||
if res.StatusCode == http.StatusOK { | ||
body, _ := io.ReadAll(res.Body) | ||
whoamiRes := whoamiResponse{} | ||
if err := json.Unmarshal(body, &whoamiRes); err != nil { | ||
if json.Valid(body) { | ||
return false, nil, fmt.Errorf("failed to decode response %s: %w", reqUrl, err) | ||
} else { | ||
// If the response isn't JSON it's highly unlikely to be valid. | ||
return false, nil, nil | ||
} | ||
} | ||
|
||
// It is possible for the response to be `{"username": null}`, `{"username":""}`, etc. | ||
// While a valid token _can_ return an empty username, the registry is likely returning 200 for invalid auth. | ||
// TODO: Write a test for this. | ||
if whoamiRes.Username == "" || | ||
(registry.RegistryType == nexusRepo3 && strings.HasPrefix(whoamiRes.Username, "anonymous")) || | ||
(registry.RegistryType == jetbrains && whoamiRes.Username == "internal") { | ||
req.Header.Del("Authorization") | ||
res2, err := client.Do(req) | ||
if err != nil { | ||
return false, nil, fmt.Errorf("request failed for %s: %w", reqUrl, err) | ||
} | ||
_, _ = io.Copy(io.Discard, res.Body) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely. 😅 |
||
_ = res2.Body.Close() | ||
|
||
if res2.StatusCode == http.StatusOK { | ||
return false, nil, nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tell me if I've got this right:
If I got that correctly, then it took me a bit to understand it. (And if I didn't, then I definitely didn't understand it!) I think I would have figured it out faster if this logic were factored out into a named function, but maybe there's another approach that would clarify it as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Your understanding is correct, but the verification logic is probably going to get even more convoluted, so it would make sense to extract this into its own function. Some registries do not implement the whoami endpoint and return a 404. Others, like e.g., I have encountered dozens of Azure and Google registry URLs but they all return 404. I don't know if that means the tokens are invalid, the registry doesn't implement the endpoint, or the registry no longer exists. |
||
} | ||
} | ||
|
||
data := map[string]string{ | ||
"registry_type": registry.RegistryType.String(), | ||
"registry_url": registry.Uri, | ||
rosecodym marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"username": whoamiRes.Username, | ||
"rotation_guide": "https://howtorotate.com/docs/tutorials/npm/", | ||
} | ||
return true, data, nil | ||
} else if res.StatusCode == http.StatusUnauthorized || | ||
(registry.RegistryType == github && res.StatusCode == http.StatusForbidden) { | ||
// Token is not valid. | ||
return false, nil, nil | ||
} else { | ||
// Here be dragons. | ||
return false, nil, fmt.Errorf("unexpected response status %d for %s", res.StatusCode, reqUrl) | ||
} | ||
} | ||
|
||
// firstNonEmptyMatch returns the index and value of the first non-empty match. | ||
// If no non-empty match is found, it will return: 0, "". | ||
func firstNonEmptyMatch(matches []string, skip int) (int, string) { | ||
if len(matches) < skip { | ||
return 0, "" | ||
} | ||
// The first index is the entire matched string. | ||
for i, val := range matches[skip:] { | ||
if val != "" { | ||
return i + skip, val | ||
} | ||
} | ||
return 0, "" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package npm | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" | ||
) | ||
|
||
type npmPatternTestCase struct { | ||
input string | ||
expected string | ||
} | ||
|
||
func testPattern(t *testing.T, d detectors.Detector, tests map[string]npmPatternTestCase) { | ||
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d}) | ||
|
||
for name, test := range tests { | ||
t.Run(name, func(t *testing.T) { | ||
chunkSpecificDetectors := make(map[ahocorasick.DetectorKey]detectors.Detector, 2) | ||
ahoCorasickCore.PopulateMatchingDetectors(test.input, chunkSpecificDetectors) | ||
if len(chunkSpecificDetectors) == 0 { | ||
t.Errorf("keywords '%v' not matched by %s", d.Keywords(), test.input) | ||
return | ||
} | ||
|
||
results, err := d.FromData(context.Background(), false, []byte(test.input)) | ||
if err != nil { | ||
t.Errorf("error = %v", err) | ||
return | ||
} | ||
|
||
if len(results) == 0 { | ||
if test.expected != "" { | ||
t.Error("did not receive result") | ||
} | ||
return | ||
} | ||
|
||
actual := string(results[0].Raw) | ||
if test.expected != actual { | ||
t.Errorf("expected '%s' != actual '%s'", test.expected, actual) | ||
} | ||
}) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
package npm | ||
|
||
import ( | ||
"context" | ||
"regexp" | ||
"strings" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" | ||
) | ||
|
||
type ScannerGeneric struct { | ||
npmScanner | ||
} | ||
|
||
// Ensure the Scanner satisfies the interfaces at compile time. | ||
var _ interface { | ||
detectors.Detector | ||
detectors.Versioner | ||
} = (*ScannerGeneric)(nil) | ||
|
||
func (s ScannerGeneric) Version() int { return 0 } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Version 0 might not be the most idiomatic way to approach this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mcastorina is there any reason not to use an explicit version 0 when also using versioned detectors? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. iirc the way we implemented filtering depends on the version being non-zero, so you wouldn't be able to exclude/ignore just the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added a test here to codify that assumption. |
||
|
||
// genericKeyPat should match all possible values for .npmrc auth tokens. | ||
// TODO: Ensure this works with Yarn and UPM configurations. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this PR degrade any existing functionality? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've done some further testing and having dedicated scanners for each pattern type ( i.e., is it confusing to have one detector type with numerical versions that are completely unrelated (npm version 1 & 2 are siblings, version 3 is yarn, version 4 is rennovate, version 5 is a hypothetical update to the yarn config, etc.) |
||
var genericKeyPat = regexp.MustCompile(`(?:_authToken|(?i:npm[_\-.]?token))['"]?[ \t]*[=:]?(?:[ \t]*['"]?)?([a-zA-Z0-9\-_.+=/]{5,})`) | ||
|
||
// Keywords are used for efficiently pre-filtering chunks. | ||
// Use identifiers in the secret preferably, or the provider name. | ||
func (s ScannerGeneric) Keywords() []string { | ||
return []string{"_authToken", "npm_token", "npm-token", "npm.token"} | ||
} | ||
|
||
// FromData will find and optionally verify secrets in a given set of bytes. | ||
func (s ScannerGeneric) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { | ||
dataStr := string(data) | ||
|
||
// Deduplicate results for more efficient handling. | ||
tokens := make(map[string]struct{}) | ||
for _, match := range genericKeyPat.FindAllStringSubmatch(dataStr, -1) { | ||
t := match[1] | ||
// Ignore results that can be handled by the v1 or v2 detectors. | ||
if strings.HasPrefix(t, "NpmToken.") || strings.HasPrefix(t, "npm_") { | ||
continue | ||
} | ||
tokens[t] = struct{}{} | ||
} | ||
if len(tokens) == 0 { | ||
return | ||
} | ||
|
||
// Iterate through results. | ||
for token := range tokens { | ||
s1 := detectors.Result{ | ||
DetectorType: s.Type(), | ||
Raw: []byte(token), | ||
} | ||
|
||
if verify { | ||
verified, extraData, vErr := s.verifyToken(ctx, dataStr, token) | ||
s1.Verified = verified | ||
s1.ExtraData = extraData | ||
s1.SetVerificationError(vErr) | ||
} | ||
|
||
// This function will check false positives for common test words, but also it will make sure the key appears 'random' enough to be a real key. | ||
if !s1.Verified && detectors.IsKnownFalsePositive(token, detectors.DefaultFalsePositives, true) { | ||
continue | ||
} | ||
|
||
results = append(results, s1) | ||
} | ||
return | ||
} | ||
|
||
func (s ScannerGeneric) Type() detectorspb.DetectorType { | ||
return detectorspb.DetectorType_NpmToken | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tell me if I'm reading this right: Any verification errors returned from any registry will propagate to the final result, even if any (other) individual registry returns a determinate failure (i.e. no error.) This means that if any registry reports indeterminate verification, then either the token will be verified or indeterminately unverified.
If I'm right, this is an issue, because it means that tokens can plausibly be impossible to determinately revoke. (A similar problem exists with the JDBC detector.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sort of. The 'happy path' is that a token is associated with a single known registry(1) and we get either a determinate verification or failure. If the token is associated with a single registry but it isn't a 'known' pattern(2), it's possible to encounter an indeterminate failure caused by implementation-specific differences.
The worst-case scenario is that a token cannot be correlated with a single registry, and there are multiple potential registries1. In that case, it can be difficult to tell whether something determinately or indeterminately failed. Some implementations are extremely sensitive, with things as small as a trailing slash being the difference between a 200 API response and a generic 400 HTML error page.
So you can either report failures for all attempted URLs, or report none of them and just mark the token as unverified2. Realistically, this is a problem when any multi-part secret has multiple potential matches. For example, Azure is a combination of secret (distinct) + clientId (uuid) + tenantId (uuid); if it fails and you have multiple UUIDs is it better to report ALL possible combinations or none of them?
Footnotes
Honestly, the detector that's most likely to trigger this right now is the existing npm v1 one that just searches for
npm.+{uuid}
. It's prone to false positives — even GitHub Advanced Security flags random UUIDs as npm tokens. ↩prior versions had more logic to handle invalid registries (e.g., being redirected to https://landing.jfrog.com/reactivate-server/test or a HTML 404 page). That was a bit complicated to refactor so I ended up dropping it. ↩