Skip to content

Python: remove the imprecise container taint steps #17030

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,16 @@ predicate synthDictSplatParameterNodeReadStep(
)
}

private import semmle.python.Concepts

predicate decoderReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
exists(Decoding decoding |
nodeFrom = decoding.getAnInput() and
nodeTo = decoding.getOutput() and
c instanceof DictionaryElementContent
)
}

// =============================================================================
// PostUpdateNode
// =============================================================================
Expand Down Expand Up @@ -928,6 +938,8 @@ predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
or
VariableCapture::readStep(nodeFrom, c, nodeTo)
or
decoderReadStep(nodeFrom, c, nodeTo)
}

/** Data flows from a sequence to a subscript of the sequence. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,16 @@ predicate defaultTaintSanitizer(DataFlow::Node node) { none() }
* of `c` at sinks and inputs to additional taint steps.
*/
bindingset[node]
predicate defaultImplicitTaintRead(DataFlow::Node node, DataFlow::ContentSet c) { none() }
predicate defaultImplicitTaintRead(DataFlow::Node node, DataFlow::ContentSet c) {
// We allow implicit reads of precise content
// imprecise content has already bubled up.
exists(node) and
(
c instanceof DataFlow::TupleElementContent
or
c instanceof DataFlow::DictionaryElementContent
)
}

private module Cached {
/**
Expand Down Expand Up @@ -146,7 +155,7 @@ predicate stringManipulation(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeT
(
fmt.getLeft() = nodeFrom.getNode()
or
fmt.getRight() = nodeFrom.getNode()
fmt.getRight().getAChild*() = nodeFrom.getNode()
)
)
or
Expand All @@ -164,9 +173,7 @@ predicate stringManipulation(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeT

/**
* Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to containers
* (lists/sets/dictionaries): literals, constructor invocation, methods. Note that this
* is currently very imprecise, as an example, since we model `dict.get`, we treat any
* `<tainted object>.get(<arg>)` will be tainted, whether it's true or not.
* where we do not track content precisely anyway (lists/sets/sequences): literals, constructor invocation, methods
*/
predicate containerStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
// construction by literal
Expand All @@ -178,10 +185,6 @@ predicate containerStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
or
DataFlowPrivate::setStoreStep(nodeFrom, _, nodeTo)
or
DataFlowPrivate::tupleStoreStep(nodeFrom, _, nodeTo)
or
DataFlowPrivate::dictStoreStep(nodeFrom, _, nodeTo)
or
// comprehension, so there is taint-flow from `x` in `[x for x in xs]` to the
// resulting list of the list-comprehension.
//
Expand Down
71 changes: 67 additions & 4 deletions python/ql/lib/semmle/python/frameworks/Stdlib.qll
Original file line number Diff line number Diff line change
Expand Up @@ -4260,6 +4260,15 @@
output = "ReturnValue.ListElement" and
preservesValue = true
or
// in case we create a list from a container with precise elementt,
// we need to taint the list.
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
) and
// TODO: Once we have DictKeyContent, we need to transform that into taint
output = "ReturnValue" and
preservesValue = false
or
input = "Argument[0]" and
output = "ReturnValue" and
preservesValue = false
Expand Down Expand Up @@ -4708,8 +4717,13 @@
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[self].DictionaryElement[" + key + "]" and
output = "ReturnValue.ListElement" and
preservesValue = true
(
output = "ReturnValue.ListElement" and
preservesValue = true
or
output = "ReturnValue" and
preservesValue = false
)
)
or
input = "Argument[self]" and
Expand Down Expand Up @@ -4759,8 +4773,13 @@
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[self].DictionaryElement[" + key + "]" and
output = "ReturnValue.ListElement.TupleElement[1]" and
preservesValue = true
(
output = "ReturnValue.ListElement.TupleElement[1]" and
preservesValue = true
or
output = "ReturnValue" and
preservesValue = false
)
)
or
// TODO: Add the keys to output list
Expand Down Expand Up @@ -4826,6 +4845,50 @@
}
}

/**
* Flow summaries for string manipulation methods.
*/
Comment on lines +4848 to +4850

Check warning

Code scanning / CodeQL

Class QLDoc style. Warning

The QLDoc for a class should start with 'A', 'An', or 'The'.
class StringManipulation extends SummarizedCallable {
string method_name;

StringManipulation() {
this = "string." + method_name and
method_name in [
"capitalize", "casefold", "center", "expandtabs", "format", "format_map", "join", "ljust",
"lstrip", "lower", "replace", "rjust", "rstrip", "strip", "swapcase", "title", "upper",
"zfill", "encode", "decode"
]
}

override DataFlow::CallCfgNode getACall() {
result.(DataFlow::MethodCallNode).calls(_, method_name)
}

override DataFlow::ArgumentNode getACallback() {
result.(DataFlow::AttrRead).getAttributeName() = method_name
}

override predicate propagatesFlow(string input, string output, boolean preservesValue) {
input = "Argument[self]" and
output = "ReturnValue" and
preservesValue = false
or
method_name = "join" and
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i + "]"
) and
output = "ReturnValue" and
preservesValue = false
or
method_name = "format_map" and
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[0].DictionaryElement[" + key + "]"
) and
output = "ReturnValue" and
preservesValue = false
}
}

/**
* A flow summary for `os.getenv` / `os.getenvb`
*
Expand Down
96 changes: 89 additions & 7 deletions python/ql/test/experimental/meta/InlineTaintTest.qll
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
/**
* Defines a InlineExpectationsTest for checking whether any arguments in
* `ensure_tainted` and `ensure_not_tainted` calls are tainted.
* These allow us to monitor the progressive flow of taint through the program.
* To test whether taint would be observable at a sink, we can use `ensure_tainted_with_reads` and `ensure_not_tainted_with_reads`.
* Note that this function should only be called at the end of test functions, as it will
* allow read steps that could otherwise be observed via use/use-flow at later sinks.
*
* Also defines query predicates to ensure that:
* - if any arguments to `ensure_not_tainted` are tainted, their annotation is marked with `SPURIOUS`.
* - if any arguments to `ensure_tainted` are not tainted, their annotation is marked with `MISSING`.
* - if any arguments to `ensure_not_tainted`(`_with_reads`) are tainted, their annotation is marked with `SPURIOUS`.
* - if any arguments to `ensure_tainted`(`_with_reads`) are not tainted, their annotation is marked with `MISSING`.
* - there is no possible spurious read step from a reading sink to a later sink.
*
* The functionality of this module is tested in `ql/test/experimental/meta/inline-taint-test-demo`.
*/

import python
import semmle.python.dataflow.new.DataFlow
import semmle.python.dataflow.new.TaintTracking
import ProgressiveTaintTrackingTest
import semmle.python.dataflow.new.RemoteFlowSources
import TestUtilities.InlineExpectationsTest
private import semmle.python.dataflow.new.internal.PrintNode
private import semmle.python.dataflow.new.internal.TaintTrackingPrivate as RealTaintTracking

DataFlow::Node shouldBeTainted() {
exists(DataFlow::CallCfgNode call |
Expand All @@ -30,6 +36,26 @@ DataFlow::Node shouldNotBeTainted() {
)
}

DataFlow::Node shouldBeTaintedWithReads() {
exists(DataFlow::CallCfgNode call |
call.getFunction().asCfgNode().(NameNode).getId() = "ensure_tainted_with_reads" and
result in [call.getArg(_), call.getArgByName(_)]
)
}

DataFlow::Node shouldNotBeTaintedWithReads() {
exists(DataFlow::CallCfgNode call |
call.getFunction().asCfgNode().(NameNode).getId() = "ensure_not_tainted_with_reads" and
result in [call.getArg(_), call.getArgByName(_)]
)
}

DataFlow::Node nonReadingSink() { result in [shouldBeTainted(), shouldNotBeTainted()] }

DataFlow::Node readingSink() {
result in [shouldBeTaintedWithReads(), shouldNotBeTaintedWithReads()]
}

// this module allows the configuration to be imported in other `.ql` files without the
// top level query predicates of this file coming into scope.
module Conf {
Expand All @@ -48,10 +74,29 @@ module Conf {
source instanceof RemoteFlowSource
}

predicate isSink(DataFlow::Node sink) {
sink = shouldBeTainted()
or
sink = shouldNotBeTainted()
predicate isSink(DataFlow::Node sink) { sink in [nonReadingSink(), readingSink()] }

// This should emulate the implicit reads that happen during
// normal non-testing taint tracking; except that for sinks,
// we only want implicit reads for the ones we explicitly marked
// (with `ensure_tainted_with_reads` or `ensure_not_tainted_with_reads`).
//
// See `shared/dataflow/codeql/dataflow/TaintTracking.qll` and
// `AddTaintDefaults::allowImplicitRead`
predicate allowImplicitRead(DataFlow::Node node, DataFlow::ContentSet c) {
(
node = readingSink()
// or
// TODO: This is what we need to add to the real configurations
// any(TaintTracking::AdditionalTaintStep a).hasStep(node, _, _)
) and
// For our testing taint tracking, `defaultImplicitTaintRead` is `none`
RealTaintTracking::defaultImplicitTaintRead(node, c)
}

predicate test(DataFlow::Node node) {
allowImplicitRead(node, _) and
exists(node.getLocation().getFile().getRelativePath())
}
}
}
Expand Down Expand Up @@ -107,4 +152,41 @@ module MakeInlineTaintTest<DataFlow::ConfigSig Config> {
)
)
}

query predicate argumentToEnsureNotTaintedWithReadsNotMarkedAsSpurious(
Location location, string error, string element
) {
error = "ERROR, you should add `SPURIOUS:` to this annotation" and
location = shouldNotBeTaintedWithReads().getLocation() and
InlineTaintTest::hasActualResult(location, element, "tainted", _) and
exists(GoodTestExpectation good, ActualTestResult actualResult |
good.matchesActualResult(actualResult) and
actualResult.getLocation() = location and
actualResult.toString() = element
)
}

query predicate untaintedArgumentToEnsureTaintedWithReadsNotMarkedAsMissing(
Location location, string error, string element
) {
error = "ERROR, you should add `# $ MISSING: tainted` annotation" and
exists(DataFlow::Node sink |
sink = shouldBeTaintedWithReads() and
element = prettyExpr(sink.asExpr()) and
not Flow::flowTo(sink) and
location = sink.getLocation() and
not exists(FalseNegativeTestExpectation missingResult |
missingResult.getTag() = "tainted" and
missingResult.getLocation().getFile() = location.getFile() and
missingResult.getLocation().getStartLine() = location.getStartLine()
)
)
}

query predicate spuriousReadStepsPossible(DataFlow::Node readSink, DataFlow::Node sink) {
readSink = readingSink() and
sink in [readingSink(), nonReadingSink()] and
readSink != sink and
TaintTracking::localTaint(readSink, sink)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/**
* Provides classes for testing local (intra-procedural) and
* global (inter-procedural) taint-tracking analyses.
*
* This module intentionally turns off implicit read steps at sinks.
* This makes it possible to do successive tests of taint propagation,
* using sinks (such as arguments to `ensure_tainted`) as markers, and get a picture of where taint flows
* on its way from a source to a sink.
*
* To use global (interprocedural) taint tracking, extend the class
* `TaintTracking::Configuration` as documented on that class. To use local
* (intraprocedural) taint tracking, call `TaintTracking::localTaint` or
* `TaintTracking::localTaintStep` with arguments of type `DataFlow::Node`.
*/

private import python

/**
* Provides classes for performing local (intra-procedural) and
* global (inter-procedural) taint-tracking analyses.
*/
module TaintTracking {
import semmle.python.dataflow.new.internal.tainttracking1.TaintTrackingParameter::Public

Check warning

Code scanning / CodeQL

Redundant import Warning test

Redundant import, the module is already imported inside
semmle.python.dataflow.new.internal.tainttracking1.TaintTrackingImpl
.
private import semmle.python.dataflow.new.internal.DataFlowImplSpecific
private import codeql.dataflow.TaintTracking
private import semmle.python.Files

module PythonTaintTracking implements InputSig<Location, PythonDataFlow> {
private import semmle.python.dataflow.new.internal.TaintTrackingPrivate as TP

/**
* Holds if `node` should be a sanitizer in all global taint flow configurations
* but not in local taint.
*/
predicate defaultTaintSanitizer(PythonDataFlow::Node node) { TP::defaultTaintSanitizer(node) }

/**
* Holds if default `TaintTracking::Configuration`s should allow implicit reads
* of `c` at sinks and inputs to additional taint steps.
*/
bindingset[node]
predicate defaultImplicitTaintRead(PythonDataFlow::Node node, PythonDataFlow::ContentSet c) {
// Normally, we allow implicit reads of precise content,
// but for taint tests, we turn this off.
none()
}

predicate defaultAdditionalTaintStep(
PythonDataFlow::Node nodeFrom, PythonDataFlow::Node nodeTo, string model
) {
TP::defaultAdditionalTaintStep(nodeFrom, nodeTo, model)
}
}

import TaintFlowMake<Location, PythonDataFlow, PythonTaintTracking>
import semmle.python.dataflow.new.internal.tainttracking1.TaintTrackingImpl
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import experimental.meta.InlineTaintTest::Conf
module Config implements DataFlow::ConfigSig {
predicate isSource(DataFlow::Node source) { TestTaintTrackingConfig::isSource(source) }

predicate isSink(DataFlow::Node source) { TestTaintTrackingConfig::isSink(source) }
predicate isSink(DataFlow::Node sink) { TestTaintTrackingConfig::isSink(sink) }
}

module Flows = TaintTracking::Global<Config>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ argumentToEnsureNotTaintedNotMarkedAsSpurious
untaintedArgumentToEnsureTaintedNotMarkedAsMissing
| taint_test.py:32:9:32:25 | taint_test.py:32 | ERROR, you should add `# $ MISSING: tainted` annotation | should_be_tainted |
| taint_test.py:37:24:37:40 | taint_test.py:37 | ERROR, you should add `# $ MISSING: tainted` annotation | should_be_tainted |
argumentToEnsureNotTaintedWithReadsNotMarkedAsSpurious
untaintedArgumentToEnsureTaintedWithReadsNotMarkedAsMissing
spuriousReadStepsPossible
| taint_test.py:68:9:68:10 | ControlFlowNode for tt | taint_test.py:77:9:77:13 | ControlFlowNode for Subscript |
testFailures
| taint_test.py:41:20:41:21 | ts | Fixed missing result:tainted= |
failures
Loading
Loading