Skip to content
Permalink
Browse files Browse the repository at this point in the history
Fix tf.io.decode_raw bugs and update documentation.
Fixes cases where specifying `fixed_length` resulted in data loss and even segfault and corruption of the Python interpreter. The fix is subtle but needed due to pointer arithmetic rules.

Makes sure that `fixed_length` does not change the output when present but not needed.

Eliminates needless copy and cast in the main codepath.

PiperOrigin-RevId: 371322725
Change-Id: I514ef67a2961c86422f69d05122d31615e87896c
  • Loading branch information
mihaimaruseac authored and tensorflower-gardener committed Apr 30, 2021
1 parent 1d8903e commit 698e015
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 27 deletions.
21 changes: 12 additions & 9 deletions tensorflow/core/kernels/decode_padded_raw_op.cc
Expand Up @@ -19,6 +19,7 @@ limitations under the License.
#include "tensorflow/core/framework/common_shape_fns.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/op_requires.h"
#include "tensorflow/core/framework/shape_inference.h"

namespace tensorflow {
Expand Down Expand Up @@ -83,14 +84,13 @@ class DecodePaddedRawOp : public OpKernel {
// can copy the memory directly.
if (!convert_data_endianness_ || sizeof(T) == 1) {
for (int64 i = 0; i < flat_in.size(); ++i) {
const T* in_data = reinterpret_cast<const T*>(flat_in(i).data());

if (flat_in(i).size() > fixed_length) {
memcpy(out_data, in_data, fixed_length);
} else {
memcpy(out_data, in_data, flat_in(i).size());
}
out_data += fixed_length;
const auto to_copy =
std::min(flat_in(i).size(), static_cast<size_t>(fixed_length));
memcpy(out_data, flat_in(i).data(), to_copy);
// Note: increase out_data by width since it's already of type T* so
// each shift amount is implicitly multiplied by sizeof(T) according to
// pointer arithmetic rules.
out_data += width;
}
} else {
// Otherwise, the data is not in the host's byte order, and rather than a
Expand All @@ -105,7 +105,10 @@ class DecodePaddedRawOp : public OpKernel {
p_in += sizeof(T), p_out += sizeof(T)) {
std::reverse_copy(p_in, p_in + sizeof(T), p_out);
}
out_data += fixed_length;
// Note: increase out_data by width since it's already of type T* so
// each shift amount is implicitly multiplied by sizeof(T) according to
// pointer arithmetic rules.
out_data += width;
}
}
}
Expand Down
49 changes: 31 additions & 18 deletions tensorflow/python/ops/parsing_ops.py
Expand Up @@ -850,8 +850,8 @@ def decode_raw(input_bytes,
name=None):
r"""Convert raw bytes from input tensor into numeric tensors.
The input tensor is interpreted as a sequence of bytes. These bytes are then
decoded as numbers in the format specified by `out_type`.
Every component of the input tensor is interpreted as a sequence of bytes.
These bytes are then decoded as numbers in the format specified by `out_type`.
>>> tf.io.decode_raw(tf.constant("1"), tf.uint8)
<tf.Tensor: shape=(1,), dtype=uint8, numpy=array([49], dtype=uint8)>
Expand Down Expand Up @@ -909,22 +909,35 @@ def decode_raw(input_bytes,
>>> tf.io.decode_raw(tf.constant(["1212"]), tf.uint16, fixed_length=4)
<tf.Tensor: shape=(1, 2), dtype=uint16, numpy=array([[12849, 12849]], ...
Note: There is currently a bug in `fixed_length` that can result in data loss:
>>> # truncated to length of type as it matches fixed_length
>>> tf.io.decode_raw(tf.constant(["1212"]), tf.uint16, fixed_length=2)
<tf.Tensor: shape=(1, 1), dtype=uint16, numpy=array([[12849]], dtype=uint16)>
>>> # ignores the second component
>>> tf.io.decode_raw(tf.constant(["12","34"]), tf.uint16, fixed_length=2)
<tf.Tensor: shape=(2, 1), dtype=uint16, numpy=
array([[12849],
[ 0]], dtype=uint16)>
>>> tf.io.decode_raw(tf.constant(["12","34"]), tf.uint16, fixed_length=4)
<tf.Tensor: shape=(2, 2), dtype=uint16, numpy=
array([[12849, 0],
[ 0, 0]], dtype=uint16)>
This will be fixed on a future release of TensorFlow.
If the input value is larger than `fixed_length`, it is truncated:
>>> x=''.join([chr(1), chr(2), chr(3), chr(4)])
>>> tf.io.decode_raw(x, tf.uint16, fixed_length=2)
<tf.Tensor: shape=(1,), dtype=uint16, numpy=array([513], dtype=uint16)>
>>> hex(513)
'0x201'
If `little_endian` and `fixed_length` are specified, truncation to the fixed
length occurs before endianness conversion:
>>> x=''.join([chr(1), chr(2), chr(3), chr(4)])
>>> tf.io.decode_raw(x, tf.uint16, fixed_length=2, little_endian=False)
<tf.Tensor: shape=(1,), dtype=uint16, numpy=array([258], dtype=uint16)>
>>> hex(258)
'0x102'
If input values all have the same length, then specifying `fixed_length`
equal to the size of the strings should not change output:
>>> x = ["12345678", "87654321"]
>>> tf.io.decode_raw(x, tf.int16)
<tf.Tensor: shape=(2, 4), dtype=int16, numpy=
array([[12849, 13363, 13877, 14391],
[14136, 13622, 13108, 12594]], dtype=int16)>
>>> tf.io.decode_raw(x, tf.int16, fixed_length=len(x[0]))
<tf.Tensor: shape=(2, 4), dtype=int16, numpy=
array([[12849, 13363, 13877, 14391],
[14136, 13622, 13108, 12594]], dtype=int16)>
Args:
input_bytes:
Expand Down

0 comments on commit 698e015

Please sign in to comment.