Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Implementation of Neural Net (NN) functions."""
import math
from tensorflow.python.distribute import distribute_lib
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import array_ops_stack
from tensorflow.python.ops import candidate_sampling_ops
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import cond as tf_cond
from tensorflow.python.ops import custom_gradient
from tensorflow.python.ops import embedding_ops
from tensorflow.python.ops import gen_array_ops # pylint: disable=unused-import
from tensorflow.python.ops import gen_nn_ops
from tensorflow.python.ops import gen_sparse_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import variables
from tensorflow.python.ops.losses import util as losses_util
from tensorflow.python.platform import device_context
from tensorflow.python.util import dispatch
from tensorflow.python.util.deprecation import deprecated_args
from tensorflow.python.util.deprecation import deprecated_argument_lookup
from tensorflow.python.util.tf_export import tf_export
@tf_export("nn.log_poisson_loss")
@dispatch.add_dispatch_support
def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
"""Computes log Poisson loss given `log_input`.
Gives the log-likelihood loss between the prediction and the target under the
assumption that the target has a Poisson distribution.
Caveat: By default, this is not the exact loss, but the loss minus a
constant term [log(z!)]. That has no effect for optimization, but
does not play well with relative loss comparisons. To compute an
approximation of the log factorial term, specify
compute_full_loss=True to enable Stirling's Approximation.
For brevity, let `c = log(x) = log_input`, `z = targets`. The log Poisson
loss is
-log(exp(-x) * (x^z) / z!)
= -log(exp(-x) * (x^z)) + log(z!)
~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
[ Note the second term is the Stirling's Approximation for log(z!).
It is invariant to x and does not affect optimization, though
important for correct relative loss comparisons. It is only
computed when compute_full_loss == True. ]
= x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
= exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
Args:
targets: A `Tensor` of the same type and shape as `log_input`.
log_input: A `Tensor` of type `float32` or `float64`.
compute_full_loss: whether to compute the full loss. If false, a constant
term is dropped in favor of more efficient optimization.
name: A name for the operation (optional).
Returns:
A `Tensor` of the same shape as `log_input` with the componentwise
logistic losses.
Raises:
ValueError: If `log_input` and `targets` do not have the same shape.
"""
with ops.name_scope(name, "log_poisson_loss", [log_input, targets]) as name:
log_input = ops.convert_to_tensor(log_input, name="log_input")
targets = ops.convert_to_tensor(targets, name="targets")
try:
targets.get_shape().assert_is_compatible_with(log_input.get_shape())
except ValueError:
raise ValueError(
"`log_input` and `targets` must have the same shape, received "
f"({log_input.get_shape()} vs {targets.get_shape()}).")
result = math_ops.exp(log_input) - log_input * targets
if compute_full_loss:
# need to create constant tensors here so that their dtypes can be matched
# to that of the targets.
point_five = constant_op.constant(0.5, dtype=targets.dtype)
two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype)
stirling_approx = (targets * math_ops.log(targets)) - targets + (
point_five * math_ops.log(two_pi * targets))
zeros = array_ops.zeros_like(targets, dtype=targets.dtype)
ones = array_ops.ones_like(targets, dtype=targets.dtype)
cond = math_ops.logical_and(targets >= zeros, targets <= ones)
result += array_ops.where(cond, zeros, stirling_approx)
return result
@tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
@dispatch.add_dispatch_support
def sigmoid_cross_entropy_with_logits(
labels=None,
logits=None,
name=None):
"""See sigmoid_cross_entropy_with_logits_v2."""
# pylint: disable=protected-access
nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", labels, logits)
# pylint: enable=protected-access
with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
logits = ops.convert_to_tensor(logits, name="logits")
labels = ops.convert_to_tensor(labels, name="labels")
try:
labels.get_shape().assert_is_compatible_with(logits.get_shape())
except ValueError:
raise ValueError("`logits` and `labels` must have the same shape, "
f"received ({logits.get_shape()} vs "
f"{labels.get_shape()}).")
# The logistic loss formula from above is
# x - x * z + log(1 + exp(-x))
# For x < 0, a more numerically stable formula is
# -x * z + log(1 + exp(x))
# Note that these two expressions can be combined into the following:
# max(x, 0) - x * z + log(1 + exp(-abs(x)))
# To allow computing gradients at zero, we define custom versions of max and
# abs functions.
zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
cond = (logits >= zeros)
relu_logits = array_ops.where(cond, logits, zeros)
neg_abs_logits = array_ops.where(cond, -logits, logits) # pylint: disable=invalid-unary-operand-type
return math_ops.add(
relu_logits - logits * labels,
math_ops.log1p(math_ops.exp(neg_abs_logits)),
name=name)
# Note: intentionally calling this v2 to not allow existing code with indirect
# imports to ignore the sentinel behavior.
@tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])
@dispatch.register_binary_elementwise_api
@dispatch.add_dispatch_support
def sigmoid_cross_entropy_with_logits_v2( # pylint: disable=invalid-name
labels=None,
logits=None,
name=None):
r"""Computes sigmoid cross entropy given `logits`.
Measures the probability error in tasks with two outcomes in which each
outcome is independent and need not have a fully certain label. For instance,
one could perform a regression where the probability of an event happening is
known and used as a label. This loss may also be used for binary
classification, where labels are either zero or one.
For brevity, let `x = logits`, `z = labels`. The logistic loss is
z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
= z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
= z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
= z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
= (1 - z) * x + log(1 + exp(-x))
= x - x * z + log(1 + exp(-x))
For x < 0, to avoid overflow in exp(-x), we reformulate the above
x - x * z + log(1 + exp(-x))
= log(exp(x)) - x * z + log(1 + exp(-x))
= - x * z + log(1 + exp(x))
Hence, to ensure stability and avoid overflow, the implementation uses this
equivalent formulation
max(x, 0) - x * z + log(1 + exp(-abs(x)))
`logits` and `labels` must have the same type and shape.
>>> logits = tf.constant([1., -1., 0., 1., -1., 0., 0.])
>>> labels = tf.constant([0., 0., 0., 1., 1., 1., 0.5])
>>> tf.nn.sigmoid_cross_entropy_with_logits(
... labels=labels, logits=logits).numpy()
array([1.3132617, 0.3132617, 0.6931472, 0.3132617, 1.3132617, 0.6931472,
0.6931472], dtype=float32)
Compared to the losses which handle multiple outcomes,
`tf.nn.softmax_cross_entropy_with_logits` for general multi-class
classification and `tf.nn.sparse_softmax_cross_entropy_with_logits` for more
efficient multi-class classification with hard labels,
`sigmoid_cross_entropy_with_logits` is a slight simplification for binary
classification:
sigmoid(x) = softmax([x, 0])[0]
$$\frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + e^0}$$
While `sigmoid_cross_entropy_with_logits` works for soft binary labels
(probabilities between 0 and 1), it can also be used for binary classification
where the labels are hard. There is an equivalence between all three symbols
in this case, with a probability 0 indicating the second class or 1 indicating
the first class:
>>> sigmoid_logits = tf.constant([1., -1., 0.])
>>> softmax_logits = tf.stack([sigmoid_logits, tf.zeros_like(sigmoid_logits)],
... axis=-1)
>>> soft_binary_labels = tf.constant([1., 1., 0.])
>>> soft_multiclass_labels = tf.stack(
... [soft_binary_labels, 1. - soft_binary_labels], axis=-1)
>>> hard_labels = tf.constant([0, 0, 1])
>>> tf.nn.sparse_softmax_cross_entropy_with_logits(
... labels=hard_labels, logits=softmax_logits).numpy()
array([0.31326166, 1.3132616 , 0.6931472 ], dtype=float32)
>>> tf.nn.softmax_cross_entropy_with_logits(
... labels=soft_multiclass_labels, logits=softmax_logits).numpy()
array([0.31326166, 1.3132616, 0.6931472], dtype=float32)
>>> tf.nn.sigmoid_cross_entropy_with_logits(
... labels=soft_binary_labels, logits=sigmoid_logits).numpy()
array([0.31326166, 1.3132616, 0.6931472], dtype=float32)
Args:
labels: A `Tensor` of the same type and shape as `logits`. Between 0 and 1,
inclusive.
logits: A `Tensor` of type `float32` or `float64`. Any real number.
name: A name for the operation (optional).
Returns:
A `Tensor` of the same shape as `logits` with the componentwise
logistic losses.
Raises:
ValueError: If `logits` and `labels` do not have the same shape.
"""
return sigmoid_cross_entropy_with_logits(
logits=logits, labels=labels, name=name)
sigmoid_cross_entropy_with_logits.__doc__ = (
sigmoid_cross_entropy_with_logits_v2.__doc__)
@tf_export("nn.weighted_cross_entropy_with_logits", v1=[])
@dispatch.add_dispatch_support
def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
name=None):
"""Computes a weighted cross entropy.
This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
allows one to trade off recall and precision by up- or down-weighting the
cost of a positive error relative to a negative error.
The usual cross-entropy cost is defined as:
labels * -log(sigmoid(logits)) +
(1 - labels) * -log(1 - sigmoid(logits))
A value `pos_weight > 1` decreases the false negative count, hence increasing
the recall.
Conversely setting `pos_weight < 1` decreases the false positive count and
increases the precision.
This can be seen from the fact that `pos_weight` is introduced as a
multiplicative coefficient for the positive labels term
in the loss expression:
labels * -log(sigmoid(logits)) * pos_weight +
(1 - labels) * -log(1 - sigmoid(logits))
For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
The loss is:
qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
= qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
= qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
= qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
= (1 - z) * x + (qz + 1 - z) * log(1 + exp(-x))
= (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
the implementation uses
(1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
`logits` and `labels` must have the same type and shape.
>>> labels = tf.constant([1., 0.5, 0.])
>>> logits = tf.constant([1.5, -0.1, -10.])
>>> tf.nn.weighted_cross_entropy_with_logits(
... labels=labels, logits=logits, pos_weight=tf.constant(1.5)).numpy()
array([3.0211994e-01, 8.8049585e-01, 4.5776367e-05], dtype=float32)
>>> tf.nn.weighted_cross_entropy_with_logits(
... labels=labels, logits=logits, pos_weight=tf.constant(0.5)).numpy()
array([1.00706644e-01, 5.08297503e-01, 4.57763672e-05], dtype=float32)
Args:
labels: A `Tensor` of the same type and shape as `logits`, with values
between 0 and 1 inclusive.
logits: A `Tensor` of type `float32` or `float64`, any real numbers.
pos_weight: A coefficient to use on the positive examples, typically a
scalar but otherwise broadcastable to the shape of `logits`. Its value
should be non-negative.
name: A name for the operation (optional).
Returns:
A `Tensor` of the same shape as `logits` with the componentwise
weighted logistic losses.
Raises:
ValueError: If `logits` and `labels` do not have the same shape.
"""
with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
logits = ops.convert_to_tensor(logits, name="logits")
labels = ops.convert_to_tensor(labels, name="labels")
try:
labels.get_shape().assert_is_compatible_with(logits.get_shape())
except ValueError:
raise ValueError("`logits` and `labels` must have the same shape, "
f"received ({logits.get_shape()} vs "
f"{labels.get_shape()}).")
# The logistic loss formula from above is
# (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
# For x < 0, a more numerically stable formula is
# (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(x)) - l * x
# To avoid branching, we use the combined version
# (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
log_weight = 1 + (pos_weight - 1) * labels
return math_ops.add(
(1 - labels) * logits,
log_weight * (math_ops.log1p(math_ops.exp(-math_ops.abs(logits))) +
nn_ops.relu(-logits)), # pylint: disable=invalid-unary-operand-type
name=name)
@tf_export(v1=["nn.weighted_cross_entropy_with_logits"])
@dispatch.add_dispatch_support
@deprecated_args(None, "targets is deprecated, use labels instead", "targets")
def weighted_cross_entropy_with_logits(labels=None,
logits=None,
pos_weight=None,
name=None,
targets=None):
"""Computes a weighted cross entropy.
This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
allows one to trade off recall and precision by up- or down-weighting the
cost of a positive error relative to a negative error.
The usual cross-entropy cost is defined as:
labels * -log(sigmoid(logits)) +
(1 - labels) * -log(1 - sigmoid(logits))
A value `pos_weight > 1` decreases the false negative count, hence increasing
the recall.
Conversely setting `pos_weight < 1` decreases the false positive count and
increases the precision.
This can be seen from the fact that `pos_weight` is introduced as a
multiplicative coefficient for the positive labels term
in the loss expression:
labels * -log(sigmoid(logits)) * pos_weight +
(1 - labels) * -log(1 - sigmoid(logits))
For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
The loss is:
qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
= qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
= qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
= qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
= (1 - z) * x + (qz + 1 - z) * log(1 + exp(-x))
= (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
the implementation uses
(1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
`logits` and `labels` must have the same type and shape.
Args:
labels: A `Tensor` of the same type and shape as `logits`.
logits: A `Tensor` of type `float32` or `float64`.
pos_weight: A coefficient to use on the positive examples.
name: A name for the operation (optional).
targets: Deprecated alias for labels.
Returns:
A `Tensor` of the same shape as `logits` with the componentwise
weighted logistic losses.
Raises:
ValueError: If `logits` and `labels` do not have the same shape.
"""
labels = deprecated_argument_lookup("labels", labels, "targets", targets)
return weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight, name)
@tf_export("nn.compute_average_loss")
@dispatch.add_dispatch_support
def compute_average_loss(per_example_loss,
sample_weight=None,
global_batch_size=None):
"""Scales per-example losses with sample_weights and computes their average.
Usage with distribution strategy and custom training loop:
```python
with strategy.scope():
def compute_loss(labels, predictions, sample_weight=None):
# If you are using a `Loss` class instead, set reduction to `NONE` so that
# we can do the reduction afterwards and divide by global batch size.
per_example_loss = tf.keras.losses.sparse_categorical_crossentropy(
labels, predictions)
# Compute loss that is scaled by sample_weight and by global batch size.
return tf.nn.compute_average_loss(
per_example_loss,
sample_weight=sample_weight,
global_batch_size=GLOBAL_BATCH_SIZE)
```
Args:
per_example_loss: Per-example loss.
sample_weight: Optional weighting for each example.
global_batch_size: Optional global batch size value. Defaults to (size of
first dimension of `losses`) * (number of replicas).
Returns:
Scalar loss value, obtained by summing the `per_example_loss` and dividing
by `global_batch_size`. If `global_batch_size` is zero, the result is zero.
""" # pylint: disable=g-doc-exception
per_example_loss = ops.convert_to_tensor(per_example_loss)
input_dtype = per_example_loss.dtype
with losses_util.check_per_example_loss_rank(per_example_loss):
if sample_weight is not None:
sample_weight = ops.convert_to_tensor(sample_weight)
per_example_loss = losses_util.scale_losses_by_sample_weight(
per_example_loss, sample_weight)
per_example_loss = math_ops.cast(per_example_loss, input_dtype)
if global_batch_size is None:
if (distribute_lib.has_strategy()
and distribute_lib.in_cross_replica_context()):
raise RuntimeError(
"You are calling `compute_average_loss` in cross replica context, "
"while it was expected to be called in replica context.")
num_replicas = distribute_lib.get_strategy().num_replicas_in_sync
per_replica_batch_size = array_ops.shape_v2(per_example_loss)[0]
global_batch_size = per_replica_batch_size * num_replicas
check_ops.assert_scalar_v2(
global_batch_size, message="global_batch_size must be scalar.")
check_ops.assert_integer_v2(
global_batch_size,
message="global_batch_size must be an integer.")
check_ops.assert_non_negative_v2(
global_batch_size, message="global_batch_size must be non-negative.")
loss = math_ops.reduce_sum(per_example_loss)
global_batch_size = math_ops.cast(global_batch_size, input_dtype)
return math_ops.div_no_nan(loss, global_batch_size)
@tf_export("nn.scale_regularization_loss")
@dispatch.add_dispatch_support
def scale_regularization_loss(regularization_loss):
"""Scales the sum of the given regularization losses by number of replicas.
Usage with distribution strategy and custom training loop:
```python
with strategy.scope():
def compute_loss(self, label, predictions):
per_example_loss = tf.keras.losses.sparse_categorical_crossentropy(
labels, predictions)
# Compute loss that is scaled by sample_weight and by global batch size.
loss = tf.nn.compute_average_loss(
per_example_loss,
sample_weight=sample_weight,
global_batch_size=GLOBAL_BATCH_SIZE)
# Add scaled regularization losses.
loss += tf.nn.scale_regularization_loss(tf.nn.l2_loss(weights))
return loss
```
Args:
regularization_loss: Regularization loss.
Returns:
Scalar loss value.
""" # pylint: disable=g-doc-exception
if (distribute_lib.has_strategy()
and distribute_lib.in_cross_replica_context()):
raise RuntimeError(
"You are calling `scale_regularization_loss` in cross replica context, "
"while it was expected to be called in replica context.")
num_replicas = distribute_lib.get_strategy().num_replicas_in_sync
return math_ops.reduce_sum(regularization_loss) / num_replicas
@tf_export(v1=["nn.relu_layer"])
@dispatch.add_dispatch_support
def relu_layer(x, weights, biases, name=None):
"""Computes Relu(x * weight + biases).
Args:
x: a 2D tensor. Dimensions typically: batch, in_units
weights: a 2D tensor. Dimensions typically: in_units, out_units
biases: a 1D tensor. Dimensions: out_units
name: A name for the operation (optional). If not specified
"nn_relu_layer" is used.
Returns:
A 2-D Tensor computing relu(matmul(x, weights) + biases).
Dimensions typically: batch, out_units.
"""
with ops.name_scope(name, "relu_layer", [x, weights, biases]) as name:
x = ops.convert_to_tensor(x, name="x")
weights = ops.convert_to_tensor(weights, name="weights")
biases = ops.convert_to_tensor(biases, name="biases")
xw_plus_b = nn_ops.bias_add(math_ops.matmul(x, weights), biases)
return nn_ops.relu(xw_plus_b, name=name)
@tf_export("nn.silu", "nn.swish")
@dispatch.register_unary_elementwise_api
@dispatch.add_dispatch_support
def swish(features, beta=1.0):
# pylint: disable=g-doc-args
"""Computes the SiLU or Swish activation function: `x * sigmoid(beta * x)`.
beta : Hyperparameter for Swish activation function. Default value 1.0.
The SiLU activation function was introduced in "Gaussian Error Linear Units
(GELUs)" [Hendrycks et al. 2016](https://arxiv.org/abs/1606.08415) and
"Sigmoid-Weighted Linear Units for Neural Network Function Approximation in
Reinforcement Learning"
[Elfwing et al. 2017](https://arxiv.org/abs/1702.03118) and was independently
discovered (and called swish) in "Searching for Activation Functions"
[Ramachandran et al. 2017](https://arxiv.org/abs/1710.05941)
Args:
features: A `Tensor` representing preactivation values.
beta: A 'Tensor' representing value of beta hyperparameter.
Returns:
The activation value.
"""
# pylint: enable=g-doc-args
features = ops.convert_to_tensor(features, name="features")
beta = ops.convert_to_tensor(beta, name="beta")
beta = math_ops.cast(beta, features.dtype)
@custom_gradient.custom_gradient
def swish_impl(features, beta):
def grad(dy):
"""Gradient for the Swish activation function."""
# Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x)
# around for backprop, effectively doubling the tensor's memory
# consumption. We use a control dependency here so that sigmoid(features)
# is re-computed during backprop (the control dep prevents it being
# de-duped with the forward pass) and we can free the sigmoid(features)
# expression immediately after use during the forward pass.
with ops.control_dependencies([dy]):
sigmoid_features = math_ops.sigmoid(beta * features)
activation_grad = (
sigmoid_features * (1.0 + (beta * features) *
(1.0 - sigmoid_features)))
beta_grad = math_ops.reduce_sum(
dy * math_ops.square(features) * sigmoid_features *
(1.0 - sigmoid_features))
return (dy * activation_grad, beta_grad)
return features * math_ops.sigmoid(beta * features), grad
return swish_impl(features, beta)
# pylint: disable=redefined-builtin
@tf_export("linalg.normalize")
@dispatch.add_dispatch_support
def normalize(tensor, ord="euclidean", axis=None, name=None):
"""Normalizes `tensor` along dimension `axis` using specified norm.
This uses `tf.linalg.norm` to compute the norm along `axis`.
This function can compute several different vector norms (the 1-norm, the
Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
Args:
tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`, `1`,
`2`, `np.inf` and any positive real number yielding the corresponding
p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if
`tensor` is a matrix and equivalent to 2-norm for vectors.
Some restrictions apply: a) The Frobenius norm `'fro'` is not defined for
vectors, b) If axis is a 2-tuple (matrix norm), only `'euclidean'`,
'`fro'`, `1`, `2`, `np.inf` are supported. See the description of `axis`
on how to compute norms for a batch of vectors or matrices stored in a
tensor.
axis: If `axis` is `None` (the default), the input is considered a vector
and a single vector norm is computed over the entire set of values in the
tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
`norm(reshape(tensor, [-1]), ord=ord)`. If `axis` is a Python integer, the
input is considered a batch of vectors, and `axis` determines the axis in
`tensor` over which to compute vector norms. If `axis` is a 2-tuple of
Python integers it is considered a batch of matrices and `axis` determines
the axes in `tensor` over which to compute a matrix norm.
Negative indices are supported. Example: If you are passing a tensor that
can be either a matrix or a batch of matrices at runtime, pass
`axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
computed.
name: The name of the op.
Returns:
normalized: A normalized `Tensor` with the same shape as `tensor`.
norm: The computed norms with the same shape and dtype `tensor` but the
final axis is 1 instead. Same as running
`tf.cast(tf.linalg.norm(tensor, ord, axis keepdims=True), tensor.dtype)`.
Raises:
ValueError: If `ord` or `axis` is invalid.
"""
with ops.name_scope(name, "normalize", [tensor]) as name:
tensor = ops.convert_to_tensor(tensor)
norm = linalg_ops.norm(tensor, ord, axis, keepdims=True)
norm = math_ops.cast(norm, tensor.dtype)
normalized = tensor / norm
return normalized, norm
@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize",
v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
@dispatch.add_dispatch_support
@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
"""Normalizes along dimension `axis` using an L2 norm.
For a 1-D tensor with `axis = 0`, computes
output = x / sqrt(max(sum(x**2), epsilon))
For `x` with more dimensions, independently normalizes each 1-D slice along
dimension `axis`.
1-D tensor example:
>>> x = tf.constant([3.0, 4.0])
>>> tf.math.l2_normalize(x).numpy()
array([0.6, 0.8], dtype=float32)
2-D tensor example:
>>> x = tf.constant([[3.0], [4.0]])
>>> tf.math.l2_normalize(x, 0).numpy()
array([[0.6],
[0.8]], dtype=float32)
>>> x = tf.constant([[3.0], [4.0]])
>>> tf.math.l2_normalize(x, 1).numpy()
array([[1.],
[1.]], dtype=float32)
Args:
x: A `Tensor`.
axis: Dimension along which to normalize. A scalar or a vector of
integers.
epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
divisor if `norm < sqrt(epsilon)`.
name: A name for this operation (optional).
dim: Deprecated, do not use.
Returns:
A `Tensor` with the same shape as `x`.
"""
axis = deprecated_argument_lookup("axis", axis, "dim", dim)
with ops.name_scope(name, "l2_normalize", [x]) as name:
x = ops.convert_to_tensor(x, name="x")
if x.dtype.is_complex:
square_real = math_ops.square(math_ops.real(x))
square_imag = math_ops.square(math_ops.imag(x))
square_sum = math_ops.real(
math_ops.reduce_sum(square_real + square_imag, axis, keepdims=True))
x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
norm_real = math_ops.multiply(math_ops.real(x), x_inv_norm)
norm_imag = math_ops.multiply(math_ops.imag(x), x_inv_norm)
return math_ops.complex(norm_real, norm_imag, name=name)
square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
return math_ops.multiply(x, x_inv_norm, name=name)
def _count_nonzero(input_tensor, dtype=dtypes.int64):
"""Same as math_ops.count_nonzero.
The reduction is done in dtype, which can be faster for 32-bit dtypes.
Args:
input_tensor: numeric tensor
dtype: reduction dtype
Returns:
number of nonzero values with type dtype
"""
with ops.name_scope("count_nonzero", values=[input_tensor]):
zero = array_ops.zeros([], dtype=input_tensor.dtype)
nonzero_count = math_ops.reduce_sum(
math_ops.cast(
math_ops.not_equal(input_tensor, zero),
dtype=dtype), name="nonzero_count")
return nonzero_count
@tf_export("math.zero_fraction", "nn.zero_fraction")
@dispatch.add_dispatch_support
def zero_fraction(value, name=None):
"""Returns the fraction of zeros in `value`.
If `value` is empty, the result is `nan`.
This is useful in summaries to measure and report sparsity. For example,
```python
z = tf.nn.relu(...)
summ = tf.compat.v1.summary.scalar('sparsity', tf.nn.zero_fraction(z))
```
Args:
value: A tensor of numeric type.
name: A name for the operation (optional).
Returns:
The fraction of zeros in `value`, with type `float32`.
"""
with ops.name_scope(name, "zero_fraction", [value]):
value = ops.convert_to_tensor(value, name="value")
size = array_ops.size(value, out_type=dtypes.int64)
# If the count is small, we can save memory/CPU with an int32 reduction.
num_nonzero = tf_cond.cond(
size <= dtypes.int32.max,
# pylint: disable=g-long-lambda
true_fn=lambda: math_ops.cast(
_count_nonzero(value, dtype=dtypes.int32),
dtype=dtypes.int64),
false_fn=lambda: _count_nonzero(value, dtype=dtypes.int64))
with ops.name_scope("counts_to_fraction"):
num_zero = size - num_nonzero
num_zero_float32 = math_ops.cast(num_zero, dtype=dtypes.float32)
size_float32 = math_ops.cast(size, dtype=dtypes.float32)
zero_fraction_float32 = num_zero_float32 / size_float32
return array_ops.identity(zero_fraction_float32, "fraction")
# pylint: disable=redefined-builtin
@tf_export(v1=["nn.depthwise_conv2d"])
@dispatch.add_dispatch_support
def depthwise_conv2d(input,
filter,
strides,
padding,
rate=None,
name=None,
data_format=None,
dilations=None):
"""Depthwise 2-D convolution.
Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
and a filter tensor of shape
`[filter_height, filter_width, in_channels, channel_multiplier]`
containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
applies a different filter to each input channel (expanding from 1 channel
to `channel_multiplier` channels for each), then concatenates the results
together. The output has `in_channels * channel_multiplier` channels.
In detail, with the default NHWC format,
output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
strides[2] * j + rate[1] * dj, k]
Must have `strides[0] = strides[3] = 1`. For the most common case of the
same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
If any value in `rate` is greater than 1, we perform atrous depthwise
convolution, in which case all values in the `strides` tensor must be equal
to 1.
Usage Example:
>>> x = np.array([
... [1., 2.],
... [3., 4.],
... [5., 6.]
... ], dtype=np.float32).reshape((1, 3, 2, 1))
>>> kernel = np.array([
... [1., 2.],
... [3., 4]
... ], dtype=np.float32).reshape((2, 1, 1, 2))
>>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
... padding='VALID').numpy()
array([[[[10., 14.],
[14., 20.]],
[[18., 26.],
[22., 32.]]]], dtype=float32)
>>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
... padding=[[0, 0], [1, 0], [1, 0], [0, 0]]
... ).numpy()
array([[[[ 0., 0.],
[ 3., 4.],
[ 6., 8.]],
[[ 0., 0.],
[10., 14.],
[14., 20.]],
[[ 0., 0.],
[18., 26.],
[22., 32.]]]], dtype=float32)
Args:
input: 4-D with shape according to `data_format`.
filter: 4-D with shape
`[filter_height, filter_width, in_channels, channel_multiplier]`.
strides: 1-D of size 4. The stride of the sliding window for each
dimension of `input`.
padding: Controls how to pad the image before applying the convolution. Can
be the string `"SAME"` or `"VALID"` indicating the type of padding
algorithm to use, or a list indicating the explicit paddings at the start
and end of each dimension. When explicit padding is used and data_format
is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
[pad_left, pad_right], [0, 0]]`. When explicit padding used and
data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
[pad_top, pad_bottom], [pad_left, pad_right]]`.
rate: 1-D of size 2. The dilation rate in which we sample input values
across the `height` and `width` dimensions in atrous convolution. If it is
greater than 1, then all values of strides must be 1.
name: A name for this operation (optional).
data_format: The data format for input. Either "NHWC" (default) or "NCHW".
dilations: Alias of rate.
Returns:
A 4-D `Tensor` with shape according to `data_format`. E.g., for
"NHWC" format, shape is
`[batch, out_height, out_width, in_channels * channel_multiplier].`
"""
rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
with ops.name_scope(name, "depthwise", [input, filter]) as name:
input = ops.convert_to_tensor(input, name="tensor_in")
filter = ops.convert_to_tensor(filter, name="filter_in")
if rate is None:
rate = [1, 1]
# Use depthwise_conv2d_native if executing on TPU.
if device_context.enclosing_tpu_context() is not None:
if data_format == "NCHW":
dilations = [1, 1, rate[0], rate[1]]
else:
dilations = [1, rate[0], rate[1], 1]
return nn_ops.depthwise_conv2d_native(
input=input,
filter=filter,
strides=strides,
padding=padding,
data_format=data_format,
dilations=dilations,
name=name)
def op(input_converted, _, padding):
return nn_ops.depthwise_conv2d_native(
input=input_converted,
filter=filter,
strides=strides,
padding=padding,
data_format=data_format,
name=name)
return nn_ops.with_space_to_batch(
input=input,
filter_shape=array_ops.shape(filter),
dilation_rate=rate,
padding=padding,
data_format=data_format,
op=op)
@tf_export("nn.depthwise_conv2d", v1=[])
@dispatch.add_dispatch_support
def depthwise_conv2d_v2(input,
filter,
strides,
padding,
data_format=None,
dilations=None,
name=None):
"""Depthwise 2-D convolution.
Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
and a filter tensor of shape
`[filter_height, filter_width, in_channels, channel_multiplier]`
containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
applies a different filter to each input channel (expanding from 1 channel
to `channel_multiplier` channels for each), then concatenates the results
together. The output has `in_channels * channel_multiplier` channels.
In detail, with the default NHWC format,
output[b, i, j, k * channel_multiplier + q] =
sum_{di, dj} filter[di, dj, k, q] *
input[b, strides[1] * i + dilations[0] * di,
strides[2] * j + dilations[1] * dj, k]
Must have `strides[0] = strides[3] = 1`. For the most common case of the
same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
If any value in `dilations` is greater than 1, we perform atrous depthwise
convolution, in which case all values in the `strides` tensor must be equal
to 1.
Usage Example:
>>> x = np.array([
... [1., 2.],
... [3., 4.],
... [5., 6.]
... ], dtype=np.float32).reshape((1, 3, 2, 1))
>>> kernel = np.array([
... [1., 2.],
... [3., 4]
... ], dtype=np.float32).reshape((2, 1, 1, 2))
>>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
... padding='VALID').numpy()
array([[[[10., 14.],
[14., 20.]],
[[18., 26.],
[22., 32.]]]], dtype=float32)
>>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
... padding=[[0, 0], [1, 0], [1, 0], [0, 0]]).numpy()
array([[[[ 0., 0.],
[ 3., 4.],
[ 6., 8.]],
[[ 0., 0.],
[10., 14.],
[14., 20.]],
[[ 0., 0.],
[18., 26.],
[22., 32.]]]], dtype=float32)
Args:
input: 4-D with shape according to `data_format`.
filter: 4-D with shape
`[filter_height, filter_width, in_channels, channel_multiplier]`.
strides: 1-D of size 4. The stride of the sliding window for each
dimension of `input`.
padding: Controls how to pad the image before applying the convolution. Can
be the string `"SAME"` or `"VALID"` indicating the type of padding
algorithm to use, or a list indicating the explicit paddings at the start
and end of each dimension. See
[here](https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2)
for more information. When explicit padding is used and data_format
is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
[pad_left, pad_right], [0, 0]]`. When explicit padding used and
data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
[pad_top, pad_bottom], [pad_left, pad_right]]`.
data_format: The data format for input. Either "NHWC" (default) or "NCHW".
dilations: 1-D of size 2. The dilation rate in which we sample input values
across the `height` and `width` dimensions in atrous convolution. If it is
greater than 1, then all values of strides must be 1.
name: A name for this operation (optional).
Returns:
A 4-D `Tensor` with shape according to `data_format`. E.g., for
"NHWC" format, shape is
`[batch, out_height, out_width, in_channels * channel_multiplier].`
"""
return depthwise_conv2d(input=input,
filter=filter,
strides=strides,
padding=padding,
rate=dilations,
name=name,
data_format=data_format)
# pylint: enable=redefined-builtin
# pylint: disable=redefined-builtin,line-too-long
@tf_export(v1=["nn.separable_conv2d"])
@dispatch.add_dispatch_support
def separable_conv2d(input,
depthwise_filter,
pointwise_filter,
strides,
padding,
rate=None,
name=None,
data_format=None,
dilations=None):
"""2-D convolution with separable filters.
Performs a depthwise convolution that acts separately on channels followed by
a pointwise convolution that mixes channels. Note that this is separability
between dimensions `[1, 2]` and `3`, not spatial separability between
dimensions `1` and `2`.
In detail, with the default NHWC format,
output[b, i, j, k] = sum_{di, dj, q, r}
input[b, strides[1] * i + di, strides[2] * j + dj, q] *
depthwise_filter[di, dj, q, r] *
pointwise_filter[0, 0, q * channel_multiplier + r, k]
`strides` controls the strides for the depthwise convolution only, since
the pointwise convolution has implicit strides of `[1, 1, 1, 1]`. Must have
`strides[0] = strides[3] = 1`. For the most common case of the same
horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
If any value in `rate` is greater than 1, we perform atrous depthwise
convolution, in which case all values in the `strides` tensor must be equal
to 1.
Args:
input: 4-D `Tensor` with shape according to `data_format`.
depthwise_filter: 4-D `Tensor` with shape
`[filter_height, filter_width, in_channels, channel_multiplier]`.
Contains `in_channels` convolutional filters of depth 1.
pointwise_filter: 4-D `Tensor` with shape
`[1, 1, channel_multiplier * in_channels, out_channels]`. Pointwise
filter to mix channels after `depthwise_filter` has convolved spatially.
strides: 1-D of size 4. The strides for the depthwise convolution for
each dimension of `input`.
padding: Controls how to pad the image before applying the depthwise
convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
of padding algorithm to use, or a Python list indicating the explicit
paddings at the start and end of each dimension. When explicit padding is
used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
padding used and data_format is `"NCHW"`, this should be in the form
`[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
rate: 1-D of size 2. The dilation rate in which we sample input values
across the `height` and `width` dimensions in atrous convolution. If it is
greater than 1, then all values of strides must be 1.
name: A name for this operation (optional).
data_format: The data format for input. Either "NHWC" (default) or "NCHW".
dilations: Alias of rate.
Returns:
A 4-D `Tensor` with shape according to 'data_format'. For
example, with data_format="NHWC", shape is [batch, out_height,
out_width, out_channels].
"""
rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
with ops.name_scope(name, "separable_conv2d",
[input, depthwise_filter, pointwise_filter]) as name:
input = ops.convert_to_tensor(input, name="tensor_in")
depthwise_filter = ops.convert_to_tensor(
depthwise_filter, name="depthwise_filter")
pointwise_filter = ops.convert_to_tensor(
pointwise_filter, name="pointwise_filter")
pointwise_filter_shape = pointwise_filter.get_shape().with_rank(4)
pointwise_filter_shape.dims[0].assert_is_compatible_with(1)
pointwise_filter_shape.dims[1].assert_is_compatible_with(1)
if rate is None:
rate = [1, 1]
# The layout of the ops in the graph are expected to be as follows:
# depthwise_conv2d // Conv2D op corresponding to native depthwise conv.
# separable_conv2d // Conv2D op corresponding to the pointwise conv.
def op(input_converted, _, padding):
return nn_ops.depthwise_conv2d_native(
input=input_converted,
filter=depthwise_filter,
strides=strides,
padding=padding,
data_format=data_format,
name="depthwise")
depthwise = nn_ops.with_space_to_batch(
input=input,
filter_shape=array_ops.shape(depthwise_filter),
dilation_rate=rate,
padding=padding,
data_format=data_format,
op=op)
return nn_ops.conv2d(
depthwise,
pointwise_filter, [1, 1, 1, 1],
padding="VALID",
data_format=data_format,
name=name)
@tf_export("nn.separable_conv2d", v1=[])
@dispatch.add_dispatch_support
def separable_conv2d_v2(
input,
depthwise_filter,
pointwise_filter,
strides,
padding,
data_format=None,
dilations=None,
name=None,
):
"""2-D convolution with separable filters.
Performs a depthwise convolution that acts separately on channels followed by
a pointwise convolution that mixes channels. Note that this is separability
between dimensions `[1, 2]` and `3`, not spatial separability between
dimensions `1` and `2`.
In detail, with the default NHWC format,
output[b, i, j, k] = sum_{di, dj, q, r}
input[b, strides[1] * i + di, strides[2] * j + dj, q] *
depthwise_filter[di, dj, q, r] *
pointwise_filter[0, 0, q * channel_multiplier + r, k]
`strides` controls the strides for the depthwise convolution only, since
the pointwise convolution has implicit strides of `[1, 1, 1, 1]`. Must have
`strides[0] = strides[3] = 1`. For the most common case of the same
horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
If any value in `rate` is greater than 1, we perform atrous depthwise
convolution, in which case all values in the `strides` tensor must be equal
to 1.
Args:
input: 4-D `Tensor` with shape according to `data_format`.
depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
in_channels, channel_multiplier]`. Contains `in_channels` convolutional
filters of depth 1.
pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
in_channels, out_channels]`. Pointwise filter to mix channels after
`depthwise_filter` has convolved spatially.
strides: 1-D of size 4. The strides for the depthwise convolution for each
dimension of `input`.
padding: Controls how to pad the image before applying the depthwise
convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
of padding algorithm to use, or a Python list indicating the explicit
paddings at the start and end of each dimension. When explicit padding is
used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
padding used and data_format is `"NCHW"`, this should be in the form
`[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
data_format: The data format for input. Either "NHWC" (default) or "NCHW".
dilations: 1-D of size 2. The dilation rate in which we sample input values
across the `height` and `width` dimensions in atrous convolution. If it is
greater than 1, then all values of strides must be 1.
name: A name for this operation (optional).
Returns:
A 4-D `Tensor` with shape according to 'data_format'. For
example, with data_format="NHWC", shape is [batch, out_height,
out_width, out_channels].
"""
return separable_conv2d(
input,
depthwise_filter,
pointwise_filter,
strides,
padding,
rate=dilations,
name=name,
data_format=data_format)
# pylint: enable=redefined-builtin,line-too-long
@tf_export(v1=["nn.sufficient_statistics"])
@dispatch.add_dispatch_support
def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
keepdims=None):
"""Calculate the sufficient statistics for the mean and variance of `x`.
These sufficient statistics are computed using the one pass algorithm on
an input that's optionally shifted. See:
https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
For example:
>>> t = [[1, 2, 3], [4, 5, 6]]
>>> sufficient_statistics(t, [1])
(<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
>>> sufficient_statistics(t, [-1])
(<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
Args:
x: A `Tensor`.
axes: Array of ints. Axes along which to compute mean and variance. As in
Python, the axes can also be negative numbers. A negative axis is
interpreted as counting from the end of the rank, i.e., axis +
rank(values)-th dimension.
shift: A `Tensor` containing the value by which to shift the data for
numerical stability, or `None` if no shift is to be performed. A shift
close to the true mean provides the most numerically stable results.
keep_dims: produce statistics with the same dimensionality as the input.
name: Name used to scope the operations that compute the sufficient stats.
keepdims: Alias for keep_dims.
Returns:
Four `Tensor` objects of the same type as `x`:
* the count (number of elements to average over).
* the (possibly shifted) sum of the elements in the array.
* the (possibly shifted) sum of squares of the elements in the array.
* the shift by which the mean must be corrected or None if `shift` is None.
"""
axes = list(set(axes))
keep_dims = deprecated_argument_lookup(
"keepdims", keepdims, "keep_dims", keep_dims)
if keep_dims is None:
keep_dims = False
with ops.name_scope(name, "sufficient_statistics", [x, shift]):
x = ops.convert_to_tensor(x, name="x")
x_shape = x.get_shape()
if x_shape.rank is not None and all(
x_shape.dims[d].value is not None for d in axes):
counts = 1
for d in axes:
counts *= x_shape.dims[d].value
counts = constant_op.constant(counts, dtype=x.dtype)
else: # shape needs to be inferred at runtime.
# Normalize axes to be positive. Required for gather.
rank = array_ops.rank(x)
positive_axes = [axis + rank if axis < 0 else axis for axis in axes]
x_dims = array_ops.gather(
math_ops.cast(array_ops.shape(x), x.dtype), positive_axes)
counts = math_ops.reduce_prod(x_dims, name="count")
if shift is not None:
shift = ops.convert_to_tensor(shift, name="shift")
m_ss = math_ops.subtract(x, shift)
v_ss = math_ops.squared_difference(x, shift)
else: # no shift.
m_ss = x
v_ss = math_ops.square(x)
m_ss = math_ops.reduce_sum(m_ss, axes, keepdims=keep_dims, name="mean_ss")
v_ss = math_ops.reduce_sum(v_ss, axes, keepdims=keep_dims, name="var_ss")
return counts, m_ss, v_ss, shift
@tf_export("nn.sufficient_statistics", v1=[])
@dispatch.add_dispatch_support
def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
"""Calculate the sufficient statistics for the mean and variance of `x`.
These sufficient statistics are computed using the one pass algorithm on
an input that's optionally shifted. See:
https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
Args:
x: A `Tensor`.
axes: Array of ints. Axes along which to compute mean and variance.
shift: A `Tensor` containing the value by which to shift the data for
numerical stability, or `None` if no shift is to be performed. A shift
close to the true mean provides the most numerically stable results.
keepdims: produce statistics with the same dimensionality as the input.
name: Name used to scope the operations that compute the sufficient stats.
Returns:
Four `Tensor` objects of the same type as `x`:
* the count (number of elements to average over).
* the (possibly shifted) sum of the elements in the array.
* the (possibly shifted) sum of squares of the elements in the array.
* the shift by which the mean must be corrected or None if `shift` is None.
"""
return sufficient_statistics(
x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name)
@tf_export("nn.normalize_moments")
@dispatch.add_dispatch_support
def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
"""Calculate the mean and variance of based on the sufficient statistics.
Args:
counts: A `Tensor` containing the total count of the data (one value).
mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
shifted) sum of the elements to average over.
variance_ss: A `Tensor` containing the variance sufficient statistics: the
(possibly shifted) squared sum of the data to compute the variance over.
shift: A `Tensor` containing the value by which the data is shifted for
numerical stability, or `None` if no shift was performed.
name: Name used to scope the operations that compute the moments.
Returns:
Two `Tensor` objects: `mean` and `variance`.
"""
with ops.name_scope(name, "normalize", [counts, mean_ss, variance_ss, shift]):
divisor = math_ops.reciprocal(counts, name="divisor")
if shift is not None:
shifted_mean = math_ops.multiply(mean_ss, divisor, name="shifted_mean")
mean = math_ops.add(shifted_mean, shift, name="mean")
else: # no shift.
shifted_mean = math_ops.multiply(mean_ss, divisor, name="mean")
mean = shifted_mean
variance = math_ops.subtract(
math_ops.multiply(variance_ss, divisor),
math_ops.square(shifted_mean),
name="variance")
return (mean, variance)
@tf_export(v1=["nn.moments"])
@dispatch.add_dispatch_support
def moments(
x,
axes,
shift=None, # pylint: disable=unused-argument
name=None,
keep_dims=None,
keepdims=None):
"""Calculate the mean and variance of `x`.
The mean and variance are calculated by aggregating the contents of `x`
across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean
and variance of a vector.
Note: shift is currently not used; the true mean is computed and used.
When using these moments for batch normalization (see
`tf.nn.batch_normalization`):
* for so-called "global normalization", used with convolutional filters with
shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
* for simple batch normalization pass `axes=[0]` (batch only).
Args:
x: A `Tensor`.
axes: Array of ints. Axes along which to compute mean and
variance.
shift: Not used in the current implementation
name: Name used to scope the operations that compute the moments.
keep_dims: produce moments with the same dimensionality as the input.
keepdims: Alias to keep_dims.
Returns:
Two `Tensor` objects: `mean` and `variance`.
"""
keep_dims = deprecated_argument_lookup(
"keepdims", keepdims, "keep_dims", keep_dims)
if keep_dims is None:
keep_dims = False
with ops.name_scope(name, "moments", [x, axes]):
# The dynamic range of fp16 is too limited to support the collection of
# sufficient statistics. As a workaround we simply perform the operations
# on 32-bit floats before converting the mean and variance back to fp16
y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
# Compute true mean while keeping the dims for proper broadcasting.
mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
# sample variance, not unbiased variance
# Note: stop_gradient does not change the gradient that gets
# backpropagated to the mean from the variance calculation,
# because that gradient is zero
variance = math_ops.reduce_mean(
math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
axes,
keepdims=True,
name="variance")
if not keep_dims:
mean = array_ops.squeeze(mean, axes)
variance = array_ops.squeeze(variance, axes)
if x.dtype == dtypes.float16:
return (math_ops.cast(mean, dtypes.float16),
math_ops.cast(variance, dtypes.float16))
else:
return (mean, variance)
@tf_export("nn.moments", v1=[])
@dispatch.add_dispatch_support
def moments_v2(
x,
axes,
shift=None,
keepdims=False,
name=None):
"""Calculates the mean and variance of `x`.
The mean and variance are calculated by aggregating the contents of `x`
across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean
and variance of a vector.
Note: shift is currently not used; the true mean is computed and used.
When using these moments for batch normalization (see
`tf.nn.batch_normalization`):
* for so-called "global normalization", used with convolutional filters with
shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
* for simple batch normalization pass `axes=[0]` (batch only).
Args:
x: A `Tensor`.
axes: Array of ints. Axes along which to compute mean and
variance.
shift: Not used in the current implementation.
keepdims: produce moments with the same dimensionality as the input.
name: Name used to scope the operations that compute the moments.
Returns:
Two `Tensor` objects: `mean` and `variance`.
"""
return moments(x=x, axes=axes, shift=shift, name=name, keep_dims=keepdims)
@tf_export(v1=["nn.weighted_moments"])
@dispatch.add_dispatch_support
def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
keepdims=None):
"""Returns the frequency-weighted mean and variance of `x`.
Args:
x: A tensor.
axes: 1-d tensor of int32 values; these are the axes along which
to compute mean and variance.
frequency_weights: A tensor of positive weights which can be
broadcast with x.
name: Name used to scope the operation.
keep_dims: Produce moments with the same dimensionality as the input.
keepdims: Alias of keep_dims.
Returns:
Two tensors: `weighted_mean` and `weighted_variance`.
"""
keep_dims = deprecated_argument_lookup(
"keepdims", keepdims, "keep_dims", keep_dims)
if keep_dims is None:
keep_dims = False
with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]):
x = ops.convert_to_tensor(x, name="x")
frequency_weights = ops.convert_to_tensor(
frequency_weights, name="frequency_weights")
# Unlike moments(), this just uses a simpler two-pass method.
# See comment in moments() WRT precision; it applies here too.
needs_cast = x.dtype == dtypes.float16
if needs_cast:
x = math_ops.cast(x, dtypes.float32)
if frequency_weights.dtype != x.dtype:
frequency_weights = math_ops.cast(frequency_weights, x.dtype)
# Note that we use keep_dims=True for our reductions regardless of the arg;
# this is so that the results remain broadcast-compatible with the inputs.
weighted_input_sum = math_ops.reduce_sum(
frequency_weights * x, axes, name="weighted_input_sum", keepdims=True)
# The shape of the weights isn't necessarily the same as x's
# shape, just broadcast-compatible with it -- so this expression
# performs broadcasting to give a per-item weight, with the same
# shape as (frequency_weights * x). This avoids having to reason
# through all the broadcast logic to compute a correct
# sum_of_weights.
broadcasted_weights = frequency_weights + array_ops.zeros_like(x)
sum_of_weights = math_ops.reduce_sum(
broadcasted_weights, axes, name="sum_of_weights", keepdims=True)
weighted_mean = math_ops.div_no_nan(weighted_input_sum, sum_of_weights)
# Have the weighted mean; now on to variance:
weighted_distsq = math_ops.reduce_sum(
frequency_weights * math_ops.squared_difference(x, weighted_mean),
axes,
name="weighted_distsq",
keepdims=True)
weighted_variance = math_ops.div_no_nan(weighted_distsq, sum_of_weights)
if not keep_dims:
weighted_mean = array_ops.squeeze(weighted_mean, axis=axes)
weighted_variance = array_ops.squeeze(
weighted_variance, axis=axes)
if needs_cast:
weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
weighted_variance = math_ops.cast(weighted_variance, dtypes.float16)
return weighted_mean, weighted_variance
@tf_export("nn.weighted_moments", v1=[])
@dispatch.add_dispatch_support
def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
"""Returns the frequency-weighted mean and variance of `x`.
Args:
x: A tensor.
axes: 1-d tensor of int32 values; these are the axes along which
to compute mean and variance.
frequency_weights: A tensor of positive weights which can be
broadcast with x.
keepdims: Produce moments with the same dimensionality as the input.
name: Name used to scope the operation.
Returns:
Two tensors: `weighted_mean` and `weighted_variance`.
"""
return weighted_moments(
x=x,
axes=axes,
frequency_weights=frequency_weights,
name=name,
keep_dims=keepdims)
@tf_export("nn.batch_normalization")
@dispatch.add_dispatch_support
def batch_normalization(x,
mean,
variance,
offset,
scale,
variance_epsilon,
name=None):
r"""Batch normalization.
Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
`scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):
\\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)
`mean`, `variance`, `offset` and `scale` are all expected to be of one of two
shapes:
* In all generality, they can have the same number of dimensions as the
input `x`, with identical sizes as `x` for the dimensions that are not
normalized over (the 'depth' dimension(s)), and dimension 1 for the
others which are being normalized over.
`mean` and `variance` in this case would typically be the outputs of
`tf.nn.moments(..., keepdims=True)` during training, or running averages
thereof during inference.
* In the common case where the 'depth' dimension is the last dimension in
the input tensor `x`, they may be one dimensional tensors of the same
size as the 'depth' dimension.
This is the case for example for the common `[batch, depth]` layout of
fully-connected layers, and `[batch, height, width, depth]` for
convolutions.
`mean` and `variance` in this case would typically be the outputs of
`tf.nn.moments(..., keepdims=False)` during training, or running averages
thereof during inference.
See equation 11 in Algorithm 2 of source:
[Batch Normalization: Accelerating Deep Network Training by
Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
(http://arxiv.org/abs/1502.03167).
Args:
x: Input `Tensor` of arbitrary dimensionality.
mean: A mean `Tensor`.
variance: A variance `Tensor`.
offset: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
None. If present, will be added to the normalized tensor.
scale: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
`None`. If present, the scale is applied to the normalized tensor.
variance_epsilon: A small float number to avoid dividing by 0.
name: A name for this operation (optional).
Returns:
the normalized, scaled, offset tensor.
References:
Batch Normalization - Accelerating Deep Network Training by Reducing
Internal Covariate Shift:
[Ioffe et al., 2015](http://arxiv.org/abs/1502.03167)
([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
"""
with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]):
inv = math_ops.rsqrt(variance + variance_epsilon)
if scale is not None:
inv *= scale
# Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on
# the precise order of ops that are generated by the expression below.
return x * math_ops.cast(inv, x.dtype) + math_ops.cast(
offset - mean * inv if offset is not None else -mean * inv, x.dtype)
@tf_export(v1=["nn.fused_batch_norm"])
@dispatch.add_dispatch_support
def fused_batch_norm(
x,
scale,
offset, # pylint: disable=invalid-name
mean=None,
variance=None,
epsilon=0.001,
data_format="NHWC",
is_training=True,
name=None,
exponential_avg_factor=1.0):
r"""Batch normalization.
See Source: [Batch Normalization: Accelerating Deep Network Training by
Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
(http://arxiv.org/abs/1502.03167).
Args:
x: Input `Tensor` of 4 or 5 dimensions.
scale: A `Tensor` of 1 dimension for scaling.
offset: A `Tensor` of 1 dimension for bias.
mean: A `Tensor` of 1 dimension for population mean. The shape and meaning
of this argument depends on the value of is_training and
exponential_avg_factor as follows:
is_training==False (inference):
Mean must be a `Tensor` of the same shape as scale containing the
estimated population mean computed during training.
is_training==True and exponential_avg_factor == 1.0:
Mean must be None.
is_training==True and exponential_avg_factor != 1.0:
Mean must be a `Tensor` of the same shape as scale containing the
exponential running mean.
variance: A `Tensor` of 1 dimension for population variance. The shape and
meaning of this argument depends on the value of is_training and
exponential_avg_factor as follows:
is_training==False (inference):
Variance must be a `Tensor` of the same shape as scale containing
the estimated population variance computed during training.
is_training==True and exponential_avg_factor == 1.0:
Variance must be None.
is_training==True and exponential_avg_factor != 1.0:
Variance must be a `Tensor` of the same shape as scale containing
the exponential running variance.
epsilon: A small float number added to the variance of x.
data_format: The data format for x. Support "NHWC" (default) or "NCHW" for
4D tenors and "NDHWC" or "NCDHW" for 5D tensors.
is_training: A bool value to specify if the operation is used for
training or inference.
name: A name for this operation (optional).
exponential_avg_factor: A float number (usually between 0 and 1) used
for controlling the decay of the running
population average of mean and variance.
If set to 1.0, the current batch average is
returned.
Returns:
y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.
running_mean: A 1D Tensor for the exponential running mean of x.
The output value is (1 - exponential_avg_factor) * mean +
exponential_avg_factor * batch_mean), where batch_mean
is the mean of the current batch in x.
running_var: A 1D Tensor for the exponential running variance
The output value is (1 - exponential_avg_factor) * variance +
exponential_avg_factor * batch_variance), where batch_variance
is the variance of the current batch in x.
References:
Batch Normalization - Accelerating Deep Network Training by Reducing
Internal Covariate Shift:
[Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
"""
if (not is_training or exponential_avg_factor != 1.0) and (
(mean is None) or (variance is None)):
raise ValueError("Both `mean` and `variance` must be a 1D tensor when "
"`is_training` is False or `exponential_avg_factor` != "
f"1.0. Received: `mean` {mean!r} and `variance` "
f"{variance!r}")
x = ops.convert_to_tensor(x, name="input")
scale = ops.convert_to_tensor(scale, name="scale")
offset = ops.convert_to_tensor(offset, name="offset")
if mean is None:
mean = constant_op.constant([])
if variance is None:
variance = constant_op.constant([])
y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
x,
scale,
offset,
mean,
variance,
epsilon=epsilon,
exponential_avg_factor=exponential_avg_factor,
data_format=data_format,
is_training=is_training,
name=name)
return y, running_mean, running_var
@tf_export(v1=["nn.batch_norm_with_global_normalization"])
@dispatch.add_dispatch_support
def batch_norm_with_global_normalization(t=None,
m=None,
v=None,
beta=None,
gamma=None,
variance_epsilon=None,
scale_after_normalization=None,
name=None,
input=None, # pylint: disable=redefined-builtin
mean=None,
variance=None):
"""Batch normalization.
This op is deprecated. See `tf.nn.batch_normalization`.
Args:
t: A 4D input Tensor.
m: A 1D mean Tensor with size matching the last dimension of t.
This is the first output from tf.nn.moments,
or a saved moving average thereof.
v: A 1D variance Tensor with size matching the last dimension of t.
This is the second output from tf.nn.moments,
or a saved moving average thereof.
beta: A 1D beta Tensor with size matching the last dimension of t.
An offset to be added to the normalized tensor.
gamma: A 1D gamma Tensor with size matching the last dimension of t.
If "scale_after_normalization" is true, this tensor will be multiplied
with the normalized tensor.
variance_epsilon: A small float number to avoid dividing by 0.
scale_after_normalization: A bool indicating whether the resulted tensor
needs to be multiplied with gamma.
name: A name for this operation (optional).
input: Alias for t.
mean: Alias for m.
variance: Alias for v.
Returns:
A batch-normalized `t`.
References:
Batch Normalization - Accelerating Deep Network Training by Reducing
Internal Covariate Shift:
[Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
"""
t = deprecated_argument_lookup("input", input, "t", t)
m = deprecated_argument_lookup("mean", mean, "m", m)
v = deprecated_argument_lookup("variance", variance, "v", v)
return batch_normalization(t, m, v, beta, gamma if scale_after_normalization
else None, variance_epsilon, name)
# pylint: disable=redefined-builtin,line-too-long
@tf_export("nn.batch_norm_with_global_normalization", v1=[])
@dispatch.add_dispatch_support
def batch_norm_with_global_normalization_v2(input,
mean,
variance,
beta,
gamma,
variance_epsilon,
scale_after_normalization,
name=None):
"""Batch normalization.
This op is deprecated. See `tf.nn.batch_normalization`.
Args:
input: A 4D input Tensor.
mean: A 1D mean Tensor with size matching the last dimension of t.
This is the first output from tf.nn.moments,
or a saved moving average thereof.
variance: A 1D variance Tensor with size matching the last dimension of t.
This is the second output from tf.nn.moments,
or a saved moving average thereof.
beta: A 1D beta Tensor with size matching the last dimension of t.
An offset to be added to the normalized tensor.
gamma: A 1D gamma Tensor with size matching the last dimension of t.
If "scale_after_normalization" is true, this tensor will be multiplied
with the normalized tensor.
variance_epsilon: A small float number to avoid dividing by 0.
scale_after_normalization: A bool indicating whether the resulted tensor
needs to be multiplied with gamma.
name: A name for this operation (optional).
Returns:
A batch-normalized `t`.
References:
Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift:
[Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
"""
return batch_norm_with_global_normalization(t=input,
m=mean,
v=variance,
beta=beta,
gamma=gamma,
variance_epsilon=variance_epsilon,
scale_after_normalization=scale_after_normalization,
name=name)
# pylint: enable=redefined-builtin,line-too-long
def _sum_rows(x):
"""Returns a vector summing up each row of the matrix x."""
# _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
# a matrix. The gradient of _sum_rows(x) is more efficient than
# reduce_sum(x, 1)'s gradient in today's implementation. Therefore,
# we use _sum_rows(x) in the nce_loss() computation since the loss
# is mostly used for training.
cols = array_ops.shape(x)[1]
ones_shape = array_ops_stack.stack([cols, 1])
ones = array_ops.ones(ones_shape, x.dtype)
return array_ops.reshape(math_ops.matmul(x, ones), [-1])
def _compute_sampled_logits(weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
subtract_log_q=True,
remove_accidental_hits=False,
partition_strategy="mod",
name=None,
seed=None):
"""Helper function for nce_loss and sampled_softmax_loss functions.
Computes sampled output training logits and labels suitable for implementing
e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
sampled_softmax_loss).
Note: In the case where num_true > 1, we assign to each target class
the target probability 1 / num_true so that the target probabilities
sum to 1 per-example.
Args:
weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
objects whose concatenation along dimension 0 has shape
`[num_classes, dim]`. The (possibly-partitioned) class embeddings.
biases: A `Tensor` of shape `[num_classes]`. The (possibly-partitioned)
class biases.
labels: A `Tensor` of type `int64` and shape `[batch_size,
num_true]`. The target classes. Note that this format differs from
the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
activations of the input network.
num_sampled: An `int`. The number of classes to randomly sample per batch.
num_classes: An `int`. The number of possible classes.
num_true: An `int`. The number of target classes per training example.
sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
`sampled_expected_count`) returned by a `*_candidate_sampler` function.
(if None, we default to `log_uniform_candidate_sampler`)
subtract_log_q: A `bool`. whether to subtract the log expected count of
the labels in the sample to get the logits of the true labels.
Default is True. Turn off for Negative Sampling.
remove_accidental_hits: A `bool`. whether to remove "accidental hits"
where a sampled class equals one of the target classes. Default is
False.
partition_strategy: A string specifying the partitioning strategy, relevant
if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
name: A name for the operation (optional).
seed: random seed for candidate sampling. Default to None, which doesn't set
the op-level random seed for candidate sampling.
Returns:
out_logits: `Tensor` object with shape
`[batch_size, num_true + num_sampled]`, for passing to either
`nn.sigmoid_cross_entropy_with_logits` (NCE) or
`nn.softmax_cross_entropy_with_logits` (sampled softmax).
out_labels: A Tensor object with the same shape as `out_logits`.
"""
if isinstance(weights, variables.PartitionedVariable):
weights = list(weights)
if not isinstance(weights, list):
weights = [weights]
with ops.name_scope(name, "compute_sampled_logits",
weights + [biases, inputs, labels]):
if labels.dtype != dtypes.int64:
labels = math_ops.cast(labels, dtypes.int64)
labels_flat = array_ops.reshape(labels, [-1])
# Sample the negative labels.
# sampled shape: [num_sampled] tensor
# true_expected_count shape = [batch_size, 1] tensor
# sampled_expected_count shape = [num_sampled] tensor
if sampled_values is None:
sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
true_classes=labels,
num_true=num_true,
num_sampled=num_sampled,
unique=True,
range_max=num_classes,
seed=seed)
# NOTE: pylint cannot tell that 'sampled_values' is a sequence
# pylint: disable=unpacking-non-sequence
sampled, true_expected_count, sampled_expected_count = (
array_ops.stop_gradient(s) for s in sampled_values)
# pylint: enable=unpacking-non-sequence
sampled = math_ops.cast(sampled, dtypes.int64)
# labels_flat is a [batch_size * num_true] tensor
# sampled is a [num_sampled] int tensor
all_ids = array_ops.concat([labels_flat, sampled], 0)
# Retrieve the true weights and the logits of the sampled weights.
# weights shape is [num_classes, dim]
all_w = embedding_ops.embedding_lookup(
weights, all_ids, partition_strategy=partition_strategy)
if all_w.dtype != inputs.dtype:
all_w = math_ops.cast(all_w, inputs.dtype)
# true_w shape is [batch_size * num_true, dim]
true_w = array_ops.slice(all_w, [0, 0],
array_ops_stack.stack(
[array_ops.shape(labels_flat)[0], -1]))
sampled_w = array_ops.slice(
all_w,
array_ops_stack.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
# inputs has shape [batch_size, dim]
# sampled_w has shape [num_sampled, dim]
# Apply X*W', which yields [batch_size, num_sampled]
sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True)
# Retrieve the true and sampled biases, compute the true logits, and
# add the biases to the true and sampled logits.
all_b = embedding_ops.embedding_lookup(
biases, all_ids, partition_strategy=partition_strategy)
if all_b.dtype != inputs.dtype:
all_b = math_ops.cast(all_b, inputs.dtype)
# true_b is a [batch_size * num_true] tensor
# sampled_b is a [num_sampled] float tensor
true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])
# inputs shape is [batch_size, dim]
# true_w shape is [batch_size * num_true, dim]
# row_wise_dots is [batch_size, num_true, dim]
dim = array_ops.shape(true_w)[1:2]
new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)
row_wise_dots = math_ops.multiply(
array_ops.expand_dims(inputs, 1),
array_ops.reshape(true_w, new_true_w_shape))
# We want the row-wise dot plus biases which yields a
# [batch_size, num_true] tensor of true_logits.
dots_as_matrix = array_ops.reshape(row_wise_dots,
array_ops.concat([[-1], dim], 0))
true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])
true_b = array_ops.reshape(true_b, [-1, num_true])
true_logits += true_b
sampled_logits += sampled_b
if remove_accidental_hits:
acc_hits = candidate_sampling_ops.compute_accidental_hits(
labels, sampled, num_true=num_true)
acc_indices, acc_ids, acc_weights = acc_hits
# This is how SparseToDense expects the indices.
acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
acc_ids_2d_int32 = array_ops.reshape(
math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1,
"sparse_indices")
# Create sampled_logits_shape = [batch_size, num_sampled]
sampled_logits_shape = array_ops.concat(
[array_ops.shape(labels)[:1],
array_ops.expand_dims(num_sampled, 0)], 0)
if sampled_logits.dtype != acc_weights.dtype:
acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
sampled_logits += gen_sparse_ops.sparse_to_dense(
sparse_indices,
sampled_logits_shape,
acc_weights,
default_value=0.0,
validate_indices=False)
if subtract_log_q:
# Subtract log of Q(l), prior probability that l appears in sampled.
true_logits -= math_ops.log(true_expected_count)
sampled_logits -= math_ops.log(sampled_expected_count)
# Construct output logits and labels. The true labels/logits start at col 0.
out_logits = array_ops.concat([true_logits, sampled_logits], 1)
# true_logits is a float tensor, ones_like(true_logits) is a float
# tensor of ones. We then divide by num_true to ensure the per-example
# labels sum to 1.0, i.e. form a proper probability distribution.
out_labels = array_ops.concat([
array_ops.ones_like(true_logits) / num_true,
array_ops.zeros_like(sampled_logits)
], 1)
return out_logits, out_labels
@tf_export("nn.nce_loss", v1=[])
@dispatch.add_dispatch_support
def nce_loss_v2(weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
remove_accidental_hits=False,
name="nce_loss"):
"""Computes and returns the noise-contrastive estimation training loss.
See [Noise-contrastive estimation: A new estimation principle for
unnormalized statistical
models](https://arxiv.org/abs/1806.03664).
Also see our [Candidate Sampling Algorithms
Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
A common use case is to use this method for training, and calculate the full
sigmoid loss for evaluation or inference as in the following example:
```python
if mode == "train":
loss = tf.nn.nce_loss(
weights=weights,
biases=biases,
labels=labels,
inputs=inputs,
...)
elif mode == "eval":
logits = tf.matmul(inputs, tf.transpose(weights))
logits = tf.nn.bias_add(logits, biases)
labels_one_hot = tf.one_hot(labels, n_classes)
loss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=labels_one_hot,
logits=logits)
loss = tf.reduce_sum(loss, axis=1)
```
Note: when doing embedding lookup on `weights` and `bias`, "div" partition
strategy will be used. Support for other partition strategy will be added
later.
Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
so your labels must be sorted in order of decreasing frequency to achieve
good results. For more details, see
`tf.random.log_uniform_candidate_sampler`.
Note: In the case where `num_true` > 1, we assign to each target class
the target probability 1 / `num_true` so that the target probabilities
sum to 1 per-example.
Note: It would be useful to allow a variable number of target classes per
example. We hope to provide this functionality in a future release.
For now, if you have a variable number of target classes, you can pad them
out to a constant number by either repeating them or by padding
with an otherwise unused class.
Args:
weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
objects whose concatenation along dimension 0 has shape [num_classes,
dim]. The (possibly-partitioned) class embeddings.
biases: A `Tensor` of shape `[num_classes]`. The class biases.
labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
target classes.
inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of
the input network.
num_sampled: An `int`. The number of negative classes to randomly sample
per batch. This single sample of negative classes is evaluated for each
element in the batch.
num_classes: An `int`. The number of possible classes.
num_true: An `int`. The number of target classes per training example.
sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
`sampled_expected_count`) returned by a `*_candidate_sampler` function.
(if None, we default to `log_uniform_candidate_sampler`)
remove_accidental_hits: A `bool`. Whether to remove "accidental hits"
where a sampled class equals one of the target classes. If set to `True`,
this is a "Sampled Logistic" loss instead of NCE, and we are learning to
generate log-odds instead of log probabilities. See our [Candidate
Sampling Algorithms Reference]
(https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
False.
name: A name for the operation (optional).
Returns:
A `batch_size` 1-D tensor of per-example NCE losses.
"""
# TODO(yuefengz): get partition_strategy from either variables or distribution
# strategies.
return nce_loss(
weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=num_true,
sampled_values=sampled_values,
remove_accidental_hits=remove_accidental_hits,
partition_strategy="div",
name=name)
@tf_export(v1=["nn.nce_loss"])
@dispatch.add_dispatch_support
def nce_loss(weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
remove_accidental_hits=False,
partition_strategy="mod",
name="nce_loss"):
"""Computes and returns the noise-contrastive estimation training loss.
A common use case is to use this method for training, and calculate the full
sigmoid loss for evaluation or inference. In this case, you must set
`partition_strategy="div"` for the two losses to be consistent, as in the
following example:
```python
if mode == "train":
loss = tf.nn.nce_loss(
weights=weights,
biases=biases,
labels=labels,
inputs=inputs,
...,
partition_strategy="div")
elif mode == "eval":
logits = tf.matmul(inputs, tf.transpose(weights))
logits = tf.nn.bias_add(logits, biases)
labels_one_hot = tf.one_hot(labels, n_classes)
loss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=labels_one_hot,
logits=logits)
loss = tf.reduce_sum(loss, axis=1)
```
Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
so your labels must be sorted in order of decreasing frequency to achieve
good results. For more details, see
`tf.random.log_uniform_candidate_sampler`.
Note: In the case where `num_true` > 1, we assign to each target class
the target probability 1 / `num_true` so that the target probabilities
sum to 1 per-example.
Note: It would be useful to allow a variable number of target classes per
example. We hope to provide this functionality in a future release.
For now, if you have a variable number of target classes, you can pad them
out to a constant number by either repeating them or by padding
with an otherwise unused class.
Args:
weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
objects whose concatenation along dimension 0 has shape
[num_classes, dim]. The (possibly-partitioned) class embeddings.
biases: A `Tensor` of shape `[num_classes]`. The class biases.
labels: A `Tensor` of type `int64` and shape `[batch_size,
num_true]`. The target classes.
inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
activations of the input network.
num_sampled: An `int`. The number of negative classes to randomly sample
per batch. This single sample of negative classes is evaluated for each
element in the batch.
num_classes: An `int`. The number of possible classes.
num_true: An `int`. The number of target classes per training example.
sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
`sampled_expected_count`) returned by a `*_candidate_sampler` function.
(if None, we default to `log_uniform_candidate_sampler`)
remove_accidental_hits: A `bool`. Whether to remove "accidental hits"
where a sampled class equals one of the target classes. If set to
`True`, this is a "Sampled Logistic" loss instead of NCE, and we are
learning to generate log-odds instead of log probabilities. See
our Candidate Sampling Algorithms Reference
([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
Default is False.
partition_strategy: A string specifying the partitioning strategy, relevant
if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
name: A name for the operation (optional).
Returns:
A `batch_size` 1-D tensor of per-example NCE losses.
References:
Noise-contrastive estimation - A new estimation principle for unnormalized
statistical models:
[Gutmann et al., 2010](http://proceedings.mlr.press/v9/gutmann10a)
([pdf](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf))
"""
logits, labels = _compute_sampled_logits(
weights=weights,
biases=biases,
labels=labels,
inputs=inputs,
num_sampled=num_sampled,
num_classes=num_classes,
num_true=num_true,
sampled_values=sampled_values,
subtract_log_q=True,
remove_accidental_hits=remove_accidental_hits,
partition_strategy=partition_strategy,
name=name)
sampled_losses = sigmoid_cross_entropy_with_logits(
labels=labels, logits=logits, name="sampled_losses")
# sampled_losses is batch_size x {true_loss, sampled_losses...}
# We sum out true and sampled losses.
return _sum_rows(sampled_losses)
@tf_export("nn.sampled_softmax_loss", v1=[])
@dispatch.add_dispatch_support
def sampled_softmax_loss_v2(weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
remove_accidental_hits=True,
seed=None,
name="sampled_softmax_loss"):
"""Computes and returns the sampled softmax training loss.
This is a faster way to train a softmax classifier over a huge number of
classes.
This operation is for training only. It is generally an underestimate of
the full softmax loss.
A common use case is to use this method for training, and calculate the full
softmax loss for evaluation or inference as in the following example:
```python
if mode == "train":
loss = tf.nn.sampled_softmax_loss(
weights=weights,
biases=biases,
labels=labels,
inputs=inputs,
...)
elif mode == "eval":
logits = tf.matmul(inputs, tf.transpose(weights))
logits = tf.nn.bias_add(logits, biases)
labels_one_hot = tf.one_hot(labels, n_classes)
loss = tf.nn.softmax_cross_entropy_with_logits(
labels=labels_one_hot,
logits=logits)
```
See our [Candidate Sampling Algorithms Reference]
(https://www.tensorflow.org/extras/candidate_sampling.pdf)
Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
Note: when doing embedding lookup on `weights` and `bias`, "div" partition
strategy will be used. Support for other partition strategy will be added
later.
Args:
weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
objects whose concatenation along dimension 0 has shape [num_classes,
dim]. The (possibly-sharded) class embeddings.
biases: A `Tensor` of shape `[num_classes]`. The class biases.
labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
target classes. Note that this format differs from the `labels` argument
of `nn.softmax_cross_entropy_with_logits`.
inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of
the input network.
num_sampled: An `int`. The number of classes to randomly sample per batch.
num_classes: An `int`. The number of possible classes.
num_true: An `int`. The number of target classes per training example.
sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
`sampled_expected_count`) returned by a `*_candidate_sampler` function.
(if None, we default to `log_uniform_candidate_sampler`)
remove_accidental_hits: A `bool`. whether to remove "accidental hits"
where a sampled class equals one of the target classes. Default is True.
seed: random seed for candidate sampling. Default to None, which doesn't set
the op-level random seed for candidate sampling.
name: A name for the operation (optional).
Returns:
A `batch_size` 1-D tensor of per-example sampled softmax losses.
"""
return sampled_softmax_loss(
weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=num_true,
sampled_values=sampled_values,
remove_accidental_hits=remove_accidental_hits,
partition_strategy="div",
name=name,
seed=seed)
@tf_export(v1=["nn.sampled_softmax_loss"])
@dispatch.add_dispatch_support
def sampled_softmax_loss(weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
remove_accidental_hits=True,
partition_strategy="mod",
name="sampled_softmax_loss",
seed=None):
"""Computes and returns the sampled softmax training loss.
This is a faster way to train a softmax classifier over a huge number of
classes.
This operation is for training only. It is generally an underestimate of
the full softmax loss.
A common use case is to use this method for training, and calculate the full
softmax loss for evaluation or inference. In this case, you must set
`partition_strategy="div"` for the two losses to be consistent, as in the
following example:
```python
if mode == "train":
loss = tf.nn.sampled_softmax_loss(
weights=weights,
biases=biases,
labels=labels,
inputs=inputs,
...,
partition_strategy="div")
elif mode == "eval":
logits = tf.matmul(inputs, tf.transpose(weights))
logits = tf.nn.bias_add(logits, biases)
labels_one_hot = tf.one_hot(labels, n_classes)
loss = tf.nn.softmax_cross_entropy_with_logits(
labels=labels_one_hot,
logits=logits)
```
See our Candidate Sampling Algorithms Reference
([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
Also see Section 3 of (Jean et al., 2014) for the math.
Args:
weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
objects whose concatenation along dimension 0 has shape
[num_classes, dim]. The (possibly-sharded) class embeddings.
biases: A `Tensor` of shape `[num_classes]`. The class biases.
labels: A `Tensor` of type `int64` and shape `[batch_size,
num_true]`. The target classes. Note that this format differs from
the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
activations of the input network.
num_sampled: An `int`. The number of classes to randomly sample per batch.
num_classes: An `int`. The number of possible classes.
num_true: An `int`. The number of target classes per training example.
sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
`sampled_expected_count`) returned by a `*_candidate_sampler` function.
(if None, we default to `log_uniform_candidate_sampler`)
remove_accidental_hits: A `bool`. whether to remove "accidental hits"
where a sampled class equals one of the target classes. Default is
True.
partition_strategy: A string specifying the partitioning strategy, relevant
if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
name: A name for the operation (optional).
seed: random seed for candidate sampling. Default to None, which doesn't set
the op-level random seed for candidate sampling.
Returns:
A `batch_size` 1-D tensor of per-example sampled softmax losses.
References:
On Using Very Large Target Vocabulary for Neural Machine Translation:
[Jean et al., 2014]
(https://aclanthology.coli.uni-saarland.de/papers/P15-1001/p15-1001)
([pdf](http://aclweb.org/anthology/P15-1001))
"""
logits, labels = _compute_sampled_logits(
weights=weights,
biases=biases,
labels=labels,
inputs=inputs,
num_sampled=num_sampled,
num_classes=num_classes,
num_true=num_true,
sampled_values=sampled_values,
subtract_log_q=True,
remove_accidental_hits=remove_accidental_hits,
partition_strategy=partition_strategy,
name=name,
seed=seed)
labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(
labels=labels, logits=logits)
# sampled_losses is a [batch_size] tensor.
return sampled_losses