tensorflow/tensorflow

Switch branches/tags
Nothing to show
282329c Dec 18, 2018
26 contributors

Users who have contributed to this file

1870 lines (1597 sloc) 74.6 KB
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= """Implementation of Neural Net (NN) functions.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import math from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import function from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import candidate_sampling_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import embedding_ops from tensorflow.python.ops import gen_array_ops # pylint: disable=unused-import from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import sparse_ops from tensorflow.python.ops import variables from tensorflow.python.util.deprecation import deprecated_args from tensorflow.python.util.deprecation import deprecated_argument_lookup from tensorflow.python.util.tf_export import tf_export @tf_export("nn.log_poisson_loss") def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None): """Computes log Poisson loss given log_input. Gives the log-likelihood loss between the prediction and the target under the assumption that the target has a Poisson distribution. Caveat: By default, this is not the exact loss, but the loss minus a constant term [log(z!)]. That has no effect for optimization, but does not play well with relative loss comparisons. To compute an approximation of the log factorial term, specify compute_full_loss=True to enable Stirling's Approximation. For brevity, let c = log(x) = log_input, z = targets. The log Poisson loss is -log(exp(-x) * (x^z) / z!) = -log(exp(-x) * (x^z)) + log(z!) ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)] [ Note the second term is the Stirling's Approximation for log(z!). It is invariant to x and does not affect optimization, though important for correct relative loss comparisons. It is only computed when compute_full_loss == True. ] = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)] = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)] Args: targets: A Tensor of the same type and shape as log_input. log_input: A Tensor of type float32 or float64. compute_full_loss: whether to compute the full loss. If false, a constant term is dropped in favor of more efficient optimization. name: A name for the operation (optional). Returns: A Tensor of the same shape as log_input with the componentwise logistic losses. Raises: ValueError: If log_input and targets do not have the same shape. """ with ops.name_scope(name, "log_poisson_loss", [log_input, targets]) as name: log_input = ops.convert_to_tensor(log_input, name="log_input") targets = ops.convert_to_tensor(targets, name="targets") try: targets.get_shape().merge_with(log_input.get_shape()) except ValueError: raise ValueError( "log_input and targets must have the same shape (%s vs %s)" % (log_input.get_shape(), targets.get_shape())) result = math_ops.exp(log_input) - log_input * targets if compute_full_loss: # need to create constant tensors here so that their dtypes can be matched # to that of the targets. point_five = constant_op.constant(0.5, dtype=targets.dtype) two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype) stirling_approx = (targets * math_ops.log(targets)) - targets + ( point_five * math_ops.log(two_pi * targets)) zeros = array_ops.zeros_like(targets, dtype=targets.dtype) ones = array_ops.ones_like(targets, dtype=targets.dtype) cond = math_ops.logical_and(targets >= zeros, targets <= ones) result += array_ops.where(cond, zeros, stirling_approx) return result @tf_export("nn.sigmoid_cross_entropy_with_logits") def sigmoid_cross_entropy_with_logits( # pylint: disable=invalid-name _sentinel=None, labels=None, logits=None, name=None): """Computes sigmoid cross entropy given logits. Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time. For brevity, let x = logits, z = labels. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) logits and labels must have the same type and shape. Args: _sentinel: Used to prevent positional parameters. Internal, do not use. labels: A Tensor of the same type and shape as logits. logits: A Tensor of type float32 or float64. name: A name for the operation (optional). Returns: A Tensor of the same shape as logits with the componentwise logistic losses. Raises: ValueError: If logits and labels do not have the same shape. """ # pylint: disable=protected-access nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", _sentinel, labels, logits) # pylint: enable=protected-access with ops.name_scope(name, "logistic_loss", [logits, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") labels = ops.convert_to_tensor(labels, name="labels") try: labels.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError("logits and labels must have the same shape (%s vs %s)" % (logits.get_shape(), labels.get_shape())) # The logistic loss formula from above is # x - x * z + log(1 + exp(-x)) # For x < 0, a more numerically stable formula is # -x * z + log(1 + exp(x)) # Note that these two expressions can be combined into the following: # max(x, 0) - x * z + log(1 + exp(-abs(x))) # To allow computing gradients at zero, we define custom versions of max and # abs functions. zeros = array_ops.zeros_like(logits, dtype=logits.dtype) cond = (logits >= zeros) relu_logits = array_ops.where(cond, logits, zeros) neg_abs_logits = array_ops.where(cond, -logits, logits) return math_ops.add( relu_logits - logits * labels, math_ops.log1p(math_ops.exp(neg_abs_logits)), name=name) @tf_export("nn.weighted_cross_entropy_with_logits") def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None): """Computes a weighted cross entropy. This is like sigmoid_cross_entropy_with_logits() except that pos_weight, allows one to trade off recall and precision by up- or down-weighting the cost of a positive error relative to a negative error. The usual cross-entropy cost is defined as: targets * -log(sigmoid(logits)) + (1 - targets) * -log(1 - sigmoid(logits)) A value pos_weights > 1 decreases the false negative count, hence increasing the recall. Conversely setting pos_weights < 1 decreases the false positive count and increases the precision. This can be seen from the fact that pos_weight is introduced as a multiplicative coefficient for the positive targets term in the loss expression: targets * -log(sigmoid(logits)) * pos_weight + (1 - targets) * -log(1 - sigmoid(logits)) For brevity, let x = logits, z = targets, q = pos_weight. The loss is: qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + (qz + 1 - z) * log(1 + exp(-x)) = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x)) Setting l = (1 + (q - 1) * z), to ensure stability and avoid overflow, the implementation uses (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0)) logits and targets must have the same type and shape. Args: targets: A Tensor of the same type and shape as logits. logits: A Tensor of type float32 or float64. pos_weight: A coefficient to use on the positive examples. name: A name for the operation (optional). Returns: A Tensor of the same shape as logits with the componentwise weighted logistic losses. Raises: ValueError: If logits and targets do not have the same shape. """ with ops.name_scope(name, "logistic_loss", [logits, targets]) as name: logits = ops.convert_to_tensor(logits, name="logits") targets = ops.convert_to_tensor(targets, name="targets") try: targets.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError( "logits and targets must have the same shape (%s vs %s)" % (logits.get_shape(), targets.get_shape())) # The logistic loss formula from above is # (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x)) # For x < 0, a more numerically stable formula is # (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(x)) - l * x # To avoid branching, we use the combined version # (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0)) log_weight = 1 + (pos_weight - 1) * targets return math_ops.add( (1 - targets) * logits, log_weight * (math_ops.log1p(math_ops.exp(-math_ops.abs(logits))) + nn_ops.relu(-logits)), name=name) @tf_export(v1=["nn.relu_layer"]) def relu_layer(x, weights, biases, name=None): """Computes Relu(x * weight + biases). Args: x: a 2D tensor. Dimensions typically: batch, in_units weights: a 2D tensor. Dimensions typically: in_units, out_units biases: a 1D tensor. Dimensions: out_units name: A name for the operation (optional). If not specified "nn_relu_layer" is used. Returns: A 2-D Tensor computing relu(matmul(x, weights) + biases). Dimensions typically: batch, out_units. """ with ops.name_scope(name, "relu_layer", [x, weights, biases]) as name: x = ops.convert_to_tensor(x, name="x") weights = ops.convert_to_tensor(weights, name="weights") biases = ops.convert_to_tensor(biases, name="biases") xw_plus_b = nn_ops.bias_add(math_ops.matmul(x, weights), biases) return nn_ops.relu(xw_plus_b, name=name) def _swish_shape(op): """Shape helper function for swish and _swish_grad function below.""" return [op.inputs[0].shape] @function.Defun(shape_func=_swish_shape, func_name="swish_grad", noinline=True) def _swish_grad(features, grad): """Gradient of Swish function defined below.""" sigmoid_features = math_ops.sigmoid(features) activation_grad = ( sigmoid_features * (1.0 + features * (1.0 - sigmoid_features))) return grad * activation_grad # Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x) around # for backprop, effectively doubling the tensor's memory consumption. We use a # @Defun decorator with noinline=True so that sigmoid(features) is re-computed # during backprop, and we can free the sigmoid(features) expression immediately # after use during the forward pass. @tf_export("nn.swish") @function.Defun( grad_func=_swish_grad, shape_func=_swish_shape, func_name="swish", noinline=True) def swish(features): # pylint: disable=g-doc-args """Computes the Swish activation function: x * sigmoid(x). Source: "Searching for Activation Functions" (Ramachandran et al. 2017) https://arxiv.org/abs/1710.05941 Args: features: A Tensor representing preactivation values. name: A name for the operation (optional). Returns: The activation value. """ # pylint: enable=g-doc-args features = ops.convert_to_tensor(features, name="features") return features * math_ops.sigmoid(features) @tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"]) @deprecated_args(None, "dim is deprecated, use axis instead", "dim") def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None): """Normalizes along dimension axis using an L2 norm. For a 1-D tensor with axis = 0, computes output = x / sqrt(max(sum(x**2), epsilon)) For x with more dimensions, independently normalizes each 1-D slice along dimension axis. Args: x: A Tensor. axis: Dimension along which to normalize. A scalar or a vector of integers. epsilon: A lower bound value for the norm. Will use sqrt(epsilon) as the divisor if norm < sqrt(epsilon). name: A name for this operation (optional). dim: Deprecated alias for axis. Returns: A Tensor with the same shape as x. """ axis = deprecated_argument_lookup("axis", axis, "dim", dim) return l2_normalize_v2(x, axis, epsilon, name) @tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[]) def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None): """Normalizes along dimension axis using an L2 norm. For a 1-D tensor with axis = 0, computes output = x / sqrt(max(sum(x**2), epsilon)) For x with more dimensions, independently normalizes each 1-D slice along dimension axis. Args: x: A Tensor. axis: Dimension along which to normalize. A scalar or a vector of integers. epsilon: A lower bound value for the norm. Will use sqrt(epsilon) as the divisor if norm < sqrt(epsilon). name: A name for this operation (optional). Returns: A Tensor with the same shape as x. """ with ops.name_scope(name, "l2_normalize", [x]) as name: x = ops.convert_to_tensor(x, name="x") square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True) x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon)) return math_ops.multiply(x, x_inv_norm, name=name) def _count_nonzero(input_tensor, dtype=dtypes.int64): """Same as math_ops.count_nonzero. The reduction is done in dtype, which can be faster for 32-bit dtypes. Args: input_tensor: numeric tensor dtype: reduction dtype Returns: number of nonzero values with type dtype """ with ops.name_scope("count_nonzero", values=[input_tensor]): zero = array_ops.zeros([], dtype=input_tensor.dtype) nonzero_count = math_ops.reduce_sum( math_ops.cast( math_ops.not_equal(input_tensor, zero), dtype=dtype), name="nonzero_count") return nonzero_count @tf_export("math.zero_fraction", "nn.zero_fraction") def zero_fraction(value, name=None): """Returns the fraction of zeros in value. If value is empty, the result is nan. This is useful in summaries to measure and report sparsity. For example, python z = tf.nn.relu(...) summ = tf.summary.scalar('sparsity', tf.nn.zero_fraction(z))  Args: value: A tensor of numeric type. name: A name for the operation (optional). Returns: The fraction of zeros in value, with type float32. """ with ops.name_scope(name, "zero_fraction", [value]): value = ops.convert_to_tensor(value, name="value") size = array_ops.size(value, out_type=dtypes.int64) # If the count is small, we can save memory/CPU with an int32 reduction. num_nonzero = control_flow_ops.cond( size <= dtypes.int32.max, # pylint: disable=g-long-lambda true_fn=lambda: math_ops.cast( _count_nonzero(value, dtype=dtypes.int32), dtype=dtypes.int64), false_fn=lambda: _count_nonzero(value, dtype=dtypes.int64)) with ops.name_scope("counts_to_fraction"): num_zero = size - num_nonzero num_zero_float32 = math_ops.cast(num_zero, dtype=dtypes.float32) size_float32 = math_ops.cast(size, dtype=dtypes.float32) zero_fraction_float32 = num_zero_float32 / size_float32 return array_ops.identity(zero_fraction_float32, "fraction") # pylint: disable=redefined-builtin @tf_export(v1=["nn.depthwise_conv2d"]) def depthwise_conv2d(input, filter, strides, padding, rate=None, name=None, data_format=None): """Depthwise 2-D convolution. Given a 4D input tensor ('NHWC' or 'NCHW' data formats) and a filter tensor of shape [filter_height, filter_width, in_channels, channel_multiplier] containing in_channels convolutional filters of depth 1, depthwise_conv2d applies a different filter to each input channel (expanding from 1 channel to channel_multiplier channels for each), then concatenates the results together. The output has in_channels * channel_multiplier channels. In detail, output[b, i, j, k * channel_multiplier + q] = sum_{di, dj} filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di, strides[2] * j + rate[1] * dj, k] Must have strides[0] = strides[3] = 1. For the most common case of the same horizontal and vertical strides, strides = [1, stride, stride, 1]. If any value in rate is greater than 1, we perform atrous depthwise convolution, in which case all values in the strides tensor must be equal to 1. Args: input: 4-D with shape according to data_format. filter: 4-D with shape [filter_height, filter_width, in_channels, channel_multiplier]. strides: 1-D of size 4. The stride of the sliding window for each dimension of input. padding: A string, either 'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.nn.convolution for details. rate: 1-D of size 2. The dilation rate in which we sample input values across the height and width dimensions in atrous convolution. If it is greater than 1, then all values of strides must be 1. name: A name for this operation (optional). data_format: The data format for input. Either "NHWC" (default) or "NCHW". Returns: A 4-D Tensor with shape according to data_format. E.g., for "NHWC" format, shape is [batch, out_height, out_width, in_channels * channel_multiplier]. """ with ops.name_scope(name, "depthwise", [input, filter]) as name: input = ops.convert_to_tensor(input, name="tensor_in") filter = ops.convert_to_tensor(filter, name="filter_in") if rate is None: rate = [1, 1] def op(input_converted, _, padding): return nn_ops.depthwise_conv2d_native( input=input_converted, filter=filter, strides=strides, padding=padding, data_format=data_format, name=name) return nn_ops.with_space_to_batch( input=input, filter_shape=array_ops.shape(filter), dilation_rate=rate, padding=padding, data_format=data_format, op=op) @tf_export("nn.depthwise_conv2d", v1=[]) def depthwise_conv2d_v2(input, filter, strides, padding, data_format=None, dilations=None, name=None): """Depthwise 2-D convolution. Given a 4D input tensor ('NHWC' or 'NCHW' data formats) and a filter tensor of shape [filter_height, filter_width, in_channels, channel_multiplier] containing in_channels convolutional filters of depth 1, depthwise_conv2d applies a different filter to each input channel (expanding from 1 channel to channel_multiplier channels for each), then concatenates the results together. The output has in_channels * channel_multiplier channels. In detail, output[b, i, j, k * channel_multiplier + q] = sum_{di, dj} filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di, strides[2] * j + rate[1] * dj, k] Must have strides[0] = strides[3] = 1. For the most common case of the same horizontal and vertical strides, strides = [1, stride, stride, 1]. If any value in rate is greater than 1, we perform atrous depthwise convolution, in which case all values in the strides tensor must be equal to 1. Args: input: 4-D with shape according to data_format. filter: 4-D with shape [filter_height, filter_width, in_channels, channel_multiplier]. strides: 1-D of size 4. The stride of the sliding window for each dimension of input. padding: A string, either 'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.nn.convolution for details. data_format: The data format for input. Either "NHWC" (default) or "NCHW". dilations: 1-D of size 2. The dilation rate in which we sample input values across the height and width dimensions in atrous convolution. If it is greater than 1, then all values of strides must be 1. name: A name for this operation (optional). Returns: A 4-D Tensor with shape according to data_format. E.g., for "NHWC" format, shape is [batch, out_height, out_width, in_channels * channel_multiplier]. """ return depthwise_conv2d(input=input, filter=filter, strides=strides, padding=padding, rate=dilations, name=name, data_format=data_format) # pylint: enable=redefined-builtin # pylint: disable=redefined-builtin,line-too-long @tf_export(v1=["nn.separable_conv2d"]) def separable_conv2d(input, depthwise_filter, pointwise_filter, strides, padding, rate=None, name=None, data_format=None): """2-D convolution with separable filters. Performs a depthwise convolution that acts separately on channels followed by a pointwise convolution that mixes channels. Note that this is separability between dimensions [1, 2] and 3, not spatial separability between dimensions 1 and 2. In detail, output[b, i, j, k] = sum_{di, dj, q, r} input[b, strides[1] * i + di, strides[2] * j + dj, q] * depthwise_filter[di, dj, q, r] * pointwise_filter[0, 0, q * channel_multiplier + r, k] strides controls the strides for the depthwise convolution only, since the pointwise convolution has implicit strides of [1, 1, 1, 1]. Must have strides[0] = strides[3] = 1. For the most common case of the same horizontal and vertical strides, strides = [1, stride, stride, 1]. If any value in rate is greater than 1, we perform atrous depthwise convolution, in which case all values in the strides tensor must be equal to 1. Args: input: 4-D Tensor with shape according to data_format. depthwise_filter: 4-D Tensor with shape [filter_height, filter_width, in_channels, channel_multiplier]. Contains in_channels convolutional filters of depth 1. pointwise_filter: 4-D Tensor with shape [1, 1, channel_multiplier * in_channels, out_channels]. Pointwise filter to mix channels after depthwise_filter has convolved spatially. strides: 1-D of size 4. The strides for the depthwise convolution for each dimension of input. padding: A string, either 'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.nn.convolution for details. rate: 1-D of size 2. The dilation rate in which we sample input values across the height and width dimensions in atrous convolution. If it is greater than 1, then all values of strides must be 1. name: A name for this operation (optional). data_format: The data format for input. Either "NHWC" (default) or "NCHW". Returns: A 4-D Tensor with shape according to 'data_format'. For example, with data_format="NHWC", shape is [batch, out_height, out_width, out_channels]. """ with ops.name_scope(name, "separable_conv2d", [input, depthwise_filter, pointwise_filter]) as name: input = ops.convert_to_tensor(input, name="tensor_in") depthwise_filter = ops.convert_to_tensor( depthwise_filter, name="depthwise_filter") pointwise_filter = ops.convert_to_tensor( pointwise_filter, name="pointwise_filter") pointwise_filter_shape = pointwise_filter.get_shape().with_rank(4) pointwise_filter_shape.dims[0].assert_is_compatible_with(1) pointwise_filter_shape.dims[1].assert_is_compatible_with(1) if rate is None: rate = [1, 1] # The layout of the ops in the graph are expected to be as follows: # depthwise_conv2d // Conv2D op corresponding to native deptwise conv. # separable_conv2d // Conv2D op corresponding to the pointwise conv. def op(input_converted, _, padding): return nn_ops.depthwise_conv2d_native( input=input_converted, filter=depthwise_filter, strides=strides, padding=padding, data_format=data_format, name="depthwise") depthwise = nn_ops.with_space_to_batch( input=input, filter_shape=array_ops.shape(depthwise_filter), dilation_rate=rate, padding=padding, data_format=data_format, op=op) return nn_ops.conv2d( depthwise, pointwise_filter, [1, 1, 1, 1], padding="VALID", data_format=data_format, name=name) @tf_export("nn.separable_conv2d", v1=[]) def separable_conv2d_v2( input, depthwise_filter, pointwise_filter, strides, padding, data_format=None, dilations=None, name=None, ): """2-D convolution with separable filters. Performs a depthwise convolution that acts separately on channels followed by a pointwise convolution that mixes channels. Note that this is separability between dimensions [1, 2] and 3, not spatial separability between dimensions 1 and 2. In detail, output[b, i, j, k] = sum_{di, dj, q, r} input[b, strides[1] * i + di, strides[2] * j + dj, q] * depthwise_filter[di, dj, q, r] * pointwise_filter[0, 0, q * channel_multiplier + r, k] strides controls the strides for the depthwise convolution only, since the pointwise convolution has implicit strides of [1, 1, 1, 1]. Must have strides[0] = strides[3] = 1. For the most common case of the same horizontal and vertical strides, strides = [1, stride, stride, 1]. If any value in rate is greater than 1, we perform atrous depthwise convolution, in which case all values in the strides tensor must be equal to 1. Args: input: 4-D Tensor with shape according to data_format. depthwise_filter: 4-D Tensor with shape [filter_height, filter_width, in_channels, channel_multiplier]. Contains in_channels convolutional filters of depth 1. pointwise_filter: 4-D Tensor with shape [1, 1, channel_multiplier * in_channels, out_channels]. Pointwise filter to mix channels after depthwise_filter has convolved spatially. strides: 1-D of size 4. The strides for the depthwise convolution for each dimension of input. padding: A string, either 'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.nn.convolution for details. data_format: The data format for input. Either "NHWC" (default) or "NCHW". dilations: 1-D of size 2. The dilation rate in which we sample input values across the height and width dimensions in atrous convolution. If it is greater than 1, then all values of strides must be 1. name: A name for this operation (optional). Returns: A 4-D Tensor with shape according to 'data_format'. For example, with data_format="NHWC", shape is [batch, out_height, out_width, out_channels]. """ return separable_conv2d( input, depthwise_filter, pointwise_filter, strides, padding, rate=dilations, name=name, data_format=data_format) # pylint: enable=redefined-builtin,line-too-long @tf_export(v1=["nn.sufficient_statistics"]) def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None): """Calculate the sufficient statistics for the mean and variance of x. These sufficient statistics are computed using the one pass algorithm on an input that's optionally shifted. See: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data Args: x: A Tensor. axes: Array of ints. Axes along which to compute mean and variance. shift: A Tensor containing the value by which to shift the data for numerical stability, or None if no shift is to be performed. A shift close to the true mean provides the most numerically stable results. keep_dims: produce statistics with the same dimensionality as the input. name: Name used to scope the operations that compute the sufficient stats. Returns: Four Tensor objects of the same type as x: * the count (number of elements to average over). * the (possibly shifted) sum of the elements in the array. * the (possibly shifted) sum of squares of the elements in the array. * the shift by which the mean must be corrected or None if shift is None. """ axes = list(set(axes)) with ops.name_scope(name, "sufficient_statistics", [x, shift]): x = ops.convert_to_tensor(x, name="x") x_shape = x.get_shape() if all(x_shape.dims[d].value is not None for d in axes): counts = 1 for d in axes: counts *= x_shape.dims[d].value counts = constant_op.constant(counts, dtype=x.dtype) else: # shape needs to be inferred at runtime. x_dims = array_ops.gather( math_ops.cast(array_ops.shape(x), x.dtype), axes) counts = math_ops.reduce_prod(x_dims, name="count") if shift is not None: shift = ops.convert_to_tensor(shift, name="shift") m_ss = math_ops.subtract(x, shift) v_ss = math_ops.squared_difference(x, shift) else: # no shift. m_ss = x v_ss = math_ops.square(x) m_ss = math_ops.reduce_sum(m_ss, axes, keepdims=keep_dims, name="mean_ss") v_ss = math_ops.reduce_sum(v_ss, axes, keepdims=keep_dims, name="var_ss") return counts, m_ss, v_ss, shift @tf_export("nn.sufficient_statistics", v1=[]) def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None): """Calculate the sufficient statistics for the mean and variance of x. These sufficient statistics are computed using the one pass algorithm on an input that's optionally shifted. See: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data Args: x: A Tensor. axes: Array of ints. Axes along which to compute mean and variance. shift: A Tensor containing the value by which to shift the data for numerical stability, or None if no shift is to be performed. A shift close to the true mean provides the most numerically stable results. keepdims: produce statistics with the same dimensionality as the input. name: Name used to scope the operations that compute the sufficient stats. Returns: Four Tensor objects of the same type as x: * the count (number of elements to average over). * the (possibly shifted) sum of the elements in the array. * the (possibly shifted) sum of squares of the elements in the array. * the shift by which the mean must be corrected or None if shift is None. """ return sufficient_statistics( x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name) @tf_export("nn.normalize_moments") def normalize_moments(counts, mean_ss, variance_ss, shift, name=None): """Calculate the mean and variance of based on the sufficient statistics. Args: counts: A Tensor containing the total count of the data (one value). mean_ss: A Tensor containing the mean sufficient statistics: the (possibly shifted) sum of the elements to average over. variance_ss: A Tensor containing the variance sufficient statistics: the (possibly shifted) squared sum of the data to compute the variance over. shift: A Tensor containing the value by which the data is shifted for numerical stability, or None if no shift was performed. name: Name used to scope the operations that compute the moments. Returns: Two Tensor objects: mean and variance. """ with ops.name_scope(name, "normalize", [counts, mean_ss, variance_ss, shift]): divisor = math_ops.reciprocal(counts, name="divisor") if shift is not None: shifted_mean = math_ops.multiply(mean_ss, divisor, name="shifted_mean") mean = math_ops.add(shifted_mean, shift, name="mean") else: # no shift. shifted_mean = math_ops.multiply(mean_ss, divisor, name="mean") mean = shifted_mean variance = math_ops.subtract( math_ops.multiply(variance_ss, divisor), math_ops.square(shifted_mean), name="variance") return (mean, variance) @tf_export(v1=["nn.moments"]) def moments( x, axes, shift=None, # pylint: disable=unused-argument name=None, keep_dims=False): """Calculate the mean and variance of x. The mean and variance are calculated by aggregating the contents of x across axes. If x is 1-D and axes = [0] this is just the mean and variance of a vector. Note: shift is currently not used; the true mean is computed and used. When using these moments for batch normalization (see tf.nn.batch_normalization): * for so-called "global normalization", used with convolutional filters with shape [batch, height, width, depth], pass axes=[0, 1, 2]. * for simple batch normalization pass axes=[0] (batch only). Args: x: A Tensor. axes: Array of ints. Axes along which to compute mean and variance. shift: Not used in the current implementation name: Name used to scope the operations that compute the moments. keep_dims: produce moments with the same dimensionality as the input. Returns: Two Tensor objects: mean and variance. """ with ops.name_scope(name, "moments", [x, axes]): # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x # Compute true mean while keeping the dims for proper broadcasting. mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean") # sample variance, not unbiased variance # Note: stop_gradient does not change the gradient that gets # backpropagated to the mean from the variance calculation, # because that gradient is zero variance = math_ops.reduce_mean( math_ops.squared_difference(y, array_ops.stop_gradient(mean)), axes, keepdims=True, name="variance") if not keep_dims: mean = array_ops.squeeze(mean, axes) variance = array_ops.squeeze(variance, axes) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(variance, dtypes.float16)) else: return (mean, variance) @tf_export("nn.moments", v1=[]) def moments_v2( x, axes, shift=None, keepdims=False, name=None): """Calculates the mean and variance of x. The mean and variance are calculated by aggregating the contents of x across axes. If x is 1-D and axes = [0] this is just the mean and variance of a vector. Note: shift is currently not used; the true mean is computed and used. When using these moments for batch normalization (see tf.nn.batch_normalization): * for so-called "global normalization", used with convolutional filters with shape [batch, height, width, depth], pass axes=[0, 1, 2]. * for simple batch normalization pass axes=[0] (batch only). Args: x: A Tensor. axes: Array of ints. Axes along which to compute mean and variance. shift: Not used in the current implementation. keepdims: produce moments with the same dimensionality as the input. name: Name used to scope the operations that compute the moments. Returns: Two Tensor objects: mean and variance. """ return moments(x=x, axes=axes, shift=shift, name=name, keep_dims=keepdims) @tf_export(v1=["nn.weighted_moments"]) def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False): """Returns the frequency-weighted mean and variance of x. Args: x: A tensor. axes: 1-d tensor of int32 values; these are the axes along which to compute mean and variance. frequency_weights: A tensor of positive weights which can be broadcast with x. name: Name used to scope the operation. keep_dims: Produce moments with the same dimensionality as the input. Returns: Two tensors: weighted_mean and weighted_variance. """ with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]): x = ops.convert_to_tensor(x, name="x") frequency_weights = ops.convert_to_tensor( frequency_weights, name="frequency_weights") # Unlike moments(), this just uses a simpler two-pass method. # See comment in moments() WRT precision; it applies here too. needs_cast = x.dtype == dtypes.float16 if needs_cast: x = math_ops.cast(x, dtypes.float32) if frequency_weights.dtype != x.dtype: frequency_weights = math_ops.cast(frequency_weights, x.dtype) # Note that we use keep_dims=True for our reductions regardless of the arg; # this is so that the results remain broadcast-compatible with the inputs. weighted_input_sum = math_ops.reduce_sum( frequency_weights * x, axes, name="weighted_input_sum", keepdims=True) # The shape of the weights isn't necessarily the same as x's # shape, just broadcast-compatible with it -- so this expression # performs broadcasting to give a per-item weight, with the same # shape as (freqency_weights * x). This avoids having to reason # through all the broadcast logic to compute a correct # sum_of_weights. broadcasted_weights = frequency_weights + array_ops.zeros_like(x) sum_of_weights = math_ops.reduce_sum( broadcasted_weights, axes, name="sum_of_weights", keepdims=True) divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum") weighted_mean = math_ops.multiply(weighted_input_sum, divisor) # Have the weighted mean; now on to variance: weighted_distsq = math_ops.reduce_sum( frequency_weights * math_ops.squared_difference(x, weighted_mean), axes, name="weighted_distsq", keepdims=True) weighted_variance = math_ops.multiply(weighted_distsq, divisor) if not keep_dims: weighted_mean = array_ops.squeeze(weighted_mean, axis=axes) weighted_variance = array_ops.squeeze( weighted_variance, axis=axes) if needs_cast: weighted_mean = math_ops.cast(weighted_mean, dtypes.float16) weighted_variance = math_ops.cast(weighted_variance, dtypes.float16) return weighted_mean, weighted_variance @tf_export("nn.weighted_moments", v1=[]) def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None): """Returns the frequency-weighted mean and variance of x. Args: x: A tensor. axes: 1-d tensor of int32 values; these are the axes along which to compute mean and variance. frequency_weights: A tensor of positive weights which can be broadcast with x. keepdims: Produce moments with the same dimensionality as the input. name: Name used to scope the operation. Returns: Two tensors: weighted_mean and weighted_variance. """ return weighted_moments( x=x, axes=axes, frequency_weights=frequency_weights, name=name, keep_dims=keepdims) @tf_export("nn.batch_normalization") def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, name=None): r"""Batch normalization. As described in http://arxiv.org/abs/1502.03167. Normalizes a tensor by mean and variance, and applies (optionally) a scale \$$\gamma\$$ to it, as well as an offset \$$\beta\$$: \$$\frac{\gamma(x-\mu)}{\sigma}+\beta\$$ mean, variance, offset and scale are all expected to be of one of two shapes: * In all generality, they can have the same number of dimensions as the input x, with identical sizes as x for the dimensions that are not normalized over (the 'depth' dimension(s)), and dimension 1 for the others which are being normalized over. mean and variance in this case would typically be the outputs of tf.nn.moments(..., keep_dims=True) during training, or running averages thereof during inference. * In the common case where the 'depth' dimension is the last dimension in the input tensor x, they may be one dimensional tensors of the same size as the 'depth' dimension. This is the case for example for the common [batch, depth] layout of fully-connected layers, and [batch, height, width, depth] for convolutions. mean and variance in this case would typically be the outputs of tf.nn.moments(..., keep_dims=False) during training, or running averages thereof during inference. Args: x: Input Tensor of arbitrary dimensionality. mean: A mean Tensor. variance: A variance Tensor. offset: An offset Tensor, often denoted \$$\beta\$$ in equations, or None. If present, will be added to the normalized tensor. scale: A scale Tensor, often denoted \$$\gamma\$$ in equations, or None. If present, the scale is applied to the normalized tensor. variance_epsilon: A small float number to avoid dividing by 0. name: A name for this operation (optional). Returns: the normalized, scaled, offset tensor. """ with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]): inv = math_ops.rsqrt(variance + variance_epsilon) if scale is not None: inv *= scale # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on # the precise order of ops that are generated by the expression below. return x * math_ops.cast(inv, x.dtype) + math_ops.cast( offset - mean * inv if offset is not None else -mean * inv, x.dtype) @tf_export(v1=["nn.fused_batch_norm"]) def fused_batch_norm( x, scale, offset, # pylint: disable=invalid-name mean=None, variance=None, epsilon=0.001, data_format="NHWC", is_training=True, name=None): r"""Batch normalization. As described in http://arxiv.org/abs/1502.03167. Args: x: Input Tensor of 4 dimensions. scale: A Tensor of 1 dimension for scaling. offset: A Tensor of 1 dimension for bias. mean: A Tensor of 1 dimension for population mean used for inference. variance: A Tensor of 1 dimension for population variance used for inference. epsilon: A small float number added to the variance of x. data_format: The data format for x. Either "NHWC" (default) or "NCHW". is_training: A bool value to specify if the operation is used for training or inference. name: A name for this operation (optional). Returns: y: A 4D Tensor for the normalized, scaled, offsetted x. batch_mean: A 1D Tensor for the mean of x. batch_var: A 1D Tensor for the variance of x. Raises: ValueError: If mean or variance is not None when is_training is True. """ x = ops.convert_to_tensor(x, name="input") scale = ops.convert_to_tensor(scale, name="scale") offset = ops.convert_to_tensor(offset, name="offset") if is_training: if (mean is not None) or (variance is not None): raise ValueError("Both 'mean' and 'variance' must be None " "if is_training is True.") if mean is None: mean = constant_op.constant([]) if variance is None: variance = constant_op.constant([]) # Set a minimum epsilon to 1.001e-5, which is a requirement by CUDNN to # prevent exception (see cudnn.h). min_epsilon = 1.001e-5 epsilon = epsilon if epsilon > min_epsilon else min_epsilon # TODO(reedwm): In a few weeks, switch to using the V2 version exclusively. We # currently only use the V2 version for float16 inputs, which is not supported # by the V1 version. if x.dtype == dtypes.float16 or x.dtype == dtypes.bfloat16: fused_batch_norm_func = gen_nn_ops.fused_batch_norm_v2 else: fused_batch_norm_func = gen_nn_ops._fused_batch_norm # pylint: disable=protected-access y, batch_mean, batch_var, _, _ = fused_batch_norm_func( x, scale, offset, mean, variance, epsilon=epsilon, data_format=data_format, is_training=is_training, name=name) return y, batch_mean, batch_var @tf_export(v1=["nn.batch_norm_with_global_normalization"]) def batch_norm_with_global_normalization(t, m, v, beta, gamma, variance_epsilon, scale_after_normalization, name=None): """Batch normalization. This op is deprecated. See tf.nn.batch_normalization. Args: t: A 4D input Tensor. m: A 1D mean Tensor with size matching the last dimension of t. This is the first output from tf.nn.moments, or a saved moving average thereof. v: A 1D variance Tensor with size matching the last dimension of t. This is the second output from tf.nn.moments, or a saved moving average thereof. beta: A 1D beta Tensor with size matching the last dimension of t. An offset to be added to the normalized tensor. gamma: A 1D gamma Tensor with size matching the last dimension of t. If "scale_after_normalization" is true, this tensor will be multiplied with the normalized tensor. variance_epsilon: A small float number to avoid dividing by 0. scale_after_normalization: A bool indicating whether the resulted tensor needs to be multiplied with gamma. name: A name for this operation (optional). Returns: A batch-normalized t. """ return batch_normalization(t, m, v, beta, gamma if scale_after_normalization else None, variance_epsilon, name) # pylint: disable=redefined-builtin,line-too-long @tf_export("nn.batch_norm_with_global_normalization", v1=[]) def batch_norm_with_global_normalization_v2(input, mean, variance, beta, gamma, variance_epsilon, scale_after_normalization, name=None): """Batch normalization. This op is deprecated. See tf.nn.batch_normalization. Args: input: A 4D input Tensor. mean: A 1D mean Tensor with size matching the last dimension of t. This is the first output from tf.nn.moments, or a saved moving average thereof. variance: A 1D variance Tensor with size matching the last dimension of t. This is the second output from tf.nn.moments, or a saved moving average thereof. beta: A 1D beta Tensor with size matching the last dimension of t. An offset to be added to the normalized tensor. gamma: A 1D gamma Tensor with size matching the last dimension of t. If "scale_after_normalization" is true, this tensor will be multiplied with the normalized tensor. variance_epsilon: A small float number to avoid dividing by 0. scale_after_normalization: A bool indicating whether the resulted tensor needs to be multiplied with gamma. name: A name for this operation (optional). Returns: A batch-normalized t. """ return batch_norm_with_global_normalization(t=input, m=mean, v=variance, beta=beta, gamma=gamma, variance_epsilon=variance_epsilon, scale_after_normalization=scale_after_normalization, name=name) # pylint: enable=redefined-builtin,line-too-long def _sum_rows(x): """Returns a vector summing up each row of the matrix x.""" # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is # a matrix. The gradient of _sum_rows(x) is more efficient than # reduce_sum(x, 1)'s gradient in today's implementation. Therefore, # we use _sum_rows(x) in the nce_loss() computation since the loss # is mostly used for training. cols = array_ops.shape(x)[1] ones_shape = array_ops.stack([cols, 1]) ones = array_ops.ones(ones_shape, x.dtype) return array_ops.reshape(math_ops.matmul(x, ones), [-1]) def _compute_sampled_logits(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, subtract_log_q=True, remove_accidental_hits=False, partition_strategy="mod", name=None, seed=None): """Helper function for nce_loss and sampled_softmax_loss functions. Computes sampled output training logits and labels suitable for implementing e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see sampled_softmax_loss). Note: In the case where num_true > 1, we assign to each target class the target probability 1 / num_true so that the target probabilities sum to 1 per-example. Args: weights: A Tensor of shape [num_classes, dim], or a list of Tensor objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-partitioned) class embeddings. biases: A Tensor of shape [num_classes]. The (possibly-partitioned) class biases. labels: A Tensor of type int64 and shape [batch_size, num_true]. The target classes. Note that this format differs from the labels argument of nn.softmax_cross_entropy_with_logits_v2. inputs: A Tensor of shape [batch_size, dim]. The forward activations of the input network. num_sampled: An int. The number of classes to randomly sample per batch. num_classes: An int. The number of possible classes. num_true: An int. The number of target classes per training example. sampled_values: a tuple of (sampled_candidates, true_expected_count, sampled_expected_count) returned by a *_candidate_sampler function. (if None, we default to log_uniform_candidate_sampler) subtract_log_q: A bool. whether to subtract the log expected count of the labels in the sample to get the logits of the true labels. Default is True. Turn off for Negative Sampling. remove_accidental_hits: A bool. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is False. partition_strategy: A string specifying the partitioning strategy, relevant if len(weights) > 1. Currently "div" and "mod" are supported. Default is "mod". See tf.nn.embedding_lookup for more details. name: A name for the operation (optional). seed: random seed for candidate sampling. Default to None, which doesn't set the op-level random seed for candidate sampling. Returns: out_logits: Tensor object with shape [batch_size, num_true + num_sampled], for passing to either nn.sigmoid_cross_entropy_with_logits (NCE) or nn.softmax_cross_entropy_with_logits_v2 (sampled softmax). out_labels: A Tensor object with the same shape as out_logits. """ if isinstance(weights, variables.PartitionedVariable): weights = list(weights) if not isinstance(weights, list): weights = [weights] with ops.name_scope(name, "compute_sampled_logits", weights + [biases, inputs, labels]): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: [num_sampled] tensor # true_expected_count shape = [batch_size, 1] tensor # sampled_expected_count shape = [num_sampled] tensor if sampled_values is None: sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_sampled, unique=True, range_max=num_classes, seed=seed) # NOTE: pylint cannot tell that 'sampled_values' is a sequence # pylint: disable=unpacking-non-sequence sampled, true_expected_count, sampled_expected_count = ( array_ops.stop_gradient(s) for s in sampled_values) # pylint: enable=unpacking-non-sequence sampled = math_ops.cast(sampled, dtypes.int64) # labels_flat is a [batch_size * num_true] tensor # sampled is a [num_sampled] int tensor all_ids = array_ops.concat([labels_flat, sampled], 0) # Retrieve the true weights and the logits of the sampled weights. # weights shape is [num_classes, dim] all_w = embedding_ops.embedding_lookup( weights, all_ids, partition_strategy=partition_strategy) # true_w shape is [batch_size * num_true, dim] true_w = array_ops.slice(all_w, [0, 0], array_ops.stack( [array_ops.shape(labels_flat)[0], -1])) sampled_w = array_ops.slice( all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1]) # inputs has shape [batch_size, dim] # sampled_w has shape [num_sampled, dim] # Apply X*W', which yields [batch_size, num_sampled] sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True) # Retrieve the true and sampled biases, compute the true logits, and # add the biases to the true and sampled logits. all_b = embedding_ops.embedding_lookup( biases, all_ids, partition_strategy=partition_strategy) # true_b is a [batch_size * num_true] tensor # sampled_b is a [num_sampled] float tensor true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat)) sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1]) # inputs shape is [batch_size, dim] # true_w shape is [batch_size * num_true, dim] # row_wise_dots is [batch_size, num_true, dim] dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0) row_wise_dots = math_ops.multiply( array_ops.expand_dims(inputs, 1), array_ops.reshape(true_w, new_true_w_shape)) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat([[-1], dim], 0)) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) true_b = array_ops.reshape(true_b, [-1, num_true]) true_logits += true_b sampled_logits += sampled_b if remove_accidental_hits: acc_hits = candidate_sampling_ops.compute_accidental_hits( labels, sampled, num_true=num_true) acc_indices, acc_ids, acc_weights = acc_hits # This is how SparseToDense expects the indices. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1]) acc_ids_2d_int32 = array_ops.reshape( math_ops.cast(acc_ids, dtypes.int32), [-1, 1]) sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1, "sparse_indices") # Create sampled_logits_shape = [batch_size, num_sampled] sampled_logits_shape = array_ops.concat( [array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0)], 0) if sampled_logits.dtype != acc_weights.dtype: acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype) sampled_logits += sparse_ops.sparse_to_dense( sparse_indices, sampled_logits_shape, acc_weights, default_value=0.0, validate_indices=False) if subtract_log_q: # Subtract log of Q(l), prior probability that l appears in sampled. true_logits -= math_ops.log(true_expected_count) sampled_logits -= math_ops.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat([true_logits, sampled_logits], 1) # true_logits is a float tensor, ones_like(true_logits) is a float # tensor of ones. We then divide by num_true to ensure the per-example # labels sum to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat([ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(sampled_logits) ], 1) return out_logits, out_labels @tf_export("nn.nce_loss", v1=[]) def nce_loss_v2(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=False, name="nce_loss"): """Computes and returns the noise-contrastive estimation training loss. See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). Also see our [Candidate Sampling Algorithms Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf) A common use case is to use this method for training, and calculate the full sigmoid loss for evaluation or inference as in the following example: python if mode == "train": loss = tf.nn.nce_loss( weights=weights, biases=biases, labels=labels, inputs=inputs, ...) elif mode == "eval": logits = tf.matmul(inputs, tf.transpose(weights)) logits = tf.nn.bias_add(logits, biases) labels_one_hot = tf.one_hot(labels, n_classes) loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels_one_hot, logits=logits) loss = tf.reduce_sum(loss, axis=1)  Note: when doing embedding lookup on weights and bias, "div" partition strategy will be used. Support for other partition strategy will be added later. Note: By default this uses a log-uniform (Zipfian) distribution for sampling, so your labels must be sorted in order of decreasing frequency to achieve good results. For more details, see tf.nn.log_uniform_candidate_sampler. Note: In the case where num_true > 1, we assign to each target class the target probability 1 / num_true so that the target probabilities sum to 1 per-example. Note: It would be useful to allow a variable number of target classes per example. We hope to provide this functionality in a future release. For now, if you have a variable number of target classes, you can pad them out to a constant number by either repeating them or by padding with an otherwise unused class. Args: weights: A Tensor of shape [num_classes, dim], or a list of Tensor objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-partitioned) class embeddings. biases: A Tensor of shape [num_classes]. The class biases. labels: A Tensor of type int64 and shape [batch_size, num_true]. The target classes. inputs: A Tensor of shape [batch_size, dim]. The forward activations of the input network. num_sampled: An int. The number of negative classes to randomly sample per batch. This single sample of negative classes is evaluated for each element in the batch. num_classes: An int. The number of possible classes. num_true: An int. The number of target classes per training example. sampled_values: a tuple of (sampled_candidates, true_expected_count, sampled_expected_count) returned by a *_candidate_sampler function. (if None, we default to log_uniform_candidate_sampler) remove_accidental_hits: A bool. Whether to remove "accidental hits" where a sampled class equals one of the target classes. If set to True, this is a "Sampled Logistic" loss instead of NCE, and we are learning to generate log-odds instead of log probabilities. See our [Candidate Sampling Algorithms Reference] (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is False. name: A name for the operation (optional). Returns: A batch_size 1-D tensor of per-example NCE losses. """ # TODO(yuefengz): get partition_strategy from either variables or distribution # strategies. return nce_loss( weights, biases, labels, inputs, num_sampled, num_classes, num_true=num_true, sampled_values=sampled_values, remove_accidental_hits=remove_accidental_hits, partition_strategy="div", name=name) @tf_export(v1=["nn.nce_loss"]) def nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=False, partition_strategy="mod", name="nce_loss"): """Computes and returns the noise-contrastive estimation training loss. See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). Also see our [Candidate Sampling Algorithms Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf) A common use case is to use this method for training, and calculate the full sigmoid loss for evaluation or inference. In this case, you must set partition_strategy="div" for the two losses to be consistent, as in the following example: python if mode == "train": loss = tf.nn.nce_loss( weights=weights, biases=biases, labels=labels, inputs=inputs, ..., partition_strategy="div") elif mode == "eval": logits = tf.matmul(inputs, tf.transpose(weights)) logits = tf.nn.bias_add(logits, biases) labels_one_hot = tf.one_hot(labels, n_classes) loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels_one_hot, logits=logits) loss = tf.reduce_sum(loss, axis=1)  Note: By default this uses a log-uniform (Zipfian) distribution for sampling, so your labels must be sorted in order of decreasing frequency to achieve good results. For more details, see tf.nn.log_uniform_candidate_sampler. Note: In the case where num_true > 1, we assign to each target class the target probability 1 / num_true so that the target probabilities sum to 1 per-example. Note: It would be useful to allow a variable number of target classes per example. We hope to provide this functionality in a future release. For now, if you have a variable number of target classes, you can pad them out to a constant number by either repeating them or by padding with an otherwise unused class. Args: weights: A Tensor of shape [num_classes, dim], or a list of Tensor objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-partitioned) class embeddings. biases: A Tensor of shape [num_classes]. The class biases. labels: A Tensor of type int64 and shape [batch_size, num_true]. The target classes. inputs: A Tensor of shape [batch_size, dim]. The forward activations of the input network. num_sampled: An int. The number of negative classes to randomly sample per batch. This single sample of negative classes is evaluated for each element in the batch. num_classes: An int. The number of possible classes. num_true: An int. The number of target classes per training example. sampled_values: a tuple of (sampled_candidates, true_expected_count, sampled_expected_count) returned by a *_candidate_sampler function. (if None, we default to log_uniform_candidate_sampler) remove_accidental_hits: A bool. Whether to remove "accidental hits" where a sampled class equals one of the target classes. If set to True, this is a "Sampled Logistic" loss instead of NCE, and we are learning to generate log-odds instead of log probabilities. See our [Candidate Sampling Algorithms Reference] (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is False. partition_strategy: A string specifying the partitioning strategy, relevant if len(weights) > 1. Currently "div" and "mod" are supported. Default is "mod". See tf.nn.embedding_lookup for more details. name: A name for the operation (optional). Returns: A batch_size 1-D tensor of per-example NCE losses. """ logits, labels = _compute_sampled_logits( weights=weights, biases=biases, labels=labels, inputs=inputs, num_sampled=num_sampled, num_classes=num_classes, num_true=num_true, sampled_values=sampled_values, subtract_log_q=True, remove_accidental_hits=remove_accidental_hits, partition_strategy=partition_strategy, name=name) sampled_losses = sigmoid_cross_entropy_with_logits( labels=labels, logits=logits, name="sampled_losses") # sampled_losses is batch_size x {true_loss, sampled_losses...} # We sum out true and sampled losses. return _sum_rows(sampled_losses) @tf_export("nn.sampled_softmax_loss", v1=[]) def sampled_softmax_loss_v2(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=True, seed=None, name="sampled_softmax_loss"): """Computes and returns the sampled softmax training loss. This is a faster way to train a softmax classifier over a huge number of classes. This operation is for training only. It is generally an underestimate of the full softmax loss. A common use case is to use this method for training, and calculate the full sigmoid loss for evaluation or inference as in the following example: python if mode == "train": loss = tf.nn.sampled_softmax_loss( weights=weights, biases=biases, labels=labels, inputs=inputs, ...) elif mode == "eval": logits = tf.matmul(inputs, tf.transpose(weights)) logits = tf.nn.bias_add(logits, biases) labels_one_hot = tf.one_hot(labels, n_classes) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_one_hot, logits=logits)  See our [Candidate Sampling Algorithms Reference] (https://www.tensorflow.org/extras/candidate_sampling.pdf) Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007) ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math. Note: when doing embedding lookup on weights and bias, "div" partition strategy will be used. Support for other partition strategy will be added later. Args: weights: A Tensor of shape [num_classes, dim], or a list of Tensor objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-sharded) class embeddings. biases: A Tensor of shape [num_classes]. The class biases. labels: A Tensor of type int64 and shape [batch_size, num_true]. The target classes. Note that this format differs from the labels argument of nn.softmax_cross_entropy_with_logits_v2. inputs: A Tensor of shape [batch_size, dim]. The forward activations of the input network. num_sampled: An int. The number of classes to randomly sample per batch. num_classes: An int. The number of possible classes. num_true: An int. The number of target classes per training example. sampled_values: a tuple of (sampled_candidates, true_expected_count, sampled_expected_count) returned by a *_candidate_sampler function. (if None, we default to log_uniform_candidate_sampler) remove_accidental_hits: A bool. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is True. seed: random seed for candidate sampling. Default to None, which doesn't set the op-level random seed for candidate sampling. name: A name for the operation (optional). Returns: A batch_size 1-D tensor of per-example sampled softmax losses. """ return sampled_softmax_loss( weights, biases, labels, inputs, num_sampled, num_classes, num_true=num_true, sampled_values=sampled_values, remove_accidental_hits=remove_accidental_hits, partition_strategy="div", name=name, seed=seed) @tf_export(v1=["nn.sampled_softmax_loss"]) def sampled_softmax_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=True, partition_strategy="mod", name="sampled_softmax_loss", seed=None): """Computes and returns the sampled softmax training loss. This is a faster way to train a softmax classifier over a huge number of classes. This operation is for training only. It is generally an underestimate of the full softmax loss. A common use case is to use this method for training, and calculate the full softmax loss for evaluation or inference. In this case, you must set partition_strategy="div" for the two losses to be consistent, as in the following example: python if mode == "train": loss = tf.nn.sampled_softmax_loss( weights=weights, biases=biases, labels=labels, inputs=inputs, ..., partition_strategy="div") elif mode == "eval": logits = tf.matmul(inputs, tf.transpose(weights)) logits = tf.nn.bias_add(logits, biases) labels_one_hot = tf.one_hot(labels, n_classes) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_one_hot, logits=logits)  See our [Candidate Sampling Algorithms Reference] (https://www.tensorflow.org/extras/candidate_sampling.pdf) Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007) ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math. Args: weights: A Tensor of shape [num_classes, dim], or a list of Tensor objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-sharded) class embeddings. biases: A Tensor of shape [num_classes]. The class biases. labels: A Tensor of type int64 and shape [batch_size, num_true]. The target classes. Note that this format differs from the labels argument of nn.softmax_cross_entropy_with_logits_v2. inputs: A Tensor of shape [batch_size, dim]. The forward activations of the input network. num_sampled: An int. The number of classes to randomly sample per batch. num_classes: An int. The number of possible classes. num_true: An int. The number of target classes per training example. sampled_values: a tuple of (sampled_candidates, true_expected_count, sampled_expected_count) returned by a *_candidate_sampler function. (if None, we default to log_uniform_candidate_sampler) remove_accidental_hits: A bool. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is True. partition_strategy: A string specifying the partitioning strategy, relevant if len(weights) > 1. Currently "div" and "mod" are supported. Default is "mod". See tf.nn.embedding_lookup for more details. name: A name for the operation (optional). seed: random seed for candidate sampling. Default to None, which doesn't set the op-level random seed for candidate sampling. Returns: A batch_size 1-D tensor of per-example sampled softmax losses. """ logits, labels = _compute_sampled_logits( weights=weights, biases=biases, labels=labels, inputs=inputs, num_sampled=num_sampled, num_classes=num_classes, num_true=num_true, sampled_values=sampled_values, subtract_log_q=True, remove_accidental_hits=remove_accidental_hits, partition_strategy=partition_strategy, name=name, seed=seed) labels = array_ops.stop_gradient(labels, name="labels_stop_gradient") sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2( labels=labels, logits=logits) # sampled_losses is a [batch_size] tensor. return sampled_losses