# Calculating predictive mean from BNN with Dropout layers

In [47]:
import tensorflow as tf
from tensorflow import keras

## A test network with Dropout layers

In [48]:
class CustomModel(keras.Model):
    def __init__(self, output_activation=None, activation=None, **kwargs):
        super().__init__(**kwargs)
        self.hidden1 = keras.layers.Dense(128, activation=activation)

        self.hidden2 = keras.layers.Dense(32, activation=activation)
        self.stochastic_layer2 = keras.layers.Dropout(rate=0.5)

        self.hidden3 = keras.layers.Dense(16, activation=activation)
        self.stochastic_layer3 = keras.layers.Dropout(rate=0.25)

        self.output_layer = keras.layers.Dense(1, activation=output_activation)

    def __call__(self, input, training=None):
        hidden1 = self.hidden1(input)

        hidden2 = self.hidden2(hidden1)
        hidden2 = self.stochastic_layer2(hidden2, training=training)

        hidden3 = self.hidden3(hidden2)
        hidden3 = self.stochastic_layer3(hidden3, training=training)

        output_layer = self.output_layer(hidden3)
        return output_layer


## Sample input for which we will analyse predictive distributions

In [49]:
x = tf.ones((1,3))

## Results for a model with no non-linearities after dropout

In [50]:
# No activation
model = CustomModel(activation=None, output_activation=None)

Let's look on few samples from the predictive distribution:

In [51]:
model(x, training=True)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.7696812]], dtype=float32)>

In [52]:
model(x, training=True)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.35197103]], dtype=float32)>

In [53]:
model(x, training=True)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.67026234]], dtype=float32)>

In [54]:
# Mean calculated from samples
samples = model(tf.tile(x, [1000000, 1]), training=True)
tf.reduce_mean(samples)

<tf.Tensor: shape=(), dtype=float32, numpy=-0.31734434>

In [55]:
# Mean from "frozen" dropout(s)
model(x, training=False)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.31770444]], dtype=float32)>

**If there are not nonlinearities in the network, the mean estimated from multiple output samples matches the one obtained deterministically by switching Dropout layers mode (training=False) to return their (layer) means.**


## Results for a model with nonlinear activations

In [56]:
# Nonlinear activations in the network: play with output activations
model = CustomModel(output_activation="tanh", activation="relu")

Let's look on few samples from the predictive distribution:

In [57]:
model(x, training=True)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.4962887]], dtype=float32)>

In [58]:
model(x, training=True)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.20800395]], dtype=float32)>

In [59]:
model(x, training=True)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.08432994]], dtype=float32)>

In [60]:
# Mean from samples
samples = model(tf.tile(x, [1000000, 1]), training=True)
tf.reduce_mean(samples)

<tf.Tensor: shape=(), dtype=float32, numpy=-0.1665587>

In [61]:
# Mean from "frozen" dropout(s)
model(x, training=False)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.2317547]], dtype=float32)>

**If there are nonlinearities in the network, the mean estimated from multiple output samples differs from the one obtained deterministically by switching Dropout layers mode (training=False) to return their (layer) means. This happens becauss nonlinearity (=activation) applied to expectation (=layer mean) is not equal to expectation of nonlinerities (applied to samples from a layer).**
