In [1]:
% matplotlib inline
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

# Set a random seed for reproduction
np.random.seed(seed=0)
tf.set_random_seed(seed=0)

In [4]:
# Import the MNIST Data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
n_samples = mnist.train.num_examples

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [14]:
def xavier_init(fan_in, fan_out, constant=1):
    """
    Xavier initialization of the weights. 
    @param fan_in: umber of incoming connections
    @param fan_out: number of outgoing connections

    output: a tensor of shape (fan_in, fan_out) filled with random uniform values
    """
    low = -constant*np.sqrt(6.0/(fan_in+fan_out))
    high = constant*np.sqrt(6.0/(fan_in+fan_out))
    return tf.Variable(tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32))

In [2]:
class VAE(object):
    """
    
    """
    
    def __init__(self, net_architecture, activation_fct=tf.nn.softplus, learning_rate=.001, batch_size=100):
        
        # Network parameters
        self.net_architecture = net_architecture
        self.activation_fct = activation_fct
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
        # Network input
        self.x = tf.placeholder(tf.float32, shape=[None, network_architecture["n_input"]])
        
    def _create_net(self):
        
        # Initialize all weights and biases of the VAE
        net_weights = self._init_weights(**self.net_architecture)

        # Use the encoder network to compute the mean and variance of a Gaussian distribution 
        # From this we can sample a value of z
        # z = mu + sigma*epsilon
        
        eps = tf.random_normal((sef.batch_size, n_z))
        
    def _init_weights(self, n_input, n_z, h1_encoder, h2_encoder, h1_decoder, h2_decoder,):
        """
        Initializes all network weights using the Xavier initialization.
        All network weights are stored in dictionaries, summarized in a large dictionary
        
        @param n_input: dimensionality of input data
        @param n_z: dimensionality of latent space
        
        returns: dictionary of dictionaries
        """
        
        all_weights = {}
        
        all_weights['weights_encoder'] = {
            'h1': xavier_init(n_input, h1_encoder),
            'h2': xavier_init(h1_encoder, h2_encoder),
            'means': xavier_init(h2_encoder, n_z),
            'sigmas': xavier_init(h2_encoder, n_z)
        }
        
        all_weights['biases_encoder'] = {
            'b1': tf.Variable(tf.zeros(shape=[h1_encoder], dtype=tf.float32)),
            'b2': tf.Variable(tf.zeros(shape=[h2_encoder], dtype=tf.float32)),
            'means': tf.Variable(tf.zeros([n_z], dtype=tf.float32)),
            'sigmas': tf.Variable(tf.zeros([n_z], dtype=tf.float32))
        }
        
        all_weights['weights_decoder'] = {
            'h1': xavier_init(n_z, h1_decoder),
            'h2': xavier_init(h1_decoder, h2_decoder),
            'means': xavier_init(h2_decoder, n_input),
            'sigmas': xavier_init(h2_decoder, n_input)
        }
        
        all_weights['biases_decoder'] = {
            'b1': tf.Variable(tf.zeros(shape=[h1_decoder], dtype=tf.float32)),
            'b2': tf.Variable(tf.zeros(shape=[h2_decoder], dtype=tf.float32)),
            'means': tf.Variable(tf.zeros([n_input], dtype=tf.float32)),
            'sigmas': tf.Variable(tf.zeros([n_input], dtype=tf.float32))  
        }
        
        return all_weights
    
    
    def _encoder_network(self, weights, biases):
        """
        The encoder network is used to approximate the true posterior p(z|x)
        using the variational distribution q_phi(z|x). Usually, q_phi(z|x) is
        taken to be a Gaussian distribution with a diagonal covariance matrix
        whose mean and variance vectors are parametrized by a neural network
        with input x.
        So our encoder network takes x as an input and produces a vector of 
        means and a vector of variances of a Gaussian distrbution. From this
        we can sample values of z, i.e. z ~ q_phi(z|x)
        """
        
        # The output of each layer is given by applying the activation function
        # to a linear combination of the input and weights
        layer1 = self.activation_fct(tf.add(tf.matmul(self.x, weights['h1']), biases['b1']))
        layer2 = self.activation_fct(tf.add(tf.matmul(layer1, weights['h2']), biases['b2']))
        
        # The vector of means and standard deviations is computed in a similar
        # fashion but without applying an activation function
        means = tf.add(tf.matmul(layer2, weights['means']), biases['means'])
        log_sigmas = tf.add(tf.matmul(layer2, weights['sigmas']), biases['sigmas'])

        # As mentioned above, the network outputs a vector of means and a
        # vector of variances
        return(means, log_sigmas)
        
  
    def _decoder_network(self, weights, biases):
        """
        The decoder network takes a latent variable z as an input and reproduces
        the input x. In our case, it maps z onto a Bernoulli distrbution.
        """
    
        # Again, the output of each layer is given by applying the activation 
        # function to a linear combination of the input and weights
        layer1 = self.activation_fct(tf.add(tf.matmul(self.z, weights['h1']), biases['h1']))
        layer2 = self.activation_fct(tf.add(tf.matmul(layer1, weights['h2']), biases['h2']))
        
        # Wieso reicht es hier aus, nur einen mean zu berechnen?
        # Berechne ich ueberhaupt einen mean? oder was ist hier der output?
        means_x = tf.nn.sigmoid(tf.add(tf.matmul(layer1, weights['means']), biases['means']))
        
        # Was ist hier der output? Wieso nur ein output?
        return means_x
        
        
    def _loss_optimizer(self):
        """
        The loss function has two terms
        
        1) The reconstruction loss: -log(p(x|z))
        
        2) The latent loss: Kullback-Leibler divergence between q_phi(z|x) and the prior
        p(z). This loss acts
        like a regularizer.
        
        """
        
        # Reconstruction loss
        reconstr_loss = -tf.reduce_sum(self.x * )
        
        
        