# NN tuning and Autoencoders

## Hyperparameter optimization with Keras

In [12]:
import keras
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras import optimizers

In [13]:
#Loading and prepping the dataset (load the raw set)

app = pd.read_csv("AppleStore.csv")
app=app.drop(['Unnamed: 0'],axis=1)
app=app.set_index('track_name')
app=app.drop(["id","ver","user_rating_ver","rating_count_ver"],axis=1)
app= pd.get_dummies(app, prefix_sep="_",columns=app.select_dtypes(include=[object]).columns)
app["user_rating"]=app["user_rating"].round() # we rounded it up to get 6 levels of the target variable
app.head(10)

Unnamed: 0_level_0,size_bytes,price,rating_count_tot,user_rating,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,currency_USD,cont_rating_12+,...,prime_genre_News,prime_genre_Photo & Video,prime_genre_Productivity,prime_genre_Reference,prime_genre_Shopping,prime_genre_Social Networking,prime_genre_Sports,prime_genre_Travel,prime_genre_Utilities,prime_genre_Weather
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PAC-MAN Premium,100788224,3.99,21292,4.0,38,5,10,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Evernote - stay organized,158578688,0.0,161065,4.0,37,5,23,1,1,0,...,0,0,1,0,0,0,0,0,0,0
"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,0.0,188583,4.0,37,5,3,1,1,0,...,0,0,0,0,0,0,0,0,0,1
"eBay: Best App to Buy, Sell, Save! Online Shopping",128512000,0.0,262241,4.0,37,5,9,1,1,1,...,0,0,0,0,1,0,0,0,0,0
Bible,92774400,0.0,985920,4.0,37,5,45,1,1,0,...,0,0,0,1,0,0,0,0,0,0
Shanghai Mahjong,10485713,0.99,8253,4.0,47,5,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
PayPal - Send and request money safely,227795968,0.0,119487,4.0,37,0,19,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Pandora - Music & Radio,130242560,0.0,1126879,4.0,37,4,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
PCalc - The Best Calculator,49250304,9.99,1117,4.0,37,5,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
Ms. PAC-MAN,70023168,3.99,7885,4.0,38,0,10,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Prepare the target variable
encoder = LabelEncoder()
encoder.fit(app["user_rating"])
y = np_utils.to_categorical(encoder.transform(app["user_rating"])).astype(int)
y

array([[0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0],
       ...,
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1]])

In [15]:
X = app.drop(['user_rating'], axis=1).values
X
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=seed)

array([[1.00788224e+08, 3.99000000e+00, 2.12920000e+04, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.58578688e+08, 0.00000000e+00, 1.61065000e+05, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00524032e+08, 0.00000000e+00, 1.88583000e+05, ..., 0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       ...,
       [1.11322112e+08, 1.99000000e+00, 1.50000000e+01, ..., 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [9.72359680e+07, 0.00000000e+00, 8.50000000e+01, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.08984320e+07, 0.00000000e+00, 3.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [16]:
seed=42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=seed)

#standartization
scaler = StandardScaler().fit(X_train)#mind that we use only X_train values not to leak the data to the test set
X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)  
X_train


array([[-0.36616427, -0.28935986, -0.15503309, ..., -0.10750387, -0.18839216, -0.10452536],
       [-0.51140646, -0.28935986, -0.1676582 , ..., -0.10750387, -0.18839216, -0.10452536],
       [ 0.96550784, -0.12505269, -0.1676582 , ..., -0.10750387, -0.18839216, -0.10452536],
       ...,
       [-0.02231683, -0.28935986, -0.12751288, ..., -0.10750387, -0.18839216, -0.10452536],
       [ 2.3095059 ,  0.87074831, -0.16722461, ..., -0.10750387, -0.18839216, -0.10452536],
       [-0.32059502, -0.28935986, -0.14234421, ..., -0.10750387, -0.18839216, -0.10452536]])

In your mini-challenge you are welcome to try out a powerful package called **talos** (https://github.com/autonomio/talos). Keep in mind that talos might take an incredible amount of time to run, so don't be too greedy on the number of parameters you want ott try out.

We will do it in a slightly more old-fashioned way with Scikit Learn and will try out only two groups of hyperparameters: batches/epochs and optimizer algorithms.

In [17]:
#We will use the architecture we had before and wrap it into the function called 'netowrk ' (or whatever you want to call it)

def network():
    
    model = Sequential()
    model.add(Dense(128, activation='relu',bias_initializer='zeros',kernel_initializer=keras.initializers.he_normal(seed=seed),  input_shape=(35,)))
    model.add(Dense(128, activation='relu',bias_initializer='zeros',kernel_initializer=keras.initializers.he_normal(seed=seed)))
    model.add(Dense(6, activation='softmax',kernel_initializer=keras.initializers.he_normal(seed=seed)))
    
    model.compile(optimizer=optimizers.SGD(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy']) #nothing changes here

    return model

# Now we will plug it into Keras Classifier Wrapper 
model = KerasClassifier(build_fn=network, verbose=0)

In [18]:
# Let's define the grid search parameters
batchsize = [32, 64, 128, 256, 512]
epochs = [10,30,60]

#Set the grid
param_grid = dict(batch_size=batchsize, epochs=epochs)# setting a dictionary to store the results
param_grid 

{'batch_size': [32, 64, 128, 256, 512], 'epochs': [10, 30, 60]}

In [19]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1,cv=3)#parallelization, here using all processors
results = grid.fit(X_train, y_train) #remember that we don't want to use the test set in CV
print("Best: %f using %s" % (results.best_score_, results.best_params_))

KeyboardInterrupt: 

In [None]:
#Let's now take this knowledge and apply it to our optimizers tuning

def optim (optimizer='SGD'):
    model = Sequential()
    model.add(Dense(128, activation='relu',bias_initializer='zeros',kernel_initializer=keras.initializers.he_normal(seed=seed),  input_shape=(35,)))
    model.add(Dense(128, activation='relu',bias_initializer='zeros',kernel_initializer=keras.initializers.he_normal(seed=seed)))
    model.add(Dense(6, activation='softmax',kernel_initializer=keras.initializers.he_normal(seed=seed)))
    
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) 
    
    return model

model = KerasClassifier(build_fn=optim, epochs=30, batch_size=64, verbose=0)
# define the grid search parameters
optimizers = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizers) # setting a dictionary to store the results
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
results = grid.fit(X_train, y_train)

# Our results (reember these are the resultson the hold-out data)
print("Best: %f using %s" % (results.best_score_, results.best_params_))

## Autoencoders

<img src="autoencoder_schema.jpg"  alt="loss" style="width: 500px;"/>

**Autoencoders** (AE) are one of the simpler neural networks architectures that have nontheless proven to be very useful, for example, Data Denoising and Dimension Reduction (think PCA). Most interestingly, AE can help with denoising and pre-training before building another ML algorithm. 

This tutorial will build a lot on the _[Keras tutorial](https://blog.keras.io/building-autoencoders-in-keras.html)_ and this nice _[blogpost](https://ramhiser.com/post/2018-05-14-autoencoders-with-keras/)_, feel free to check it out.

AE is a type of feedforward neural networks, with input 'matching' the output. You can think of autoencoders as data compressors by learning (automatically) a lossy compression based on the data examples fed in (compression being specific to those examples). There are several points that might seem confusing:
1. AE are data specific, so they only know how to work with data they have seen before, it's not a general model. If you taught it to compress pictures of people, it will not be able to handle numbers.
2. 'Lossy' means that decompressed outputs will be degraded compared to the original inputs (unlike lossless algorithmic compression).

AE is an unsupervised (or as some refer to it, 'self-supervised') algorithm, where labels are defined by inputs.

As you can judge from the illustration, three components are involved - they will also define the metaparameters choices we will have to make when building the NN structure:
1. Encoder
2. Decoder
3. Loss function that will calculate the information loss that happened at decoding

There are several types of AE, we will look into simple and deep AE (if curious, explore convolutional AE).

### Data
We will use the actual MNIST package (hand-written digits) for this case.

In [20]:
from IPython.display import Image, SVG
import matplotlib.pyplot as plt
%matplotlib inline

from keras.datasets import mnist

# Loads the training and test data sets (ignoring class labels)
(X_train, _), (X_test, _) = mnist.load_data()

# Scales the training and test data to range between 0 and 1.
max_value = float(X_train.max()) # it will be 255
X_train = X_train.astype('float32') / max_value
X_test = X_test.astype('float32') / max_value

X_train.shape, X_test.shape
#similarly to fashionMNIST we have training and test sets containing 28x28 pictures

((60000, 28, 28), (10000, 28, 28))

In [21]:
#We will need to reshape the 3D arrays into matrices
X_train.shape[1:]

(28, 28)

In [22]:
print(len(X_train))
print(np.prod(X_train.shape[1:])) #np.prod returns the product of array elements over a given axis.


60000
784


In [23]:
np.set_printoptions(linewidth=180)
X_train[0].reshape([28,28]).round(2) # let's see if we can spot the number

array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.07, 0.07, 0.07, 0.49, 0.53,

In [24]:
#We will need to reshape the 3D arrays into matrices
X_train = X_train.reshape((len(X_train), np.prod(X_train.shape[1:])))
X_test = X_test.reshape((len(X_test), np.prod(X_test.shape[1:])))

(X_train.shape, X_test.shape)

((60000, 784), (10000, 784))

In [25]:
# We will start with the simplest possible model - 1 level AE
# The encoder model reduces the dimension from the original 784-dimensional 
# vector to the encoded 32-dimensional vector. The decoder model restores the 
# dimension from the encoded 32-dimensional representation back to the original 
# 784-dimensional vector.

from keras.layers import Input, Dense
from keras.models import Model, Sequential

# this is the size of our encoded representations
encoding_dim = 32  # 32 floats -> compression of factor 24.5, assuming the input is 784 floats
#increasing this parameter can be important for obtaining improved results, however, 
# if we increase it too much we might face some overfitting as AE will just copy the input.
# with 32 being smaller than the input size, it won’t be able to directly copy its inputs to the output, 
# and will be forced to learn intelligent features.

# this is our input placeholder
input_dim = X_train.shape[1]
compression_factor = float(input_dim) / encoding_dim
#The compression factor is the ratio of the input dimension to the encoded dimension. 
#In our case, the factor is 24.5 = 784 / 32
print("Compression factor: %s" % compression_factor)


Compression factor: 24.5


Again, before we start building, we need to decide on several **metaparameters**:
1. Number of input nodes - defined by the number of pixels in our case
2. Number of layers for encoder and decoder parts. We will start with just 1 hidden layer and then go "deeper"
3. Number of nodes in each layer: usually it is symmetric, but not obligatory, we will experiment
4. Loss function: binary crossentropy (if input values are in the range {0, 1}) otherwise MSE

In [26]:
#Mind that we keep on using sequential API, not functional API
autoencoder = Sequential()
autoencoder.add(Dense(encoding_dim, input_shape=(input_dim,), activation='relu'))
autoencoder.add(Dense(input_dim, activation='sigmoid')) # we use sigmoid activation because we need the outputs to be between [0, 1]
autoencoder.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                25120     
_________________________________________________________________
dense_2 (Dense)              (None, 784)               25872     
Total params: 50,992
Trainable params: 50,992
Non-trainable params: 0
_________________________________________________________________


In [27]:
input_img = Input(shape=(input_dim,))
encoder_layer = autoencoder.layers[0]
encoder = Model(input_img, encoder_layer(input_img))
encoder.summary()
#####delete?

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 784)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                25120     
Total params: 25,120
Trainable params: 25,120
Non-trainable params: 0
_________________________________________________________________


In [None]:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(X_train, X_train, # remember that our target is the same as our input
                epochs=10,
                batch_size=256,
                shuffle=True,
                validation_data=(X_test, X_test))

Instructions for updating:
Use tf.cast instead.
Train on 60000 samples, validate on 10000 samples
Epoch 1/10


In [None]:
#Let's take 5 random pics to see how well our decoder did

num_images = 10
np.random.seed(42)
random_test_images = np.random.randint(X_test.shape[0], size=num_images)

encoded_imgs = encoder.predict(X_test)
decoded_imgs = autoencoder.predict(X_test)

plt.figure(figsize=(18, 4))

for i, image_idx in enumerate(random_test_images):
    # plot original image
    ax = plt.subplot(3, num_images, i + 1)
    plt.imshow(X_test[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    # plot encoded image
    ax = plt.subplot(3, num_images, num_images + i + 1)
    plt.imshow(encoded_imgs[image_idx].reshape(8, 4))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # plot reconstructed image
    ax = plt.subplot(3, num_images, 2*num_images + i + 1)
    plt.imshow(decoded_imgs[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

In [None]:
#That was too simple, let's go "deep", as you remember it will just mean more hidden layers
autoencoder = Sequential()

# Encoder Layers
autoencoder.add(Dense(4 * encoding_dim, input_shape=(input_dim,), activation='relu'))# 4 and 2 are our numbers of choice
autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
autoencoder.add(Dense(encoding_dim, activation='relu'))

# Decoder Layers
autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
autoencoder.add(Dense(4 * encoding_dim, activation='relu'))
autoencoder.add(Dense(input_dim, activation='sigmoid'))

autoencoder.summary()

In [None]:
input_img = Input(shape=(input_dim,))
encoder_layer1 = autoencoder.layers[0]
encoder_layer2 = autoencoder.layers[1]
encoder_layer3 = autoencoder.layers[2]
encoder = Model(input_img, encoder_layer3(encoder_layer2(encoder_layer1(input_img))))

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=256,
                validation_data=(X_test, X_test))

In [None]:
# repeat the previous exercise of examining some random images together with their un-/de-coded version
num_images = 10
np.random.seed(42)
random_test_images = np.random.randint(X_test.shape[0], size=num_images)

encoded_imgs = encoder.predict(X_test)
decoded_imgs = autoencoder.predict(X_test)

plt.figure(figsize=(18, 4))

for i, image_idx in enumerate(random_test_images):
    # plot original image
    ax = plt.subplot(3, num_images, i + 1)
    plt.imshow(X_test[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    # plot encoded image
    ax = plt.subplot(3, num_images, num_images + i + 1)
    plt.imshow(encoded_imgs[image_idx].reshape(8, 4))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # plot reconstructed image
    ax = plt.subplot(3, num_images, 2*num_images + i + 1)
    plt.imshow(decoded_imgs[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

**COMMENT**: [JH] The autoencoder above has hidden layers, why is the AE below suddenly 'deep'?

## Denoising the images with deep AE

In [None]:
# We will start by adding some noise

X_train_noisy = X_train + np.random.normal(loc=0.0, scale=0.5, size=X_train.shape)
#loc : float or array_like of floats, mean (“centre”) of the distribution.

X_train_noisy = np.clip(X_train_noisy, 0., 1.)
#Given an interval, values outside the interval are clipped to the interval edges. 
#For example, if an interval of [0, 1] is specified, values smaller than 0 become 0, 
#and values larger than 1 become 1.

X_test_noisy = X_test + np.random.normal(loc=0.0, scale=0.5, size=X_test.shape)
X_test_noisy = np.clip(X_test_noisy, 0., 1.)

In [None]:
np.set_printoptions(linewidth=140)
X_test_noisy[0]

In [None]:
num_images = 10
np.random.seed(42)
random_test_images = np.random.randint(X_test.shape[0], size=num_images)

# Denoise test images
X_test_denoised = autoencoder.predict(X_test_noisy)

plt.figure(figsize=(18, 4))

for i, image_idx in enumerate(random_test_images):
    # plot original image
    ax = plt.subplot(2, num_images, i + 1)
    plt.imshow(X_test_noisy[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    # plot reconstructed image
    ax = plt.subplot(2, num_images, num_images + i + 1)
    plt.imshow(X_test_denoised[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

In [None]:
#We will complicate the model a little more

autoencoder = Sequential()

# Encoder Layers
autoencoder.add(Dense(8 * encoding_dim, input_shape=(input_dim,), activation='relu'))# 4 and 2 are our numbers of choice
autoencoder.add(Dense(4 * encoding_dim, activation='relu'))
autoencoder.add(Dense(2 * encoding_dim, activation='relu'))

# Decoder Layers
autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
autoencoder.add(Dense(4 * encoding_dim, activation='relu'))
autoencoder.add(Dense(8 * encoding_dim, activation='relu'))
autoencoder.add(Dense(input_dim, activation='sigmoid'))

autoencoder.summary()

TODO: Image is missing!

<img src="AE.png"  alt="loss" style="width: 500px;"/>

In [None]:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(X_train_noisy, X_train,
                epochs=40, # try 10 in class
                batch_size=52,
                validation_data=(X_test_noisy, X_test))
#It will take much longer as you can imagine

In [None]:
# Let's see the resuls
X_test_denoised = autoencoder.predict(X_test_noisy)

plt.figure(figsize=(18, 4))

for i, image_idx in enumerate(random_test_images):
    # plot original image
    ax = plt.subplot(2, num_images, i + 1)
    plt.imshow(X_test_noisy[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    # plot reconstructed image
    ax = plt.subplot(2, num_images, num_images + i + 1)
    plt.imshow(X_test_denoised[image_idx].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

Wow, given the small amount of epochs, that's a pretty impressive denoising!

In [None]:
#Little note on two styles in Keras:

#sequential API :
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
#functional API :
layer_1 = Dense(16, activation='relu')(input)
layer_2 = Dense(8, activation='relu')(layer_1)
#Just pick up the notation that works best for you

The overall agreement is pretraining each layer with an unsupervised learning algorithm can allow for better initial weights. Examples of such unsupervised algorithms are Deep Belief Networks, which are based on Restricted Boltzmann Machines, and Deep Autoencoders, which are based on Autoencoders.

Variational Autoencoders (VAE) is a more modern and complex use-case of autoencoders that learns the parameters of the probability distribution modeling the input data, instead of learning an arbitrary function in the case of vanilla autoencoders. By sampling points from this distribution we can also use the VAE as a generative model. That application came in handy in the field of generative adversarial networks (GANs).