# [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) for MNIST
(J. Ho, A. Jain, P. Abbeel 2020)

![](https://raw.githubusercontent.com/dataflowr/website/master/modules/extras/diffusions/ddpm.png)


Given a schedule $\beta_1<\beta_2<\dots <\beta_T$, the **forward diffusion process** is defined by:
$q(x_t|x_{t-1}) = \mathcal{N}(x_t; \sqrt{1-\beta_t}x_{t-1},\beta_t I)$ and $q(x_{1:T}|x_0) = \prod_{t=1}^T q(x_t|x_{t-1})$.

With $\alpha_t = 1-\beta_t$ and $\overline{\alpha_t} = \prod_{i=1}^t\alpha_i$, we see that, with $\epsilon\sim\mathcal{N}(0,I)$:
\begin{align*}
x_t = \sqrt{\overline{\alpha}_t}x_0 + \sqrt{1-\overline{\alpha}_t}\epsilon.
\end{align*}
The law $q(x_{t-1}|x_t,\epsilon)$ is explicit: $q(x_{t-1}|x_t,\epsilon) = \mathcal{N}(x_{t-1};\mu(x_t,\epsilon,t), \gamma_t I)$ with,
\begin{align*}
\mu(x_t,\epsilon, t) = \frac{1}{\sqrt{\alpha_t}}\left( x_t-\frac{1-\alpha_t}{\sqrt{1-\overline{\alpha}_t}}\epsilon\right)\text{ and, }
\gamma_t = \frac{1-\overline{\alpha}_{t-1}}{1-\overline{\alpha}_{t}}\beta_t
\end{align*}


**Training**: to approximate **the reversed diffusion** $q(x_{t-1}|x_t)$ by a neural network given by $p_{\theta}(x_{t-1}|x_t) = \mathcal{N}(x_{t-1}; \mu_{\theta}(x_t,t), \beta_t I)$ and $p(x_T) \sim \mathcal{N}(0,I)$, we maximize the usual Variational bound:
\begin{align*}
\mathbb{E}_{q(x_0)} \ln p_{\theta}(x_0) &\geq L_T +\sum_{t=2}^T L_{t-1}+L_0 \text{ with, }L_{t-1} = \mathbb{E}_q\left[ \frac{1}{2\sigma_t^2}\|\mu_\theta(x_t,t) -\mu(x_t,\epsilon,t)\|^2\right].
\end{align*}
With the change of variable:
\begin{align*}
\mu_\theta(x_t,t) = \frac{1}{\sqrt{\alpha_t}}\left( x_t-\frac{1-\alpha_t}{\sqrt{1-\overline{\alpha}_t}}\epsilon_\theta(x_t,t)\right),
\end{align*}
ignoring the prefactor and sampling $\tau$ instead of summing over all $t$, the loss is finally:
\begin{align*}
\ell(\theta) = \mathbb{E}_\tau\mathbb{E}_\epsilon \left[ \|\epsilon - \epsilon_\theta(\sqrt{\overline{\alpha}_\tau}x_0 + \sqrt{1-\overline{\alpha}_\tau}\epsilon, \tau)\|^2\right]
\end{align*}



**Sampling**: to simulate the reversed diffusion with the learned $\epsilon_\theta(x_t,t)$ starting from $x_T\sim \mathcal{N}(0,I)$, iterate for $t=T,\dots, 1$:
\begin{align*}
x_{t-1} = \frac{1}{\sqrt{\alpha_t}}\left( x_t-\frac{1-\alpha_t}{\sqrt{1-\overline{\alpha}_t}}\epsilon_\theta(x_t,t)\right)+\sqrt{\beta_t}\epsilon,\text{ with } \epsilon\sim\mathcal{N}(0,I).
\end{align*}

In [None]:
import torch
from image_data_utils import show_images, show_image_dataset, load_data
from full_unet import FullUNet
from ddpm import DDPM, generate_image
from train_utils import TrainingLoop
from main_utils import train_cifar_by_class
from metrics.utils import calc_and_save_stats, get_inception_score_and_fid

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load data

In [None]:
root_dir = './data/'
mnist_dataset = load_data("MNIST", root_dir)

In [None]:
# cifar10_class = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]
cifar_root_dir = './data/CIFAR10'
cifar10_dataset = load_data("CIFAR10", cifar_root_dir)

## Load pretrained models

### MNIST

In [None]:
gaussian_ddpm = torch.load("./pretrained_models/gaussian_diffusion.pt")
laplace_ddpm = torch.load("./pretrained_models/laplace_diffusion.pt")

In [None]:
g_generated, g_generated_mid = generate_image(gaussian_ddpm, 100, 1, 32, device)

In [None]:
show_images(g_generated)

In [None]:
generated, generated_mid = generate_image(laplace_ddpm, 100, 1, 32, device)

In [None]:
show_images(generated)

### CIFAR10

In [None]:
base_cifar_model = torch.load("./pretrained_models/base_cifar_gaussian_diffusion.pt")
base_cifar_network = torch.load("./pretrained_models/base_cifar_network.pt")
# base_cifar_model = torch.load("./pretrained_models/base_cifar_laplace_diffusion.pt")
# base_cifar_network = torch.load("./pretrained_models/base_cifar_laplace_network.pt")

In [None]:
class_name = "ship"
class_model, class_network = train_cifar_by_class(cifar10_dataset, 
                                                  class_name, 
                                                  device, 
                                                  base_cifar_network, 
                                                  batch_size=512, 
                                                  num_epochs=10)

In [None]:
generated, generated_mid = generate_image(base_cifar_model, 100, 3, 32, device)

In [None]:
show_images(generated)

## Train model

### MNIST

In [None]:
network = FullUNet().to(device)
model = DDPM(network, device=device)

In [None]:
# Default is Gaussian DDPM with L2 (MSE) loss
# Uncomment noise_type and loss_f below to train Laplace DDPM

trainer = TrainingLoop(diffusion_model=model, 
                       network=network, 
                       dataset=mnist_dataset, 
                       batch_size=1200, 
                       num_epochs=70, 
                       num_workers=5, 
#                        noise_type="Laplace", 
#                        loss_f="L1"
                      )
trainer.run_loop()

In [None]:
generated, generated_mid = generate_image(model, 100, 1, 32, device)

In [None]:
show_images(generated)

### CIFAR10

In [None]:
# Default is Gaussian DDPM with L2 (MSE) loss
# Uncomment noise_type and loss_f below to train Laplace DDPM

network = FullUNet(in_c = 3, out_c = 3).to(device)
model = DDPM(network, device=device)
model.train()
trainer = TrainingLoop(diffusion_model=model, 
                       network=network, 
                       dataset=cifar10_dataset, 
                       batch_size=512, 
                       num_epochs=30, 
                       num_workers=4, 
#                        noise_type="Laplace", 
#                        loss_f="L1"
                      )
trainer.run_loop()

In [None]:
generated, generated_mid = generate_image(model, 100, 3, 32, device)
show_images(generated)