In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import math

#Torch-related imports
import torch
import torch.distributions as D
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Function

#Model-specific imports
from SBM_SDE import *
from obs_and_flow import *
from training import calc_log_lik

In [2]:
torch.manual_seed(0)
np.random.seed(0)

In [3]:
temp_ref = 283
temp_rise = 5 #High estimate of 5 celsius temperature rise by 2100. 

#System parameters from deterministic CON model
u_M = 0.002
a_SD = 0.33
a_DS = 0.33
a_M = 0.33
a_MSC = 0.5
k_S_ref = 0.000025
k_D_ref = 0.005
k_M_ref = 0.0002
Ea_S = 75
Ea_D = 50
Ea_M = 50

#SCON diffusion matrix parameters
c_SOC = 1.0
c_DOC = 0.001
c_MBC = 0.01
s_SOC = 0.001
s_DOC = 0.001
s_MBC = 0.001

SCON_C_params_dict = {'u_M': u_M, 'a_SD': a_SD, 'a_DS': a_DS, 'a_M': a_M, 'a_MSC': a_MSC, 'k_S_ref': k_S_ref, 'k_D_ref': k_D_ref, 'k_M_ref': k_M_ref, 'Ea_S': Ea_S, 'Ea_D': Ea_D, 'Ea_M': Ea_M, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC}
SCON_SS_params_dict = {'u_M': u_M, 'a_SD': a_SD, 'a_DS': a_DS, 'a_M': a_M, 'a_MSC': a_MSC, 'k_S_ref': k_S_ref, 'k_D_ref': k_D_ref, 'k_M_ref': k_M_ref, 'Ea_S': Ea_S, 'Ea_D': Ea_D, 'Ea_M': Ea_M, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC}

#System parameters from deterministic AWB model
#u_Q_ref = 0.2
#Q = 0.002
#a_MSA = 0.5
#K_D = 200
#K_U = 1
#V_D_ref = 0.4
#V_U_ref = 0.02
#Ea_V_D = 75
#Ea_V_U = 50
#r_M = 0.0004
#r_E = 0.00001
#r_L = 0.0005

#SAWB diffusion matrix parameters
#c_SOC = 2
#c_DOC = 0.05
#c_MBC = 0.1
#c_EEC = 0.01
#s_SOC = 0.1
#s_DOC = 0.1
#s_MBC = 0.1
#s_EEC = 0.1

#SAWB_C_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_D': K_D, 'K_U': K_U, 'V_D_ref': V_D_ref, 'V_U_ref': V_U_ref, 'Ea_V_D': Ea_V_D, 'Ea_V_U': Ea_V_U, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC, 'c_EEC': c_EEC}
#SAWB_SS_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_D': K_D, 'K_U': K_U, 'V_D_ref': V_D_ref, 'V_U_ref': V_U_ref, 'Ea_V_D': Ea_V_D, 'Ea_V_U': Ea_V_U, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC, 's_EEC': s_EEC}

#System parameters from deterministic AWB-ECA model
#u_Q_ref = 0.2
#Q = 0.002
#a_MSA = 0.5
#K_DE = 200
#K_UE = 1
#V_DE_ref = 0.4
#V_UE_ref = 0.02
#Ea_V_DE = 75
#Ea_V_UE = 50
#r_M = 0.0004
#r_E = 0.00001
#r_L = 0.0005

#SAWB-ECA diffusion matrix parameters
#c_SOC = 2
#c_DOC = 0.05
#c_MBC = 0.1
#c_EEC = 0.01
#s_SOC = 0.1
#s_DOC = 0.1
#s_MBC = 0.1
#s_EEC = 0.1

#SAWB_ECA_C_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_DE': K_DE, 'K_UE': K_UE, 'V_DE_ref': V_DE_ref, 'V_UE_ref': V_UE_ref, 'Ea_V_DE': Ea_V_DE, 'Ea_V_UE': Ea_V_UE, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC, 'c_EEC': c_EEC}
#SAWB_ECA_SS_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_DE': K_DE, 'K_UE': K_UE, 'V_DE_ref': V_DE_ref, 'V_UE_ref': V_UE_ref, 'Ea_V_DE': Ea_V_DE, 'Ea_V_UE': Ea_V_UE, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC, 's_EEC': s_EEC}

In [4]:
#Set flow NN parameters.

devi = torch.device("".join(["cuda:",f'{cuda_id}']) if torch.cuda.is_available() else "cpu")
dt_flow = 0.1
t = 500
n_flow = int(t / dt_flow) + 1
t_span = np.linspace(0, t, n_flow)
t_span_tensor = torch.reshape(torch.Tensor(t_span), [1, n_flow, 1]) #T_span needs to be converted to tensor object. Additionally, facilitates conversion of I_S and I_D to tensor objects.
l_r = 1e-4
niter = 5001
piter = 11
batch_size = 3 #Number of sets of observation outputs to sample per set of parameters.
state_dim_SCON = 3 #Not including CO2 in STATE_DIM, because CO2 is an observation.
obs_error_scale = 0.1 #Proportion of the mean of observation error standard deviation.

x0_SCON = [58, 0.08, 0.8] #Initial condition means for SCON

In [5]:
#Obtain temperature forcing function.
temp_tensor = temp_gen(t_span_tensor, temp_ref, temp_rise)
print(temp_tensor)

#Obtain SOC and DOC pool litter input vectors for use in flow SDE functions.
i_s_tensor = i_s(t_span_tensor) #Exogenous SOC input function
i_d_tensor = i_d(t_span_tensor) #Exogenous DOC input function
print(i_s_tensor)
print(i_d_tensor)

tensor([[[283.0000],
         [283.2625],
         [283.5248],
         ...,
         [277.6021],
         [277.7247],
         [277.8533]]])
tensor([[[0.0010],
         [0.0010],
         [0.0010],
         ...,
         [0.0012],
         [0.0012],
         [0.0012]]])
tensor([[[1.0000e-04],
         [1.0000e-04],
         [1.0001e-04],
         ...,
         [1.1754e-04],
         [1.1755e-04],
         [1.1755e-04]]])


In [6]:
def train(DEVICE, L_R, NITER, PRETRAIN_ITER, BATCH_SIZE, SDEFLOW, ObsModel, csv_to_obs_df, DATA_CSV, OBS_ERROR_SCALE, STATE_DIM, T, DT, N, T_SPAN_TENSOR, I_S_TENSOR, I_D_TENSOR, TEMP_TENSOR, TEMP_REF, C0, DRIFT_DIFFUSION, PARAMS_DICT): 
    #Read-in observation information. 
    obs_times, obs_means, obs_error = csv_to_obs_df(DATA_CSV, STATE_DIM, T, OBS_ERROR_SCALE)
    obs_means = LowerBound.apply(obs_means, 1e-6)
    #Pass observation information to `ObsModel`.
    obs_model = ObsModel(DEVICE, obs_times, DT, obs_means, obs_error)
    net = SDEFlow(DEVICE, obs_model, STATE_DIM, T, DT, N, I_S_TENSOR, I_D_TENSOR, cond_inputs = 3, num_layers = 6).to(DEVICE)
    optimizer = optim.Adam(net.parameters(), lr = L_R)
    if PRETRAIN_ITER >= NITER:
        raise Exception("PRETRAIN_ITER must be < NITER.")
    best_loss_norm = 1e15
    best_loss_ELBO = 1e15
    norm_losses = []
    ELBO_losses = []
    C0_tensor = torch.tensor(C0).to(DEVICE) #Convert initial conditions from list to tensor for X0 prior object.
    #C0 = C0[(None,) * 2].repeat(BATCH_SIZE, 1, 1).to(DEVICE)
    PARAMS_DICT_TENSOR = {k: torch.tensor(v).expand(BATCH_SIZE) for k, v in PARAMS_DICT.items()}
    X0_prior = D.normal.Normal(loc = C0_tensor, scale = OBS_ERROR_SCALE * C0_tensor) #Setting prior noise = observation noise for now.
    with tqdm(total = NITER, desc = f'Train Diffusion', position = -1) as tq:
        for i in range(NITER):
            net.train()
            optimizer.zero_grad()
            C_PATH, log_prob = net(BATCH_SIZE) #For obs_and_flow.py
            #C_PATH = torch.cat([C0, C_PATH], 1) #Learning initial conditions in this version. #Append deterministic CON initial conditions conditional on parameter values to C path.
            if i <= PRETRAIN_ITER:
                l1_norm_element = C_PATH - torch.mean(obs_model.mu, -1)
                l1_norm = torch.sum(torch.abs(l1_norm_element)).mean()
                best_loss_norm = l1_norm if l1_norm < best_loss_norm else best_loss_norm
                norm_losses.append(l1_norm.item())
                #l2_norm_element = C_PATH - torch.mean(obs_model.mu, -1)
                #l2_norm = torch.sqrt(torch.sum(torch.square(l2_norm_element))).mean()
                #best_loss_norm = l2_norm if l2_norm < best_loss_norm else best_loss_norm
                #norm_losses.append(l2_norm.item())
                if i % 10 == 0:
                    ma_norm_loss = sum(norm_losses[-10:]) / len(norm_losses[-10:])
                    print(f"\nMoving average norm loss at {iter} iterations is: {ma_norm_loss}. Best norm loss value is: {best_loss_norm}.")
                    print('\nC_PATH mean =', C_PATH.mean(-2))
                    print('\nC_PATH =', C_PATH)
                l1_norm.backward()
                #l2_norm.backward()
            else:
                log_lik = calc_log_lik(C_PATH, T_SPAN_TENSOR.to(DEVICE), DT, I_S_TENSOR.to(DEVICE), I_D_TENSOR.to(DEVICE), TEMP_TENSOR.to(DEVICE), TEMP_REF, DRIFT_DIFFUSION, PARAMS_DICT)
                neg_ELBO = -X0_prior.log_prob(C_PATH[:, 0, :]).sum(-1).mean() - log_lik.mean() - obs_model(C_PATH, PARAMS_DICT_TENSOR) + log_prob.mean()
                best_loss_ELBO = neg_ELBO if neg_ELBO < best_loss_ELBO else best_loss_ELBO
                ELBO_losses.append(neg_ELBO.item())
                if i % 10 == 0:             
                    ma_elbo_loss = sum(ELBO_losses[-10:]) / len(ELBO_losses[-10:])
                    print(f"\nMoving average ELBO loss at {iter} iterations is: {ma_elbo_loss}. Best ELBO loss value is: {best_loss_ELBO}.")
                    print('\nC_PATH mean =', C_PATH.mean(-2))
                    print('\nC_PATH =', C_PATH)
                neg_ELBO.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 3.0)
            optimizer.step()
            if i % 100000 == 0 and i > 0:
                optimizer.param_groups[0]['lr'] *= 0.1
            tq.update()
    return net, ELBO_losses, norm_losses

In [None]:
net, ELBO_losses, norm_losses = train(devi, l_r, niter, piter, batch_size, SDEFlow, ObsModel, csv_to_obs_df, 'y_from_x_t_1000_dt_0-01.csv', obs_error_scale, state_dim_SCON, t, dt_flow, n_flow, t_span_tensor, i_s_tensor, i_d_tensor, temp_tensor, temp_ref, x0_SCON, drift_diffusion_SCON_C, SCON_C_params_dict)


Train Diffusion:   0%|          | 0/5001 [00:00<?, ?it/s][A


Moving average norm loss at <built-in function iter> iterations is: 349720.375. Best norm loss value is: 349720.375.

C_PATH mean = tensor([[0.8584, 0.8820, 0.8543],
        [0.8721, 0.8674, 0.8597],
        [0.8598, 0.8649, 0.8632]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.0218, 0.6919, 0.8867],
         [1.8316, 0.8502, 0.2385],
         [0.1398, 1.1947, 2.1356],
         ...,
         [0.6374, 0.6841, 0.5368],
         [0.6461, 0.5776, 0.6641],
         [0.5732, 0.6179, 0.5520]],

        [[0.4191, 2.0637, 0.9564],
         [0.4066, 2.0826, 2.2696],
         [0.3793, 0.3986, 1.0027],
         ...,
         [0.7034, 0.9312, 0.5633],
         [0.7227, 1.4726, 0.7203],
         [0.6977, 0.8745, 0.6807]],

        [[0.7853, 0.7400, 0.9096],
         [0.6049, 0.3987, 1.9713],
         [2.9533, 1.6398, 0.8749],
         ...,
         [0.7032, 0.6070, 0.7062],
         [1.2515, 0.5188, 0.7244],
         [1.0447, 0.5262, 0.6705]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 1/5001 [00:08<11:18:59,  8.15s/it][A
Train Diffusion:   0%|          | 2/5001 [00:19<13:54:47, 10.02s/it][A
Train Diffusion:   0%|          | 3/5001 [00:26<12:05:56,  8.71s/it][A
Train Diffusion:   0%|          | 4/5001 [00:34<11:23:01,  8.20s/it][A
Train Diffusion:   0%|          | 5/5001 [00:41<10:59:29,  7.92s/it][A
Train Diffusion:   0%|          | 6/5001 [00:49<10:58:38,  7.91s/it][A
Train Diffusion:   0%|          | 7/5001 [00:59<12:01:45,  8.67s/it][A
Train Diffusion:   0%|          | 8/5001 [01:10<13:02:21,  9.40s/it][A
Train Diffusion:   0%|          | 9/5001 [01:19<12:43:37,  9.18s/it][A
Train Diffusion:   0%|          | 10/5001 [01:27<12:21:28,  8.91s/it][A


Moving average norm loss at <built-in function iter> iterations is: 347886.60625. Best norm loss value is: 347068.9375.

C_PATH mean = tensor([[0.8830, 0.8472, 0.8778],
        [0.8809, 0.8361, 0.8867],
        [0.8848, 0.8455, 0.8766]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.6927, 1.4634, 0.5655],
         [0.4717, 1.1919, 0.5402],
         [0.3733, 0.9304, 0.7381],
         ...,
         [1.7922, 1.5345, 0.6658],
         [0.9461, 1.8681, 1.0650],
         [1.8426, 1.0644, 1.7578]],

        [[0.5727, 0.7949, 0.9657],
         [0.8539, 0.7746, 1.1958],
         [1.0880, 1.1378, 1.3254],
         ...,
         [0.6920, 0.9547, 0.6856],
         [1.5384, 0.7290, 1.0364],
         [0.7049, 0.8775, 1.1417]],

        [[1.1351, 0.7101, 1.1230],
         [0.9844, 0.5827, 1.6516],
         [1.4695, 0.5683, 1.2795],
         ...,
         [0.9261, 0.2739, 1.5602],
         [0.6078, 0.6114, 0.5592],
         [0.6799, 0.8383, 0.3778]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 11/5001 [01:38<13:08:46,  9.48s/it][A
Train Diffusion:   0%|          | 12/5001 [01:47<13:06:40,  9.46s/it][A
Train Diffusion:   0%|          | 13/5001 [01:56<12:37:20,  9.11s/it][A
Train Diffusion:   0%|          | 14/5001 [02:04<12:14:32,  8.84s/it][A
Train Diffusion:   0%|          | 15/5001 [02:13<12:29:43,  9.02s/it][A
Train Diffusion:   0%|          | 16/5001 [02:22<12:12:59,  8.82s/it][A
Train Diffusion:   0%|          | 17/5001 [02:30<12:02:41,  8.70s/it][A
Train Diffusion:   0%|          | 18/5001 [02:40<12:22:56,  8.95s/it][A
Train Diffusion:   0%|          | 19/5001 [02:47<11:38:51,  8.42s/it][A
Train Diffusion:   0%|          | 20/5001 [02:54<11:08:34,  8.05s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 4421611.0. Best ELBO loss value is: 4092925.25.

C_PATH mean = tensor([[0.8884, 0.8610, 0.8861],
        [0.8805, 0.8523, 0.8913],
        [0.8777, 0.8600, 0.8881]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.0057, 0.9813, 0.8984],
         [1.2410, 1.3869, 1.8016],
         [1.3417, 0.8899, 0.6894],
         ...,
         [0.6307, 1.2531, 0.9027],
         [1.4490, 0.7660, 1.2088],
         [1.5966, 0.9884, 1.5641]],

        [[0.9860, 0.6877, 1.3181],
         [0.7738, 0.7099, 0.8185],
         [1.3863, 1.1849, 0.8972],
         ...,
         [1.1020, 0.9128, 0.7485],
         [0.5639, 1.3836, 0.7277],
         [0.9724, 0.7323, 0.9489]],

        [[0.5252, 1.4926, 0.6636],
         [0.6323, 0.6338, 0.6224],
         [0.4264, 0.7404, 1.2991],
         ...,
         [1.0026, 0.6846, 0.7710],
         [1.0451, 0.5054, 0.9791],
         [0.6984, 0.7746, 0.5374]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 21/5001 [03:01<10:39:34,  7.71s/it][A
Train Diffusion:   0%|          | 22/5001 [03:08<10:37:35,  7.68s/it][A
Train Diffusion:   0%|          | 23/5001 [03:16<10:33:49,  7.64s/it][A
Train Diffusion:   0%|          | 24/5001 [03:24<10:38:31,  7.70s/it][A
Train Diffusion:   0%|          | 25/5001 [03:31<10:37:22,  7.69s/it][A
Train Diffusion:   1%|          | 26/5001 [03:39<10:44:07,  7.77s/it][A
Train Diffusion:   1%|          | 27/5001 [03:48<10:56:38,  7.92s/it][A
Train Diffusion:   1%|          | 28/5001 [03:55<10:47:11,  7.81s/it][A
Train Diffusion:   1%|          | 29/5001 [04:02<10:30:20,  7.61s/it][A
Train Diffusion:   1%|          | 30/5001 [04:10<10:34:26,  7.66s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 4630884.15. Best ELBO loss value is: 4092925.25.

C_PATH mean = tensor([[0.8822, 0.8721, 0.8812],
        [0.8831, 0.8627, 0.8910],
        [0.8827, 0.8628, 0.8815]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.1922, 0.7179, 1.3648],
         [0.4590, 0.5904, 1.3978],
         [0.2464, 0.6945, 1.1175],
         ...,
         [1.0311, 0.8987, 0.9803],
         [0.6873, 0.7459, 0.9495],
         [0.6828, 1.0099, 1.0245]],

        [[0.6893, 1.4771, 0.6778],
         [1.1480, 0.9456, 0.5423],
         [1.4028, 1.0793, 0.8168],
         ...,
         [0.8693, 1.1156, 0.5299],
         [1.0313, 1.0484, 0.7558],
         [1.5176, 0.8006, 1.4204]],

        [[0.6544, 1.0726, 0.8556],
         [1.2595, 0.9379, 1.0622],
         [0.9761, 1.0308, 1.1605],
         ...,
         [1.0043, 0.8342, 0.8967],
         [1.2819, 0.9010, 1.0130],
         [0.6788, 0.8563, 0.5016]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|          | 31/5001 [04:18<10:39:42,  7.72s/it][A
Train Diffusion:   1%|          | 32/5001 [04:26<10:35:05,  7.67s/it][A
Train Diffusion:   1%|          | 33/5001 [04:33<10:35:36,  7.68s/it][A
Train Diffusion:   1%|          | 34/5001 [04:41<10:43:54,  7.78s/it][A
Train Diffusion:   1%|          | 35/5001 [04:48<10:29:25,  7.60s/it][A
Train Diffusion:   1%|          | 36/5001 [04:56<10:31:02,  7.63s/it][A
Train Diffusion:   1%|          | 37/5001 [05:04<10:29:58,  7.61s/it][A
Train Diffusion:   1%|          | 38/5001 [05:12<10:42:11,  7.76s/it][A
Train Diffusion:   1%|          | 39/5001 [05:19<10:31:12,  7.63s/it][A
Train Diffusion:   1%|          | 40/5001 [05:27<10:30:48,  7.63s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 4426364.3. Best ELBO loss value is: 4092925.25.

C_PATH mean = tensor([[0.8921, 0.8769, 0.9020],
        [0.8849, 0.8833, 0.8925],
        [0.8984, 0.8855, 0.8970]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.0867, 1.2120, 1.2307],
         [0.9543, 0.5473, 0.7298],
         [1.7225, 0.6783, 0.9732],
         ...,
         [1.1993, 1.3577, 0.9150],
         [0.8375, 1.1138, 0.9630],
         [1.6010, 0.9198, 0.4767]],

        [[0.7167, 0.7134, 0.9377],
         [0.6005, 0.9846, 0.7284],
         [0.9824, 0.9836, 0.8532],
         ...,
         [0.4934, 0.9480, 0.7120],
         [0.8546, 0.9848, 0.9581],
         [1.1981, 0.9045, 1.2523]],

        [[0.6514, 1.3182, 0.6685],
         [1.1249, 1.0530, 2.1717],
         [0.3779, 1.1669, 1.3494],
         ...,
         [1.2633, 0.5675, 1.0718],
         [1.5942, 0.6159, 0.8985],
         [0.4180, 0.7836, 1.0380]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|          | 41/5001 [05:35<10:52:07,  7.89s/it][A
Train Diffusion:   1%|          | 42/5001 [05:44<11:00:28,  7.99s/it][A
Train Diffusion:   1%|          | 43/5001 [05:51<10:53:50,  7.91s/it][A
Train Diffusion:   1%|          | 44/5001 [05:59<10:57:39,  7.96s/it][A
Train Diffusion:   1%|          | 45/5001 [06:07<10:49:05,  7.86s/it][A
Train Diffusion:   1%|          | 46/5001 [06:15<10:45:03,  7.81s/it][A
Train Diffusion:   1%|          | 47/5001 [06:23<10:59:50,  7.99s/it][A
Train Diffusion:   1%|          | 48/5001 [06:31<10:51:32,  7.89s/it][A
Train Diffusion:   1%|          | 49/5001 [06:39<10:57:46,  7.97s/it][A
Train Diffusion:   1%|          | 50/5001 [06:46<10:48:37,  7.86s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 2591138.0875. Best ELBO loss value is: 1375767.125.

C_PATH mean = tensor([[0.8362, 0.8323, 0.8277],
        [0.8375, 0.8227, 0.8362],
        [0.8304, 0.8274, 0.8272]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.6581, 1.2288, 0.6742],
         [0.6033, 0.8475, 0.5937],
         [1.0011, 0.9062, 0.6863],
         ...,
         [0.6180, 0.7116, 1.0446],
         [0.6236, 0.7626, 1.0072],
         [1.0411, 1.1278, 1.1675]],

        [[0.7433, 0.8830, 0.9572],
         [0.7148, 0.7779, 1.0898],
         [0.3984, 0.7842, 0.9771],
         ...,
         [1.0520, 0.9403, 0.7672],
         [1.0949, 0.6500, 0.7861],
         [1.0090, 0.8076, 0.8022]],

        [[0.8512, 0.7594, 0.8999],
         [0.9403, 0.6446, 0.9436],
         [0.9722, 0.6632, 0.8923],
         ...,
         [1.0021, 1.1027, 0.8812],
         [0.9377, 1.1304, 0.7971],
         [0.7926, 0.7590, 0.6225]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|          | 51/5001 [06:54<10:43:14,  7.80s/it][A
Train Diffusion:   1%|          | 52/5001 [07:02<10:35:58,  7.71s/it][A
Train Diffusion:   1%|          | 53/5001 [07:10<10:46:58,  7.85s/it][A
Train Diffusion:   1%|          | 54/5001 [07:18<10:46:16,  7.84s/it][A
Train Diffusion:   1%|          | 55/5001 [07:25<10:36:57,  7.73s/it][A
Train Diffusion:   1%|          | 56/5001 [07:34<11:04:36,  8.06s/it][A
Train Diffusion:   1%|          | 57/5001 [07:49<14:03:39, 10.24s/it][A
Train Diffusion:   1%|          | 58/5001 [08:00<14:03:57, 10.24s/it][A
Train Diffusion:   1%|          | 59/5001 [08:09<13:33:05,  9.87s/it][A
Train Diffusion:   1%|          | 60/5001 [08:17<12:58:06,  9.45s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 784466.090625. Best ELBO loss value is: 471306.59375.

C_PATH mean = tensor([[0.7542, 0.7482, 0.7505],
        [0.7505, 0.7514, 0.7453],
        [0.7464, 0.7481, 0.7506]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.7028, 0.8045, 0.7397],
         [0.7207, 0.7765, 0.6824],
         [0.7576, 0.6350, 0.6454],
         ...,
         [0.7175, 0.6325, 0.8856],
         [0.6752, 0.8071, 0.8296],
         [0.9587, 0.9005, 0.9698]],

        [[0.7033, 0.8090, 0.7635],
         [0.7246, 0.6300, 0.8669],
         [0.8519, 0.6614, 0.7253],
         ...,
         [0.8035, 0.8970, 0.6647],
         [0.8504, 0.9079, 0.7383],
         [0.7425, 0.8040, 0.7470]],

        [[0.6951, 0.6489, 0.6196],
         [0.5654, 0.6198, 0.6176],
         [0.4704, 0.7891, 0.8812],
         ...,
         [0.8993, 0.8010, 0.8872],
         [0.9916, 0.6321, 0.7830],
         [0.9396, 0.6764, 0.5745]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|          | 61/5001 [08:24<12:07:44,  8.84s/it][A
Train Diffusion:   1%|          | 62/5001 [08:32<11:35:35,  8.45s/it][A
Train Diffusion:   1%|▏         | 63/5001 [08:40<11:14:20,  8.19s/it][A
Train Diffusion:   1%|▏         | 64/5001 [08:47<10:57:43,  7.99s/it][A
Train Diffusion:   1%|▏         | 65/5001 [08:54<10:37:44,  7.75s/it][A
Train Diffusion:   1%|▏         | 66/5001 [09:02<10:25:53,  7.61s/it][A
Train Diffusion:   1%|▏         | 67/5001 [09:10<10:42:39,  7.82s/it][A
Train Diffusion:   1%|▏         | 68/5001 [09:17<10:38:15,  7.76s/it][A
Train Diffusion:   1%|▏         | 69/5001 [09:26<10:46:55,  7.87s/it][A
Train Diffusion:   1%|▏         | 70/5001 [09:33<10:38:48,  7.77s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 342593.00625. Best ELBO loss value is: 266151.21875.

C_PATH mean = tensor([[0.6901, 0.6889, 0.6880],
        [0.6919, 0.6873, 0.6889],
        [0.6908, 0.6904, 0.6890]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.6898, 0.6302, 0.5849],
         [0.6008, 0.6382, 0.5857],
         [0.5954, 0.6175, 0.6399],
         ...,
         [0.6217, 0.7268, 0.6168],
         [0.7059, 0.7556, 0.6719],
         [0.7472, 0.7335, 0.9514]],

        [[0.7234, 0.6370, 0.5653],
         [0.6391, 0.5828, 0.6306],
         [0.6440, 0.6029, 0.6107],
         ...,
         [0.8216, 0.5686, 0.7678],
         [0.8230, 0.5720, 0.7441],
         [0.8686, 0.6607, 0.8980]],

        [[0.6534, 0.6175, 0.6466],
         [0.5695, 0.5749, 0.6364],
         [0.5451, 0.5762, 0.6133],
         ...,
         [0.7025, 0.8150, 0.6751],
         [0.6807, 0.6972, 0.6143],
         [0.6685, 0.7159, 0.4673]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|▏         | 71/5001 [09:41<10:31:28,  7.69s/it][A
Train Diffusion:   1%|▏         | 72/5001 [09:48<10:33:06,  7.71s/it][A
Train Diffusion:   1%|▏         | 73/5001 [09:58<11:11:35,  8.18s/it][A
Train Diffusion:   1%|▏         | 74/5001 [10:05<10:48:01,  7.89s/it][A
Train Diffusion:   1%|▏         | 75/5001 [10:19<13:21:20,  9.76s/it][A
Train Diffusion:   2%|▏         | 76/5001 [10:32<14:46:08, 10.80s/it][A
Train Diffusion:   2%|▏         | 77/5001 [10:40<13:41:09, 10.01s/it][A
Train Diffusion:   2%|▏         | 78/5001 [10:49<13:14:21,  9.68s/it][A
Train Diffusion:   2%|▏         | 79/5001 [10:57<12:21:32,  9.04s/it][A
Train Diffusion:   2%|▏         | 80/5001 [11:04<11:28:06,  8.39s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 242446.734375. Best ELBO loss value is: 227753.703125.

C_PATH mean = tensor([[0.6501, 0.6486, 0.6490],
        [0.6515, 0.6504, 0.6480],
        [0.6497, 0.6501, 0.6497]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.6516, 0.5919, 0.5231],
         [0.5386, 0.5581, 0.5318],
         [0.5503, 0.5214, 0.5717],
         ...,
         [0.6973, 0.6365, 0.6656],
         [0.7354, 0.5842, 0.5945],
         [0.7785, 0.5915, 0.4299]],

        [[0.6584, 0.5753, 0.5261],
         [0.5297, 0.4956, 0.5342],
         [0.4946, 0.5245, 0.5806],
         ...,
         [0.5921, 0.6596, 0.5784],
         [0.5907, 0.6934, 0.6139],
         [0.6663, 0.7581, 1.0864]],

        [[0.6262, 0.5310, 0.5568],
         [0.4911, 0.5369, 0.5979],
         [0.5099, 0.5596, 0.5195],
         ...,
         [0.6310, 0.6063, 0.6355],
         [0.6126, 0.6398, 0.6447],
         [0.6291, 0.7027, 0.7197]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 81/5001 [11:16<13:12:39,  9.67s/it][A
Train Diffusion:   2%|▏         | 82/5001 [11:25<12:52:12,  9.42s/it][A
Train Diffusion:   2%|▏         | 83/5001 [11:34<12:25:34,  9.10s/it][A
Train Diffusion:   2%|▏         | 84/5001 [11:42<12:07:19,  8.88s/it][A
Train Diffusion:   2%|▏         | 85/5001 [11:52<12:31:57,  9.18s/it][A
Train Diffusion:   2%|▏         | 86/5001 [12:04<13:45:33, 10.08s/it][A
Train Diffusion:   2%|▏         | 87/5001 [12:15<14:20:04, 10.50s/it][A
Train Diffusion:   2%|▏         | 88/5001 [12:24<13:24:31,  9.83s/it][A
Train Diffusion:   2%|▏         | 89/5001 [12:31<12:20:52,  9.05s/it][A
Train Diffusion:   2%|▏         | 90/5001 [12:39<11:53:32,  8.72s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 207314.9046875. Best ELBO loss value is: 184570.859375.

C_PATH mean = tensor([[0.6075, 0.6067, 0.6064],
        [0.6085, 0.6065, 0.6077],
        [0.6069, 0.6064, 0.6073]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.5993, 0.4877, 0.4749],
         [0.4627, 0.4506, 0.4445],
         [0.4555, 0.4515, 0.4622],
         ...,
         [0.5432, 0.5823, 0.5149],
         [0.6192, 0.5894, 0.5881],
         [0.6760, 0.6172, 0.8356]],

        [[0.5989, 0.5159, 0.4832],
         [0.4663, 0.4467, 0.4591],
         [0.4449, 0.4565, 0.4819],
         ...,
         [0.5399, 0.5508, 0.5642],
         [0.5154, 0.5710, 0.5280],
         [0.4988, 0.6080, 0.4501]],

        [[0.6015, 0.4834, 0.4466],
         [0.4662, 0.4685, 0.5078],
         [0.5093, 0.5036, 0.5067],
         ...,
         [0.6005, 0.5538, 0.5915],
         [0.5842, 0.5257, 0.5926],
         [0.6107, 0.5717, 0.8439]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 91/5001 [12:46<11:23:43,  8.36s/it][A
Train Diffusion:   2%|▏         | 92/5001 [12:55<11:28:55,  8.42s/it][A
Train Diffusion:   2%|▏         | 93/5001 [13:03<11:24:32,  8.37s/it][A
Train Diffusion:   2%|▏         | 94/5001 [13:13<12:08:24,  8.91s/it][A
Train Diffusion:   2%|▏         | 95/5001 [13:21<11:49:01,  8.67s/it][A
Train Diffusion:   2%|▏         | 96/5001 [13:30<11:42:04,  8.59s/it][A
Train Diffusion:   2%|▏         | 97/5001 [13:38<11:22:22,  8.35s/it][A
Train Diffusion:   2%|▏         | 98/5001 [13:45<11:07:56,  8.17s/it][A
Train Diffusion:   2%|▏         | 99/5001 [13:53<10:45:03,  7.90s/it][A
Train Diffusion:   2%|▏         | 100/5001 [14:00<10:33:45,  7.76s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 176966.3546875. Best ELBO loss value is: 171446.765625.

C_PATH mean = tensor([[0.5830, 0.5807, 0.5823],
        [0.5829, 0.5822, 0.5834],
        [0.5818, 0.5836, 0.5810]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.5627, 0.4291, 0.4525],
         [0.4062, 0.4051, 0.4845],
         [0.3817, 0.3982, 0.4711],
         ...,
         [0.4819, 0.5421, 0.4740],
         [0.5195, 0.5778, 0.4817],
         [0.7157, 0.5416, 0.4494]],

        [[0.6143, 0.4415, 0.4106],
         [0.4487, 0.4205, 0.4119],
         [0.4666, 0.4306, 0.4306],
         ...,
         [0.5821, 0.4589, 0.4946],
         [0.5995, 0.4544, 0.4963],
         [0.4909, 0.4954, 0.5843]],

        [[0.5712, 0.4967, 0.4175],
         [0.4165, 0.4441, 0.4231],
         [0.4285, 0.4448, 0.4365],
         ...,
         [0.4912, 0.5013, 0.5435],
         [0.4623, 0.4931, 0.5324],
         [0.4982, 0.5668, 0.9307]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 101/5001 [14:08<10:34:57,  7.78s/it][A
Train Diffusion:   2%|▏         | 102/5001 [14:15<10:23:21,  7.63s/it][A
Train Diffusion:   2%|▏         | 103/5001 [14:24<10:50:54,  7.97s/it][A
Train Diffusion:   2%|▏         | 104/5001 [14:32<10:47:50,  7.94s/it][A
Train Diffusion:   2%|▏         | 105/5001 [14:40<10:51:39,  7.99s/it][A
Train Diffusion:   2%|▏         | 106/5001 [14:48<10:51:21,  7.98s/it][A
Train Diffusion:   2%|▏         | 107/5001 [14:56<10:43:29,  7.89s/it][A
Train Diffusion:   2%|▏         | 108/5001 [15:03<10:41:34,  7.87s/it][A
Train Diffusion:   2%|▏         | 109/5001 [15:11<10:43:37,  7.89s/it][A
Train Diffusion:   2%|▏         | 110/5001 [15:19<10:36:19,  7.81s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 156367.9625. Best ELBO loss value is: 148394.265625.

C_PATH mean = tensor([[0.5513, 0.5520, 0.5526],
        [0.5525, 0.5524, 0.5508],
        [0.5527, 0.5506, 0.5520]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.5295, 0.4008, 0.3883],
         [0.3679, 0.3760, 0.3620],
         [0.4195, 0.4227, 0.3908],
         ...,
         [0.4197, 0.5108, 0.4464],
         [0.4121, 0.4798, 0.4738],
         [0.4420, 0.5366, 0.5117]],

        [[0.5515, 0.3773, 0.3846],
         [0.3707, 0.3616, 0.4282],
         [0.3582, 0.3677, 0.4187],
         ...,
         [0.4634, 0.4307, 0.4709],
         [0.4971, 0.4265, 0.4436],
         [0.5511, 0.4549, 0.9302]],

        [[0.5873, 0.4369, 0.3668],
         [0.4134, 0.4074, 0.3957],
         [0.3483, 0.3662, 0.3964],
         ...,
         [0.4862, 0.4491, 0.4298],
         [0.4474, 0.4361, 0.4293],
         [0.4556, 0.5015, 0.4737]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 111/5001 [15:28<10:55:57,  8.05s/it][A
Train Diffusion:   2%|▏         | 112/5001 [15:36<11:12:07,  8.25s/it][A
Train Diffusion:   2%|▏         | 113/5001 [15:44<11:08:56,  8.21s/it][A
Train Diffusion:   2%|▏         | 114/5001 [15:53<11:21:08,  8.36s/it][A
Train Diffusion:   2%|▏         | 115/5001 [16:05<12:44:33,  9.39s/it][A
Train Diffusion:   2%|▏         | 116/5001 [16:16<13:22:00,  9.85s/it][A
Train Diffusion:   2%|▏         | 117/5001 [16:28<14:24:49, 10.62s/it][A
Train Diffusion:   2%|▏         | 118/5001 [16:39<14:18:50, 10.55s/it][A
Train Diffusion:   2%|▏         | 119/5001 [16:50<14:44:00, 10.86s/it][A
Train Diffusion:   2%|▏         | 120/5001 [17:01<14:44:35, 10.87s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 134855.6640625. Best ELBO loss value is: 126274.1875.

C_PATH mean = tensor([[0.5155, 0.5162, 0.5156],
        [0.5155, 0.5160, 0.5161],
        [0.5162, 0.5156, 0.5162]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.5265, 0.3546, 0.3308],
         [0.3318, 0.3381, 0.3236],
         [0.3253, 0.3410, 0.3423],
         ...,
         [0.3855, 0.3990, 0.3812],
         [0.3810, 0.4036, 0.3837],
         [0.3929, 0.4488, 0.6587]],

        [[0.4970, 0.3529, 0.3456],
         [0.3278, 0.3276, 0.3602],
         [0.3068, 0.3331, 0.3430],
         ...,
         [0.3930, 0.3879, 0.3894],
         [0.4152, 0.3750, 0.3862],
         [0.3894, 0.4350, 0.6986]],

        [[0.5335, 0.3359, 0.3149],
         [0.3370, 0.3194, 0.3389],
         [0.3446, 0.3356, 0.3432],
         ...,
         [0.3858, 0.3736, 0.4020],
         [0.4006, 0.3913, 0.3926],
         [0.5278, 0.4335, 0.4147]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 121/5001 [17:11<14:14:11, 10.50s/it][A
Train Diffusion:   2%|▏         | 122/5001 [17:23<14:46:36, 10.90s/it][A
Train Diffusion:   2%|▏         | 123/5001 [17:30<13:24:51,  9.90s/it][A
Train Diffusion:   2%|▏         | 124/5001 [17:39<12:48:49,  9.46s/it][A
Train Diffusion:   2%|▏         | 125/5001 [17:46<11:57:54,  8.83s/it][A
Train Diffusion:   3%|▎         | 126/5001 [17:53<11:21:19,  8.39s/it][A
Train Diffusion:   3%|▎         | 127/5001 [18:00<10:48:31,  7.98s/it][A
Train Diffusion:   3%|▎         | 128/5001 [18:08<10:36:00,  7.83s/it][A
Train Diffusion:   3%|▎         | 129/5001 [18:15<10:22:15,  7.66s/it][A
Train Diffusion:   3%|▎         | 130/5001 [18:23<10:37:06,  7.85s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 117500.446875. Best ELBO loss value is: 110508.6875.

C_PATH mean = tensor([[0.4857, 0.4853, 0.4850],
        [0.4855, 0.4861, 0.4855],
        [0.4860, 0.4842, 0.4855]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.4701, 0.3063, 0.2859],
         [0.2871, 0.3059, 0.2859],
         [0.2961, 0.2797, 0.2911],
         ...,
         [0.3701, 0.3583, 0.3588],
         [0.4111, 0.3378, 0.3470],
         [0.4351, 0.3617, 0.3450]],

        [[0.5179, 0.2987, 0.2766],
         [0.3042, 0.2826, 0.2808],
         [0.3062, 0.2940, 0.2899],
         ...,
         [0.3719, 0.3206, 0.3337],
         [0.3387, 0.3255, 0.3247],
         [0.3240, 0.4025, 0.5806]],

        [[0.4893, 0.2926, 0.2940],
         [0.2713, 0.2809, 0.3237],
         [0.2398, 0.3037, 0.3412],
         ...,
         [0.3216, 0.3607, 0.3378],
         [0.3258, 0.3606, 0.3620],
         [0.3970, 0.4503, 0.7398]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 131/5001 [18:31<10:41:12,  7.90s/it][A
Train Diffusion:   3%|▎         | 132/5001 [18:39<10:42:24,  7.92s/it][A
Train Diffusion:   3%|▎         | 133/5001 [18:48<10:55:16,  8.08s/it][A
Train Diffusion:   3%|▎         | 134/5001 [18:56<10:54:29,  8.07s/it][A
Train Diffusion:   3%|▎         | 135/5001 [19:04<10:53:38,  8.06s/it][A
Train Diffusion:   3%|▎         | 136/5001 [19:11<10:39:01,  7.88s/it][A
Train Diffusion:   3%|▎         | 137/5001 [19:19<10:44:15,  7.95s/it][A
Train Diffusion:   3%|▎         | 138/5001 [19:28<10:50:20,  8.02s/it][A
Train Diffusion:   3%|▎         | 139/5001 [19:35<10:40:03,  7.90s/it][A
Train Diffusion:   3%|▎         | 140/5001 [19:44<10:56:21,  8.10s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 101267.83125. Best ELBO loss value is: 94561.921875.

C_PATH mean = tensor([[0.4516, 0.4514, 0.4512],
        [0.4524, 0.4515, 0.4510],
        [0.4524, 0.4512, 0.4521]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.4623, 0.2585, 0.2422],
         [0.2556, 0.2389, 0.2436],
         [0.2509, 0.2361, 0.2417],
         ...,
         [0.3104, 0.3368, 0.3258],
         [0.2939, 0.2920, 0.3299],
         [0.2806, 0.3455, 0.6476]],

        [[0.4456, 0.2642, 0.2332],
         [0.2581, 0.2525, 0.2392],
         [0.2529, 0.2544, 0.2521],
         ...,
         [0.2703, 0.2844, 0.2931],
         [0.2812, 0.3167, 0.2843],
         [0.4098, 0.3495, 0.4331]],

        [[0.4429, 0.2434, 0.2501],
         [0.2280, 0.2495, 0.2653],
         [0.2077, 0.2616, 0.2706],
         ...,
         [0.3189, 0.2826, 0.2748],
         [0.3608, 0.2830, 0.2849],
         [0.3375, 0.3205, 0.3583]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 141/5001 [19:52<10:59:10,  8.14s/it][A
Train Diffusion:   3%|▎         | 142/5001 [19:59<10:40:26,  7.91s/it][A
Train Diffusion:   3%|▎         | 143/5001 [20:07<10:35:30,  7.85s/it][A
Train Diffusion:   3%|▎         | 144/5001 [20:17<11:28:09,  8.50s/it][A
Train Diffusion:   3%|▎         | 145/5001 [20:25<10:59:05,  8.14s/it][A
Train Diffusion:   3%|▎         | 146/5001 [20:32<10:50:48,  8.04s/it][A
Train Diffusion:   3%|▎         | 147/5001 [20:40<10:37:44,  7.88s/it][A
Train Diffusion:   3%|▎         | 148/5001 [20:47<10:22:31,  7.70s/it][A
Train Diffusion:   3%|▎         | 149/5001 [20:55<10:28:28,  7.77s/it][A
Train Diffusion:   3%|▎         | 150/5001 [21:06<11:50:19,  8.79s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 89737.01171875. Best ELBO loss value is: 84852.40625.

C_PATH mean = tensor([[0.4281, 0.4276, 0.4275],
        [0.4282, 0.4267, 0.4275],
        [0.4280, 0.4273, 0.4273]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.4104, 0.2360, 0.2263],
         [0.2206, 0.2310, 0.2041],
         [0.2285, 0.2230, 0.2316],
         ...,
         [0.2593, 0.2418, 0.2494],
         [0.2716, 0.2497, 0.2398],
         [0.3325, 0.2868, 0.2973]],

        [[0.4309, 0.2195, 0.2046],
         [0.2043, 0.2102, 0.2188],
         [0.1849, 0.2187, 0.2192],
         ...,
         [0.2483, 0.2930, 0.2802],
         [0.2383, 0.2905, 0.2780],
         [0.2328, 0.3267, 0.4445]],

        [[0.4399, 0.2192, 0.2109],
         [0.2292, 0.2100, 0.2232],
         [0.2386, 0.2159, 0.2332],
         ...,
         [0.2922, 0.2642, 0.2596],
         [0.2929, 0.2507, 0.2700],
         [0.3143, 0.3442, 0.6495]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 151/5001 [21:15<11:46:29,  8.74s/it][A
Train Diffusion:   3%|▎         | 152/5001 [21:23<11:32:04,  8.56s/it][A
Train Diffusion:   3%|▎         | 153/5001 [21:31<11:07:35,  8.26s/it][A
Train Diffusion:   3%|▎         | 154/5001 [21:38<10:43:23,  7.96s/it][A
Train Diffusion:   3%|▎         | 155/5001 [21:45<10:31:30,  7.82s/it][A
Train Diffusion:   3%|▎         | 156/5001 [21:53<10:31:06,  7.82s/it][A
Train Diffusion:   3%|▎         | 157/5001 [22:00<10:10:05,  7.56s/it][A
Train Diffusion:   3%|▎         | 158/5001 [22:07<10:01:51,  7.46s/it][A
Train Diffusion:   3%|▎         | 159/5001 [22:15<10:07:19,  7.53s/it][A
Train Diffusion:   3%|▎         | 160/5001 [22:23<10:25:06,  7.75s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 78687.87265625. Best ELBO loss value is: 74110.109375.

C_PATH mean = tensor([[0.4019, 0.4020, 0.4012],
        [0.4027, 0.4009, 0.4017],
        [0.4019, 0.4020, 0.4023]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.3875, 0.1989, 0.1837],
         [0.1914, 0.2089, 0.2243],
         [0.1695, 0.2151, 0.2227],
         ...,
         [0.2446, 0.2269, 0.2373],
         [0.2376, 0.2487, 0.2199],
         [0.2169, 0.2856, 0.5821]],

        [[0.4001, 0.1825, 0.1990],
         [0.1980, 0.1857, 0.1882],
         [0.2205, 0.1899, 0.1866],
         ...,
         [0.2341, 0.2199, 0.2354],
         [0.2265, 0.2082, 0.2344],
         [0.2672, 0.2443, 0.2923]],

        [[0.4083, 0.2080, 0.1811],
         [0.1763, 0.1890, 0.1815],
         [0.1898, 0.1817, 0.1990],
         ...,
         [0.2143, 0.2509, 0.2216],
         [0.2558, 0.2460, 0.2402],
         [0.3014, 0.2840, 0.4227]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 161/5001 [22:31<10:25:13,  7.75s/it][A
Train Diffusion:   3%|▎         | 162/5001 [22:40<10:53:18,  8.10s/it][A
Train Diffusion:   3%|▎         | 163/5001 [22:47<10:34:24,  7.87s/it][A
Train Diffusion:   3%|▎         | 164/5001 [22:54<10:19:49,  7.69s/it][A
Train Diffusion:   3%|▎         | 165/5001 [23:02<10:04:57,  7.51s/it][A
Train Diffusion:   3%|▎         | 166/5001 [23:09<9:52:34,  7.35s/it] [A
Train Diffusion:   3%|▎         | 167/5001 [23:15<9:40:34,  7.21s/it][A
Train Diffusion:   3%|▎         | 168/5001 [23:23<9:52:09,  7.35s/it][A
Train Diffusion:   3%|▎         | 169/5001 [23:32<10:20:50,  7.71s/it][A
Train Diffusion:   3%|▎         | 170/5001 [23:40<10:40:58,  7.96s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 68357.77109375. Best ELBO loss value is: 63199.0546875.

C_PATH mean = tensor([[0.3715, 0.3708, 0.3715],
        [0.3712, 0.3708, 0.3704],
        [0.3701, 0.3706, 0.3709]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.3786, 0.1627, 0.1574],
         [0.1673, 0.1625, 0.1599],
         [0.1616, 0.1665, 0.1815],
         ...,
         [0.1794, 0.2334, 0.1941],
         [0.1778, 0.2179, 0.1916],
         [0.2790, 0.2363, 0.5201]],

        [[0.3841, 0.1687, 0.1751],
         [0.1587, 0.1702, 0.1994],
         [0.1513, 0.1692, 0.1793],
         ...,
         [0.2215, 0.1879, 0.2024],
         [0.2375, 0.1815, 0.2029],
         [0.2381, 0.2261, 0.3783]],

        [[0.3679, 0.1832, 0.1649],
         [0.1889, 0.1917, 0.1773],
         [0.1944, 0.2045, 0.1806],
         ...,
         [0.2175, 0.1971, 0.2060],
         [0.2118, 0.2191, 0.2065],
         [0.1933, 0.2574, 0.2868]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 171/5001 [23:47<10:18:52,  7.69s/it][A
Train Diffusion:   3%|▎         | 172/5001 [23:55<10:07:53,  7.55s/it][A