In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import math

#Torch-related imports
import torch
import torch.distributions as D
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Function

#Model-specific imports
from SBM_SDE import *
from obs_and_flow import *
from training import calc_log_lik

In [2]:
torch.manual_seed(0)
np.random.seed(0)

In [3]:
temp_ref = 283
temp_rise = 5 #High estimate of 5 celsius temperature rise by 2100. 

#System parameters from deterministic CON model
u_M = 0.002
a_SD = 0.33
a_DS = 0.33
a_M = 0.33
a_MSC = 0.5
k_S_ref = 0.000025
k_D_ref = 0.005
k_M_ref = 0.0002
Ea_S = 75
Ea_D = 50
Ea_M = 50

#SCON diffusion matrix parameters
c_SOC = 0.1
c_DOC = 0.0001
c_MBC = 0.001
s_SOC = 0.001
s_DOC = 0.001
s_MBC = 0.001

SCON_C_params_dict = {'u_M': u_M, 'a_SD': a_SD, 'a_DS': a_DS, 'a_M': a_M, 'a_MSC': a_MSC, 'k_S_ref': k_S_ref, 'k_D_ref': k_D_ref, 'k_M_ref': k_M_ref, 'Ea_S': Ea_S, 'Ea_D': Ea_D, 'Ea_M': Ea_M, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC}
SCON_SS_params_dict = {'u_M': u_M, 'a_SD': a_SD, 'a_DS': a_DS, 'a_M': a_M, 'a_MSC': a_MSC, 'k_S_ref': k_S_ref, 'k_D_ref': k_D_ref, 'k_M_ref': k_M_ref, 'Ea_S': Ea_S, 'Ea_D': Ea_D, 'Ea_M': Ea_M, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC}

#System parameters from deterministic AWB model
u_Q_ref = 0.2
Q = 0.002
a_MSA = 0.5
K_D = 200
K_U = 1
V_D_ref = 0.4
V_U_ref = 0.02
Ea_V_D = 75
Ea_V_U = 50
r_M = 0.0004
r_E = 0.00001
r_L = 0.0005

#SAWB diffusion matrix parameters
c_SOC = 2
c_DOC = 0.05
c_MBC = 0.1
c_EEC = 0.01
s_SOC = 0.1
s_DOC = 0.1
s_MBC = 0.1
s_EEC = 0.1

SAWB_C_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_D': K_D, 'K_U': K_U, 'V_D_ref': V_D_ref, 'V_U_ref': V_U_ref, 'Ea_V_D': Ea_V_D, 'Ea_V_U': Ea_V_U, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC, 'c_EEC': c_EEC}
SAWB_SS_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_D': K_D, 'K_U': K_U, 'V_D_ref': V_D_ref, 'V_U_ref': V_U_ref, 'Ea_V_D': Ea_V_D, 'Ea_V_U': Ea_V_U, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC, 's_EEC': s_EEC}

#System parameters from deterministic AWB-ECA model
u_Q_ref = 0.2
Q = 0.002
a_MSA = 0.5
K_DE = 200
K_UE = 1
V_DE_ref = 0.4
V_UE_ref = 0.02
Ea_V_DE = 75
Ea_V_UE = 50
r_M = 0.0004
r_E = 0.00001
r_L = 0.0005

#SAWB-ECA diffusion matrix parameters
c_SOC = 2
c_DOC = 0.05
c_MBC = 0.1
c_EEC = 0.01
s_SOC = 0.1
s_DOC = 0.1
s_MBC = 0.1
s_EEC = 0.1

SAWB_ECA_C_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_DE': K_DE, 'K_UE': K_UE, 'V_DE_ref': V_DE_ref, 'V_UE_ref': V_UE_ref, 'Ea_V_DE': Ea_V_DE, 'Ea_V_UE': Ea_V_UE, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC, 'c_EEC': c_EEC}
SAWB_ECA_SS_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_DE': K_DE, 'K_UE': K_UE, 'V_DE_ref': V_DE_ref, 'V_UE_ref': V_UE_ref, 'Ea_V_DE': Ea_V_DE, 'Ea_V_UE': Ea_V_UE, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC, 's_EEC': s_EEC}

In [4]:
#Set flow NN parameters.

devi = torch.device("".join(["cuda:",f'{cuda_id}']) if torch.cuda.is_available() else "cpu")
dt_flow = 0.2
t = 1000
n_flow = int(t / dt_flow) + 1
t_span = np.linspace(0, t, n_flow)
t_span_tensor = torch.reshape(torch.Tensor(t_span), [1, n_flow, 1]) #T_span needs to be converted to tensor object. Additionally, facilitates conversion of I_S and I_D to tensor objects.
l_r = 5e-4
niter = 5001
piter = 250
batch_size = 3 #Number of sets of observation outputs to sample per set of parameters.
state_dim_SCON = 3 #Not including CO2 in STATE_DIM, because CO2 is an observation.
obs_error_scale = 0.1 #Proportion of the mean of observation error standard deviation.

x0_SCON = [40, 0.08, 0.8] #Initial condition means for SCON

In [5]:
#Obtain temperature forcing function.
temp_tensor = temp_gen(t_span_tensor, temp_ref, temp_rise)
print(temp_tensor)

#Obtain SOC and DOC pool litter input vectors for use in flow SDE functions.
i_s_tensor = i_s(t_span_tensor) #Exogenous SOC input function
i_d_tensor = i_d(t_span_tensor) #Exogenous DOC input function
print(i_s_tensor)
print(i_d_tensor)

tensor([[[283.0000],
         [283.5248],
         [284.0482],
         ...,
         [281.4880],
         [281.1925],
         [280.9200]]])
tensor([[[0.0010],
         [0.0010],
         [0.0010],
         ...,
         [0.0013],
         [0.0013],
         [0.0013]]])
tensor([[[1.0000e-04],
         [1.0001e-04],
         [1.0001e-04],
         ...,
         [1.3286e-04],
         [1.3286e-04],
         [1.3287e-04]]])


In [6]:
def train(DEVICE, L_R, NITER, PRETRAIN_ITER, BATCH_SIZE, SDEFLOW, ObsModel, csv_to_obs_df, DATA_CSV, OBS_ERROR_SCALE, STATE_DIM, T, DT, N, T_SPAN_TENSOR, I_S_TENSOR, I_D_TENSOR, TEMP_TENSOR, TEMP_REF, C0, DRIFT_DIFFUSION, PARAMS_DICT): 
    #Read-in observation information. 
    obs_times, obs_means, obs_error = csv_to_obs_df(DATA_CSV, STATE_DIM, T, OBS_ERROR_SCALE)
    #Pass observation information to `ObsModel`.
    obs_model = ObsModel(DEVICE, obs_times, DT, obs_means, obs_error)
    net = SDEFlow(DEVICE, obs_model, STATE_DIM, T, DT, N, I_S_TENSOR, I_D_TENSOR, cond_inputs = 3, num_layers = 6).to(DEVICE)
    optimizer = optim.Adam(net.parameters(), lr = L_R)
    if PRETRAIN_ITER >= NITER:
        raise Exception("PRETRAIN_ITER must be < NITER.")
    best_loss_norm = 1e10
    best_loss_ELBO = 1e10
    norm_losses = [best_loss_norm] * 10
    ELBO_losses = [best_loss_ELBO] * 10
    C0_tensor = torch.tensor(C0).to(DEVICE) #Convert initial conditions from list to tensor for X0 prior object.
    #C0 = C0[(None,) * 2].repeat(BATCH_SIZE, 1, 1).to(DEVICE)
    PARAMS_DICT_TENSOR = {k: torch.tensor(v).expand(BATCH_SIZE) for k, v in PARAMS_DICT.items()}
    X0_prior = D.normal.Normal(loc = C0_tensor, scale = OBS_ERROR_SCALE * C0_tensor) #Setting prior noise = observation noise for now.
    with tqdm(total = NITER, desc = f'Train Diffusion', position = -1) as tq:
        for i in range(NITER):
            net.train()
            optimizer.zero_grad()
            C_PATH, log_prob = net(BATCH_SIZE) #For obs_and_flow.py
            #C_PATH = torch.cat([C0, C_PATH], 1) #Learning initial conditions in this version. #Append deterministic CON initial conditions conditional on parameter values to C path.
            if i <= PRETRAIN_ITER:
                l1_norm_element = C_PATH - torch.mean(obs_model.mu, -1)
                l1_norm = torch.sum(torch.abs(l1_norm_element)).mean()
                best_loss_norm = l1_norm if l1_norm < best_loss_norm else best_loss_norm
                norm_losses.append(l1_norm.item())
                #l2_norm_element = C_PATH - torch.mean(obs_model.mu, -1)
                #l2_norm = torch.sqrt(torch.sum(torch.square(l2_norm_element))).mean()
                #best_loss_norm = l2_norm if l2_norm < best_loss_norm else best_loss_norm
                #norm_losses.append(l2_norm.item())
                if i % 10 == 0:
                    ma_norm_loss = sum(norm_losses[-10:]) / len(norm_losses[-10:])
                    print(f"\nMoving average norm loss at {iter} iterations is: {ma_norm_loss}. Best norm loss value is: {best_loss_norm}.")
                    print('\nC_PATH mean =', C_PATH.mean(-2))
                    print('\nC_PATH =', C_PATH)
                l1_norm.backward()
                #l2_norm.backward()
            else:
                log_lik = calc_log_lik(C_PATH, T_SPAN_TENSOR.to(DEVICE), DT, I_S_TENSOR.to(DEVICE), I_D_TENSOR.to(DEVICE), TEMP_TENSOR.to(DEVICE), TEMP_REF, DRIFT_DIFFUSION, PARAMS_DICT)
                neg_ELBO = -X0_prior.log_prob(C_PATH[:, 0, :]).sum(-1).mean() - log_lik.mean() - obs_model(C_PATH, PARAMS_DICT_TENSOR) + log_prob.mean()
                best_loss_ELBO = neg_ELBO if neg_ELBO < best_loss_ELBO else best_loss_ELBO
                ELBO_losses.append(neg_ELBO.item())
                if i % 10 == 0:             
                    ma_elbo_loss = sum(ELBO_losses[-10:]) / len(ELBO_losses[-10:])
                    print(f"\nMoving average ELBO loss at {iter} iterations is: {ma_elbo_loss}. Best ELBO loss value is: {best_loss_ELBO}.")
                    print('\nC_PATH mean =', C_PATH.mean(-2))
                    print('\nC_PATH =', C_PATH)
            torch.nn.utils.clip_grad_norm_(net.parameters(), 3.0)
            optimizer.step()
            if i % 100000 == 0 and i > 0:
                optimizer.param_groups[0]['lr'] *= 0.1
            tq.update()
    return net, ELBO_losses, norm_losses

In [None]:
net, ELBO_losses, norm_losses = train(devi, l_r, niter, piter, batch_size, SDEFlow, ObsModel, csv_to_obs_df, 'y_from_x_t_1000_dt_0-01.csv', obs_error_scale, state_dim_SCON, t, dt_flow, n_flow, t_span_tensor, i_s_tensor, i_d_tensor, temp_tensor, temp_ref, x0_SCON, drift_diffusion_SCON_C, SCON_C_params_dict)


Train Diffusion:   0%|          | 0/5001 [00:00<?, ?it/s][A


Moving average norm loss at <built-in function iter> iterations is: 9000048777.76875. Best norm loss value is: 487777.6875.

C_PATH mean = tensor([[0.8412, 0.8549, 0.8230],
        [0.8406, 0.8506, 0.8430],
        [0.8338, 0.8452, 0.8374]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.7979, 0.4989, 0.8650],
         [1.0883, 0.3769, 2.7311],
         [0.3203, 0.7267, 1.9741],
         ...,
         [0.5538, 0.7271, 0.4697],
         [0.6251, 1.0343, 0.5232],
         [0.5830, 1.1936, 0.6196]],

        [[0.7932, 0.9848, 0.8966],
         [0.2745, 1.7503, 1.1032],
         [0.2097, 0.5167, 1.5775],
         ...,
         [0.7713, 1.1644, 0.7867],
         [0.6278, 0.5162, 0.7906],
         [0.8528, 0.6983, 0.7032]],

        [[0.5078, 1.8464, 0.8937],
         [1.3542, 0.9721, 0.2880],
         [3.1418, 1.7542, 0.5738],
         ...,
         [1.0607, 0.4409, 0.7835],
         [1.4567, 0.4050, 0.8300],
         [1.0807, 0.6015, 0.7105]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 1/5001 [00:07<9:58:42,  7.18s/it][A
Train Diffusion:   0%|          | 2/5001 [00:14<9:49:22,  7.07s/it][A
Train Diffusion:   0%|          | 3/5001 [00:21<10:02:16,  7.23s/it][A
Train Diffusion:   0%|          | 4/5001 [00:29<10:39:11,  7.67s/it][A
Train Diffusion:   0%|          | 5/5001 [00:37<10:37:13,  7.65s/it][A
Train Diffusion:   0%|          | 6/5001 [00:45<10:55:55,  7.88s/it][A
Train Diffusion:   0%|          | 7/5001 [00:54<11:23:15,  8.21s/it][A
Train Diffusion:   0%|          | 8/5001 [01:03<11:36:15,  8.37s/it][A
Train Diffusion:   0%|          | 9/5001 [01:10<10:59:42,  7.93s/it][A
Train Diffusion:   0%|          | 10/5001 [01:20<11:47:24,  8.50s/it][A


Moving average norm loss at <built-in function iter> iterations is: 484292.625. Best norm loss value is: 481173.78125.

C_PATH mean = tensor([[1.1543, 0.9091, 1.0381],
        [1.1465, 0.9087, 1.0403],
        [1.1530, 0.9061, 1.0468]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.6522, 1.1212, 1.3921],
         [1.4355, 1.9493, 1.7745],
         [2.1817, 1.5417, 1.7370],
         ...,
         [3.0407, 0.2333, 0.4067],
         [0.8372, 0.9192, 0.8916],
         [1.6845, 0.8087, 0.8938]],

        [[0.7257, 1.5692, 1.2102],
         [1.2684, 1.1243, 1.5937],
         [1.3636, 0.9959, 1.1029],
         ...,
         [0.9383, 1.1271, 0.9637],
         [2.9841, 0.6358, 1.5744],
         [0.6144, 0.8518, 0.9835]],

        [[0.7725, 1.9250, 1.0835],
         [0.9704, 1.2086, 1.1118],
         [1.0455, 1.3292, 1.2594],
         ...,
         [0.8271, 0.9082, 1.0051],
         [0.9568, 0.5508, 0.8797],
         [1.0169, 0.9112, 1.2045]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 11/5001 [01:30<12:25:17,  8.96s/it][A
Train Diffusion:   0%|          | 12/5001 [01:42<13:58:43, 10.09s/it][A
Train Diffusion:   0%|          | 13/5001 [01:50<13:01:03,  9.40s/it][A
Train Diffusion:   0%|          | 14/5001 [01:58<12:16:54,  8.87s/it][A
Train Diffusion:   0%|          | 15/5001 [02:06<11:52:14,  8.57s/it][A
Train Diffusion:   0%|          | 16/5001 [02:14<11:35:10,  8.37s/it][A
Train Diffusion:   0%|          | 17/5001 [02:22<11:37:35,  8.40s/it][A
Train Diffusion:   0%|          | 18/5001 [02:31<11:55:43,  8.62s/it][A
Train Diffusion:   0%|          | 19/5001 [02:39<11:44:08,  8.48s/it][A
Train Diffusion:   0%|          | 20/5001 [02:49<12:23:04,  8.95s/it][A


Moving average norm loss at <built-in function iter> iterations is: 472693.234375. Best norm loss value is: 460949.6875.

C_PATH mean = tensor([[2.6503, 0.9832, 1.4101],
        [2.6783, 0.9772, 1.3958],
        [2.7162, 0.9623, 1.4197]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.6845, 1.5477, 1.3498],
         [1.2877, 1.4343, 1.2334],
         [3.2653, 1.1825, 1.5352],
         ...,
         [1.6642, 1.4149, 0.9968],
         [5.5062, 1.2824, 1.6467],
         [1.3136, 0.8960, 3.9144]],

        [[1.1040, 1.3867, 1.3765],
         [1.4738, 1.1182, 1.3642],
         [3.1038, 0.8310, 1.3487],
         ...,
         [3.4430, 0.0181, 0.5995],
         [1.6094, 0.8840, 1.0150],
         [7.4385, 1.2926, 3.7144]],

        [[0.7172, 1.5399, 1.5112],
         [4.7422, 0.7112, 1.1579],
         [1.5840, 1.1552, 1.2511],
         ...,
         [1.4661, 0.9897, 1.4179],
         [1.6600, 0.2460, 1.0739],
         [1.4446, 0.8279, 0.5805]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 21/5001 [02:57<11:53:15,  8.59s/it][A
Train Diffusion:   0%|          | 22/5001 [03:07<12:19:55,  8.92s/it][A
Train Diffusion:   0%|          | 23/5001 [03:19<13:32:47,  9.80s/it][A
Train Diffusion:   0%|          | 24/5001 [03:27<12:56:55,  9.37s/it][A
Train Diffusion:   0%|          | 25/5001 [03:34<12:01:48,  8.70s/it][A
Train Diffusion:   1%|          | 26/5001 [03:42<11:38:53,  8.43s/it][A
Train Diffusion:   1%|          | 27/5001 [03:51<11:47:45,  8.54s/it][A
Train Diffusion:   1%|          | 28/5001 [03:59<11:43:24,  8.49s/it][A
Train Diffusion:   1%|          | 29/5001 [04:08<11:44:42,  8.50s/it][A
Train Diffusion:   1%|          | 30/5001 [04:16<11:47:32,  8.54s/it][A


Moving average norm loss at <built-in function iter> iterations is: 427921.1375. Best norm loss value is: 403807.0625.

C_PATH mean = tensor([[6.6761, 0.9711, 1.2854],
        [6.5052, 0.9919, 1.2818],
        [6.6446, 0.9582, 1.2915]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 0.8440,  0.8017,  0.7987],
         [ 1.3761,  0.5969,  0.8278],
         [23.6338,  0.1432,  1.4931],
         ...,
         [ 2.5235,  0.3551,  1.0627],
         [30.7260,  0.8671,  0.2723],
         [ 3.0005,  0.3575,  1.5783]],

        [[ 0.8440,  1.0179,  0.6702],
         [11.0136,  1.0934,  1.1024],
         [ 1.4595,  0.6352,  0.6627],
         ...,
         [14.9440,  0.8288,  0.1340],
         [ 2.6676,  0.3511,  1.1464],
         [ 1.2944,  5.5067,  7.0452]],

        [[ 0.8440,  0.9485,  0.7933],
         [ 0.8518,  0.9370,  0.9250],
         [ 0.8794,  0.8919,  0.9367],
         ...,
         [ 0.4135,  0.9631,  1.8711],
         [ 0.5014,  2.9774,  1.1772],
         [ 0.2522,  1.7623,  1.2367


Train Diffusion:   1%|          | 31/5001 [04:24<11:27:08,  8.30s/it][A
Train Diffusion:   1%|          | 32/5001 [04:32<11:08:39,  8.07s/it][A
Train Diffusion:   1%|          | 33/5001 [04:42<12:05:05,  8.76s/it][A
Train Diffusion:   1%|          | 34/5001 [04:51<12:20:37,  8.95s/it][A
Train Diffusion:   1%|          | 35/5001 [05:00<12:20:48,  8.95s/it][A
Train Diffusion:   1%|          | 36/5001 [05:09<12:18:33,  8.93s/it][A
Train Diffusion:   1%|          | 37/5001 [05:17<11:42:05,  8.49s/it][A
Train Diffusion:   1%|          | 38/5001 [05:25<11:29:07,  8.33s/it][A
Train Diffusion:   1%|          | 39/5001 [05:33<11:32:48,  8.38s/it][A
Train Diffusion:   1%|          | 40/5001 [05:41<11:12:58,  8.14s/it][A


Moving average norm loss at <built-in function iter> iterations is: 375619.40625. Best norm loss value is: 359935.75.

C_PATH mean = tensor([[9.6557, 0.5436, 1.2972],
        [9.8715, 0.5453, 1.3050],
        [9.7430, 0.5272, 1.2918]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 1.1224,  0.9295,  5.3198],
         [ 1.2551,  1.0547,  2.2540],
         [ 0.6108,  0.7741,  0.7731],
         ...,
         [ 1.9366,  0.6477,  1.0566],
         [ 1.9125,  0.3659,  1.0642],
         [23.5513,  1.7610,  2.7778]],

        [[ 1.0526,  0.9657,  0.7708],
         [ 1.5881,  0.4159,  1.3071],
         [29.6813,  0.3188,  1.3026],
         ...,
         [28.2666,  0.2270,  1.2836],
         [ 4.2289,  0.2408,  0.8675],
         [ 0.9350,  1.7472,  2.8690]],

        [[ 0.7352,  1.0714,  0.9233],
         [10.0256,  1.0802,  0.8789],
         [ 2.1284,  0.4086,  1.4283],
         ...,
         [ 1.4696,  0.5044,  1.5035],
         [45.2667,  0.0818,  1.5153],
         [ 5.7756,  0.4607,  2.3183]


Train Diffusion:   1%|          | 41/5001 [05:49<11:07:18,  8.07s/it][A
Train Diffusion:   1%|          | 42/5001 [05:56<10:42:41,  7.78s/it][A
Train Diffusion:   1%|          | 43/5001 [06:03<10:25:35,  7.57s/it][A
Train Diffusion:   1%|          | 44/5001 [06:10<10:27:36,  7.60s/it][A
Train Diffusion:   1%|          | 45/5001 [06:17<10:10:38,  7.39s/it][A
Train Diffusion:   1%|          | 46/5001 [06:25<10:18:03,  7.48s/it][A
Train Diffusion:   1%|          | 47/5001 [06:33<10:17:20,  7.48s/it][A
Train Diffusion:   1%|          | 48/5001 [06:40<10:12:25,  7.42s/it][A
Train Diffusion:   1%|          | 49/5001 [06:47<10:16:10,  7.47s/it][A
Train Diffusion:   1%|          | 50/5001 [06:55<10:10:43,  7.40s/it][A


Moving average norm loss at <built-in function iter> iterations is: 348114.88125. Best norm loss value is: 337895.78125.

C_PATH mean = tensor([[11.1986,  0.4636,  1.2642],
        [10.8172,  0.4994,  1.2866],
        [11.4419,  0.4497,  1.2842]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[5.6390e-01, 7.5536e-01, 1.0456e+00],
         [1.8587e+01, 3.9103e-01, 1.1230e+00],
         [3.8795e+00, 1.7118e-01, 1.2566e+00],
         ...,
         [3.4021e+01, 7.0132e-02, 1.3577e+00],
         [8.3652e+00, 1.0759e-01, 6.5171e-01],
         [6.2838e-01, 1.0295e+00, 2.0195e+00]],

        [[1.4275e+00, 6.8272e-01, 9.7551e-01],
         [7.2062e-01, 6.4707e-01, 6.8643e-01],
         [6.1586e-01, 8.0982e-01, 1.2128e+00],
         ...,
         [7.1414e+00, 9.9681e-02, 1.0587e+00],
         [4.3255e+01, 4.6390e-02, 1.6048e+00],
         [1.0984e+01, 3.9324e-02, 5.1539e+00]],

        [[1.5578e+00, 6.7429e-01, 5.8075e-01],
         [1.1614e+00, 4.8752e-01, 1.3580e+00],
         [2.1994e+01, 6.74


Train Diffusion:   1%|          | 51/5001 [07:02<10:03:30,  7.32s/it][A
Train Diffusion:   1%|          | 52/5001 [07:09<10:06:01,  7.35s/it][A
Train Diffusion:   1%|          | 53/5001 [07:16<9:54:44,  7.21s/it] [A
Train Diffusion:   1%|          | 54/5001 [07:23<9:58:11,  7.26s/it][A
Train Diffusion:   1%|          | 55/5001 [07:30<9:46:00,  7.11s/it][A
Train Diffusion:   1%|          | 56/5001 [07:38<10:08:16,  7.38s/it][A
Train Diffusion:   1%|          | 57/5001 [07:46<10:12:21,  7.43s/it][A
Train Diffusion:   1%|          | 58/5001 [07:53<10:00:25,  7.29s/it][A
Train Diffusion:   1%|          | 59/5001 [08:00<9:48:20,  7.14s/it] [A
Train Diffusion:   1%|          | 60/5001 [08:07<9:55:42,  7.23s/it][A


Moving average norm loss at <built-in function iter> iterations is: 325430.203125. Best norm loss value is: 314833.8125.

C_PATH mean = tensor([[12.8033,  0.4021,  1.3039],
        [12.3945,  0.4422,  1.2882],
        [12.6214,  0.4346,  1.2680]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[5.9462e-01, 7.4400e-01, 1.2014e+00],
         [2.0583e+01, 1.7777e-01, 1.1876e+00],
         [6.8090e+00, 2.6509e-01, 6.3555e-01],
         ...,
         [3.6566e+00, 7.2190e-01, 6.9105e-01],
         [6.1435e-01, 3.2741e-01, 1.5882e+00],
         [2.0526e+00, 4.9633e-01, 1.0036e+00]],

        [[9.7362e-01, 7.4443e-01, 5.8593e-01],
         [6.8276e-01, 5.5748e-01, 5.0455e-01],
         [8.2412e+00, 2.1547e-01, 7.1060e-01],
         ...,
         [3.3590e+01, 3.5711e-02, 1.1753e+00],
         [1.0514e+01, 6.2997e-02, 1.2212e+00],
         [2.4210e+01, 6.5596e-01, 1.1413e+00]],

        [[3.2208e+00, 6.3341e-01, 8.7629e-01],
         [2.9821e+00, 2.0167e-01, 1.3175e+00],
         [1.2364e-06, 4.04


Train Diffusion:   1%|          | 61/5001 [08:15<10:07:26,  7.38s/it][A
Train Diffusion:   1%|          | 62/5001 [08:22<10:12:17,  7.44s/it][A
Train Diffusion:   1%|▏         | 63/5001 [08:29<10:00:53,  7.30s/it][A
Train Diffusion:   1%|▏         | 64/5001 [08:36<9:53:13,  7.21s/it] [A
Train Diffusion:   1%|▏         | 65/5001 [08:44<10:01:32,  7.31s/it][A
Train Diffusion:   1%|▏         | 66/5001 [08:51<10:02:57,  7.33s/it][A
Train Diffusion:   1%|▏         | 67/5001 [08:58<9:54:03,  7.22s/it] [A
Train Diffusion:   1%|▏         | 68/5001 [09:06<10:12:18,  7.45s/it][A
Train Diffusion:   1%|▏         | 69/5001 [09:13<10:06:55,  7.38s/it][A
Train Diffusion:   1%|▏         | 70/5001 [09:21<10:04:10,  7.35s/it][A


Moving average norm loss at <built-in function iter> iterations is: 306166.5625. Best norm loss value is: 296563.59375.

C_PATH mean = tensor([[13.6734,  0.3749,  1.3118],
        [13.6985,  0.3809,  1.3086],
        [13.5373,  0.3807,  1.3033]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 0.5374,  0.6610,  0.7868],
         [17.2073,  0.1327,  1.1407],
         [ 5.7282,  0.2455,  0.6294],
         ...,
         [23.5546,  0.1277,  1.3607],
         [ 8.9170,  0.2585,  0.9253],
         [26.0038,  0.9650,  0.8537]],

        [[ 2.9968,  0.5012,  0.2395],
         [ 2.9455,  0.1596,  1.7666],
         [36.7638,  0.0795,  1.1384],
         ...,
         [19.3607,  0.0934,  1.2223],
         [ 8.0158,  0.1662,  0.6240],
         [ 0.6112,  1.8144,  0.6416]],

        [[ 1.2211,  0.6277,  0.5729],
         [ 3.7778,  0.2674,  0.5219],
         [ 0.5882,  0.4835,  1.0056],
         ...,
         [11.5285,  0.0721,  0.7971],
         [39.7507,  0.1402,  1.5356],
         [14.9881,  0.143


Train Diffusion:   1%|▏         | 71/5001 [09:28<10:02:28,  7.33s/it][A
Train Diffusion:   1%|▏         | 72/5001 [09:35<9:50:57,  7.19s/it] [A
Train Diffusion:   1%|▏         | 73/5001 [09:42<9:52:19,  7.21s/it][A
Train Diffusion:   1%|▏         | 74/5001 [09:49<9:57:38,  7.28s/it][A
Train Diffusion:   1%|▏         | 75/5001 [09:57<9:54:32,  7.24s/it][A
Train Diffusion:   2%|▏         | 76/5001 [10:04<9:48:58,  7.18s/it][A
Train Diffusion:   2%|▏         | 77/5001 [10:11<9:48:43,  7.17s/it][A
Train Diffusion:   2%|▏         | 78/5001 [10:18<9:47:40,  7.16s/it][A
Train Diffusion:   2%|▏         | 79/5001 [10:25<9:48:35,  7.18s/it][A
Train Diffusion:   2%|▏         | 80/5001 [10:32<9:46:03,  7.15s/it][A


Moving average norm loss at <built-in function iter> iterations is: 285294.03125. Best norm loss value is: 277266.0.

C_PATH mean = tensor([[15.3209,  0.3523,  1.2803],
        [15.1794,  0.3559,  1.2883],
        [14.8268,  0.3895,  1.3031]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 0.7154,  1.1476,  0.8123],
         [16.8420,  0.1117,  1.0406],
         [ 7.8934,  0.1680,  0.5553],
         ...,
         [27.4853,  0.0579,  1.3240],
         [11.4889,  0.1294,  0.9309],
         [27.7748,  0.4573,  0.8382]],

        [[ 1.3221,  1.1626,  0.3274],
         [ 0.4286,  0.5069,  2.0787],
         [ 0.2613,  4.6830,  4.4413],
         ...,
         [23.1279,  0.1688,  1.2408],
         [ 7.7157,  0.3118,  1.6490],
         [ 1.1666,  0.5917,  1.1191]],

        [[ 3.7988,  1.1473,  2.3043],
         [ 2.7828,  0.1392,  1.5421],
         [37.2105,  0.0875,  1.1927],
         ...,
         [12.7962,  0.0783,  0.8910],
         [37.5889,  0.1301,  1.4585],
         [15.2032,  0.1289, 


Train Diffusion:   2%|▏         | 81/5001 [10:39<9:39:34,  7.07s/it][A
Train Diffusion:   2%|▏         | 82/5001 [10:47<9:50:02,  7.20s/it][A
Train Diffusion:   2%|▏         | 83/5001 [10:54<9:45:57,  7.15s/it][A
Train Diffusion:   2%|▏         | 84/5001 [11:00<9:34:31,  7.01s/it][A
Train Diffusion:   2%|▏         | 85/5001 [11:07<9:37:25,  7.05s/it][A
Train Diffusion:   2%|▏         | 86/5001 [11:15<9:59:03,  7.31s/it][A
Train Diffusion:   2%|▏         | 87/5001 [11:23<10:07:29,  7.42s/it][A
Train Diffusion:   2%|▏         | 88/5001 [11:31<10:32:15,  7.72s/it][A
Train Diffusion:   2%|▏         | 89/5001 [11:39<10:21:42,  7.59s/it][A
Train Diffusion:   2%|▏         | 90/5001 [11:46<10:22:06,  7.60s/it][A


Moving average norm loss at <built-in function iter> iterations is: 267387.1140625. Best norm loss value is: 259231.1875.

C_PATH mean = tensor([[16.3827,  0.3309,  1.3629],
        [16.6735,  0.3426,  1.3284],
        [16.2858,  0.3495,  1.3557]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 3.5691,  0.9291,  0.5613],
         [ 1.8622,  0.1553,  1.3298],
         [31.5308,  0.1321,  1.1972],
         ...,
         [27.6397,  0.0512,  1.5323],
         [14.0079,  0.0942,  0.8567],
         [27.1148,  0.3091,  1.4839]],

        [[ 0.5405,  0.9831,  0.5225],
         [21.9218,  0.0885,  1.3887],
         [11.9343,  0.0996,  0.5222],
         ...,
         [24.9919,  0.2070,  1.1432],
         [ 7.1374,  0.0427,  0.2558],
         [ 5.6287,  0.1891,  2.6061]],

        [[ 2.8050,  0.9184,  0.5132],
         [ 0.3240,  0.4056,  0.9496],
         [10.4564,  0.2790,  0.9113],
         ...,
         [15.4017,  0.0887,  0.8294],
         [34.1185,  0.1017,  1.4856],
         [13.3882,  0.3


Train Diffusion:   2%|▏         | 91/5001 [11:54<10:26:45,  7.66s/it][A
Train Diffusion:   2%|▏         | 92/5001 [12:02<10:24:40,  7.64s/it][A
Train Diffusion:   2%|▏         | 93/5001 [12:09<10:20:24,  7.58s/it][A
Train Diffusion:   2%|▏         | 94/5001 [12:16<10:09:15,  7.45s/it][A
Train Diffusion:   2%|▏         | 95/5001 [12:24<10:06:14,  7.41s/it][A
Train Diffusion:   2%|▏         | 96/5001 [12:31<10:11:31,  7.48s/it][A
Train Diffusion:   2%|▏         | 97/5001 [12:39<10:12:18,  7.49s/it][A
Train Diffusion:   2%|▏         | 98/5001 [12:47<10:18:48,  7.57s/it][A
Train Diffusion:   2%|▏         | 99/5001 [12:54<10:24:22,  7.64s/it][A
Train Diffusion:   2%|▏         | 100/5001 [13:02<10:23:07,  7.63s/it][A


Moving average norm loss at <built-in function iter> iterations is: 251114.653125. Best norm loss value is: 243172.625.

C_PATH mean = tensor([[17.5064,  0.3322,  1.4563],
        [17.7169,  0.3182,  1.4620],
        [17.5838,  0.3228,  1.4647]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 3.4338,  1.2859,  0.4993],
         [ 1.6151,  0.1586,  1.9207],
         [21.5614,  0.2202,  1.1598],
         ...,
         [38.0518,  0.0456,  1.3198],
         [18.4767,  0.1133,  1.1891],
         [28.4258,  0.2258,  1.3773]],

        [[ 3.0424,  1.3397,  0.3735],
         [ 0.3660,  0.2927,  1.2947],
         [25.2791,  0.0852,  0.9681],
         ...,
         [15.7936,  0.1078,  1.2788],
         [26.0114,  0.1300,  0.9771],
         [10.0472,  0.3743,  2.5524]],

        [[ 0.8128,  1.2793,  0.8046],
         [22.8363,  0.0985,  1.3715],
         [14.2197,  0.1116,  1.4640],
         ...,
         [ 0.4971,  0.0822,  5.3319],
         [34.8053,  0.1025,  1.4172],
         [16.2076,  0.105


Train Diffusion:   2%|▏         | 101/5001 [13:09<10:04:19,  7.40s/it][A
Train Diffusion:   2%|▏         | 102/5001 [13:16<9:48:42,  7.21s/it] [A
Train Diffusion:   2%|▏         | 103/5001 [13:24<10:12:33,  7.50s/it][A
Train Diffusion:   2%|▏         | 104/5001 [13:32<10:27:08,  7.68s/it][A
Train Diffusion:   2%|▏         | 105/5001 [13:40<10:31:40,  7.74s/it][A
Train Diffusion:   2%|▏         | 106/5001 [13:47<10:22:18,  7.63s/it][A
Train Diffusion:   2%|▏         | 107/5001 [13:54<10:04:09,  7.41s/it][A
Train Diffusion:   2%|▏         | 108/5001 [14:02<10:06:17,  7.43s/it][A
Train Diffusion:   2%|▏         | 109/5001 [14:09<9:58:30,  7.34s/it] [A
Train Diffusion:   2%|▏         | 110/5001 [14:16<9:46:34,  7.20s/it][A


Moving average norm loss at <built-in function iter> iterations is: 234845.1609375. Best norm loss value is: 227967.4375.

C_PATH mean = tensor([[18.9468,  0.3302,  1.4133],
        [18.7034,  0.3481,  1.4392],
        [18.3992,  0.3536,  1.4890]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 2.7554,  1.8686,  0.7591],
         [ 8.4372,  0.1582,  2.5080],
         [ 0.1189,  0.3951,  1.1781],
         ...,
         [16.4186,  0.0584,  0.9850],
         [42.8680,  0.0835,  1.3920],
         [20.8158,  0.0551,  4.0680]],

        [[ 4.1726,  1.7309,  0.3339],
         [ 6.0313,  0.0697,  2.1192],
         [39.6823,  0.1080,  1.2694],
         ...,
         [12.6260,  0.1737,  0.7142],
         [ 6.9497,  0.1338,  0.8795],
         [31.4484,  0.2308,  1.9285]],

        [[ 1.1701,  1.8285,  0.4894],
         [21.7784,  0.1072,  0.9915],
         [12.1592,  0.0748,  0.6155],
         ...,
         [30.4578,  0.1007,  1.4739],
         [17.5063,  0.0616,  0.8038],
         [ 0.9557,  0.5


Train Diffusion:   2%|▏         | 111/5001 [14:22<9:38:22,  7.10s/it][A
Train Diffusion:   2%|▏         | 112/5001 [14:29<9:29:33,  6.99s/it][A
Train Diffusion:   2%|▏         | 113/5001 [14:36<9:25:05,  6.94s/it][A
Train Diffusion:   2%|▏         | 114/5001 [14:43<9:32:35,  7.03s/it][A
Train Diffusion:   2%|▏         | 115/5001 [14:54<11:01:04,  8.12s/it][A
Train Diffusion:   2%|▏         | 116/5001 [15:05<12:05:19,  8.91s/it][A
Train Diffusion:   2%|▏         | 117/5001 [15:12<11:25:43,  8.42s/it][A
Train Diffusion:   2%|▏         | 118/5001 [15:19<10:52:32,  8.02s/it][A
Train Diffusion:   2%|▏         | 119/5001 [15:26<10:23:54,  7.67s/it][A
Train Diffusion:   2%|▏         | 120/5001 [15:33<10:07:42,  7.47s/it][A


Moving average norm loss at <built-in function iter> iterations is: 219464.8390625. Best norm loss value is: 213524.28125.

C_PATH mean = tensor([[19.6161,  0.3128,  1.3834],
        [20.0569,  0.2931,  1.3885],
        [19.6125,  0.3207,  1.3879]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.8574e+00, 2.3585e+00, 5.0153e-01],
         [2.1327e+01, 8.3467e-02, 1.2366e+00],
         [1.1453e+01, 9.6204e-02, 1.4895e+00],
         ...,
         [2.0721e+01, 7.4116e-02, 6.2247e-01],
         [7.3090e+00, 9.2166e-02, 9.6438e-01],
         [3.3362e+01, 9.8923e-02, 2.4513e+00]],

        [[2.1723e+00, 2.3944e+00, 4.7200e-01],
         [1.8590e+01, 1.2423e-01, 4.1132e-01],
         [1.3289e+00, 2.2457e-01, 1.4612e+00],
         ...,
         [1.9377e+01, 7.7096e-02, 8.4095e-01],
         [4.3280e+01, 7.9700e-02, 1.2870e+00],
         [2.1752e+01, 3.5917e-02, 2.8601e+00]],

        [[5.0799e+00, 1.9916e+00, 3.5546e-01],
         [8.7593e+00, 5.3441e-02, 1.8630e+00],
         [3.9151e+01, 9.


Train Diffusion:   2%|▏         | 121/5001 [15:42<10:45:15,  7.93s/it][A
Train Diffusion:   2%|▏         | 122/5001 [15:51<11:24:45,  8.42s/it][A
Train Diffusion:   2%|▏         | 123/5001 [16:00<11:35:56,  8.56s/it][A
Train Diffusion:   2%|▏         | 124/5001 [16:08<11:17:53,  8.34s/it][A
Train Diffusion:   2%|▏         | 125/5001 [16:16<10:58:49,  8.11s/it][A
Train Diffusion:   3%|▎         | 126/5001 [16:24<11:04:30,  8.18s/it][A
Train Diffusion:   3%|▎         | 127/5001 [16:33<11:34:01,  8.54s/it][A
Train Diffusion:   3%|▎         | 128/5001 [16:41<11:20:59,  8.38s/it][A
Train Diffusion:   3%|▎         | 129/5001 [16:52<12:10:50,  9.00s/it][A
Train Diffusion:   3%|▎         | 130/5001 [17:02<12:42:34,  9.39s/it][A


Moving average norm loss at <built-in function iter> iterations is: 204210.6375. Best norm loss value is: 195977.421875.

C_PATH mean = tensor([[20.7286,  0.3493,  1.3660],
        [20.8195,  0.3300,  1.3846],
        [20.8137,  0.3427,  1.4034]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.5492e+00, 2.5496e+00, 5.6037e-01],
         [2.2686e+01, 5.9877e-02, 1.3606e+00],
         [1.8785e+01, 6.1962e-02, 1.1301e+00],
         ...,
         [3.2908e+01, 6.1455e-02, 1.5152e+00],
         [1.9841e+01, 1.2434e-01, 8.7482e-01],
         [2.0027e+01, 2.8267e-01, 2.0296e+00]],

        [[4.2137e+00, 2.3719e+00, 5.9999e-01],
         [2.9121e-01, 2.4509e-01, 4.9997e-01],
         [6.6680e+00, 3.1508e+00, 1.8054e-01],
         ...,
         [1.9834e+01, 1.4026e-01, 7.7438e-01],
         [3.9833e+01, 8.5649e-02, 1.3314e+00],
         [2.2130e+01, 1.5271e-01, 8.7647e-01]],

        [[4.5099e+00, 2.2339e+00, 4.7554e-01],
         [2.2675e+00, 1.1533e-01, 1.6186e+00],
         [3.2226e+01, 8.27


Train Diffusion:   3%|▎         | 131/5001 [17:13<13:06:00,  9.68s/it][A
Train Diffusion:   3%|▎         | 132/5001 [17:21<12:46:50,  9.45s/it][A
Train Diffusion:   3%|▎         | 133/5001 [17:29<12:07:03,  8.96s/it][A
Train Diffusion:   3%|▎         | 134/5001 [17:40<12:39:31,  9.36s/it][A
Train Diffusion:   3%|▎         | 135/5001 [17:50<12:57:45,  9.59s/it][A
Train Diffusion:   3%|▎         | 136/5001 [18:00<13:23:08,  9.91s/it][A
Train Diffusion:   3%|▎         | 137/5001 [18:12<14:04:36, 10.42s/it][A
Train Diffusion:   3%|▎         | 138/5001 [18:19<12:48:31,  9.48s/it][A
Train Diffusion:   3%|▎         | 139/5001 [18:26<11:49:16,  8.75s/it][A
Train Diffusion:   3%|▎         | 140/5001 [18:34<11:16:29,  8.35s/it][A


Moving average norm loss at <built-in function iter> iterations is: 187954.7984375. Best norm loss value is: 180312.234375.

C_PATH mean = tensor([[21.8666,  0.3284,  1.3743],
        [21.5908,  0.3569,  1.4022],
        [21.4946,  0.3287,  1.4100]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 2.0849,  3.2869,  0.4063],
         [17.1470,  0.2543,  0.1664],
         [ 0.7268,  0.2730,  0.4488],
         ...,
         [19.3701,  0.0439,  0.7961],
         [39.9991,  0.0836,  1.1409],
         [23.5030,  0.0561,  1.0567]],

        [[ 3.4392,  3.3534,  0.6247],
         [21.9799,  0.0831,  1.4270],
         [16.9388,  0.0711,  1.2583],
         ...,
         [30.1253,  0.0899,  1.2219],
         [22.6473,  0.0487,  0.8457],
         [32.8342,  0.1277,  1.2006]],

        [[ 5.6485,  2.4706,  0.8576],
         [10.8459,  0.0504,  1.7268],
         [35.3953,  0.0873,  1.5326],
         ...,
         [ 1.6833,  2.4963,  0.3498],
         [ 2.0637,  0.1459,  0.4579],
         [ 4.3891,  0


Train Diffusion:   3%|▎         | 141/5001 [18:41<10:48:23,  8.00s/it][A
Train Diffusion:   3%|▎         | 142/5001 [18:48<10:36:17,  7.86s/it][A
Train Diffusion:   3%|▎         | 143/5001 [18:56<10:37:38,  7.88s/it][A
Train Diffusion:   3%|▎         | 144/5001 [19:04<10:26:25,  7.74s/it][A
Train Diffusion:   3%|▎         | 145/5001 [19:13<10:55:19,  8.10s/it][A
Train Diffusion:   3%|▎         | 146/5001 [19:21<10:56:44,  8.12s/it][A
Train Diffusion:   3%|▎         | 147/5001 [19:28<10:22:49,  7.70s/it][A
Train Diffusion:   3%|▎         | 148/5001 [19:34<9:58:34,  7.40s/it] [A
Train Diffusion:   3%|▎         | 149/5001 [19:41<9:45:38,  7.24s/it][A
Train Diffusion:   3%|▎         | 150/5001 [19:48<9:31:23,  7.07s/it][A


Moving average norm loss at <built-in function iter> iterations is: 173744.525. Best norm loss value is: 166780.84375.

C_PATH mean = tensor([[22.4722,  0.3416,  1.5575],
        [22.4754,  0.2982,  1.5333],
        [22.6649,  0.2922,  1.5237]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 1.9809,  2.9781,  0.7084],
         [23.1429,  0.0683,  1.5265],
         [21.9182,  0.0635,  1.1921],
         ...,
         [22.5910,  0.0862,  0.9417],
         [23.2914,  0.0888,  3.7107],
         [22.7981,  1.9620,  0.1141]],

        [[ 4.9882,  2.7533,  0.4134],
         [ 0.7869,  0.2120,  2.1296],
         [29.9310,  0.0596,  1.2440],
         ...,
         [32.4868,  0.0854,  1.5442],
         [26.1430,  0.0768,  0.8666],
         [15.9310,  0.3847,  0.3240]],

        [[ 5.3079,  2.4915,  0.8014],
         [ 2.3207,  0.1079,  2.1787],
         [29.0478,  0.0587,  1.3182],
         ...,
         [ 0.4260, 13.4127,  0.6833],
         [29.7626,  0.0963,  0.8659],
         [21.9170,  0.1039


Train Diffusion:   3%|▎         | 151/5001 [19:55<9:21:51,  6.95s/it][A
Train Diffusion:   3%|▎         | 152/5001 [20:02<9:23:22,  6.97s/it][A
Train Diffusion:   3%|▎         | 153/5001 [20:08<9:18:11,  6.91s/it][A
Train Diffusion:   3%|▎         | 154/5001 [20:15<9:17:29,  6.90s/it][A
Train Diffusion:   3%|▎         | 155/5001 [20:23<9:33:45,  7.10s/it][A
Train Diffusion:   3%|▎         | 156/5001 [20:30<9:38:28,  7.16s/it][A
Train Diffusion:   3%|▎         | 157/5001 [20:37<9:36:51,  7.15s/it][A
Train Diffusion:   3%|▎         | 158/5001 [20:44<9:41:28,  7.20s/it][A
Train Diffusion:   3%|▎         | 159/5001 [20:52<9:52:32,  7.34s/it][A
Train Diffusion:   3%|▎         | 160/5001 [20:59<9:46:37,  7.27s/it][A


Moving average norm loss at <built-in function iter> iterations is: 161739.5265625. Best norm loss value is: 153549.015625.

C_PATH mean = tensor([[23.6382,  0.3491,  1.5005],
        [23.7814,  0.3092,  1.4886],
        [23.7212,  0.3303,  1.4764]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[6.1868e+00, 2.7908e+00, 1.4655e+01],
         [1.9299e-01, 4.9335e-01, 2.7735e+00],
         [3.3857e-02, 3.3686e+00, 2.9903e+00],
         ...,
         [3.0436e+01, 7.5935e-02, 1.4491e+00],
         [6.7972e+00, 6.9537e+00, 7.4663e-01],
         [3.3405e+01, 1.2342e-01, 1.4462e+00]],

        [[4.2354e+00, 3.4488e+00, 1.6028e-01],
         [6.4256e-01, 6.2965e-01, 3.2329e+00],
         [4.0315e+01, 7.8111e-02, 1.3800e+00],
         ...,
         [2.5267e+01, 5.2948e-02, 1.0238e+00],
         [4.2384e+01, 7.3284e-02, 1.2675e+00],
         [2.5920e+01, 4.0359e-02, 2.4654e+00]],

        [[2.4018e+00, 3.4263e+00, 5.4899e-01],
         [2.4274e+01, 6.5582e-02, 1.0603e+00],
         [2.0991e+01, 5


Train Diffusion:   3%|▎         | 161/5001 [21:06<9:45:36,  7.26s/it][A
Train Diffusion:   3%|▎         | 162/5001 [21:14<9:54:47,  7.38s/it][A
Train Diffusion:   3%|▎         | 163/5001 [21:22<10:08:38,  7.55s/it][A
Train Diffusion:   3%|▎         | 164/5001 [21:32<10:56:20,  8.14s/it][A
Train Diffusion:   3%|▎         | 165/5001 [21:39<10:44:02,  7.99s/it][A
Train Diffusion:   3%|▎         | 166/5001 [21:49<11:22:14,  8.47s/it][A
Train Diffusion:   3%|▎         | 167/5001 [21:56<10:50:19,  8.07s/it][A
Train Diffusion:   3%|▎         | 168/5001 [22:06<11:46:41,  8.77s/it][A
Train Diffusion:   3%|▎         | 169/5001 [22:14<11:22:06,  8.47s/it][A
Train Diffusion:   3%|▎         | 170/5001 [22:22<11:12:22,  8.35s/it][A


Moving average norm loss at <built-in function iter> iterations is: 150229.5375. Best norm loss value is: 143414.125.

C_PATH mean = tensor([[24.2528,  0.2856,  1.5114],
        [24.2599,  0.3132,  1.5267],
        [24.1323,  0.3185,  1.5110]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[5.3922e+00, 4.2821e+00, 1.8558e-01],
         [1.0345e+00, 2.8379e-01, 2.1853e+00],
         [2.8346e+01, 4.8809e-02, 1.0390e+00],
         ...,
         [3.2819e+01, 7.1529e-02, 1.3251e+00],
         [2.9290e+01, 8.0592e-02, 9.7617e-01],
         [2.2757e+01, 1.3208e-01, 2.0531e+00]],

        [[5.9067e+00, 3.5398e+00, 2.1439e+00],
         [4.0670e+00, 7.4082e-02, 2.0259e+00],
         [3.0377e+01, 4.6466e-02, 1.1801e+00],
         ...,
         [1.8436e+01, 1.6574e-01, 1.1566e+00],
         [2.1277e+01, 3.5295e-02, 3.9661e+00],
         [1.9892e+01, 5.2899e-01, 7.8790e-01]],

        [[2.3783e+00, 4.1949e+00, 8.2971e-01],
         [2.3933e+01, 5.2303e-02, 1.2555e+00],
         [2.5434e+01, 8.4653e


Train Diffusion:   3%|▎         | 171/5001 [22:30<10:52:40,  8.11s/it][A
Train Diffusion:   3%|▎         | 172/5001 [22:38<10:48:24,  8.06s/it][A
Train Diffusion:   3%|▎         | 173/5001 [22:46<10:45:01,  8.02s/it][A
Train Diffusion:   3%|▎         | 174/5001 [22:53<10:20:14,  7.71s/it][A
Train Diffusion:   3%|▎         | 175/5001 [23:00<10:06:16,  7.54s/it][A
Train Diffusion:   4%|▎         | 176/5001 [23:07<10:02:15,  7.49s/it][A
Train Diffusion:   4%|▎         | 177/5001 [23:14<9:49:27,  7.33s/it] [A
Train Diffusion:   4%|▎         | 178/5001 [23:21<9:40:49,  7.23s/it][A
Train Diffusion:   4%|▎         | 179/5001 [23:29<9:51:13,  7.36s/it][A
Train Diffusion:   4%|▎         | 180/5001 [23:36<9:57:35,  7.44s/it][A


Moving average norm loss at <built-in function iter> iterations is: 136866.7328125. Best norm loss value is: 131011.5.

C_PATH mean = tensor([[24.8946,  0.2409,  1.5114],
        [24.8346,  0.2959,  1.4998],
        [24.9138,  0.2280,  1.5613]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[6.2076e+00, 3.6752e+00, 2.3532e+00],
         [3.2851e+00, 6.5235e-02, 1.7297e+00],
         [3.1153e+01, 4.8471e-02, 1.3189e+00],
         ...,
         [2.9387e+01, 3.0750e-02, 1.6316e+00],
         [2.7620e+01, 5.6853e-02, 7.7475e-01],
         [7.6791e+00, 3.8478e+00, 6.1173e-01]],

        [[5.8274e+00, 4.3287e+00, 2.2259e-01],
         [1.8382e+00, 2.0502e-01, 2.0339e+00],
         [2.9629e+01, 5.1536e-02, 1.3050e+00],
         ...,
         [2.7440e+01, 6.9656e-02, 1.3599e+00],
         [2.2039e+01, 8.0852e-02, 1.6949e-01],
         [1.7936e+01, 9.2996e+00, 1.0559e-01]],

        [[2.8094e+00, 4.7692e+00, 5.8972e-01],
         [2.3958e+01, 6.1849e-02, 1.4546e+00],
         [2.7045e+01, 7.5158


Train Diffusion:   4%|▎         | 181/5001 [23:44<10:01:15,  7.48s/it][A
Train Diffusion:   4%|▎         | 182/5001 [23:52<10:17:20,  7.69s/it][A
Train Diffusion:   4%|▎         | 183/5001 [23:59<10:07:22,  7.56s/it][A
Train Diffusion:   4%|▎         | 184/5001 [24:07<10:07:28,  7.57s/it][A
Train Diffusion:   4%|▎         | 185/5001 [24:17<11:06:39,  8.31s/it][A
Train Diffusion:   4%|▎         | 186/5001 [24:26<11:18:09,  8.45s/it][A
Train Diffusion:   4%|▎         | 187/5001 [24:36<12:02:01,  9.00s/it][A
Train Diffusion:   4%|▍         | 188/5001 [24:45<12:04:40,  9.03s/it][A
Train Diffusion:   4%|▍         | 189/5001 [24:57<13:01:16,  9.74s/it][A
Train Diffusion:   4%|▍         | 190/5001 [25:04<12:11:44,  9.13s/it][A


Moving average norm loss at <built-in function iter> iterations is: 124785.13359375. Best norm loss value is: 117905.125.

C_PATH mean = tensor([[25.7990,  0.3579,  1.5268],
        [25.8054,  0.3024,  1.4896],
        [25.6704,  0.3272,  1.4963]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[7.2922e+00, 3.5799e+00, 6.5934e-01],
         [1.7552e+01, 3.3634e-02, 1.9934e+00],
         [3.6814e+01, 7.3575e-02, 1.4383e+00],
         ...,
         [2.2499e+01, 8.6520e-02, 1.2223e+00],
         [2.3754e+01, 3.5904e-02, 3.7981e+00],
         [1.1436e+01, 2.9025e+00, 3.0787e-01]],

        [[3.8379e+00, 5.2325e+00, 6.2856e-01],
         [2.3299e+01, 3.8372e-02, 1.4460e+00],
         [2.2445e+01, 4.3059e-02, 1.6080e+00],
         ...,
         [3.3753e+01, 7.7227e-02, 1.4803e+00],
         [3.1191e+01, 6.8464e-02, 9.3350e-01],
         [3.0625e+01, 8.8162e-02, 1.8135e+00]],

        [[4.3498e+00, 5.2504e+00, 3.9024e-01],
         [1.9375e+01, 1.0802e-01, 2.0895e-01],
         [4.1467e+00, 8.8


Train Diffusion:   4%|▍         | 191/5001 [25:13<12:02:21,  9.01s/it][A
Train Diffusion:   4%|▍         | 192/5001 [25:22<11:51:28,  8.88s/it][A
Train Diffusion:   4%|▍         | 193/5001 [25:32<12:18:15,  9.21s/it][A
Train Diffusion:   4%|▍         | 194/5001 [25:41<12:29:10,  9.35s/it][A
Train Diffusion:   4%|▍         | 195/5001 [25:50<12:22:54,  9.27s/it][A
Train Diffusion:   4%|▍         | 196/5001 [25:59<11:57:14,  8.96s/it][A
Train Diffusion:   4%|▍         | 197/5001 [26:08<12:02:27,  9.02s/it][A
Train Diffusion:   4%|▍         | 198/5001 [26:16<11:50:24,  8.87s/it][A
Train Diffusion:   4%|▍         | 199/5001 [26:24<11:12:40,  8.40s/it][A
Train Diffusion:   4%|▍         | 200/5001 [26:31<10:53:21,  8.17s/it][A


Moving average norm loss at <built-in function iter> iterations is: 112088.7703125. Best norm loss value is: 106583.2421875.

C_PATH mean = tensor([[26.6797,  0.3043,  1.5095],
        [26.5939,  0.2786,  1.4949],
        [26.8852,  0.2591,  1.4747]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[6.6861e+00, 5.3466e+00, 1.1255e+01],
         [2.5129e-02, 4.2476e+00, 1.1216e+01],
         [1.1073e+00, 7.5962e-02, 1.4561e+00],
         ...,
         [1.9873e+01, 1.8769e-02, 3.9542e+00],
         [2.4848e+01, 7.4440e-02, 1.1087e+00],
         [3.5608e+01, 5.1105e-02, 1.5253e+00]],

        [[3.3356e+00, 6.4863e+00, 3.3702e-01],
         [2.1211e+01, 6.2661e-02, 4.4638e-01],
         [2.3189e+01, 3.4759e-02, 1.6791e+00],
         ...,
         [2.0809e+01, 3.1083e-02, 1.2902e+00],
         [3.9274e+01, 5.1545e-02, 1.3429e+00],
         [3.0309e+01, 1.5409e-01, 1.3160e+00]],

        [[6.3715e+00, 6.2120e+00, 1.4041e-01],
         [5.2970e+00, 1.7475e-01, 2.8846e+00],
         [3.6743e+01, 


Train Diffusion:   4%|▍         | 201/5001 [26:38<10:29:22,  7.87s/it][A
Train Diffusion:   4%|▍         | 202/5001 [26:48<11:05:07,  8.32s/it][A
Train Diffusion:   4%|▍         | 203/5001 [26:57<11:25:20,  8.57s/it][A
Train Diffusion:   4%|▍         | 204/5001 [27:05<11:21:28,  8.52s/it][A
Train Diffusion:   4%|▍         | 205/5001 [27:13<10:54:42,  8.19s/it][A
Train Diffusion:   4%|▍         | 206/5001 [27:20<10:32:52,  7.92s/it][A
Train Diffusion:   4%|▍         | 207/5001 [27:27<10:03:19,  7.55s/it][A
Train Diffusion:   4%|▍         | 208/5001 [27:34<9:48:21,  7.37s/it] [A
Train Diffusion:   4%|▍         | 209/5001 [27:41<9:46:10,  7.34s/it][A
Train Diffusion:   4%|▍         | 210/5001 [27:50<10:28:30,  7.87s/it][A


Moving average norm loss at <built-in function iter> iterations is: 103729.39375. Best norm loss value is: 99694.8828125.

C_PATH mean = tensor([[27.2610,  0.3264,  1.5779],
        [27.0565,  0.2937,  1.6248],
        [27.2366,  0.2828,  1.6387]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[4.7526e+00, 6.7335e+00, 3.1051e-01],
         [1.8360e+01, 7.4533e-01, 1.5673e-01],
         [1.6807e+00, 8.4833e-02, 1.8765e+00],
         ...,
         [2.9834e+01, 4.0942e-02, 1.2767e+00],
         [3.2141e+01, 6.4329e-02, 8.6727e-01],
         [3.2471e+01, 2.2287e-02, 1.7339e+00]],

        [[5.2320e+00, 6.8020e+00, 6.4348e-01],
         [2.3423e+01, 3.0812e-02, 1.5962e+00],
         [2.9024e+01, 3.6181e-02, 1.8774e+00],
         ...,
         [2.9015e+01, 1.9783e-02, 1.7006e+00],
         [3.1563e+01, 4.6689e-02, 9.7008e-01],
         [2.3364e+01, 1.5329e-01, 2.2750e+00]],

        [[8.1745e+00, 4.5921e+00, 1.0584e+00],
         [2.0211e+01, 2.7530e-02, 2.2543e+00],
         [3.9243e+01, 5.0


Train Diffusion:   4%|▍         | 211/5001 [27:58<10:34:33,  7.95s/it][A
Train Diffusion:   4%|▍         | 212/5001 [28:06<10:44:11,  8.07s/it][A
Train Diffusion:   4%|▍         | 213/5001 [28:13<10:18:31,  7.75s/it][A
Train Diffusion:   4%|▍         | 214/5001 [28:21<10:05:01,  7.58s/it][A
Train Diffusion:   4%|▍         | 215/5001 [28:28<9:51:39,  7.42s/it] [A
Train Diffusion:   4%|▍         | 216/5001 [28:35<9:46:34,  7.36s/it][A
Train Diffusion:   4%|▍         | 217/5001 [28:42<9:46:17,  7.35s/it][A
Train Diffusion:   4%|▍         | 218/5001 [28:50<9:59:58,  7.53s/it][A
Train Diffusion:   4%|▍         | 219/5001 [28:58<10:01:59,  7.55s/it][A
Train Diffusion:   4%|▍         | 220/5001 [29:05<9:57:42,  7.50s/it] [A


Moving average norm loss at <built-in function iter> iterations is: 94430.20234375. Best norm loss value is: 89125.84375.

C_PATH mean = tensor([[27.8122,  0.2745,  1.6155],
        [28.0174,  0.2926,  1.6257],
        [27.9635,  0.2579,  1.6412]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[7.4561e+00, 6.9126e+00, 1.9295e+00],
         [6.8667e+00, 9.5191e-02, 1.2191e+00],
         [1.2214e+01, 4.8806e-02, 1.7925e+00],
         ...,
         [2.4250e+01, 2.6369e-02, 1.1226e+00],
         [3.0206e+01, 4.8575e-02, 1.3508e+00],
         [3.0752e+01, 2.4810e-01, 8.0368e-01]],

        [[4.3057e+00, 6.5822e+00, 5.1984e-01],
         [2.3142e+01, 2.0886e-02, 1.5577e+00],
         [2.5116e+01, 4.2687e-02, 1.7195e+00],
         ...,
         [3.6435e+01, 9.7043e-03, 1.6879e+00],
         [3.0724e+01, 1.8231e-01, 1.0697e+00],
         [3.2877e+01, 1.2168e-01, 6.1729e-01]],

        [[8.0561e+00, 4.9433e+00, 4.9539e-01],
         [1.2930e+01, 7.5111e-02, 2.7681e+00],
         [3.6704e+01, 5.1


Train Diffusion:   4%|▍         | 221/5001 [29:13<9:54:52,  7.47s/it][A
Train Diffusion:   4%|▍         | 222/5001 [29:19<9:41:16,  7.30s/it][A
Train Diffusion:   4%|▍         | 223/5001 [29:27<9:53:42,  7.46s/it][A
Train Diffusion:   4%|▍         | 224/5001 [29:36<10:31:25,  7.93s/it][A
Train Diffusion:   4%|▍         | 225/5001 [29:45<10:41:32,  8.06s/it][A
Train Diffusion:   5%|▍         | 226/5001 [29:52<10:19:14,  7.78s/it][A
Train Diffusion:   5%|▍         | 227/5001 [29:59<10:08:55,  7.65s/it][A
Train Diffusion:   5%|▍         | 228/5001 [30:06<9:51:52,  7.44s/it] [A
Train Diffusion:   5%|▍         | 229/5001 [30:16<10:50:12,  8.18s/it][A
Train Diffusion:   5%|▍         | 230/5001 [30:26<11:43:25,  8.85s/it][A


Moving average norm loss at <built-in function iter> iterations is: 86753.5046875. Best norm loss value is: 83296.546875.

C_PATH mean = tensor([[28.3955,  0.3116,  1.5502],
        [28.6875,  0.2676,  1.5271],
        [28.4469,  0.2977,  1.5644]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[4.6509e+00, 7.2774e+00, 8.1721e-01],
         [2.5653e+01, 3.0516e-02, 1.2754e+00],
         [3.1854e+01, 8.1933e-02, 1.6199e+00],
         ...,
         [3.3828e+01, 2.2680e-02, 1.2018e+00],
         [3.2432e+01, 7.8234e-02, 1.0886e+00],
         [2.9789e+01, 4.8961e-02, 1.0568e+00]],

        [[6.4430e+00, 7.8187e+00, 2.7571e-01],
         [5.1797e-01, 1.8771e+00, 8.7473e-01],
         [2.2612e+01, 6.0528e-02, 9.4083e-01],
         ...,
         [3.2448e+01, 6.9273e-02, 1.2117e+00],
         [2.3869e+01, 6.0099e-02, 1.6122e+00],
         [2.2647e+01, 6.2197e+00, 5.8127e-01]],

        [[8.3918e+00, 4.9171e+00, 1.7430e+00],
         [2.1721e+01, 2.6070e-02, 2.0927e+00],
         [2.9126e+01, 3.6


Train Diffusion:   5%|▍         | 231/5001 [30:35<11:35:03,  8.74s/it][A
Train Diffusion:   5%|▍         | 232/5001 [30:43<11:14:00,  8.48s/it][A
Train Diffusion:   5%|▍         | 233/5001 [30:53<11:44:25,  8.86s/it][A
Train Diffusion:   5%|▍         | 234/5001 [31:00<11:16:30,  8.51s/it][A
Train Diffusion:   5%|▍         | 235/5001 [31:08<10:48:27,  8.16s/it][A
Train Diffusion:   5%|▍         | 236/5001 [31:16<10:59:07,  8.30s/it][A
Train Diffusion:   5%|▍         | 237/5001 [31:25<11:08:13,  8.42s/it][A
Train Diffusion:   5%|▍         | 238/5001 [31:33<10:52:39,  8.22s/it][A
Train Diffusion:   5%|▍         | 239/5001 [31:40<10:39:01,  8.05s/it][A
Train Diffusion:   5%|▍         | 240/5001 [31:49<10:42:06,  8.09s/it][A


Moving average norm loss at <built-in function iter> iterations is: 84882.27734375. Best norm loss value is: 82376.375.

C_PATH mean = tensor([[28.3498,  0.3392,  1.5875],
        [28.1388,  0.3643,  1.6136],
        [28.1286,  0.3621,  1.6928]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[4.9914e+00, 8.2352e+00, 4.2344e-01],
         [2.6139e+01, 1.7688e-02, 1.3354e+00],
         [3.0966e+01, 7.8708e-02, 1.6928e+00],
         ...,
         [3.1967e+01, 3.4637e-01, 4.5370e-01],
         [3.3220e+01, 1.5367e-02, 1.3569e+00],
         [3.0160e+01, 1.9005e-01, 3.1680e+00]],

        [[9.6505e+00, 5.3759e+00, 8.1449e-01],
         [1.9592e+01, 3.6793e-02, 2.5036e+00],
         [2.8184e+01, 2.2298e-02, 1.5881e+00],
         ...,
         [2.8514e+01, 4.3609e-02, 1.0791e+00],
         [2.5621e+01, 5.7303e-02, 1.3673e+00],
         [3.2976e+01, 6.4013e-01, 6.2596e-01]],

        [[8.2728e+00, 9.4435e+00, 2.3254e-01],
         [4.0558e+00, 1.7294e-01, 4.5367e+00],
         [5.9134e-01, 2.837


Train Diffusion:   5%|▍         | 241/5001 [31:57<10:44:12,  8.12s/it][A
Train Diffusion:   5%|▍         | 242/5001 [32:09<12:30:04,  9.46s/it][A
Train Diffusion:   5%|▍         | 243/5001 [32:21<13:21:07, 10.10s/it][A
Train Diffusion:   5%|▍         | 244/5001 [32:29<12:42:52,  9.62s/it][A
Train Diffusion:   5%|▍         | 245/5001 [32:38<12:12:55,  9.25s/it][A
Train Diffusion:   5%|▍         | 246/5001 [32:46<11:52:34,  8.99s/it][A
Train Diffusion:   5%|▍         | 247/5001 [32:57<12:37:10,  9.56s/it][A
Train Diffusion:   5%|▍         | 248/5001 [33:07<12:51:37,  9.74s/it][A
Train Diffusion:   5%|▍         | 249/5001 [33:16<12:36:01,  9.55s/it][A
Train Diffusion:   5%|▍         | 250/5001 [33:25<12:14:57,  9.28s/it][A


Moving average norm loss at <built-in function iter> iterations is: 82558.45390625. Best norm loss value is: 77829.078125.

C_PATH mean = tensor([[29.1290,  0.3658,  1.6419],
        [29.4643,  0.3534,  1.6044],
        [29.4156,  0.3685,  1.5835]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[7.2746e+00, 8.1581e+00, 1.2320e-01],
         [9.9851e-01, 1.5410e+00, 5.4075e+00],
         [3.3931e+01, 3.3148e-02, 1.6191e+00],
         ...,
         [3.7012e+01, 3.1664e-02, 1.4525e+00],
         [3.1068e+01, 6.8096e-02, 8.4038e-01],
         [3.5583e+01, 1.2200e-01, 8.6032e-01]],

        [[9.7777e+00, 5.1883e+00, 2.0375e+01],
         [7.5120e-02, 6.7377e-01, 3.9575e+00],
         [2.4798e+01, 3.8451e-02, 1.2676e+00],
         ...,
         [3.0429e+01, 7.9647e-02, 9.5185e-01],
         [3.5183e+01, 2.4318e-02, 1.5786e+00],
         [3.1238e+01, 1.1995e-01, 2.5511e+00]],

        [[5.3682e+00, 7.6380e+00, 2.6041e-01],
         [2.3475e+01, 8.0973e-02, 1.4205e+00],
         [2.0011e+01, 5.


Train Diffusion:   5%|▌         | 251/5001 [33:35<12:43:29,  9.64s/it][A
Train Diffusion:   5%|▌         | 252/5001 [33:39<10:24:16,  7.89s/it][A
Train Diffusion:   5%|▌         | 253/5001 [33:46<10:00:24,  7.59s/it][A
Train Diffusion:   5%|▌         | 254/5001 [33:50<8:35:39,  6.52s/it] [A
Train Diffusion:   5%|▌         | 255/5001 [33:58<9:10:13,  6.96s/it][A
Train Diffusion:   5%|▌         | 256/5001 [34:02<7:48:33,  5.92s/it][A
Train Diffusion:   5%|▌         | 257/5001 [34:05<6:53:54,  5.23s/it][A
Train Diffusion:   5%|▌         | 258/5001 [34:10<6:50:18,  5.19s/it][A
Train Diffusion:   5%|▌         | 259/5001 [34:15<6:45:49,  5.13s/it][A
Train Diffusion:   5%|▌         | 260/5001 [34:19<6:14:41,  4.74s/it][A
Train Diffusion:   5%|▌         | 261/5001 [34:22<5:35:10,  4.24s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 75150462.4. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[23.5710,  0.4240,  3.4515],
        [23.4626,  0.4492,  3.3912],
        [23.2929,  0.4436,  3.4137]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[5.7876e+00, 1.1587e+01, 1.1162e+00],
         [1.4152e+01, 3.9003e-01, 2.1938e+00],
         [3.0010e+01, 4.4594e-01, 6.9763e+00],
         ...,
         [1.7029e+01, 1.9919e-01, 1.8743e+00],
         [2.7964e+01, 8.1690e-02, 1.8748e+00],
         [2.0239e+00, 4.3769e+00, 2.6942e+00]],

        [[9.7440e+00, 1.2241e+01, 1.9260e-02],
         [6.5599e+00, 1.8733e-01, 4.2469e+00],
         [1.9588e+01, 6.2522e-02, 2.1262e+00],
         ...,
         [2.6220e+01, 1.2911e-01, 3.5797e+00],
         [1.7862e+01, 2.0448e-01, 1.6495e+00],
         [3.1709e+01, 3.7523e-02, 3.5403e+00]],

        [[1.0716e+01, 8.9807e+00, 3.6639e-01],
         [1.8397e+01, 4.0192e-02, 4.7437e+00],
         [1.7490e+01, 5.6474e-


Train Diffusion:   5%|▌         | 262/5001 [34:25<5:06:22,  3.88s/it][A
Train Diffusion:   5%|▌         | 263/5001 [34:29<4:55:55,  3.75s/it][A
Train Diffusion:   5%|▌         | 264/5001 [34:32<4:50:03,  3.67s/it][A
Train Diffusion:   5%|▌         | 265/5001 [34:36<4:47:48,  3.65s/it][A
Train Diffusion:   5%|▌         | 266/5001 [34:39<4:44:52,  3.61s/it][A
Train Diffusion:   5%|▌         | 267/5001 [34:42<4:31:02,  3.44s/it][A
Train Diffusion:   5%|▌         | 268/5001 [34:45<4:23:15,  3.34s/it][A
Train Diffusion:   5%|▌         | 269/5001 [34:49<4:19:29,  3.29s/it][A
Train Diffusion:   5%|▌         | 270/5001 [34:53<4:51:36,  3.70s/it][A
Train Diffusion:   5%|▌         | 271/5001 [34:57<4:59:01,  3.79s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 92101481.6. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[20.2382,  0.7170,  4.5800],
        [20.3997,  0.7164,  4.5386],
        [20.1640,  0.6995,  4.6034]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[7.7452e+00, 1.2545e+01, 1.2677e+00],
         [9.4308e+00, 6.6313e-01, 2.9220e+00],
         [2.9415e+01, 8.1190e-02, 5.5236e+00],
         ...,
         [2.3472e+01, 5.5106e-02, 5.9171e+00],
         [9.7343e+00, 3.3811e-01, 1.7371e+00],
         [2.9346e+01, 5.4151e-02, 5.4686e+00]],

        [[8.2425e+00, 1.2751e+01, 5.0259e-01],
         [8.5810e+00, 2.9116e+00, 1.0156e-01],
         [1.6905e+01, 1.8461e-01, 6.5799e+00],
         ...,
         [7.9984e+00, 5.0720e-01, 2.2464e+00],
         [2.5498e+01, 6.3310e-01, 2.7905e+00],
         [1.2785e+01, 6.4328e-01, 2.5874e+00]],

        [[1.2386e+01, 7.1738e+00, 1.4899e-01],
         [2.9348e+01, 6.3725e-02, 8.2092e+00],
         [1.0335e+01, 3.2180e-


Train Diffusion:   5%|▌         | 272/5001 [35:01<4:51:34,  3.70s/it][A
Train Diffusion:   5%|▌         | 273/5001 [35:04<4:51:00,  3.69s/it][A
Train Diffusion:   5%|▌         | 274/5001 [35:08<4:49:31,  3.68s/it][A
Train Diffusion:   5%|▌         | 275/5001 [35:12<4:44:25,  3.61s/it][A
Train Diffusion:   6%|▌         | 276/5001 [35:15<4:38:05,  3.53s/it][A
Train Diffusion:   6%|▌         | 277/5001 [35:18<4:30:26,  3.43s/it][A
Train Diffusion:   6%|▌         | 278/5001 [35:21<4:15:27,  3.25s/it][A
Train Diffusion:   6%|▌         | 279/5001 [35:24<4:22:24,  3.33s/it][A
Train Diffusion:   6%|▌         | 280/5001 [35:28<4:18:13,  3.28s/it][A
Train Diffusion:   6%|▌         | 281/5001 [35:31<4:16:07,  3.26s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 106153136.8. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[19.4737,  0.8427,  4.9270],
        [19.4536,  0.8601,  4.9579],
        [19.3914,  0.8499,  4.9812]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[9.4960e+00, 1.3645e+01, 2.2592e-01],
         [2.8386e+00, 2.2999e+00, 2.1378e+00],
         [1.4029e+00, 4.7684e-01, 7.3767e-01],
         ...,
         [2.3608e+01, 2.8804e-01, 4.9983e+00],
         [6.2242e+00, 9.5530e-01, 2.1182e+00],
         [1.6770e+01, 3.6224e+00, 5.3746e+00]],

        [[7.2979e+00, 1.2638e+01, 1.5508e+00],
         [8.2886e+00, 1.0096e+00, 3.0914e+00],
         [2.9967e+01, 1.1448e+00, 9.5974e+00],
         ...,
         [1.6125e+01, 3.0463e-01, 1.4867e+00],
         [2.3399e+01, 4.6543e-01, 4.5322e+00],
         [8.5589e+00, 1.5068e+00, 2.4411e+00]],

        [[1.2135e+01, 7.7616e+00, 1.2592e-01],
         [2.7585e+01, 9.8141e-02, 8.0413e+00],
         [8.1506e+00, 4.4543e


Train Diffusion:   6%|▌         | 282/5001 [35:34<4:13:52,  3.23s/it][A
Train Diffusion:   6%|▌         | 283/5001 [35:37<4:13:57,  3.23s/it][A
Train Diffusion:   6%|▌         | 284/5001 [35:40<4:03:18,  3.09s/it][A
Train Diffusion:   6%|▌         | 285/5001 [35:43<4:00:55,  3.07s/it][A
Train Diffusion:   6%|▌         | 286/5001 [35:46<4:06:04,  3.13s/it][A
Train Diffusion:   6%|▌         | 287/5001 [35:50<4:15:07,  3.25s/it][A
Train Diffusion:   6%|▌         | 288/5001 [35:53<4:05:28,  3.13s/it][A
Train Diffusion:   6%|▌         | 289/5001 [35:55<3:57:59,  3.03s/it][A
Train Diffusion:   6%|▌         | 290/5001 [35:58<3:54:18,  2.98s/it][A
Train Diffusion:   6%|▌         | 291/5001 [36:01<3:49:14,  2.92s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 115801721.6. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[19.0424,  0.9252,  5.1132],
        [18.9321,  0.9535,  5.1071],
        [19.0168,  0.9062,  5.1371]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.0148e+01, 1.4128e+01, 5.3905e-02],
         [1.1562e+00, 2.2571e+00, 2.8182e+00],
         [9.9935e+00, 3.1926e-01, 1.8370e+00],
         ...,
         [5.6496e+00, 1.1265e+00, 3.6036e+00],
         [1.4179e+01, 4.2850e-04, 1.4024e+00],
         [2.5537e+01, 1.6069e-02, 9.3994e+00]],

        [[7.1785e+00, 1.2666e+01, 2.0131e+00],
         [7.3832e+00, 1.3115e+00, 3.3610e+00],
         [2.9003e+01, 1.8207e+00, 9.8832e+00],
         ...,
         [2.3236e+01, 3.1726e-02, 5.7555e+00],
         [5.9772e+00, 7.8577e-01, 2.3061e+00],
         [2.4268e+01, 4.0630e-02, 1.8447e+00]],

        [[1.1910e+01, 8.6315e+00, 1.9444e-01],
         [2.4384e+01, 8.2872e-02, 7.4825e+00],
         [6.6098e+00, 6.9008e


Train Diffusion:   6%|▌         | 292/5001 [36:04<3:59:01,  3.05s/it][A
Train Diffusion:   6%|▌         | 293/5001 [36:08<4:00:56,  3.07s/it][A
Train Diffusion:   6%|▌         | 294/5001 [36:11<3:58:04,  3.03s/it][A
Train Diffusion:   6%|▌         | 295/5001 [36:13<3:55:35,  3.00s/it][A
Train Diffusion:   6%|▌         | 296/5001 [36:16<3:54:09,  2.99s/it][A
Train Diffusion:   6%|▌         | 297/5001 [36:19<3:51:57,  2.96s/it][A
Train Diffusion:   6%|▌         | 298/5001 [36:22<3:50:19,  2.94s/it][A
Train Diffusion:   6%|▌         | 299/5001 [36:25<3:55:27,  3.00s/it][A
Train Diffusion:   6%|▌         | 300/5001 [36:28<3:56:17,  3.02s/it][A
Train Diffusion:   6%|▌         | 301/5001 [36:31<3:54:44,  3.00s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 116537073.6. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.9840,  0.9058,  5.1434],
        [18.8174,  0.9826,  5.1329],
        [18.9860,  0.9473,  5.1385]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 7.7829, 12.7937,  1.1590],
         [ 8.1005,  0.6705,  3.4888],
         [30.2208,  0.9365,  8.3738],
         ...,
         [ 7.9470,  0.1963,  6.2992],
         [10.9562,  0.4556,  0.8719],
         [15.7282,  2.1421,  7.2462]],

        [[ 9.0329, 13.4010,  0.6163],
         [ 8.7897,  0.9871,  1.2650],
         [ 8.5882,  0.1210,  5.1321],
         ...,
         [ 5.9520,  0.3728,  2.0699],
         [22.8261,  0.8695,  3.8424],
         [ 7.1768,  1.3411,  2.8852]],

        [[12.4035,  7.4203,  0.1260],
         [28.7906,  0.1178,  8.7194],
         [ 7.9396,  0.4704,  5.6070],
         ...,
         [25.5294,  0.4931,  3.8338],
         [ 8.2300,  0.4894,  2.3584],
         [29.4447,  0.2436,


Train Diffusion:   6%|▌         | 302/5001 [36:35<4:00:27,  3.07s/it][A
Train Diffusion:   6%|▌         | 303/5001 [36:38<4:06:16,  3.15s/it][A
Train Diffusion:   6%|▌         | 304/5001 [36:41<4:08:03,  3.17s/it][A
Train Diffusion:   6%|▌         | 305/5001 [36:44<4:01:57,  3.09s/it][A
Train Diffusion:   6%|▌         | 306/5001 [36:47<3:59:29,  3.06s/it][A
Train Diffusion:   6%|▌         | 307/5001 [36:50<3:59:18,  3.06s/it][A
Train Diffusion:   6%|▌         | 308/5001 [36:53<3:55:02,  3.01s/it][A
Train Diffusion:   6%|▌         | 309/5001 [36:56<3:52:23,  2.97s/it][A
Train Diffusion:   6%|▌         | 310/5001 [36:59<3:57:39,  3.04s/it][A
Train Diffusion:   6%|▌         | 311/5001 [37:02<3:54:58,  3.01s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 120557006.4. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.7936,  0.9819,  5.1542],
        [18.7359,  0.9460,  5.1916],
        [18.9625,  0.9578,  5.1665]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[8.0124e+00, 1.2884e+01, 3.1176e-01],
         [6.3595e+00, 2.6022e+00, 1.0657e+00],
         [3.9470e+01, 1.1032e-03, 2.1633e+00],
         ...,
         [6.9539e+00, 3.7213e-01, 2.2249e+00],
         [2.2996e+01, 6.6196e-01, 4.1795e+00],
         [7.3590e+00, 1.7491e+00, 2.7588e+00]],

        [[1.2461e+01, 7.8244e+00, 1.4155e-01],
         [2.8733e+01, 8.9610e-02, 7.0100e+00],
         [3.9697e+00, 1.0864e+00, 2.4718e+01],
         ...,
         [2.8286e+00, 3.6551e+00, 6.3459e+00],
         [4.2538e+00, 2.8032e-03, 1.8747e+00],
         [2.5961e+01, 1.6482e-02, 7.7256e+00]],

        [[8.7907e+00, 1.3261e+01, 2.3117e+00],
         [1.8928e-01, 1.4160e+00, 1.9823e+01],
         [2.3439e-05, 4.0457e


Train Diffusion:   6%|▌         | 312/5001 [37:05<4:02:40,  3.11s/it][A
Train Diffusion:   6%|▋         | 313/5001 [37:09<4:12:22,  3.23s/it][A
Train Diffusion:   6%|▋         | 314/5001 [37:12<4:09:33,  3.19s/it][A
Train Diffusion:   6%|▋         | 315/5001 [37:15<4:07:56,  3.17s/it][A
Train Diffusion:   6%|▋         | 316/5001 [37:18<4:02:20,  3.10s/it][A
Train Diffusion:   6%|▋         | 317/5001 [37:21<3:59:34,  3.07s/it][A
Train Diffusion:   6%|▋         | 318/5001 [37:24<3:59:58,  3.07s/it][A
Train Diffusion:   6%|▋         | 319/5001 [37:28<4:10:48,  3.21s/it][A
Train Diffusion:   6%|▋         | 320/5001 [37:31<4:06:55,  3.17s/it][A
Train Diffusion:   6%|▋         | 321/5001 [37:34<4:02:09,  3.10s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 120236775.2. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.9646,  0.9422,  5.1559],
        [18.6686,  0.9453,  5.2258],
        [18.9917,  0.9827,  5.0819]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.2513e+01, 7.5812e+00, 1.6801e-01],
         [2.9039e+01, 9.0910e-02, 9.2667e+00],
         [7.8021e+00, 4.7379e-01, 5.6499e+00],
         ...,
         [5.3264e+00, 9.5820e-01, 2.7499e+00],
         [2.0419e+01, 1.5232e-04, 1.3814e+00],
         [2.7041e+01, 1.2475e-02, 7.2450e+00]],

        [[8.3881e+00, 1.2864e+01, 9.7640e-01],
         [3.3984e+00, 3.1315e+00, 1.4995e-01],
         [1.8170e+01, 4.7911e-01, 7.7637e+00],
         ...,
         [1.0592e+01, 1.2381e-02, 1.6175e+00],
         [2.2828e+01, 8.4067e-02, 5.1327e+00],
         [7.5213e+00, 1.5723e+00, 2.6460e+00]],

        [[8.3547e+00, 1.2855e+01, 1.1530e+00],
         [6.1839e+00, 1.4494e+00, 3.7286e+00],
         [2.6878e+01, 3.3359e


Train Diffusion:   6%|▋         | 322/5001 [37:37<4:05:43,  3.15s/it][A
Train Diffusion:   6%|▋         | 323/5001 [37:40<4:07:46,  3.18s/it][A
Train Diffusion:   6%|▋         | 324/5001 [37:43<4:04:38,  3.14s/it][A
Train Diffusion:   6%|▋         | 325/5001 [37:46<3:59:44,  3.08s/it][A
Train Diffusion:   7%|▋         | 326/5001 [37:49<3:58:46,  3.06s/it][A
Train Diffusion:   7%|▋         | 327/5001 [37:52<3:54:29,  3.01s/it][A
Train Diffusion:   7%|▋         | 328/5001 [37:55<3:49:55,  2.95s/it][A
Train Diffusion:   7%|▋         | 329/5001 [37:58<3:50:23,  2.96s/it][A
Train Diffusion:   7%|▋         | 330/5001 [38:01<3:48:48,  2.94s/it][A
Train Diffusion:   7%|▋         | 331/5001 [38:04<3:48:37,  2.94s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 117819204.8. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.7186,  0.9859,  5.1864],
        [18.7676,  0.9888,  5.2154],
        [18.7090,  1.0293,  5.2474]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 8.8109, 13.1908,  1.0030],
         [ 9.2834,  0.8968,  2.4101],
         [21.1277,  0.5643,  7.1531],
         ...,
         [ 2.5429,  2.2870,  7.8024],
         [ 6.7711,  1.5420,  0.5903],
         [13.4309,  3.0561,  1.2018]],

        [[12.5363,  7.2637,  0.1307],
         [29.0671,  0.1089,  8.9809],
         [ 7.8943,  0.4575,  5.6078],
         ...,
         [25.8181,  0.7423,  3.7772],
         [ 8.0846,  0.4393,  2.0176],
         [29.3688,  0.0838,  5.6028]],

        [[ 8.0973, 12.8540,  0.8680],
         [ 7.7932,  1.2206,  1.8634],
         [25.5016,  0.1371,  5.8609],
         ...,
         [ 6.4399,  0.3690,  2.1251],
         [23.6824,  0.2974,  3.9642],
         [ 7.0276,  3.0174,


Train Diffusion:   7%|▋         | 332/5001 [38:07<3:57:58,  3.06s/it][A
Train Diffusion:   7%|▋         | 333/5001 [38:10<3:53:59,  3.01s/it][A
Train Diffusion:   7%|▋         | 334/5001 [38:13<4:02:05,  3.11s/it][A
Train Diffusion:   7%|▋         | 335/5001 [38:16<3:58:38,  3.07s/it][A
Train Diffusion:   7%|▋         | 336/5001 [38:19<3:57:52,  3.06s/it][A
Train Diffusion:   7%|▋         | 337/5001 [38:23<4:08:44,  3.20s/it][A
Train Diffusion:   7%|▋         | 338/5001 [38:27<4:29:37,  3.47s/it][A
Train Diffusion:   7%|▋         | 339/5001 [38:31<4:36:14,  3.56s/it][A
Train Diffusion:   7%|▋         | 340/5001 [38:36<5:08:37,  3.97s/it][A
Train Diffusion:   7%|▋         | 341/5001 [38:41<5:38:03,  4.35s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 116779711.2. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.8334,  0.9499,  5.1671],
        [18.8099,  0.9854,  5.1953],
        [18.9424,  0.9197,  5.1600]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.1881e+01, 8.8498e+00, 1.5212e-01],
         [2.4198e+01, 1.2711e-01, 7.3266e+00],
         [8.1128e+00, 4.0700e-01, 4.7030e+00],
         ...,
         [6.9716e+00, 3.4104e-01, 2.2042e+00],
         [2.1872e+01, 9.0427e-01, 4.3618e+00],
         [6.9857e+00, 1.4514e+00, 2.8288e+00]],

        [[7.1884e+00, 1.2790e+01, 1.8752e+00],
         [7.3500e+00, 1.3821e+00, 3.2506e+00],
         [2.8854e+01, 1.8048e+00, 1.0171e+01],
         ...,
         [2.4964e+01, 5.6340e-01, 4.2933e+00],
         [5.9452e+00, 9.3206e-01, 2.4342e+00],
         [2.2984e+01, 1.1667e+00, 4.8029e+00]],

        [[1.0172e+01, 1.4233e+01, 5.5435e-02],
         [1.7958e+00, 7.7933e-01, 3.8311e+00],
         [8.4779e+00, 3.7322e


Train Diffusion:   7%|▋         | 342/5001 [38:46<6:04:46,  4.70s/it][A
Train Diffusion:   7%|▋         | 343/5001 [38:53<6:52:34,  5.31s/it][A
Train Diffusion:   7%|▋         | 344/5001 [38:58<6:44:31,  5.21s/it][A
Train Diffusion:   7%|▋         | 345/5001 [39:02<6:13:08,  4.81s/it][A
Train Diffusion:   7%|▋         | 346/5001 [39:06<6:02:20,  4.67s/it][A
Train Diffusion:   7%|▋         | 347/5001 [39:11<6:00:40,  4.65s/it][A
Train Diffusion:   7%|▋         | 348/5001 [39:16<6:15:18,  4.84s/it][A
Train Diffusion:   7%|▋         | 349/5001 [39:19<5:37:09,  4.35s/it][A
Train Diffusion:   7%|▋         | 350/5001 [39:23<5:20:59,  4.14s/it][A
Train Diffusion:   7%|▋         | 351/5001 [39:26<4:54:04,  3.79s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 120196952.0. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.7671,  0.9735,  5.1906],
        [18.7424,  0.9811,  5.2408],
        [18.8078,  0.9427,  5.2059]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 7.2006, 12.8020,  2.1764],
         [ 7.1401,  1.3424,  3.4379],
         [29.4382,  1.5030,  9.7835],
         ...,
         [18.7903,  0.2102,  5.0300],
         [ 9.4640,  1.1060,  1.0838],
         [ 9.3549,  2.2104,  1.2765]],

        [[11.9528,  8.5528,  0.1910],
         [25.0551,  0.0769,  7.7617],
         [ 4.9112,  1.7079, 15.4336],
         ...,
         [ 5.7556,  0.3973,  2.0478],
         [23.0892,  0.3499,  4.0036],
         [ 6.7480,  3.3824,  6.8847]],

        [[10.0673, 14.2376,  0.0841],
         [ 0.7590,  2.9019,  2.6060],
         [11.4524,  0.1517,  1.8298],
         ...,
         [22.4134,  0.0521,  5.2346],
         [ 9.2638,  0.5699,  1.9576],
         [28.4962,  0.0973,


Train Diffusion:   7%|▋         | 352/5001 [39:29<4:46:05,  3.69s/it][A
Train Diffusion:   7%|▋         | 353/5001 [39:32<4:27:40,  3.46s/it][A
Train Diffusion:   7%|▋         | 354/5001 [39:35<4:15:54,  3.30s/it][A
Train Diffusion:   7%|▋         | 355/5001 [39:38<4:12:18,  3.26s/it][A
Train Diffusion:   7%|▋         | 356/5001 [39:42<4:07:35,  3.20s/it][A
Train Diffusion:   7%|▋         | 357/5001 [39:44<3:58:32,  3.08s/it][A
Train Diffusion:   7%|▋         | 358/5001 [39:47<3:52:09,  3.00s/it][A
Train Diffusion:   7%|▋         | 359/5001 [39:51<4:05:52,  3.18s/it][A
Train Diffusion:   7%|▋         | 360/5001 [39:54<4:01:29,  3.12s/it][A
Train Diffusion:   7%|▋         | 361/5001 [39:58<4:20:24,  3.37s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 122865349.6. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.7963,  0.9714,  5.2274],
        [18.6819,  1.0044,  5.1439],
        [18.6731,  1.0160,  5.2781]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[10.0263, 14.2590,  0.2067],
         [ 1.6134,  0.9550,  4.7088],
         [ 0.6922,  0.1764,  7.7755],
         ...,
         [22.8234,  0.5457,  5.6374],
         [ 5.9009,  0.7744,  2.1164],
         [23.8114,  1.5690,  6.1372]],

        [[12.0073,  8.1956,  0.1137],
         [25.8520,  0.2150,  8.2939],
         [ 8.1631,  0.4406,  5.3756],
         ...,
         [11.0775,  0.3485,  0.9684],
         [21.4761,  0.6946,  4.9987],
         [ 6.8680,  1.4702,  2.8254]],

        [[ 7.2512, 12.8035,  1.5113],
         [ 7.2535,  1.0391,  3.3104],
         [31.3729,  0.8605,  7.7516],
         ...,
         [ 5.6041,  1.4168,  2.7418],
         [12.3312,  0.0426,  1.3276],
         [27.1298,  0.0705,


Train Diffusion:   7%|▋         | 362/5001 [40:02<4:53:43,  3.80s/it][A
Train Diffusion:   7%|▋         | 363/5001 [40:06<4:42:47,  3.66s/it][A
Train Diffusion:   7%|▋         | 364/5001 [40:13<5:53:22,  4.57s/it][A
Train Diffusion:   7%|▋         | 365/5001 [40:17<5:42:13,  4.43s/it][A
Train Diffusion:   7%|▋         | 366/5001 [40:20<5:22:38,  4.18s/it][A
Train Diffusion:   7%|▋         | 367/5001 [40:23<4:59:23,  3.88s/it][A
Train Diffusion:   7%|▋         | 368/5001 [40:26<4:41:44,  3.65s/it][A
Train Diffusion:   7%|▋         | 369/5001 [40:29<4:24:48,  3.43s/it][A
Train Diffusion:   7%|▋         | 370/5001 [40:33<4:17:55,  3.34s/it][A
Train Diffusion:   7%|▋         | 371/5001 [40:35<4:07:38,  3.21s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 120680581.6. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.6915,  0.9550,  5.2314],
        [18.9049,  0.9427,  5.2017],
        [18.8336,  0.9612,  5.1381]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[8.9374e+00, 1.3427e+01, 8.7439e-01],
         [1.0011e+01, 5.8453e-01, 1.7785e+00],
         [1.7371e+01, 4.9217e-01, 7.2959e+00],
         ...,
         [7.1895e+00, 3.2827e-01, 2.1624e+00],
         [2.3254e+01, 4.2509e-01, 3.9349e+00],
         [7.2448e+00, 1.6891e+00, 2.6937e+00]],

        [[1.2396e+01, 7.4272e+00, 1.2713e-01],
         [2.8958e+01, 1.2085e-01, 9.0150e+00],
         [7.8997e+00, 4.7065e-01, 5.5871e+00],
         ...,
         [2.5913e+01, 3.8279e-01, 4.0180e+00],
         [5.7932e+00, 1.3885e+00, 3.1121e+00],
         [1.2886e+01, 1.1578e+00, 4.9272e+00]],

        [[7.8483e+00, 1.2897e+01, 9.2267e-01],
         [7.5244e+00, 9.8118e-01, 2.6792e+00],
         [2.8074e+01, 1.4529e


Train Diffusion:   7%|▋         | 372/5001 [40:40<4:33:19,  3.54s/it][A
Train Diffusion:   7%|▋         | 373/5001 [40:43<4:25:41,  3.44s/it][A
Train Diffusion:   7%|▋         | 374/5001 [40:47<4:34:07,  3.55s/it][A
Train Diffusion:   7%|▋         | 375/5001 [40:50<4:37:37,  3.60s/it][A
Train Diffusion:   8%|▊         | 376/5001 [40:54<4:24:22,  3.43s/it][A
Train Diffusion:   8%|▊         | 377/5001 [40:59<5:16:28,  4.11s/it][A
Train Diffusion:   8%|▊         | 378/5001 [41:03<5:01:36,  3.91s/it][A
Train Diffusion:   8%|▊         | 379/5001 [41:06<4:45:25,  3.71s/it][A
Train Diffusion:   8%|▊         | 380/5001 [41:09<4:38:05,  3.61s/it][A
Train Diffusion:   8%|▊         | 381/5001 [41:13<4:42:58,  3.68s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 116990499.2. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.7886,  0.9283,  5.1959],
        [18.8313,  0.9696,  5.1919],
        [18.7869,  0.9869,  5.2573]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.0318e+01, 1.4189e+01, 3.5446e-02],
         [2.8162e+00, 6.9823e-01, 3.6181e+00],
         [1.4171e-01, 1.2993e+00, 1.1725e+01],
         ...,
         [8.9427e+00, 7.4586e-01, 2.2170e+00],
         [2.8903e+01, 2.3962e-02, 7.9666e-01],
         [6.8509e+00, 3.1843e+00, 6.4963e+00]],

        [[1.1798e+01, 9.1014e+00, 1.6665e-01],
         [2.2998e+01, 2.0265e-01, 7.6615e+00],
         [9.2404e+00, 3.2198e-01, 4.8742e+00],
         ...,
         [3.7570e-01, 4.7152e+00, 7.1753e+00],
         [2.0350e-01, 1.7580e+00, 5.2725e+00],
         [7.5866e+00, 1.8101e+00, 2.9483e+00]],

        [[7.1506e+00, 1.2782e+01, 1.9279e+00],
         [7.1972e+00, 1.4277e+00, 3.3409e+00],
         [3.1523e+01, 1.1283e


Train Diffusion:   8%|▊         | 382/5001 [41:16<4:35:09,  3.57s/it][A
Train Diffusion:   8%|▊         | 383/5001 [41:19<4:22:35,  3.41s/it][A
Train Diffusion:   8%|▊         | 384/5001 [41:22<4:09:38,  3.24s/it][A
Train Diffusion:   8%|▊         | 385/5001 [41:26<4:12:37,  3.28s/it][A
Train Diffusion:   8%|▊         | 386/5001 [41:29<4:01:38,  3.14s/it][A
Train Diffusion:   8%|▊         | 387/5001 [41:31<3:53:04,  3.03s/it][A
Train Diffusion:   8%|▊         | 388/5001 [41:34<3:51:19,  3.01s/it][A
Train Diffusion:   8%|▊         | 389/5001 [41:37<3:51:24,  3.01s/it][A
Train Diffusion:   8%|▊         | 390/5001 [41:40<3:50:49,  3.00s/it][A
Train Diffusion:   8%|▊         | 391/5001 [41:43<3:48:46,  2.98s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 118649083.2. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.6850,  0.9904,  5.1880],
        [18.7524,  0.9545,  5.2047],
        [18.9773,  0.9408,  5.1393]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[7.1386e+00, 1.2796e+01, 2.0934e+00],
         [7.1795e+00, 1.4054e+00, 3.3595e+00],
         [2.9749e+01, 1.5674e+00, 8.5642e+00],
         ...,
         [5.4393e+00, 1.1012e+00, 3.7206e+00],
         [1.6066e+01, 9.9234e-03, 3.7936e+00],
         [1.3125e+01, 7.3541e-01, 7.2379e+00]],

        [[1.0291e+01, 1.4273e+01, 3.7597e-02],
         [2.0608e+00, 1.4125e+00, 3.3040e+00],
         [3.1652e+00, 1.1845e+00, 1.2366e+01],
         ...,
         [1.8507e+01, 4.2909e-02, 1.9070e+00],
         [2.3747e+01, 3.7279e-01, 4.5306e+00],
         [7.0674e+00, 1.4714e+00, 2.8418e+00]],

        [[1.1850e+01, 8.9803e+00, 1.8845e-01],
         [2.3440e+01, 1.2386e-01, 7.5569e+00],
         [9.7602e+00, 3.0669e


Train Diffusion:   8%|▊         | 392/5001 [41:46<3:55:35,  3.07s/it][A
Train Diffusion:   8%|▊         | 393/5001 [41:50<3:57:04,  3.09s/it][A
Train Diffusion:   8%|▊         | 394/5001 [41:53<3:56:46,  3.08s/it][A
Train Diffusion:   8%|▊         | 395/5001 [41:56<4:00:25,  3.13s/it][A
Train Diffusion:   8%|▊         | 396/5001 [41:59<4:04:33,  3.19s/it][A
Train Diffusion:   8%|▊         | 397/5001 [42:02<4:00:14,  3.13s/it][A
Train Diffusion:   8%|▊         | 398/5001 [42:06<4:11:07,  3.27s/it][A
Train Diffusion:   8%|▊         | 399/5001 [42:09<3:59:20,  3.12s/it][A
Train Diffusion:   8%|▊         | 400/5001 [42:12<3:57:24,  3.10s/it][A
Train Diffusion:   8%|▊         | 401/5001 [42:15<3:55:01,  3.07s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 120447439.2. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.8535,  0.9553,  5.2001],
        [18.6670,  0.9987,  5.1891],
        [18.8570,  0.9690,  5.1843]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 7.2775, 12.7444,  1.5404],
         [ 7.3069,  1.0791,  3.3011],
         [30.5273,  1.4036,  9.3058],
         ...,
         [ 0.3519,  0.9876, 10.0598],
         [ 0.2412,  2.6315,  1.8534],
         [ 7.5432,  0.0348,  2.8267]],

        [[ 9.9620, 14.1111,  0.2298],
         [ 1.2254,  0.7992,  4.6851],
         [ 0.7367,  0.4075,  5.0684],
         ...,
         [ 6.9912,  0.3717,  2.1439],
         [23.1552,  0.6093,  4.1282],
         [ 6.4077,  3.6139,  3.8688]],

        [[12.0233,  8.2085,  0.1158],
         [25.9562,  0.1898,  8.0234],
         [ 7.9578,  0.4682,  5.3340],
         ...,
         [24.5647,  0.0356,  4.5201],
         [ 7.1899,  0.4549,  2.0125],
         [26.8670,  0.1157,


Train Diffusion:   8%|▊         | 402/5001 [42:18<3:55:16,  3.07s/it][A
Train Diffusion:   8%|▊         | 403/5001 [42:21<3:53:23,  3.05s/it][A
Train Diffusion:   8%|▊         | 404/5001 [42:24<4:02:04,  3.16s/it][A
Train Diffusion:   8%|▊         | 405/5001 [42:28<4:17:09,  3.36s/it][A
Train Diffusion:   8%|▊         | 406/5001 [42:31<4:14:38,  3.33s/it][A
Train Diffusion:   8%|▊         | 407/5001 [42:34<4:03:46,  3.18s/it][A
Train Diffusion:   8%|▊         | 408/5001 [42:37<3:53:13,  3.05s/it][A
Train Diffusion:   8%|▊         | 409/5001 [42:40<3:57:19,  3.10s/it][A
Train Diffusion:   8%|▊         | 410/5001 [42:43<3:53:00,  3.05s/it][A
Train Diffusion:   8%|▊         | 411/5001 [42:48<4:43:27,  3.71s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 122267208.0. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.9353,  0.9809,  5.1137],
        [18.8026,  0.9429,  5.1934],
        [18.8239,  0.9221,  5.2398]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[11.4387, 10.1626,  0.1144],
         [17.5306,  0.2499,  6.8802],
         [10.8919,  0.2160,  4.1444],
         ...,
         [10.2352,  0.4065,  3.5982],
         [ 7.7225,  0.8703,  4.6442],
         [ 8.0643,  0.6823,  2.9313]],

        [[ 7.0119, 12.7059,  2.0754],
         [ 7.3019,  1.4594,  3.2464],
         [29.5536,  1.6251,  9.3866],
         ...,
         [ 8.1452,  0.5062,  2.0660],
         [25.1938,  0.8202,  2.6463],
         [ 7.2282,  1.4970,  3.1457]],

        [[10.7889, 12.9245,  0.0383],
         [ 8.0180,  0.1475,  4.8742],
         [ 6.8474,  0.7710, 12.7607],
         ...,
         [23.1986,  0.5277,  5.1924],
         [ 6.3470,  0.4392,  1.9122],
         [26.4115,  0.1076,


Train Diffusion:   8%|▊         | 412/5001 [42:54<5:39:00,  4.43s/it][A
Train Diffusion:   8%|▊         | 413/5001 [42:59<5:45:08,  4.51s/it][A
Train Diffusion:   8%|▊         | 414/5001 [43:03<5:37:45,  4.42s/it][A
Train Diffusion:   8%|▊         | 415/5001 [43:08<5:40:45,  4.46s/it][A
Train Diffusion:   8%|▊         | 416/5001 [43:12<5:43:35,  4.50s/it][A
Train Diffusion:   8%|▊         | 417/5001 [43:16<5:19:19,  4.18s/it][A
Train Diffusion:   8%|▊         | 418/5001 [43:20<5:13:50,  4.11s/it][A
Train Diffusion:   8%|▊         | 419/5001 [43:23<4:48:04,  3.77s/it][A
Train Diffusion:   8%|▊         | 420/5001 [43:26<4:31:54,  3.56s/it][A
Train Diffusion:   8%|▊         | 421/5001 [43:29<4:19:34,  3.40s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 117826094.4. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.7214,  1.0432,  5.0997],
        [18.9240,  0.9402,  5.2199],
        [18.6616,  1.0037,  5.2249]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[8.2996e+00, 1.2869e+01, 9.5902e-01],
         [8.0139e+00, 1.4906e+00, 1.1976e+00],
         [2.1163e+01, 1.6774e-01, 6.1566e+00],
         ...,
         [5.1749e+00, 2.1089e-02, 3.2523e+00],
         [1.7413e+01, 4.3219e-01, 5.2408e+00],
         [6.9223e+00, 1.7188e+00, 2.8564e+00]],

        [[1.2519e+01, 7.2089e+00, 1.3291e-01],
         [2.9152e+01, 1.0512e-01, 9.0479e+00],
         [7.9081e+00, 4.6170e-01, 5.5933e+00],
         ...,
         [5.8272e+00, 5.4186e-01, 2.3220e+00],
         [2.7889e+01, 7.0985e-03, 6.7211e-01],
         [8.6143e+00, 7.2172e-01, 6.8873e+00]],

        [[8.4748e+00, 1.2946e+01, 1.1926e+00],
         [8.7564e+00, 6.7699e-01, 3.1828e+00],
         [2.5460e+01, 6.2622e


Train Diffusion:   8%|▊         | 422/5001 [43:32<4:17:32,  3.37s/it][A
Train Diffusion:   8%|▊         | 423/5001 [43:35<4:14:39,  3.34s/it][A
Train Diffusion:   8%|▊         | 424/5001 [43:39<4:13:33,  3.32s/it][A
Train Diffusion:   8%|▊         | 425/5001 [43:42<4:19:50,  3.41s/it][A
Train Diffusion:   9%|▊         | 426/5001 [43:46<4:17:25,  3.38s/it][A
Train Diffusion:   9%|▊         | 427/5001 [43:48<4:06:38,  3.24s/it][A
Train Diffusion:   9%|▊         | 428/5001 [43:51<4:01:08,  3.16s/it][A
Train Diffusion:   9%|▊         | 429/5001 [43:54<3:56:05,  3.10s/it][A
Train Diffusion:   9%|▊         | 430/5001 [43:57<3:51:55,  3.04s/it][A
Train Diffusion:   9%|▊         | 431/5001 [44:00<3:46:39,  2.98s/it][A


Moving average ELBO loss at <built-in function iter> iterations is: 120851823.2. Best ELBO loss value is: 63306328.0.

C_PATH mean = tensor([[18.6977,  0.9524,  5.2423],
        [18.9532,  0.9766,  5.1096],
        [18.7906,  0.9731,  5.1854]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[9.9499e+00, 1.4120e+01, 2.0588e-01],
         [9.9313e-01, 6.6986e-01, 4.0395e+00],
         [3.0247e-01, 2.2313e-01, 9.5278e+00],
         ...,
         [5.2349e+00, 1.4974e+00, 7.4679e+00],
         [7.3985e+00, 2.7631e-02, 3.0673e+00],
         [2.1896e+01, 1.7627e-01, 2.6549e+00]],

        [[1.2007e+01, 8.3966e+00, 1.2059e-01],
         [2.5725e+01, 1.9757e-01, 7.9675e+00],
         [8.4863e+00, 4.0101e-01, 5.2081e+00],
         ...,
         [9.5199e+00, 1.4258e-01, 8.4622e-01],
         [2.1406e+01, 1.2077e+00, 4.5406e+00],
         [6.8395e+00, 1.5780e+00, 2.6938e+00]],

        [[7.2396e+00, 1.2775e+01, 1.5820e+00],
         [7.2588e+00, 1.1682e+00, 3.2844e+00],
         [3.1719e+01, 7.2278e


Train Diffusion:   9%|▊         | 432/5001 [44:03<3:49:03,  3.01s/it][A
Train Diffusion:   9%|▊         | 433/5001 [44:07<4:00:08,  3.15s/it][A
Train Diffusion:   9%|▊         | 434/5001 [44:11<4:24:33,  3.48s/it][A
Train Diffusion:   9%|▊         | 435/5001 [44:16<4:52:37,  3.85s/it][A
Train Diffusion:   9%|▊         | 436/5001 [44:19<4:42:54,  3.72s/it][A
Train Diffusion:   9%|▊         | 437/5001 [44:23<4:49:57,  3.81s/it][A