In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import math

#Torch-related imports
import torch
import torch.distributions as D
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Function

#Model-specific imports
from SBM_SDE import *
from obs_and_flow import *
from training import calc_log_lik

In [2]:
torch.manual_seed(0)
np.random.seed(0)

In [3]:
temp_ref = 283
temp_rise = 5 #High estimate of 5 celsius temperature rise by 2100. 

#System parameters from deterministic CON model
u_M = 0.002
a_SD = 0.33
a_DS = 0.33
a_M = 0.33
a_MSC = 0.5
k_S_ref = 0.000025
k_D_ref = 0.005
k_M_ref = 0.0002
Ea_S = 75
Ea_D = 50
Ea_M = 50

#SCON diffusion matrix parameters
c_SOC = 0.5
c_DOC = 0.001
c_MBC = 0.01
s_SOC = 0.001
s_DOC = 0.001
s_MBC = 0.001

SCON_C_params_dict = {'u_M': u_M, 'a_SD': a_SD, 'a_DS': a_DS, 'a_M': a_M, 'a_MSC': a_MSC, 'k_S_ref': k_S_ref, 'k_D_ref': k_D_ref, 'k_M_ref': k_M_ref, 'Ea_S': Ea_S, 'Ea_D': Ea_D, 'Ea_M': Ea_M, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC}
SCON_SS_params_dict = {'u_M': u_M, 'a_SD': a_SD, 'a_DS': a_DS, 'a_M': a_M, 'a_MSC': a_MSC, 'k_S_ref': k_S_ref, 'k_D_ref': k_D_ref, 'k_M_ref': k_M_ref, 'Ea_S': Ea_S, 'Ea_D': Ea_D, 'Ea_M': Ea_M, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC}

#System parameters from deterministic AWB model
u_Q_ref = 0.2
Q = 0.002
a_MSA = 0.5
K_D = 200
K_U = 1
V_D_ref = 0.4
V_U_ref = 0.02
Ea_V_D = 75
Ea_V_U = 50
r_M = 0.0004
r_E = 0.00001
r_L = 0.0005

#SAWB diffusion matrix parameters
c_SOC = 2
c_DOC = 0.05
c_MBC = 0.1
c_EEC = 0.01
s_SOC = 0.1
s_DOC = 0.1
s_MBC = 0.1
s_EEC = 0.1

SAWB_C_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_D': K_D, 'K_U': K_U, 'V_D_ref': V_D_ref, 'V_U_ref': V_U_ref, 'Ea_V_D': Ea_V_D, 'Ea_V_U': Ea_V_U, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC, 'c_EEC': c_EEC}
SAWB_SS_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_D': K_D, 'K_U': K_U, 'V_D_ref': V_D_ref, 'V_U_ref': V_U_ref, 'Ea_V_D': Ea_V_D, 'Ea_V_U': Ea_V_U, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC, 's_EEC': s_EEC}

#System parameters from deterministic AWB-ECA model
u_Q_ref = 0.2
Q = 0.002
a_MSA = 0.5
K_DE = 200
K_UE = 1
V_DE_ref = 0.4
V_UE_ref = 0.02
Ea_V_DE = 75
Ea_V_UE = 50
r_M = 0.0004
r_E = 0.00001
r_L = 0.0005

#SAWB-ECA diffusion matrix parameters
c_SOC = 2
c_DOC = 0.05
c_MBC = 0.1
c_EEC = 0.01
s_SOC = 0.1
s_DOC = 0.1
s_MBC = 0.1
s_EEC = 0.1

SAWB_ECA_C_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_DE': K_DE, 'K_UE': K_UE, 'V_DE_ref': V_DE_ref, 'V_UE_ref': V_UE_ref, 'Ea_V_DE': Ea_V_DE, 'Ea_V_UE': Ea_V_UE, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 'c_SOC': c_SOC, 'c_DOC': c_DOC, 'c_MBC': c_MBC, 'c_EEC': c_EEC}
SAWB_ECA_SS_params_dict = {'u_Q_ref': u_Q_ref, 'Q': Q, 'a_MSA': a_MSA, 'K_DE': K_DE, 'K_UE': K_UE, 'V_DE_ref': V_DE_ref, 'V_UE_ref': V_UE_ref, 'Ea_V_DE': Ea_V_DE, 'Ea_V_UE': Ea_V_UE, 'r_M': r_M, 'r_E': r_E, 'r_L': r_L, 's_SOC': s_SOC, 's_DOC': s_DOC, 's_MBC': s_MBC, 's_EEC': s_EEC}

In [4]:
#Set flow NN parameters.

devi = torch.device("".join(["cuda:",f'{cuda_id}']) if torch.cuda.is_available() else "cpu")
dt_flow = 0.1
t = 1000
n_flow = int(t / dt_flow) + 1
t_span = np.linspace(0, t, n_flow)
t_span_tensor = torch.reshape(torch.Tensor(t_span), [1, n_flow, 1]) #T_span needs to be converted to tensor object. Additionally, facilitates conversion of I_S and I_D to tensor objects.
l_r = 1e-4
niter = 4000
piter = 200
batch_size = 2 #Number of sets of observation outputs to sample per set of parameters.
state_dim_SCON = 3 #Not including CO2 in STATE_DIM, because CO2 is an observation.
obs_error_scale = 0.1 #Proportion of the mean of observation error standard deviation.

x0_SCON = [40, 0.08, 0.8] #Initial condition means for SCON

In [5]:
#Obtain temperature forcing function.
temp_tensor = temp_gen(t_span_tensor, temp_ref, temp_rise)
print(temp_tensor)

#Obtain SOC and DOC pool litter input vectors for use in flow SDE functions.
i_s_tensor = i_s(t_span_tensor) #Exogenous SOC input function
i_d_tensor = i_d(t_span_tensor) #Exogenous DOC input function
print(i_s_tensor)
print(i_d_tensor)

tensor([[[283.0000],
         [283.2625],
         [283.5248],
         ...,
         [281.1925],
         [281.0533],
         [280.9200]]])
tensor([[[0.0010],
         [0.0010],
         [0.0010],
         ...,
         [0.0013],
         [0.0013],
         [0.0013]]])
tensor([[[1.0000e-04],
         [1.0000e-04],
         [1.0001e-04],
         ...,
         [1.3286e-04],
         [1.3286e-04],
         [1.3287e-04]]])


In [6]:
def train(DEVICE, L_R, NITER, PRETRAIN_ITER, BATCH_SIZE, SDEFLOW, ObsModel, csv_to_obs_df, DATA_CSV, OBS_ERROR_SCALE, STATE_DIM, T, DT, N, T_SPAN_TENSOR, I_S_TENSOR, I_D_TENSOR, TEMP_TENSOR, TEMP_REF, C0, DRIFT_DIFFUSION, PARAMS_DICT): 
    #Read-in observation information. 
    obs_times, obs_means, obs_error = csv_to_obs_df(DATA_CSV, STATE_DIM, T, OBS_ERROR_SCALE)
    #Pass observation information to `ObsModel`.
    obs_model = ObsModel(DEVICE, obs_times, DT, obs_means, obs_error)
    net = SDEFlow(DEVICE, obs_model, STATE_DIM, T, DT, N, I_S_TENSOR, I_D_TENSOR, cond_inputs = 3, num_layers = 6).to(DEVICE)
    optimizer = optim.Adam(net.parameters(), lr = L_R)
    if PRETRAIN_ITER >= NITER:
        raise Exception("PRETRAIN_ITER must be < NITER.")
    best_loss_norm = 1e10
    best_loss_ELBO = 1e10
    norm_losses = [best_loss_norm] * 10
    ELBO_losses = [best_loss_ELBO] * 10
    C0_tensor = torch.tensor(C0).to(DEVICE) #Convert initial conditions from list to tensor for X0 prior object.
    #C0 = C0[(None,) * 2].repeat(BATCH_SIZE, 1, 1).to(DEVICE)
    PARAMS_DICT_TENSOR = {k: torch.tensor(v).expand(BATCH_SIZE) for k, v in PARAMS_DICT.items()}
    X0_prior = D.normal.Normal(loc = C0_tensor, scale = OBS_ERROR_SCALE * C0_tensor) #Setting prior noise = observation noise for now.
    with tqdm(total = NITER, desc = f'Train Diffusion', position = -1) as tq:
        for i in range(NITER):
            net.train()
            optimizer.zero_grad()
            C_PATH, log_prob = net(BATCH_SIZE) #For obs_and_flow.py
            #C_PATH = torch.cat([C0, C_PATH], 1) #Learning initial conditions in this version. #Append deterministic CON initial conditions conditional on parameter values to C path.
            if i <= PRETRAIN_ITER:
                l1_norm_element = C_PATH - torch.mean(obs_model.mu, -1)
                l1_norm = torch.sum(torch.abs(l1_norm_element)).mean()
                best_loss_norm = l1_norm if l1_norm < best_loss_norm else best_loss_norm
                norm_losses.append(l1_norm.item())
                #l2_norm_element = C_PATH - torch.mean(obs_model.mu, -1)
                #l2_norm = torch.sqrt(torch.sum(torch.square(l2_norm_element))).mean()
                #best_loss_norm = l2_norm if l2_norm < best_loss_norm else best_loss_norm
                #norm_losses.append(l2_norm.item())
                if i % 10 == 0:
                    ma_norm_loss = sum(norm_losses[-10:]) / len(norm_losses[-10:])
                    print(f"\nMoving average norm loss at {iter} iterations is: {ma_norm_loss}. Best norm loss value is: {best_loss_norm}.")
                    print('\nC_PATH mean =', C_PATH.mean(-2))
                    print('\nC_PATH =', C_PATH)
                l1_norm.backward()
                #l2_norm.backward()
            else:
                log_lik = calc_log_lik(C_PATH, T_SPAN_TENSOR.to(DEVICE), DT, I_S_TENSOR.to(DEVICE), I_D_TENSOR.to(DEVICE), TEMP_TENSOR.to(DEVICE), TEMP_REF, DRIFT_DIFFUSION, PARAMS_DICT)
                neg_ELBO = -X0_prior.log_prob(C_PATH[:, 0, :]).sum(-1).mean() - log_lik.mean() - obs_model(C_PATH, PARAMS_DICT_TENSOR) + log_prob.mean()
                best_loss_ELBO = neg_ELBO if neg_ELBO < best_loss_ELBO else best_loss_ELBO
                ELBO_losses.append(neg_ELBO.item())
                if i % 10 == 0:             
                    ma_elbo_loss = sum(ELBO_losses[-10:]) / len(ELBO_losses[-10:])
                    print(f"\nMoving average ELBO loss at {iter} iterations is: {ma_elbo_loss}. Best ELBO loss value is: {best_loss_ELBO}.")
                    print('\nC_PATH mean =', C_PATH.mean(-2))
                    print('\nC_PATH =', C_PATH)
            torch.nn.utils.clip_grad_norm_(net.parameters(), 3.0)
            optimizer.step()
            if i % 100000 == 0 and i > 0:
                optimizer.param_groups[0]['lr'] *= 0.1
            tq.update()
    return net, ELBO_losses, norm_losses

In [7]:
net, ELBO_losses, norm_losses = train(devi, l_r, niter, piter, batch_size, SDEFlow, ObsModel, csv_to_obs_df, 'y_from_x_t_1000_dt_0-01.csv', obs_error_scale, state_dim_SCON, t, dt_flow, n_flow, t_span_tensor, i_s_tensor, i_d_tensor, temp_tensor, temp_ref, x0_SCON, drift_diffusion_SCON_C, SCON_C_params_dict)


Train Diffusion:   0%|          | 0/4000 [00:00<?, ?it/s][A


Moving average norm loss at <built-in function iter> iterations is: 9000085270.725. Best norm loss value is: 852707.25.

C_PATH mean = tensor([[0.8326, 0.8493, 0.8397],
        [0.8421, 0.8406, 0.8408]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.5148, 1.6574, 1.0591],
         [2.0328, 1.9618, 0.3213],
         [0.2518, 1.6845, 0.4843],
         ...,
         [1.4098, 0.7378, 0.6306],
         [1.3640, 0.5198, 0.8221],
         [1.0584, 0.6409, 0.8375]],

        [[0.9495, 0.6527, 1.0440],
         [0.2745, 0.3526, 1.6695],
         [2.6859, 0.5185, 1.8224],
         ...,
         [0.5829, 0.5205, 1.7013],
         [0.5263, 0.7840, 0.5352],
         [0.5147, 1.0243, 0.6948]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 1/4000 [00:10<11:38:03, 10.47s/it][A
Train Diffusion:   0%|          | 2/4000 [00:20<11:34:40, 10.43s/it][A
Train Diffusion:   0%|          | 3/4000 [00:30<11:04:48,  9.98s/it][A
Train Diffusion:   0%|          | 4/4000 [00:39<10:48:23,  9.74s/it][A
Train Diffusion:   0%|          | 5/4000 [00:49<10:38:41,  9.59s/it][A
Train Diffusion:   0%|          | 6/4000 [00:58<10:34:59,  9.54s/it][A
Train Diffusion:   0%|          | 7/4000 [01:08<10:38:12,  9.59s/it][A
Train Diffusion:   0%|          | 8/4000 [01:17<10:40:58,  9.63s/it][A
Train Diffusion:   0%|          | 9/4000 [01:27<10:40:48,  9.63s/it][A
Train Diffusion:   0%|          | 10/4000 [01:36<10:37:11,  9.58s/it][A


Moving average norm loss at <built-in function iter> iterations is: 850947.49375. Best norm loss value is: 849492.6875.

C_PATH mean = tensor([[0.9669, 0.9312, 0.9662],
        [0.9584, 0.9369, 0.9727]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.1558, 0.7429, 2.7547],
         [3.2437, 0.4328, 2.8207],
         [4.8243, 0.6184, 0.5130],
         ...,
         [0.3825, 0.4922, 0.4544],
         [0.5735, 0.5629, 0.6015],
         [0.5474, 0.5416, 0.6651]],

        [[0.5892, 2.7640, 0.4134],
         [0.3454, 2.6997, 0.2862],
         [0.3641, 2.6801, 2.4350],
         ...,
         [0.5826, 0.7924, 2.1676],
         [0.5095, 0.5018, 0.9482],
         [0.7619, 0.8945, 0.7834]]], grad_fn=<AddBackward0>)



Train Diffusion:   0%|          | 11/4000 [01:46<10:41:17,  9.65s/it][A
Train Diffusion:   0%|          | 12/4000 [01:56<10:42:48,  9.67s/it][A
Train Diffusion:   0%|          | 13/4000 [02:07<11:14:41, 10.15s/it][A
Train Diffusion:   0%|          | 14/4000 [02:18<11:19:05, 10.22s/it][A
Train Diffusion:   0%|          | 15/4000 [02:29<11:47:34, 10.65s/it][A
Train Diffusion:   0%|          | 16/4000 [02:40<11:54:56, 10.77s/it][A
Train Diffusion:   0%|          | 17/4000 [02:51<11:49:52, 10.69s/it][A
Train Diffusion:   0%|          | 18/4000 [03:01<11:38:34, 10.53s/it][A
Train Diffusion:   0%|          | 19/4000 [03:11<11:35:07, 10.48s/it][A
Train Diffusion:   0%|          | 20/4000 [03:22<11:33:07, 10.45s/it][A


Moving average norm loss at <built-in function iter> iterations is: 848024.6375. Best norm loss value is: 847196.875.

C_PATH mean = tensor([[1.0583, 1.0247, 1.0889],
        [1.0655, 1.0277, 1.0856]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.0069, 2.4929, 0.4728],
         [0.1938, 3.0121, 0.3633],
         [3.3298, 2.2653, 0.5491],
         ...,
         [0.6882, 0.4377, 0.7187],
         [0.9225, 0.6570, 0.6914],
         [1.6657, 0.6239, 0.6383]],

        [[0.5779, 0.7247, 3.0385],
         [2.3415, 0.4924, 3.3312],
         [0.2299, 0.5704, 2.4875],
         ...,
         [2.0922, 0.6508, 2.2560],
         [0.5938, 0.4379, 1.1986],
         [0.6288, 1.5957, 0.7482]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|          | 21/4000 [03:32<11:27:03, 10.36s/it][A
Train Diffusion:   1%|          | 22/4000 [03:42<11:18:07, 10.23s/it][A
Train Diffusion:   1%|          | 23/4000 [03:52<11:13:15, 10.16s/it][A
Train Diffusion:   1%|          | 24/4000 [04:04<11:56:07, 10.81s/it][A
Train Diffusion:   1%|          | 25/4000 [04:17<12:41:54, 11.50s/it][A
Train Diffusion:   1%|          | 26/4000 [04:28<12:27:30, 11.29s/it][A
Train Diffusion:   1%|          | 27/4000 [04:38<12:10:40, 11.03s/it][A
Train Diffusion:   1%|          | 28/4000 [04:48<11:50:04, 10.73s/it][A
Train Diffusion:   1%|          | 29/4000 [04:59<11:38:38, 10.56s/it][A
Train Diffusion:   1%|          | 30/4000 [05:08<11:22:12, 10.31s/it][A


Moving average norm loss at <built-in function iter> iterations is: 844510.66875. Best norm loss value is: 842439.0.

C_PATH mean = tensor([[1.2217, 1.1459, 1.2886],
        [1.2264, 1.1123, 1.2715]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.0296, 2.3733, 0.4972],
         [0.2212, 2.7362, 2.0093],
         [0.5505, 0.5835, 2.5451],
         ...,
         [0.7155, 0.8457, 0.7587],
         [0.7890, 0.8065, 0.7767],
         [0.7373, 0.6667, 0.9452]],

        [[0.5908, 0.7983, 2.7818],
         [1.8918, 0.5605, 0.2104],
         [4.7945, 1.8530, 0.6327],
         ...,
         [4.6339, 3.6603, 6.6033],
         [3.9200, 2.4293, 3.5256],
         [2.6907, 2.2183, 0.4136]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|          | 31/4000 [05:18<11:11:07, 10.15s/it][A
Train Diffusion:   1%|          | 32/4000 [05:28<11:07:13, 10.09s/it][A
Train Diffusion:   1%|          | 33/4000 [05:38<11:07:43, 10.10s/it][A
Train Diffusion:   1%|          | 34/4000 [05:49<11:13:25, 10.19s/it][A
Train Diffusion:   1%|          | 35/4000 [06:00<11:29:29, 10.43s/it][A
Train Diffusion:   1%|          | 36/4000 [06:10<11:34:28, 10.51s/it][A
Train Diffusion:   1%|          | 37/4000 [06:21<11:31:15, 10.47s/it][A
Train Diffusion:   1%|          | 38/4000 [06:32<11:47:27, 10.71s/it][A
Train Diffusion:   1%|          | 39/4000 [06:43<11:45:38, 10.69s/it][A
Train Diffusion:   1%|          | 40/4000 [06:54<11:52:17, 10.79s/it][A


Moving average norm loss at <built-in function iter> iterations is: 840225.1125. Best norm loss value is: 838048.375.

C_PATH mean = tensor([[1.4454, 1.2554, 1.5205],
        [1.4472, 1.2752, 1.5027]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[1.3544, 1.9289, 0.6150],
         [0.2670, 2.4105, 0.6853],
         [0.6628, 2.0673, 0.7912],
         ...,
         [5.5938, 0.7645, 5.9651],
         [0.4888, 0.2158, 1.5414],
         [0.4531, 0.7228, 0.2353]],

        [[0.5688, 0.9521, 2.1890],
         [0.9670, 0.6791, 3.3617],
         [3.4429, 0.6992, 2.9235],
         ...,
         [0.7099, 0.1622, 1.0063],
         [1.3305, 0.5913, 0.5069],
         [3.4803, 0.7432, 8.4057]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|          | 41/4000 [07:04<11:43:35, 10.66s/it][A
Train Diffusion:   1%|          | 42/4000 [07:15<11:47:35, 10.73s/it][A
Train Diffusion:   1%|          | 43/4000 [07:26<11:51:26, 10.79s/it][A
Train Diffusion:   1%|          | 44/4000 [07:36<11:36:03, 10.56s/it][A
Train Diffusion:   1%|          | 45/4000 [07:46<11:30:16, 10.47s/it][A
Train Diffusion:   1%|          | 46/4000 [07:57<11:32:15, 10.50s/it][A
Train Diffusion:   1%|          | 47/4000 [08:06<11:16:03, 10.26s/it][A
Train Diffusion:   1%|          | 48/4000 [08:17<11:21:28, 10.35s/it][A
Train Diffusion:   1%|          | 49/4000 [08:27<11:20:45, 10.34s/it][A
Train Diffusion:   1%|▏         | 50/4000 [08:37<11:09:40, 10.17s/it][A


Moving average norm loss at <built-in function iter> iterations is: 835511.9375. Best norm loss value is: 832888.8125.

C_PATH mean = tensor([[1.5819, 1.2546, 1.6424],
        [1.5584, 1.2400, 1.6053]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 0.5355,  1.0332,  1.1409],
         [ 0.9125,  0.6027,  2.7861],
         [ 1.7284,  0.4671,  0.4355],
         ...,
         [ 0.0749,  1.2806,  0.7453],
         [ 0.8814,  0.8750,  0.8418],
         [ 0.7743,  1.0140,  0.2082]],

        [[ 1.2657,  1.6836,  1.2008],
         [ 0.2486,  2.4949,  0.7721],
         [ 0.3118,  1.5760,  0.7886],
         ...,
         [ 0.8346,  4.0480,  7.4997],
         [ 3.7627,  2.7676,  3.5806],
         [ 2.7073,  1.7536, 11.0034]]], grad_fn=<AddBackward0>)



Train Diffusion:   1%|▏         | 51/4000 [08:47<11:03:15, 10.08s/it][A
Train Diffusion:   1%|▏         | 52/4000 [08:57<11:01:33, 10.05s/it][A
Train Diffusion:   1%|▏         | 53/4000 [09:07<11:11:02, 10.20s/it][A
Train Diffusion:   1%|▏         | 54/4000 [09:20<11:49:50, 10.79s/it][A
Train Diffusion:   1%|▏         | 55/4000 [09:31<12:07:45, 11.07s/it][A
Train Diffusion:   1%|▏         | 56/4000 [09:47<13:45:39, 12.56s/it][A
Train Diffusion:   1%|▏         | 57/4000 [10:06<15:50:07, 14.46s/it][A
Train Diffusion:   1%|▏         | 58/4000 [10:24<17:02:56, 15.57s/it][A
Train Diffusion:   1%|▏         | 59/4000 [10:38<16:21:37, 14.94s/it][A
Train Diffusion:   2%|▏         | 60/4000 [10:51<15:38:57, 14.30s/it][A


Moving average norm loss at <built-in function iter> iterations is: 830305.26875. Best norm loss value is: 828275.125.

C_PATH mean = tensor([[1.6999, 1.3240, 1.8473],
        [1.7127, 1.3436, 1.8483]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.3964, 1.0447, 1.4228],
         [0.9916, 3.0477, 0.1686],
         [6.1063, 0.8443, 3.5116],
         ...,
         [3.8965, 3.0297, 5.1890],
         [0.6572, 0.3324, 1.1678],
         [0.7453, 0.8854, 0.5959]],

        [[1.4125, 2.1626, 0.3533],
         [0.1381, 0.9285, 1.0828],
         [0.6257, 0.4863, 1.1555],
         ...,
         [0.9883, 0.9951, 0.8148],
         [0.9096, 0.6465, 3.3614],
         [2.1486, 0.8795, 9.8348]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 61/4000 [11:02<14:33:55, 13.31s/it][A
Train Diffusion:   2%|▏         | 62/4000 [11:12<13:40:04, 12.49s/it][A
Train Diffusion:   2%|▏         | 63/4000 [11:23<13:04:19, 11.95s/it][A
Train Diffusion:   2%|▏         | 64/4000 [11:34<12:54:36, 11.81s/it][A
Train Diffusion:   2%|▏         | 65/4000 [11:45<12:33:21, 11.49s/it][A
Train Diffusion:   2%|▏         | 66/4000 [11:56<12:28:18, 11.41s/it][A
Train Diffusion:   2%|▏         | 67/4000 [12:07<12:09:06, 11.12s/it][A
Train Diffusion:   2%|▏         | 68/4000 [12:17<11:53:39, 10.89s/it][A
Train Diffusion:   2%|▏         | 69/4000 [12:28<11:44:59, 10.76s/it][A
Train Diffusion:   2%|▏         | 70/4000 [12:39<11:58:21, 10.97s/it][A


Moving average norm loss at <built-in function iter> iterations is: 825702.55625. Best norm loss value is: 822879.3125.

C_PATH mean = tensor([[1.9840, 1.3419, 1.9489],
        [2.0101, 1.3842, 1.9843]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 0.3903,  1.2756,  1.2968],
         [ 0.7480,  0.5245,  1.3806],
         [ 1.0244,  0.5159,  2.9563],
         ...,
         [ 7.4294,  0.9523,  8.4382],
         [ 1.1584,  2.6204,  1.3963],
         [ 2.8204,  0.7566, 12.5134]],

        [[ 1.8195,  1.6832,  1.5664],
         [ 5.2437,  1.9965,  2.6884],
         [ 0.2614,  0.6326,  1.4155],
         ...,
         [ 0.8249,  4.8036,  1.1079],
         [ 4.7779,  1.0031,  3.8075],
         [ 1.1588,  1.3377,  1.0204]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 71/4000 [12:53<12:59:15, 11.90s/it][A
Train Diffusion:   2%|▏         | 72/4000 [13:05<13:06:57, 12.02s/it][A
Train Diffusion:   2%|▏         | 73/4000 [13:16<12:44:32, 11.68s/it][A
Train Diffusion:   2%|▏         | 74/4000 [13:27<12:31:48, 11.49s/it][A
Train Diffusion:   2%|▏         | 75/4000 [13:40<12:46:39, 11.72s/it][A
Train Diffusion:   2%|▏         | 76/4000 [13:52<13:05:07, 12.00s/it][A
Train Diffusion:   2%|▏         | 77/4000 [14:08<14:09:06, 12.99s/it][A
Train Diffusion:   2%|▏         | 78/4000 [14:22<14:40:43, 13.47s/it][A
Train Diffusion:   2%|▏         | 79/4000 [14:48<18:36:59, 17.09s/it][A
Train Diffusion:   2%|▏         | 80/4000 [15:10<20:15:34, 18.61s/it][A


Moving average norm loss at <built-in function iter> iterations is: 821424.16875. Best norm loss value is: 817533.0625.

C_PATH mean = tensor([[2.1912, 1.3578, 2.0404],
        [2.2038, 1.3697, 2.0223]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[0.2858, 1.5085, 1.5640],
         [0.7178, 1.1812, 1.0757],
         [0.9494, 1.0643, 0.9234],
         ...,
         [5.2448, 2.6227, 5.0019],
         [0.4477, 0.2059, 1.6010],
         [0.7083, 0.4227, 0.2272]],

        [[2.5050, 1.8213, 3.3321],
         [5.1694, 2.7416, 2.6883],
         [4.2685, 2.3398, 2.8784],
         ...,
         [1.0488, 1.1203, 0.8636],
         [0.9268, 0.5070, 0.2492],
         [0.4246, 1.1756, 4.3674]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 81/4000 [15:26<19:17:40, 17.72s/it][A
Train Diffusion:   2%|▏         | 82/4000 [15:42<18:47:54, 17.27s/it][A
Train Diffusion:   2%|▏         | 83/4000 [16:00<19:10:07, 17.62s/it][A
Train Diffusion:   2%|▏         | 84/4000 [16:15<18:10:22, 16.71s/it][A
Train Diffusion:   2%|▏         | 85/4000 [16:29<17:29:18, 16.08s/it][A
Train Diffusion:   2%|▏         | 86/4000 [16:43<16:41:31, 15.35s/it][A
Train Diffusion:   2%|▏         | 87/4000 [16:55<15:32:21, 14.30s/it][A
Train Diffusion:   2%|▏         | 88/4000 [17:07<14:56:22, 13.75s/it][A
Train Diffusion:   2%|▏         | 89/4000 [17:21<14:50:47, 13.67s/it][A
Train Diffusion:   2%|▏         | 90/4000 [17:34<14:44:17, 13.57s/it][A


Moving average norm loss at <built-in function iter> iterations is: 812268.25625. Best norm loss value is: 808786.625.

C_PATH mean = tensor([[2.5990, 1.4207, 2.2392],
        [2.4998, 1.3851, 2.1709]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 2.7914,  1.7589,  3.6431],
         [ 5.8939,  2.1362,  2.1939],
         [ 3.4981,  2.0371,  2.3383],
         ...,
         [ 1.0763,  1.1640,  0.8639],
         [ 0.8843,  1.0519,  0.7994],
         [ 1.0401,  1.1749,  1.7979]],

        [[ 0.3477,  1.9106,  1.9803],
         [ 0.8385,  1.4228,  1.2273],
         [ 1.0428,  1.1811,  1.0097],
         ...,
         [10.8158,  1.9723,  4.9535],
         [ 2.0055,  1.3279,  3.0831],
         [ 1.4510,  0.5793, 14.2711]]], grad_fn=<AddBackward0>)



Train Diffusion:   2%|▏         | 91/4000 [17:47<14:28:38, 13.33s/it][A
Train Diffusion:   2%|▏         | 92/4000 [18:03<15:26:55, 14.23s/it][A
Train Diffusion:   2%|▏         | 93/4000 [18:15<14:42:40, 13.56s/it][A
Train Diffusion:   2%|▏         | 94/4000 [18:28<14:22:14, 13.24s/it][A
Train Diffusion:   2%|▏         | 95/4000 [18:40<14:00:46, 12.92s/it][A
Train Diffusion:   2%|▏         | 96/4000 [18:52<13:46:09, 12.70s/it][A
Train Diffusion:   2%|▏         | 97/4000 [19:04<13:31:46, 12.48s/it][A
Train Diffusion:   2%|▏         | 98/4000 [19:17<13:42:17, 12.64s/it][A
Train Diffusion:   2%|▏         | 99/4000 [19:30<13:40:43, 12.62s/it][A
Train Diffusion:   2%|▎         | 100/4000 [19:42<13:36:27, 12.56s/it][A


Moving average norm loss at <built-in function iter> iterations is: 802664.45625. Best norm loss value is: 798484.1875.

C_PATH mean = tensor([[2.9675, 1.3869, 2.3572],
        [3.0501, 1.4097, 2.3183]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[4.7230e-01, 1.6032e+00, 1.5460e+00],
         [5.3570e-01, 2.3560e-01, 2.0054e+00],
         [1.2001e+00, 8.4347e-01, 7.7409e-01],
         ...,
         [7.1464e-01, 3.0232e-02, 4.3726e+00],
         [2.3104e+00, 9.2978e-01, 9.5822e-01],
         [1.0697e+00, 1.4174e+00, 5.1303e-03]],

        [[2.0135e+00, 1.0094e+00, 1.3403e+00],
         [9.2324e+00, 9.0159e-01, 2.6305e+00],
         [8.2254e+00, 1.6154e+00, 2.9140e+00],
         ...,
         [1.6345e+01, 9.0215e-01, 6.2710e+00],
         [1.3562e+01, 4.6818e-01, 2.9507e+00],
         [6.1848e-01, 3.3514e+00, 3.6969e+00]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 101/4000 [19:55<13:37:53, 12.59s/it][A
Train Diffusion:   3%|▎         | 102/4000 [20:07<13:23:02, 12.36s/it][A
Train Diffusion:   3%|▎         | 103/4000 [20:18<12:57:55, 11.98s/it][A
Train Diffusion:   3%|▎         | 104/4000 [20:31<13:18:42, 12.30s/it][A
Train Diffusion:   3%|▎         | 105/4000 [20:43<13:20:21, 12.33s/it][A
Train Diffusion:   3%|▎         | 106/4000 [20:56<13:39:48, 12.63s/it][A
Train Diffusion:   3%|▎         | 107/4000 [21:09<13:36:36, 12.59s/it][A
Train Diffusion:   3%|▎         | 108/4000 [21:22<13:41:12, 12.66s/it][A
Train Diffusion:   3%|▎         | 109/4000 [21:35<13:42:22, 12.68s/it][A
Train Diffusion:   3%|▎         | 110/4000 [21:48<13:51:10, 12.82s/it][A


Moving average norm loss at <built-in function iter> iterations is: 793372.85. Best norm loss value is: 790906.875.

C_PATH mean = tensor([[3.3600, 1.3736, 2.3189],
        [3.2708, 1.3557, 2.3050]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[4.7307e-01, 2.2023e+00, 1.9467e+00],
         [1.3728e+00, 5.9311e-01, 2.7757e-02],
         [4.5749e-02, 2.3231e+00, 2.1811e+00],
         ...,
         [7.1253e-01, 1.1034e+00, 9.6144e-01],
         [7.6922e-01, 1.1001e+00, 5.6189e-01],
         [8.9694e-02, 4.2528e+00, 7.2339e+00]],

        [[2.4239e+00, 1.0767e+00, 3.4297e+00],
         [1.2723e-02, 2.0779e-01, 2.6617e+00],
         [1.3140e+00, 1.7619e+00, 8.9540e+00],
         ...,
         [6.9458e+00, 2.5520e+00, 5.7543e+00],
         [3.1043e+00, 8.8955e-02, 4.5003e+00],
         [1.1672e+00, 4.7172e+00, 1.0479e-03]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 111/4000 [22:01<13:55:49, 12.90s/it][A
Train Diffusion:   3%|▎         | 112/4000 [22:13<13:52:09, 12.84s/it][A
Train Diffusion:   3%|▎         | 113/4000 [22:26<13:41:45, 12.68s/it][A
Train Diffusion:   3%|▎         | 114/4000 [22:37<13:18:48, 12.33s/it][A
Train Diffusion:   3%|▎         | 115/4000 [22:51<13:40:28, 12.67s/it][A
Train Diffusion:   3%|▎         | 116/4000 [23:04<13:58:14, 12.95s/it][A
Train Diffusion:   3%|▎         | 117/4000 [23:17<13:48:18, 12.80s/it][A
Train Diffusion:   3%|▎         | 118/4000 [23:30<13:52:04, 12.86s/it][A
Train Diffusion:   3%|▎         | 119/4000 [23:45<14:41:05, 13.62s/it][A
Train Diffusion:   3%|▎         | 120/4000 [24:00<15:06:15, 14.01s/it][A


Moving average norm loss at <built-in function iter> iterations is: 784211.73125. Best norm loss value is: 782109.25.

C_PATH mean = tensor([[3.7979, 1.3322, 2.3688],
        [3.7243, 1.3427, 2.3856]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[2.6801, 0.7820, 2.9586],
         [9.5572, 1.6535, 2.2650],
         [4.0257, 1.8725, 2.3476],
         ...,
         [1.3633, 1.5081, 1.3086],
         [0.9845, 0.3541, 0.1031],
         [0.0538, 4.6441, 7.5132]],

        [[0.4105, 1.7281, 1.8137],
         [0.9561, 1.2158, 1.3105],
         [1.0468, 1.0999, 0.9610],
         ...,
         [7.3755, 2.5624, 5.5514],
         [0.2411, 0.1200, 2.9762],
         [2.1984, 2.6331, 0.0107]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 121/4000 [24:16<15:49:27, 14.69s/it][A
Train Diffusion:   3%|▎         | 122/4000 [24:34<16:41:52, 15.50s/it][A
Train Diffusion:   3%|▎         | 123/4000 [24:53<17:53:24, 16.61s/it][A
Train Diffusion:   3%|▎         | 124/4000 [25:08<17:23:57, 16.16s/it][A
Train Diffusion:   3%|▎         | 125/4000 [25:21<16:26:21, 15.27s/it][A
Train Diffusion:   3%|▎         | 126/4000 [25:33<15:22:07, 14.28s/it][A
Train Diffusion:   3%|▎         | 127/4000 [25:46<14:55:48, 13.88s/it][A
Train Diffusion:   3%|▎         | 128/4000 [25:57<14:05:28, 13.10s/it][A
Train Diffusion:   3%|▎         | 129/4000 [26:10<13:49:58, 12.86s/it][A
Train Diffusion:   3%|▎         | 130/4000 [26:24<14:07:57, 13.15s/it][A


Moving average norm loss at <built-in function iter> iterations is: 774588.925. Best norm loss value is: 769566.125.

C_PATH mean = tensor([[4.3114, 1.3101, 2.3997],
        [4.2298, 1.3096, 2.3731]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[2.8811e+00, 1.0636e+00, 4.2895e+00],
         [1.2903e+01, 1.7240e+00, 2.3088e+00],
         [4.2475e+00, 2.1879e+00, 2.3554e+00],
         ...,
         [1.1142e+00, 1.2327e+00, 1.1470e+00],
         [8.4844e-01, 9.5015e-01, 1.0140e+00],
         [8.5971e-01, 1.7231e+00, 8.9058e-03]],

        [[4.3284e-01, 2.3490e+00, 2.1394e+00],
         [1.4921e+00, 1.5212e+00, 1.5029e+00],
         [1.1461e+00, 1.1768e+00, 9.9605e-01],
         ...,
         [7.5474e+00, 2.8055e+00, 6.0806e+00],
         [3.5637e+00, 1.5597e+00, 3.8590e+00],
         [5.1233e-01, 3.2202e+00, 6.7833e+00]]], grad_fn=<AddBackward0>)



Train Diffusion:   3%|▎         | 131/4000 [26:39<14:42:43, 13.69s/it][A
Train Diffusion:   3%|▎         | 132/4000 [26:55<15:36:12, 14.52s/it][A
Train Diffusion:   3%|▎         | 133/4000 [27:15<17:17:46, 16.10s/it][A
Train Diffusion:   3%|▎         | 134/4000 [27:35<18:29:58, 17.23s/it][A
Train Diffusion:   3%|▎         | 135/4000 [27:54<19:11:02, 17.87s/it][A
Train Diffusion:   3%|▎         | 136/4000 [28:12<19:06:10, 17.80s/it][A
Train Diffusion:   3%|▎         | 137/4000 [28:25<17:37:43, 16.43s/it][A
Train Diffusion:   3%|▎         | 138/4000 [28:36<15:54:21, 14.83s/it][A
Train Diffusion:   3%|▎         | 139/4000 [28:51<15:57:49, 14.88s/it][A
Train Diffusion:   4%|▎         | 140/4000 [29:04<15:23:55, 14.36s/it][A


Moving average norm loss at <built-in function iter> iterations is: 764680.3625. Best norm loss value is: 762055.5.

C_PATH mean = tensor([[4.6024, 1.2818, 2.3882],
        [4.6735, 1.2606, 2.4150]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 3.0410,  1.0685,  4.3641],
         [13.7231,  1.6796,  2.7738],
         [ 5.1850,  0.2626,  3.3381],
         ...,
         [ 5.7475,  1.3364,  4.9584],
         [25.4983,  0.1140,  3.2144],
         [ 1.4694,  3.8727,  1.9968]],

        [[ 0.4577,  2.3231,  2.2118],
         [ 1.1547,  1.2039,  1.3970],
         [ 1.0046,  1.0183,  0.3658],
         ...,
         [ 1.6687,  0.1459,  3.5266],
         [ 3.7992,  0.3978,  0.4753],
         [ 3.8362,  3.7812, 15.6662]]], grad_fn=<AddBackward0>)



Train Diffusion:   4%|▎         | 141/4000 [29:16<14:31:08, 13.54s/it][A
Train Diffusion:   4%|▎         | 142/4000 [29:27<13:45:29, 12.84s/it][A
Train Diffusion:   4%|▎         | 143/4000 [29:38<13:13:16, 12.34s/it][A
Train Diffusion:   4%|▎         | 144/4000 [29:49<12:50:00, 11.98s/it][A
Train Diffusion:   4%|▎         | 145/4000 [30:01<12:35:33, 11.76s/it][A
Train Diffusion:   4%|▎         | 146/4000 [30:10<11:48:34, 11.03s/it][A
Train Diffusion:   4%|▎         | 147/4000 [30:21<11:49:46, 11.05s/it][A
Train Diffusion:   4%|▎         | 148/4000 [30:31<11:26:17, 10.69s/it][A
Train Diffusion:   4%|▎         | 149/4000 [30:41<11:11:21, 10.46s/it][A
Train Diffusion:   4%|▍         | 150/4000 [30:52<11:17:57, 10.57s/it][A


Moving average norm loss at <built-in function iter> iterations is: 760189.5. Best norm loss value is: 756173.0.

C_PATH mean = tensor([[4.8165, 1.1334, 2.4033],
        [4.8364, 1.1420, 2.3743]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[3.0808e+00, 9.4378e-01, 4.4663e+00],
         [1.4722e+01, 1.2875e+00, 2.1419e+00],
         [1.2878e-01, 4.6957e-01, 2.4952e+00],
         ...,
         [1.0851e+00, 2.8301e-01, 7.0270e+00],
         [2.2919e+00, 1.3924e+00, 2.0390e+01],
         [7.6897e-01, 3.7941e+00, 6.6901e+00]],

        [[4.6858e-01, 2.1217e+00, 2.1066e+00],
         [1.3234e+00, 1.0390e+00, 1.2700e+00],
         [9.1028e-01, 2.5172e-01, 2.2006e-02],
         ...,
         [1.9136e-02, 3.0341e+00, 1.4386e+00],
         [6.6150e-01, 3.2561e+00, 3.4099e+00],
         [5.3173e-01, 2.0016e+00, 1.2318e-01]]], grad_fn=<AddBackward0>)



Train Diffusion:   4%|▍         | 151/4000 [31:03<11:29:16, 10.74s/it][A
Train Diffusion:   4%|▍         | 152/4000 [31:14<11:44:10, 10.98s/it][A
Train Diffusion:   4%|▍         | 153/4000 [31:24<11:16:54, 10.56s/it][A
Train Diffusion:   4%|▍         | 154/4000 [31:33<10:56:14, 10.24s/it][A
Train Diffusion:   4%|▍         | 155/4000 [31:43<10:36:24,  9.93s/it][A
Train Diffusion:   4%|▍         | 156/4000 [31:52<10:31:32,  9.86s/it][A
Train Diffusion:   4%|▍         | 157/4000 [32:05<11:30:33, 10.78s/it][A
Train Diffusion:   4%|▍         | 158/4000 [32:17<11:51:31, 11.11s/it][A
Train Diffusion:   4%|▍         | 159/4000 [32:29<12:02:27, 11.29s/it][A
Train Diffusion:   4%|▍         | 160/4000 [32:39<11:40:52, 10.95s/it][A


Moving average norm loss at <built-in function iter> iterations is: 757032.5125. Best norm loss value is: 755296.875.

C_PATH mean = tensor([[4.9709, 1.1674, 2.3395],
        [4.9484, 1.1708, 2.3074]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[4.2521e-01, 2.1587e+00, 1.0434e+00],
         [2.8612e+00, 9.5048e-04, 7.9426e-01],
         [1.3593e+00, 6.3788e-01, 3.0275e+00],
         ...,
         [1.2024e+00, 2.1440e-02, 3.5724e+00],
         [4.6449e+00, 3.7333e-01, 9.0583e-01],
         [7.9214e-01, 4.1620e+00, 3.3360e+00]],

        [[3.0137e+00, 1.0344e+00, 2.0348e-01],
         [2.1970e+00, 3.6703e+00, 3.5377e-01],
         [6.3184e+00, 1.1131e+00, 1.9386e+00],
         ...,
         [3.3081e+01, 6.9928e-01, 4.4877e+00],
         [2.9350e+01, 2.9450e-01, 2.5925e+00],
         [2.0028e+00, 3.7187e+00, 2.1803e+01]]], grad_fn=<AddBackward0>)



Train Diffusion:   4%|▍         | 161/4000 [32:51<12:05:43, 11.34s/it][A
Train Diffusion:   4%|▍         | 162/4000 [33:01<11:44:39, 11.02s/it][A
Train Diffusion:   4%|▍         | 163/4000 [33:12<11:35:40, 10.88s/it][A
Train Diffusion:   4%|▍         | 164/4000 [33:26<12:35:11, 11.81s/it][A
Train Diffusion:   4%|▍         | 165/4000 [33:37<12:13:39, 11.48s/it][A
Train Diffusion:   4%|▍         | 166/4000 [33:47<11:46:40, 11.06s/it][A
Train Diffusion:   4%|▍         | 167/4000 [34:00<12:24:49, 11.66s/it][A
Train Diffusion:   4%|▍         | 168/4000 [34:13<12:49:17, 12.05s/it][A
Train Diffusion:   4%|▍         | 169/4000 [34:23<12:22:38, 11.63s/it][A
Train Diffusion:   4%|▍         | 170/4000 [34:34<11:53:51, 11.18s/it][A


Moving average norm loss at <built-in function iter> iterations is: 757241.625. Best norm loss value is: 755296.875.

C_PATH mean = tensor([[5.0445, 1.2779, 2.4969],
        [5.0146, 1.2308, 2.3681]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[5.6491e-01, 2.1944e+00, 2.2536e+00],
         [1.4988e+00, 1.1762e+00, 6.4334e-01],
         [7.7467e+00, 1.0646e+00, 2.1383e+00],
         ...,
         [1.1204e+00, 3.2357e-01, 4.2780e+00],
         [3.0057e+01, 3.2219e-01, 3.7989e+00],
         [3.3001e+00, 3.1536e+00, 1.3107e+01]],

        [[3.4722e+00, 1.0329e+00, 4.7650e+00],
         [1.6349e+01, 1.1729e-01, 3.5840e+00],
         [1.3526e+00, 6.7898e-01, 2.9913e+00],
         ...,
         [2.2741e-02, 1.0022e-01, 3.0167e+00],
         [4.8861e+00, 3.1443e-01, 9.1543e-01],
         [7.1655e-01, 3.0996e+00, 5.4853e+00]]], grad_fn=<AddBackward0>)



Train Diffusion:   4%|▍         | 171/4000 [34:44<11:39:30, 10.96s/it][A
Train Diffusion:   4%|▍         | 172/4000 [34:54<11:23:55, 10.72s/it][A
Train Diffusion:   4%|▍         | 173/4000 [35:04<11:04:40, 10.42s/it][A
Train Diffusion:   4%|▍         | 174/4000 [35:14<11:00:27, 10.36s/it][A
Train Diffusion:   4%|▍         | 175/4000 [35:24<10:51:38, 10.22s/it][A
Train Diffusion:   4%|▍         | 176/4000 [35:34<10:51:15, 10.22s/it][A
Train Diffusion:   4%|▍         | 177/4000 [35:44<10:51:17, 10.22s/it][A
Train Diffusion:   4%|▍         | 178/4000 [35:55<10:49:02, 10.19s/it][A
Train Diffusion:   4%|▍         | 179/4000 [36:06<11:04:33, 10.44s/it][A
Train Diffusion:   4%|▍         | 180/4000 [36:16<11:07:48, 10.49s/it][A


Moving average norm loss at <built-in function iter> iterations is: 752960.14375. Best norm loss value is: 747659.25.

C_PATH mean = tensor([[5.2802, 1.2150, 2.4386],
        [5.2837, 1.1584, 2.4479]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 0.4551,  2.0722,  2.3282],
         [ 0.6182,  5.7597,  1.3721],
         [ 7.4457,  0.9131,  2.8181],
         ...,
         [36.6243,  0.4126,  4.8111],
         [ 3.2992,  1.4985,  2.8422],
         [ 2.5887,  3.6221,  0.9417]],

        [[ 3.2926,  0.9581,  0.1447],
         [18.6028,  0.8332,  3.9880],
         [ 1.5407,  0.6161,  2.9379],
         ...,
         [ 3.1634,  0.5531,  1.0045],
         [ 0.6199,  0.6877,  0.8326],
         [ 0.6946,  3.6414,  5.5579]]], grad_fn=<AddBackward0>)



Train Diffusion:   5%|▍         | 181/4000 [36:27<11:07:04, 10.48s/it][A
Train Diffusion:   5%|▍         | 182/4000 [36:42<12:47:24, 12.06s/it][A
Train Diffusion:   5%|▍         | 183/4000 [36:55<12:53:36, 12.16s/it][A
Train Diffusion:   5%|▍         | 184/4000 [37:07<12:58:24, 12.24s/it][A
Train Diffusion:   5%|▍         | 185/4000 [37:19<12:51:49, 12.14s/it][A
Train Diffusion:   5%|▍         | 186/4000 [37:29<12:12:53, 11.53s/it][A
Train Diffusion:   5%|▍         | 187/4000 [37:43<12:48:37, 12.09s/it][A
Train Diffusion:   5%|▍         | 188/4000 [37:55<13:02:12, 12.31s/it][A
Train Diffusion:   5%|▍         | 189/4000 [38:07<12:52:13, 12.16s/it][A
Train Diffusion:   5%|▍         | 190/4000 [38:17<12:13:13, 11.55s/it][A


Moving average norm loss at <built-in function iter> iterations is: 748301.09375. Best norm loss value is: 745217.375.

C_PATH mean = tensor([[5.4607, 1.1631, 2.4491],
        [5.3195, 1.1666, 2.4075]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[3.3322e+00, 1.0320e+00, 5.2018e+00],
         [1.8134e+01, 8.3439e-01, 2.3798e+00],
         [4.5098e+00, 2.2376e+00, 2.4308e+00],
         ...,
         [1.9303e-02, 3.5615e+00, 5.2810e+00],
         [8.0377e-02, 3.7253e-02, 3.8017e+00],
         [4.7170e+00, 3.0326e+00, 9.5401e+00]],

        [[3.7545e-01, 2.0550e+00, 2.4696e+00],
         [1.1728e+00, 7.6127e-01, 1.1842e+00],
         [7.5699e-01, 7.1008e-01, 7.2958e-01],
         ...,
         [8.9890e-01, 2.9647e+00, 1.8253e-06],
         [7.6533e-01, 1.4393e+00, 2.4352e+00],
         [1.7103e+01, 3.5705e+00, 1.2478e+00]]], grad_fn=<AddBackward0>)



Train Diffusion:   5%|▍         | 191/4000 [38:28<12:01:04, 11.36s/it][A
Train Diffusion:   5%|▍         | 192/4000 [38:39<11:51:00, 11.20s/it][A
Train Diffusion:   5%|▍         | 193/4000 [38:50<11:45:58, 11.13s/it][A
Train Diffusion:   5%|▍         | 194/4000 [39:00<11:21:41, 10.75s/it][A
Train Diffusion:   5%|▍         | 195/4000 [39:10<11:15:22, 10.65s/it][A
Train Diffusion:   5%|▍         | 196/4000 [39:20<10:56:01, 10.35s/it][A
Train Diffusion:   5%|▍         | 197/4000 [39:30<10:49:17, 10.24s/it][A
Train Diffusion:   5%|▍         | 198/4000 [39:40<10:44:40, 10.17s/it][A
Train Diffusion:   5%|▍         | 199/4000 [39:50<10:36:59, 10.06s/it][A
Train Diffusion:   5%|▌         | 200/4000 [40:01<11:00:17, 10.43s/it][A


Moving average norm loss at <built-in function iter> iterations is: 748296.8375. Best norm loss value is: 745217.375.

C_PATH mean = tensor([[5.4264, 1.2351, 2.3445],
        [5.4046, 1.2356, 2.3693]], grad_fn=<MeanBackward1>)

C_PATH = tensor([[[ 3.4892,  1.1128,  5.4618],
         [19.4885,  1.0328,  2.6518],
         [ 4.8089,  2.5684,  2.8400],
         ...,
         [ 3.2185,  0.4765,  0.8286],
         [ 0.5741,  0.6457,  0.7090],
         [ 0.5499,  2.9043,  1.5110]],

        [[ 0.4450,  2.1229,  2.4690],
         [ 1.2466,  0.8200,  1.1711],
         [ 0.7620,  0.7183,  0.7350],
         ...,
         [39.7257,  0.3676,  5.0869],
         [ 3.3032,  1.4061,  2.9814],
         [ 0.3410,  4.8755,  6.7588]]], grad_fn=<AddBackward0>)



Train Diffusion:   5%|▌         | 201/4000 [40:15<12:40:48, 12.02s/it][A


NameError: name 'temp_tensor' is not defined