In [None]:
!pip install tensorflow==1.15
!pip install torch==1.6
!pip install torch-stft
!pip install unidecode

### Dataset download

In [None]:
!gdown --id 10tCachYdSGWVQdIpX4PazptTbM0HITka

Downloading...
From: https://drive.google.com/uc?id=10tCachYdSGWVQdIpX4PazptTbM0HITka
To: /content/Blizzard_Emotion_Data.zip
114MB [00:01, 97.5MB/s]


In [None]:
!unzip Blizzard_Emotion_Data.zip

In [None]:
!gdown --id 1uBuZZpuDHhqj4pE5rM67GEgBidam56Fn
!unzip text.zip

Downloading...
From: https://drive.google.com/uc?id=1uBuZZpuDHhqj4pE5rM67GEgBidam56Fn
To: /content/text.zip
  0% 0.00/13.8k [00:00<?, ?B/s]100% 13.8k/13.8k [00:00<00:00, 26.0MB/s]
Archive:  text.zip
   creating: text/
  inflating: text/LICENSE            
  inflating: text/__init__.py        
  inflating: text/cleaners.py        
  inflating: text/cmudict.py         
  inflating: text/numbers.py         
  inflating: text/symbols.py         
   creating: text/__pycache__/
  inflating: text/__pycache__/__init__.cpython-37.pyc  
  inflating: text/__pycache__/cleaners.cpython-37.pyc  
  inflating: text/__pycache__/numbers.cpython-37.pyc  
  inflating: text/__pycache__/symbols.cpython-37.pyc  
  inflating: text/__pycache__/cmudict.cpython-37.pyc  


In [None]:
!gdown --id 1gWITMTGSb5JdkWZ1lwkchxWzrtPEn8cj
!unzip -j fg_new.zip

Downloading...
From: https://drive.google.com/uc?id=1gWITMTGSb5JdkWZ1lwkchxWzrtPEn8cj
To: /content/fg_new.zip
  0% 0.00/32.0k [00:00<?, ?B/s]100% 32.0k/32.0k [00:00<00:00, 10.2MB/s]
Archive:  fg_new.zip
  inflating: Blizzard_train_filelist.txt  
  inflating: Blizzard_val_filelist.txt  
  inflating: hparams.py              
  inflating: audio_processing.py     
  inflating: layers.py               
  inflating: yin.py                  


In [None]:
!gdown --id 1VSV4TapY7c-3-goRcZD_jLwNf7toFXm9
!unzip data.zip

Downloading...
From: https://drive.google.com/uc?id=1VSV4TapY7c-3-goRcZD_jLwNf7toFXm9
To: /content/data.zip
  0% 0.00/1.62M [00:00<?, ?B/s]100% 1.62M/1.62M [00:00<00:00, 51.9MB/s]
Archive:  data.zip
   creating: data/
  inflating: data/cmu_dictionary     
  inflating: data/debussy_prelude_lyrics.musicxml  
  inflating: data/example1.wav       
  inflating: data/example2.wav       
  inflating: data/haendel_hallelujah.musicxml  
  inflating: data/mozart_requiem_kyrie_satb.musicxml  
  inflating: data/example3.wav       
  inflating: data/Untitled.ipynb     
  inflating: data/examples_filelist.txt  
   creating: data/.ipynb_checkpoints/
  inflating: data/.ipynb_checkpoints/Untitled-checkpoint.ipynb  


### Import modules

In [None]:
import os
import random
import time
from math import sqrt

import numpy as np
import torch
import torch.nn.functional as F
from scipy.io.wavfile import read
from torch import nn
from torch.nn import init
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

import layers
from hparams import create_hparams
from layers import ConvNorm, LinearNorm
from text import text_to_sequence,cmudict
from yin import compute_yin

### Utils

In [None]:
def get_mask_from_lengths(lengths):
    max_len = torch.max(lengths).item()
    ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
    mask = (ids < lengths.unsqueeze(1)).bool()
    return mask


def load_wav_to_torch(full_path):
    sampling_rate, data = read(full_path)
    return torch.FloatTensor(data.astype(np.float32)), sampling_rate


def load_filepaths_and_text(filename, split="|"):
    with open(filename, encoding='utf-8') as f:
        filepaths_and_text = [line.strip().split(split) for line in f]
    return filepaths_and_text


def to_gpu(x):
    x = x.contiguous()

    if torch.cuda.is_available():
        x = x.cuda(non_blocking=True)
    return torch.autograd.Variable(x)


### Dataloader

In [None]:
class TextMelLoader(torch.utils.data.Dataset):
    def __init__(self, audiopaths_and_text, hparams, speaker_ids=None):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)
        self.sampling_rate = hparams.sampling_rate
        self.filter_length = hparams.filter_length
        self.hop_length = hparams.hop_length
        self.f0_min = hparams.f0_min
        self.f0_max = hparams.f0_max
        self.harm_thresh = hparams.harm_thresh
        self.p_arpabet = hparams.p_arpabet

        self.cmudict = None
        if hparams.cmudict_path is not None:
            self.cmudict = cmudict.CMUDict(hparams.cmudict_path)

        self.speaker_ids = speaker_ids
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)

        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)

    def create_speaker_lookup_table(self, audiopaths_and_text):
        speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
        print('speaker id ', speaker_ids)
        d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
        print("seected speaker id ", d)
        return d

    def get_f0(self, audio, sampling_rate=22050, frame_length=1024,
               hop_length=256, f0_min=100, f0_max=300, harm_thresh=0.1):
        f0, harmonic_rates, argmins, times = compute_yin(
            audio, sampling_rate, frame_length, hop_length, f0_min, f0_max,
            harm_thresh)
        pad = int((frame_length / hop_length) / 2)
        f0 = [0.0] * pad + f0 + [0.0] * pad

        f0 = np.array(f0, dtype=np.float32)
        return f0

    def get_data(self, audiopath_and_text):
        audiopath, text, speaker = audiopath_and_text
        text = self.get_text(text)
        mel, f0 = self.get_mel_and_f0(audiopath)
        speaker_id = self.get_speaker_id(speaker)
        return (text, mel, speaker_id, f0)

    def get_speaker_id(self, speaker_id):
        return torch.IntTensor([self.speaker_ids[int(speaker_id)]])

    def get_mel_and_f0(self, filepath):
        audio, sampling_rate = load_wav_to_torch(filepath)
        if sampling_rate != self.stft.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.stft.sampling_rate))
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)

        f0 = self.get_f0(audio.cpu().numpy(), self.sampling_rate,
                         self.filter_length, self.hop_length, self.f0_min,
                         self.f0_max, self.harm_thresh)
        f0 = torch.from_numpy(f0)[None]
        f0 = f0[:, :melspec.size(1)]

        return melspec, f0

    def get_text(self, text):
        text_norm = torch.IntTensor(
            text_to_sequence(text, self.text_cleaners, self.cmudict, self.p_arpabet))

        return text_norm

    def __getitem__(self, index):
        return self.get_data(self.audiopaths_and_text[index])

    def __len__(self):
        return len(self.audiopaths_and_text)


class TextMelCollate():
    def __init__(self, n_frames_per_step):
        self.n_frames_per_step = n_frames_per_step

    def __call__(self, batch):
        # Right zero-pad all one-hot text sequences to max input length
        input_lengths, ids_sorted_decreasing = torch.sort(
            torch.LongTensor([len(x[0]) for x in batch]),
            dim=0, descending=True)
        max_input_len = input_lengths[0]

        text_padded = torch.LongTensor(len(batch), max_input_len)
        text_padded.zero_()
        for i in range(len(ids_sorted_decreasing)):
            text = batch[ids_sorted_decreasing[i]][0]
            text_padded[i, :text.size(0)] = text

        # Right zero-pad mel-spec
        num_mels = batch[0][1].size(0)
        max_target_len = max([x[1].size(1) for x in batch])
        if max_target_len % self.n_frames_per_step != 0:
            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
            assert max_target_len % self.n_frames_per_step == 0

        # include mel padded, gate padded and speaker ids
        mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
        mel_padded.zero_()
        gate_padded = torch.FloatTensor(len(batch), max_target_len)
        gate_padded.zero_()
        output_lengths = torch.LongTensor(len(batch))
        speaker_ids = torch.LongTensor(len(batch))
        f0_padded = torch.FloatTensor(len(batch), 1, max_target_len)
        f0_padded.zero_()

        for i in range(len(ids_sorted_decreasing)):
            mel = batch[ids_sorted_decreasing[i]][1]
            mel_padded[i, :, :mel.size(1)] = mel
            gate_padded[i, mel.size(1) - 1:] = 1
            output_lengths[i] = mel.size(1)
            speaker_ids[i] = batch[ids_sorted_decreasing[i]][2]
            f0 = batch[ids_sorted_decreasing[i]][3]
            f0_padded[i, :, :f0.size(1)] = f0

        model_inputs = (text_padded, input_lengths, mel_padded, gate_padded,
                        output_lengths, speaker_ids, f0_padded)

        return model_inputs


In [None]:
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams)
    valset = TextMelLoader(hparams.validation_files, hparams,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size, pin_memory=False,
                              drop_last=True, collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler

In [None]:
hparams = create_hparams()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



### Model - Tactron2-GST

### Location layer

In [None]:
drop_rate = 0.5

In [None]:
class LocationLayer(nn.Module):
    def __init__(self, attention_n_filters, attention_kernel_size,
                 attention_dim):
        super(LocationLayer, self).__init__()
        padding = int((attention_kernel_size - 1) / 2)
        self.location_conv = ConvNorm(2, attention_n_filters,
                                      kernel_size=attention_kernel_size,
                                      padding=padding, bias=False, stride=1,
                                      dilation=1)
        self.location_dense = LinearNorm(attention_n_filters, attention_dim,
                                         bias=False, w_init_gain='tanh')

    def forward(self, attention_weights_cat):
        processed_attention = self.location_conv(attention_weights_cat)
        processed_attention = processed_attention.transpose(1, 2)
        processed_attention = self.location_dense(processed_attention)
        return processed_attention

### Attention layer

In [None]:
class Attention(nn.Module):
    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
                 attention_location_n_filters, attention_location_kernel_size):
        super(Attention, self).__init__()
        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
                                      bias=False, w_init_gain='tanh')
        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
                                       w_init_gain='tanh')
        self.v = LinearNorm(attention_dim, 1, bias=False)
        self.location_layer = LocationLayer(attention_location_n_filters,
                                            attention_location_kernel_size,
                                            attention_dim)
        self.score_mask_value = -float("inf")

    def get_alignment_energies(self, query, processed_memory,
                               attention_weights_cat):
        processed_query = self.query_layer(query.unsqueeze(1))
        processed_attention_weights = self.location_layer(attention_weights_cat)
        energies = self.v(torch.tanh(
            processed_query + processed_attention_weights + processed_memory))

        energies = energies.squeeze(-1)
        return energies

    def forward(self, attention_hidden_state, memory, processed_memory,
                attention_weights_cat, mask, attention_weights=None):
        if attention_weights is None:
            alignment = self.get_alignment_energies(
                attention_hidden_state, processed_memory, attention_weights_cat)

            if mask is not None:
                alignment.data.masked_fill_(mask, self.score_mask_value)

            attention_weights = F.softmax(alignment, dim=1)
        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
        attention_context = attention_context.squeeze(1)

        return attention_context, attention_weights

### PreNet

In [None]:
class Prenet(nn.Module):
    def __init__(self, in_dim, sizes):
        super(Prenet, self).__init__()
        in_sizes = [in_dim] + sizes[:-1]
        self.layers = nn.ModuleList(
            [LinearNorm(in_size, out_size, bias=False)
             for (in_size, out_size) in zip(in_sizes, sizes)])

    def forward(self, x):
        for linear in self.layers:
            x = F.dropout(F.relu(linear(x)), p=drop_rate, training=True)
        return x

### PostNet

In [None]:
class Postnet(nn.Module):
    def __init__(self, hparams):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
                         kernel_size=hparams.postnet_kernel_size, stride=1,
                         padding=int((hparams.postnet_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='tanh'),
                nn.BatchNorm1d(hparams.postnet_embedding_dim))
        )

        for i in range(1, hparams.postnet_n_convolutions - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm(hparams.postnet_embedding_dim,
                             hparams.postnet_embedding_dim,
                             kernel_size=hparams.postnet_kernel_size, stride=1,
                             padding=int((hparams.postnet_kernel_size - 1) / 2),
                             dilation=1, w_init_gain='tanh'),
                    nn.BatchNorm1d(hparams.postnet_embedding_dim))
            )

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
                         kernel_size=hparams.postnet_kernel_size, stride=1,
                         padding=int((hparams.postnet_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='linear'),
                nn.BatchNorm1d(hparams.n_mel_channels))
        )

    def forward(self, x):
        for i in range(len(self.convolutions) - 1):
            x = F.dropout(torch.tanh(self.convolutions[i](x)), drop_rate, self.training)
        x = F.dropout(self.convolutions[-1](x), drop_rate, self.training)

        return x

### Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, hparams):
        super(Encoder, self).__init__()

        convolutions = []
        for _ in range(hparams.encoder_n_convolutions):
            conv_layer = nn.Sequential(
                ConvNorm(hparams.encoder_embedding_dim,
                         hparams.encoder_embedding_dim,
                         kernel_size=hparams.encoder_kernel_size, stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='relu'),
                nn.BatchNorm1d(hparams.encoder_embedding_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
                            int(hparams.encoder_embedding_dim / 2), 1,
                            batch_first=True, bidirectional=True)

    def forward(self, x, input_lengths):
        if x.size()[0] > 1:
            x_embedded = []
            for b_ind in range(x.size()[0]):  # TODO: Speed up
                curr_x = x[b_ind:b_ind + 1, :, :input_lengths[b_ind]].clone()
                for conv in self.convolutions:
                    curr_x = F.dropout(F.relu(conv(curr_x)), drop_rate, self.training)
                x_embedded.append(curr_x[0].transpose(0, 1))
            x = torch.nn.utils.rnn.pad_sequence(x_embedded, batch_first=True)
        else:
            for conv in self.convolutions:
                x = F.dropout(F.relu(conv(x)), drop_rate, self.training)
            x = x.transpose(1, 2)

        # pytorch tensor are not reversible, hence the conversion
        input_lengths = input_lengths.cpu().numpy()
        x = nn.utils.rnn.pack_padded_sequence(
            x, input_lengths, batch_first=True)

        self.lstm.flatten_parameters()
        outputs, _ = self.lstm(x)

        outputs, _ = nn.utils.rnn.pad_packed_sequence(
            outputs, batch_first=True)

        return outputs

    def inference(self, x):
        for conv in self.convolutions:
            x = F.dropout(F.relu(conv(x)), drop_rate, self.training)

        x = x.transpose(1, 2)

        self.lstm.flatten_parameters()
        outputs, _ = self.lstm(x)

        return outputs

### Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing

        self.prenet_f0 = ConvNorm(
            1, hparams.prenet_f0_dim,
            kernel_size=hparams.prenet_f0_kernel_size,
            padding=max(0, int(hparams.prenet_f0_kernel_size / 2)),
            bias=False, stride=1, dilation=1)

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.prenet_f0_dim + self.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, self.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + self.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim, 1,
            bias=True, w_init_gain='sigmoid')

    def get_go_frame(self, memory):
        B = memory.size(0)
        decoder_input = Variable(memory.data.new(
            B, self.n_mel_channels * self.n_frames_per_step).zero_())
        return decoder_input

    def get_end_f0(self, f0s):
        B = f0s.size(0)
        dummy = Variable(f0s.data.new(B, 1, f0s.size(1)).zero_())
        return dummy

    def initialize_decoder_states(self, memory, mask):
        B = memory.size(0)
        MAX_TIME = memory.size(1)

        self.attention_hidden = Variable(memory.data.new(
            B, self.attention_rnn_dim).zero_())
        self.attention_cell = Variable(memory.data.new(
            B, self.attention_rnn_dim).zero_())

        self.decoder_hidden = Variable(memory.data.new(
            B, self.decoder_rnn_dim).zero_())
        self.decoder_cell = Variable(memory.data.new(
            B, self.decoder_rnn_dim).zero_())

        self.attention_weights = Variable(memory.data.new(
            B, MAX_TIME).zero_())
        self.attention_weights_cum = Variable(memory.data.new(
            B, MAX_TIME).zero_())
        self.attention_context = Variable(memory.data.new(
            B, self.encoder_embedding_dim).zero_())

        self.memory = memory
        self.processed_memory = self.attention_layer.memory_layer(memory)
        self.mask = mask

    def parse_decoder_inputs(self, decoder_inputs):
        # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
        decoder_inputs = decoder_inputs.transpose(1, 2)
        decoder_inputs = decoder_inputs.view(
            decoder_inputs.size(0),
            int(decoder_inputs.size(1) / self.n_frames_per_step), -1)
        # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
        decoder_inputs = decoder_inputs.transpose(0, 1)
        return decoder_inputs

    def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
        # (T_out, B) -> (B, T_out)
        alignments = torch.stack(alignments).transpose(0, 1)
        # (T_out, B) -> (B, T_out)
        gate_outputs = torch.stack(gate_outputs)
        if len(gate_outputs.size()) > 1:
            gate_outputs = gate_outputs.transpose(0, 1)
        else:
            gate_outputs = gate_outputs[None]
        gate_outputs = gate_outputs.contiguous()
        # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
        mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
        # decouple frames per step
        mel_outputs = mel_outputs.view(
            mel_outputs.size(0), -1, self.n_mel_channels)
        # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
        mel_outputs = mel_outputs.transpose(1, 2)

        return mel_outputs, gate_outputs, alignments

    def decode(self, decoder_input, attention_weights=None):
        cell_input = torch.cat((decoder_input, self.attention_context), -1)
        self.attention_hidden, self.attention_cell = self.attention_rnn(
            cell_input, (self.attention_hidden, self.attention_cell))
        self.attention_hidden = F.dropout(
            self.attention_hidden, self.p_attention_dropout, self.training)
        self.attention_cell = F.dropout(
            self.attention_cell, self.p_attention_dropout, self.training)

        attention_weights_cat = torch.cat(
            (self.attention_weights.unsqueeze(1),
             self.attention_weights_cum.unsqueeze(1)), dim=1)
        self.attention_context, self.attention_weights = self.attention_layer(
            self.attention_hidden, self.memory, self.processed_memory,
            attention_weights_cat, self.mask, attention_weights)

        self.attention_weights_cum += self.attention_weights
        decoder_input = torch.cat(
            (self.attention_hidden, self.attention_context), -1)
        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
            decoder_input, (self.decoder_hidden, self.decoder_cell))
        self.decoder_hidden = F.dropout(
            self.decoder_hidden, self.p_decoder_dropout, self.training)
        self.decoder_cell = F.dropout(
            self.decoder_cell, self.p_decoder_dropout, self.training)

        decoder_hidden_attention_context = torch.cat(
            (self.decoder_hidden, self.attention_context), dim=1)

        decoder_output = self.linear_projection(
            decoder_hidden_attention_context)

        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
        return decoder_output, gate_prediction, self.attention_weights

    def forward(self, memory, decoder_inputs, memory_lengths, f0s):
        decoder_input = self.get_go_frame(memory).unsqueeze(0)
        decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
        decoder_inputs = self.prenet(decoder_inputs)

        # audio features
        f0_dummy = self.get_end_f0(f0s)
        f0s = torch.cat((f0s, f0_dummy), dim=2)
        f0s = F.relu(self.prenet_f0(f0s))
        f0s = f0s.permute(2, 0, 1)

        self.initialize_decoder_states(
            memory, mask=~get_mask_from_lengths(memory_lengths))

        mel_outputs, gate_outputs, alignments = [], [], []
        while len(mel_outputs) < decoder_inputs.size(0) - 1:
            if len(mel_outputs) == 0 or np.random.uniform(0.0, 1.0) <= self.p_teacher_forcing:
                decoder_input = torch.cat((decoder_inputs[len(mel_outputs)],
                                           f0s[len(mel_outputs)]), dim=1)
            else:
                decoder_input = torch.cat((self.prenet(mel_outputs[-1]),
                                           f0s[len(mel_outputs)]), dim=1)
            mel_output, gate_output, attention_weights = self.decode(
                decoder_input)
            mel_outputs += [mel_output.squeeze(1)]
            gate_outputs += [gate_output.squeeze()]
            alignments += [attention_weights]

        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            mel_outputs, gate_outputs, alignments)

        return mel_outputs, gate_outputs, alignments

    def inference(self, memory, f0s):
        decoder_input = self.get_go_frame(memory)

        self.initialize_decoder_states(memory, mask=None)
        f0_dummy = self.get_end_f0(f0s)
        f0s = torch.cat((f0s, f0_dummy), dim=2)
        f0s = F.relu(self.prenet_f0(f0s))
        f0s = f0s.permute(2, 0, 1)

        mel_outputs, gate_outputs, alignments = [], [], []
        while True:
            if len(mel_outputs) < len(f0s):
                f0 = f0s[len(mel_outputs)]
            else:
                f0 = f0s[-1] * 0

            decoder_input = torch.cat((self.prenet(decoder_input), f0), dim=1)
            mel_output, gate_output, alignment = self.decode(decoder_input)

            mel_outputs += [mel_output.squeeze(1)]
            gate_outputs += [gate_output]
            alignments += [alignment]

            if torch.sigmoid(gate_output.data) > self.gate_threshold:
                break
            elif len(mel_outputs) == self.max_decoder_steps:
                print("Warning! Reached max decoder steps")
                break

            decoder_input = mel_output

        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            mel_outputs, gate_outputs, alignments)

        return mel_outputs, gate_outputs, alignments

    def inference_noattention(self, memory, f0s, attention_map):
        decoder_input = self.get_go_frame(memory)

        self.initialize_decoder_states(memory, mask=None)
        f0_dummy = self.get_end_f0(f0s)
        f0s = torch.cat((f0s, f0_dummy), dim=2)
        f0s = F.relu(self.prenet_f0(f0s))
        f0s = f0s.permute(2, 0, 1)

        mel_outputs, gate_outputs, alignments = [], [], []
        for i in range(len(attention_map)):
            f0 = f0s[i]
            attention = attention_map[i]
            decoder_input = torch.cat((self.prenet(decoder_input), f0), dim=1)
            mel_output, gate_output, alignment = self.decode(decoder_input, attention)

            mel_outputs += [mel_output.squeeze(1)]
            gate_outputs += [gate_output]
            alignments += [alignment]

            decoder_input = mel_output

        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            mel_outputs, gate_outputs, alignments)

        return mel_outputs, gate_outputs, alignments

### Style token layer

In [None]:
class ReferenceEncoder(nn.Module):
    def __init__(self, hp):

        super().__init__()
        K = len(hp.ref_enc_filters)
        filters = [1] + hp.ref_enc_filters

        convs = [nn.Conv2d(in_channels=filters[i],
                           out_channels=filters[i + 1],
                           kernel_size=(3, 3),
                           stride=(2, 2),
                           padding=(1, 1)) for i in range(K)]
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(
            [nn.BatchNorm2d(num_features=hp.ref_enc_filters[i])
             for i in range(K)])

        out_channels = self.calculate_channels(hp.n_mel_channels, 3, 2, 1, K)
        self.gru = nn.GRU(input_size=hp.ref_enc_filters[-1] * out_channels,
                          hidden_size=hp.ref_enc_gru_size,
                          batch_first=True)
        self.n_mel_channels = hp.n_mel_channels
        self.ref_enc_gru_size = hp.ref_enc_gru_size

    def forward(self, inputs, input_lengths=None):
        out = inputs.view(inputs.size(0), 1, -1, self.n_mel_channels)
        for conv, bn in zip(self.convs, self.bns):
            out = conv(out)
            out = bn(out)
            out = F.relu(out)

        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
        N, T = out.size(0), out.size(1)
        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]

        if input_lengths is not None:
            input_lengths = torch.ceil(input_lengths.float() / 2 ** len(self.convs))
            input_lengths = input_lengths.cpu().numpy().astype(int)
            out = nn.utils.rnn.pack_padded_sequence(
                out, input_lengths, batch_first=True, enforce_sorted=False)

        self.gru.flatten_parameters()
        _, out = self.gru(out)
        return out.squeeze(0)

    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
        for _ in range(n_convs):
            L = (L - kernel_size + 2 * pad) // stride + 1
        return L


class STL(nn.Module):
    def __init__(self, hp):
        super().__init__()
        self.embed = nn.Parameter(torch.FloatTensor(hp.token_num, hp.token_embedding_size // hp.num_heads))
        d_q = hp.ref_enc_gru_size
        d_k = hp.token_embedding_size // hp.num_heads
        self.attention = MultiHeadAttention(
            query_dim=d_q, key_dim=d_k, num_units=hp.token_embedding_size,
            num_heads=hp.num_heads)

        init.normal_(self.embed, mean=0, std=0.5)

    def forward(self, inputs):
        N = inputs.size(0)
        query = inputs.unsqueeze(1)
        keys = torch.tanh(self.embed).unsqueeze(0).expand(N, -1,
                                                          -1)  # [N, token_num, token_embedding_size // num_heads]
        style_embed = self.attention(query, keys)

        return style_embed


class MultiHeadAttention(nn.Module):
    def __init__(self, query_dim, key_dim, num_units, num_heads):
        super().__init__()
        self.num_units = num_units
        self.num_heads = num_heads
        self.key_dim = key_dim

        self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
        self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)

    def forward(self, query, key):
        querys = self.W_query(query)  # [N, T_q, num_units]
        keys = self.W_key(key)  # [N, T_k, num_units]
        values = self.W_value(key)

        split_size = self.num_units // self.num_heads
        querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)  # [h, N, T_q, num_units/h]
        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]

        # score = softmax(QK^T / (d_k ** 0.5))
        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
        scores = scores / (self.key_dim ** 0.5)
        scores = F.softmax(scores, dim=3)

        # out = score * V
        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]

        return out


class GST(nn.Module):
    def __init__(self, hp):
        super().__init__()
        self.encoder = ReferenceEncoder(hp)
        self.stl = STL(hp)

    def forward(self, inputs, input_lengths=None):
        enc_out = self.encoder(inputs, input_lengths=input_lengths)
        style_embed = self.stl(enc_out)

        return style_embed

### Seq2Seq Layer

In [None]:
class Tacotron2(nn.Module):
    def __init__(self, hparams):
        super(Tacotron2, self).__init__()
        self.mask_padding = hparams.mask_padding
        self.fp16_run = hparams.fp16_run
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.embedding = nn.Embedding(
            hparams.n_symbols, hparams.symbols_embedding_dim)
        std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
        val = sqrt(3.0) * std  # uniform bounds for std
        self.embedding.weight.data.uniform_(-val, val)
        self.encoder = Encoder(hparams)
        self.decoder = Decoder(hparams)
        self.postnet = Postnet(hparams)
        if hparams.with_gst:
            self.gst = GST(hparams)
        self.speaker_embedding = nn.Embedding(
            hparams.n_speakers, hparams.speaker_embedding_dim)

    def parse_batch(self, batch):
        text_padded, input_lengths, mel_padded, gate_padded, \
        output_lengths, speaker_ids, f0_padded = batch
        text_padded = to_gpu(text_padded).long()
        input_lengths = to_gpu(input_lengths).long()
        max_len = torch.max(input_lengths.data).item()
        mel_padded = to_gpu(mel_padded).float()
        gate_padded = to_gpu(gate_padded).float()
        output_lengths = to_gpu(output_lengths).long()
        speaker_ids = to_gpu(speaker_ids.data).long()
        f0_padded = to_gpu(f0_padded).float()
        return ((text_padded, input_lengths, mel_padded, max_len,
                 output_lengths, speaker_ids, f0_padded),
                (mel_padded, gate_padded))

    def parse_output(self, outputs, output_lengths=None):
        if self.mask_padding and output_lengths is not None:
            mask = ~get_mask_from_lengths(output_lengths)
            mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
            mask = mask.permute(1, 0, 2)

            outputs[0].data.masked_fill_(mask, 0.0)
            outputs[1].data.masked_fill_(mask, 0.0)
            outputs[2].data.masked_fill_(mask[:, 0, :], 1e3)  # gate energies

        return outputs

    def forward(self, inputs):
        inputs, input_lengths, targets, max_len, \
        output_lengths, speaker_ids, f0s = inputs
        input_lengths, output_lengths = input_lengths.data, output_lengths.data

        embedded_inputs = self.embedding(inputs).transpose(1, 2)
        embedded_text = self.encoder(embedded_inputs, input_lengths)
        embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
        embedded_gst = self.gst(targets, output_lengths)
        embedded_gst = embedded_gst.repeat(1, embedded_text.size(1), 1)
        embedded_speakers = embedded_speakers.repeat(1, embedded_text.size(1), 1)

        encoder_outputs = torch.cat(
            (embedded_text, embedded_gst, embedded_speakers), dim=2)

        mel_outputs, gate_outputs, alignments = self.decoder(
            encoder_outputs, targets, memory_lengths=input_lengths, f0s=f0s)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
            output_lengths)

    def inference(self, inputs):
        text, style_input, speaker_ids, f0s = inputs
        embedded_inputs = self.embedding(text).transpose(1, 2)
        embedded_text = self.encoder.inference(embedded_inputs)
        embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
        if hasattr(self, 'gst'):
            if isinstance(style_input, int):
                query = torch.zeros(1, 1, self.gst.encoder.ref_enc_gru_size).cuda()
                GST = torch.tanh(self.gst.stl.embed)
                key = GST[style_input].unsqueeze(0).expand(1, -1, -1)
                embedded_gst = self.gst.stl.attention(query, key)
            else:
                embedded_gst = self.gst(style_input)

        embedded_speakers = embedded_speakers.repeat(1, embedded_text.size(1), 1)
        if hasattr(self, 'gst'):
            embedded_gst = embedded_gst.repeat(1, embedded_text.size(1), 1)
            encoder_outputs = torch.cat(
                (embedded_text, embedded_gst, embedded_speakers), dim=2)
        else:
            encoder_outputs = torch.cat(
                (embedded_text, embedded_speakers), dim=2)

        mel_outputs, gate_outputs, alignments = self.decoder.inference(
            encoder_outputs, f0s)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])

    def inference_noattention(self, inputs):
        text, style_input, speaker_ids, f0s, attention_map = inputs
        embedded_inputs = self.embedding(text).transpose(1, 2)
        embedded_text = self.encoder.inference(embedded_inputs)
        embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
        if hasattr(self, 'gst'):
            if isinstance(style_input, int):
                query = torch.zeros(1, 1, self.gst.encoder.ref_enc_gru_size).cuda()
                GST = torch.tanh(self.gst.stl.embed)
                key = GST[style_input].unsqueeze(0).expand(1, -1, -1)
                embedded_gst = self.gst.stl.attention(query, key)
            else:
                embedded_gst = self.gst(style_input)

        embedded_speakers = embedded_speakers.repeat(1, embedded_text.size(1), 1)
        if hasattr(self, 'gst'):
            embedded_gst = embedded_gst.repeat(1, embedded_text.size(1), 1)
            encoder_outputs = torch.cat(
                (embedded_text, embedded_gst, embedded_speakers), dim=2)
        else:
            encoder_outputs = torch.cat(
                (embedded_text, embedded_speakers), dim=2)

        mel_outputs, gate_outputs, alignments = self.decoder.inference_noattention(
            encoder_outputs, f0s, attention_map)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])

### Loss and Optimizer

In [None]:
class Tacotron2Loss(nn.Module):
    def __init__(self):
        super(Tacotron2Loss, self).__init__()

    def forward(self, model_output, targets):
        mel_target, gate_target = targets[0], targets[1]
        mel_target.requires_grad = False
        gate_target.requires_grad = False
        gate_target = gate_target.view(-1, 1)

        mel_out, mel_out_postnet, gate_out, _ = model_output
        gate_out = gate_out.view(-1, 1)
        mel_loss = nn.MSELoss()(mel_out, mel_target) + \
            nn.MSELoss()(mel_out_postnet, mel_target)
        gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
        return mel_loss + gate_loss


In [None]:
# !gdown --id 1ZesPPyRRKloltRIuRnGZ2LIUEuMSVjkI

In [None]:
!gdown --id 10CsPaJUB3AglDL2udCKfc8-SYpghkmsP

Downloading...
From: https://drive.google.com/uc?id=10CsPaJUB3AglDL2udCKfc8-SYpghkmsP
To: /content/checkpoint_18000
383MB [00:02, 149MB/s]


In [None]:
def load_checkpoint(checkpoint_path, model, optimizer):
    assert os.path.isfile(checkpoint_path)
    print("Loading checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint_dict['state_dict'])
    optimizer.load_state_dict(checkpoint_dict['optimizer'])
    learning_rate = checkpoint_dict['learning_rate']
    iteration = checkpoint_dict['iteration']
    print("Loaded checkpoint '{}' from iteration {}" .format(
        checkpoint_path, iteration))
    return model, optimizer, learning_rate, iteration

In [None]:
def warm_start_model(checkpoint_path, model, ignore_layers):
    assert os.path.isfile(checkpoint_path)
    print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    model_dict = checkpoint_dict['state_dict']
    if len(ignore_layers) > 0:
        model_dict = {k: v for k, v in model_dict.items()
                      if k not in ignore_layers}
        dummy_dict = model.state_dict()
        dummy_dict.update(model_dict)
        model_dict = dummy_dict
    model.load_state_dict(model_dict)
    return model

In [None]:
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))
    torch.save({'iteration': iteration,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath)


def validate(model, criterion, valset, iteration, batch_size,
             collate_fn, distributed_run, rank):
    model.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, collate_fn=collate_fn)

        val_loss = 0.0
        for i, batch in enumerate(val_loader):
            x, y = model.parse_batch(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)

            reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        val_loss = val_loss / (i + 1)

    model.train()
    if rank == 0:
        print("Validation loss {}: {:9f}  ".format(iteration, val_loss))


torch.manual_seed(hparams.seed)
torch.cuda.manual_seed(hparams.seed)

model = Tacotron2(hparams).cuda()

learning_rate = hparams.learning_rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                             weight_decay=hparams.weight_decay)
criterion = Tacotron2Loss()

model, optimizer, _learning_rate, iteration = load_checkpoint(
                'checkpoint_18000', model, optimizer)

# model = warm_start_model(
#                 "mellotron_libritts.pt", model, hparams.ignore_layers)

Loading checkpoint 'checkpoint_18000'
Loaded checkpoint 'checkpoint_18000' from iteration 18000


### Train

In [None]:
train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(hparams)

speaker id  ['1' '2' '3' '4']
seected speaker id  {1: 0, 2: 1, 3: 2, 4: 3}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
epoch_offset = max(0, int(iteration / len(train_loader)))

In [None]:
epoch_offset

214

In [None]:
model.train()
is_overflow = False
iteration = 0
# ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, hparams.epochs):
    print("Epoch: {}".format(epoch))
    for i, batch in enumerate(train_loader):
        start = time.perf_counter()
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate

        model.zero_grad()
        x, y = model.parse_batch(batch)
        y_pred = model(x)

        loss = criterion(y_pred, y)
        reduced_loss = loss.item()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(
            model.parameters(), hparams.grad_clip_thresh)
        optimizer.step()
        if not is_overflow:
            duration = time.perf_counter() - start
            print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                iteration, reduced_loss, grad_norm, duration))

        if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
            validate(model, criterion, valset, iteration,
                     hparams.batch_size, collate_fn,
                     hparams.distributed_run, 0)
            checkpoint_path = os.path.join(
                "./drive/MyDrive/final_weight", "checkpoint_{}".format(iteration))
            save_checkpoint(model, optimizer, learning_rate, iteration,
                            checkpoint_path)
        iteration += 1

Epoch: 214
Train loss 0 0.127125 Grad Norm 0.334923 6.18s/it
Validation loss 0:  0.369144  
Saving model and optimizer state at iteration 0 to ./drive/MyDrive/final_weight/checkpoint_0


  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1 0.201089 Grad Norm 2.574035 2.43s/it
Train loss 2 0.155128 Grad Norm 1.230110 4.71s/it
Train loss 3 0.142625 Grad Norm 0.625719 8.83s/it
Train loss 4 0.159605 Grad Norm 0.827516 2.62s/it
Train loss 5 0.202242 Grad Norm 1.087934 4.58s/it
Train loss 6 0.196844 Grad Norm 0.615841 3.94s/it
Train loss 7 0.133548 Grad Norm 0.566564 6.45s/it
Train loss 8 0.187516 Grad Norm 1.126332 3.49s/it
Train loss 9 0.176915 Grad Norm 0.719749 2.80s/it
Train loss 10 0.244673 Grad Norm 2.086165 3.38s/it
Train loss 11 0.139490 Grad Norm 0.545354 3.08s/it
Train loss 12 0.201932 Grad Norm 1.185878 2.39s/it
Train loss 13 0.123424 Grad Norm 0.748115 3.83s/it
Train loss 14 0.145773 Grad Norm 0.648851 5.20s/it
Train loss 15 0.184051 Grad Norm 0.368734 2.88s/it
Train loss 16 0.165689 Grad Norm 1.260771 3.17s/it
Train loss 17 0.188396 Grad Norm 0.799581 3.04s/it
Train loss 18 0.131469 Grad Norm 0.254591 6.32s/it
Train loss 19 0.190621 Grad Norm 1.798250 5.06s/it
Train loss 20 0.153274 Grad Norm 0.74124

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 100 0.217215 Grad Norm 0.539917 1.89s/it
Train loss 101 0.179058 Grad Norm 1.043363 3.88s/it
Train loss 102 0.214493 Grad Norm 1.495206 3.82s/it
Train loss 103 0.131409 Grad Norm 0.874235 4.11s/it
Train loss 104 0.231571 Grad Norm 1.554740 3.66s/it
Train loss 105 0.209921 Grad Norm 1.915280 2.79s/it
Train loss 106 0.137237 Grad Norm 0.690236 3.26s/it
Train loss 107 0.227911 Grad Norm 0.652108 3.95s/it
Train loss 108 0.216029 Grad Norm 0.687391 3.90s/it
Train loss 109 0.171025 Grad Norm 0.838250 5.00s/it
Train loss 110 0.135477 Grad Norm 0.393660 5.53s/it
Train loss 111 0.206700 Grad Norm 0.832004 2.77s/it
Train loss 112 0.184395 Grad Norm 0.611312 3.58s/it
Train loss 113 0.114285 Grad Norm 0.193607 5.64s/it
Train loss 114 0.139748 Grad Norm 0.346943 5.14s/it
Train loss 115 0.156243 Grad Norm 1.323569 3.24s/it
Train loss 116 0.181609 Grad Norm 0.542254 4.33s/it
Train loss 117 0.171229 Grad Norm 0.422978 5.55s/it
Train loss 118 0.158869 Grad Norm 0.558933 4.01s/it
Train loss 1

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 190 0.176421 Grad Norm 0.949079 5.06s/it
Train loss 191 0.163811 Grad Norm 0.738371 2.89s/it
Train loss 192 0.216344 Grad Norm 3.558003 3.07s/it
Train loss 193 0.140203 Grad Norm 1.968709 5.65s/it
Train loss 194 0.178403 Grad Norm 1.830463 4.49s/it
Train loss 195 0.213456 Grad Norm 0.990735 5.18s/it
Train loss 196 0.202811 Grad Norm 0.766359 2.07s/it
Train loss 197 0.150829 Grad Norm 1.680679 6.48s/it
Train loss 198 0.197489 Grad Norm 1.953303 2.84s/it
Train loss 199 0.140341 Grad Norm 0.434859 2.98s/it
Train loss 200 0.116137 Grad Norm 1.149499 4.92s/it
Train loss 201 0.197387 Grad Norm 0.955780 3.38s/it
Train loss 202 0.186235 Grad Norm 1.027760 1.75s/it
Train loss 203 0.156831 Grad Norm 0.833832 2.88s/it
Train loss 204 0.188449 Grad Norm 1.811026 2.04s/it
Train loss 205 0.263024 Grad Norm 1.678305 1.78s/it
Train loss 206 0.134556 Grad Norm 0.771660 4.91s/it
Train loss 207 0.122156 Grad Norm 0.872051 3.46s/it
Train loss 208 0.234904 Grad Norm 2.231747 2.63s/it
Train loss 2

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 253 0.142652 Grad Norm 1.358508 5.04s/it
Train loss 254 0.153127 Grad Norm 0.503911 4.83s/it
Train loss 255 0.176281 Grad Norm 0.743710 3.85s/it
Train loss 256 0.111609 Grad Norm 0.927478 7.36s/it
Train loss 257 0.159239 Grad Norm 0.720978 2.63s/it
Train loss 258 0.159401 Grad Norm 0.687550 6.36s/it
Train loss 259 0.159861 Grad Norm 0.791145 5.30s/it
Train loss 260 0.206612 Grad Norm 0.884369 4.21s/it
Train loss 261 0.209181 Grad Norm 0.826538 2.89s/it
Train loss 262 0.144754 Grad Norm 0.414356 2.90s/it
Train loss 263 0.121890 Grad Norm 0.656836 7.20s/it
Train loss 264 0.135585 Grad Norm 0.578529 4.53s/it
Train loss 265 0.223251 Grad Norm 0.501441 3.34s/it
Train loss 266 0.153908 Grad Norm 0.424564 5.11s/it
Train loss 267 0.161925 Grad Norm 0.759943 2.59s/it
Train loss 268 0.225398 Grad Norm 1.877528 5.01s/it
Train loss 269 0.162237 Grad Norm 0.524100 2.01s/it
Train loss 270 0.168580 Grad Norm 0.903363 3.58s/it
Train loss 271 0.145711 Grad Norm 1.147769 5.08s/it
Train loss 2

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 336 0.194292 Grad Norm 1.533627 2.41s/it
Train loss 337 0.143005 Grad Norm 1.667179 5.90s/it
Train loss 338 0.257891 Grad Norm 2.315711 5.83s/it
Train loss 339 0.254236 Grad Norm 0.881456 2.95s/it
Train loss 340 0.181984 Grad Norm 1.386161 4.95s/it
Train loss 341 0.130214 Grad Norm 1.766015 3.09s/it
Train loss 342 0.228866 Grad Norm 2.126319 4.94s/it
Train loss 343 0.148250 Grad Norm 0.627023 3.06s/it
Train loss 344 0.224367 Grad Norm 1.418502 3.24s/it
Train loss 345 0.149023 Grad Norm 1.578258 2.60s/it
Train loss 346 0.156938 Grad Norm 0.960577 3.63s/it
Train loss 347 0.200355 Grad Norm 1.057296 2.90s/it
Train loss 348 0.187368 Grad Norm 1.268391 4.11s/it
Train loss 349 0.194918 Grad Norm 1.799370 2.42s/it
Train loss 350 0.216367 Grad Norm 2.220463 4.26s/it
Train loss 351 0.223579 Grad Norm 1.371987 4.22s/it
Train loss 352 0.228198 Grad Norm 1.956692 3.36s/it
Train loss 353 0.127540 Grad Norm 1.332398 7.34s/it
Train loss 354 0.145785 Grad Norm 1.110379 4.12s/it
Train loss 3

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 427 0.162696 Grad Norm 0.414963 6.38s/it
Train loss 428 0.167599 Grad Norm 0.441930 3.94s/it
Train loss 429 0.169993 Grad Norm 0.737506 2.83s/it
Train loss 430 0.180273 Grad Norm 0.859329 2.66s/it
Train loss 431 0.166222 Grad Norm 0.554796 2.51s/it
Train loss 432 0.214786 Grad Norm 1.271982 1.86s/it
Train loss 433 0.183001 Grad Norm 0.479009 2.64s/it
Train loss 434 0.179872 Grad Norm 0.404252 5.18s/it
Train loss 435 0.157782 Grad Norm 0.363142 5.39s/it
Train loss 436 0.196441 Grad Norm 1.567457 4.02s/it
Train loss 437 0.206468 Grad Norm 1.028311 3.08s/it
Train loss 438 0.184080 Grad Norm 1.015333 2.77s/it
Train loss 439 0.156424 Grad Norm 1.076086 3.35s/it
Train loss 440 0.207478 Grad Norm 0.690848 4.58s/it
Train loss 441 0.148933 Grad Norm 0.421180 2.46s/it
Train loss 442 0.147651 Grad Norm 0.886108 3.34s/it
Train loss 443 0.143984 Grad Norm 0.784302 8.18s/it
Train loss 444 0.220090 Grad Norm 1.093969 3.62s/it
Train loss 445 0.202067 Grad Norm 0.795737 5.10s/it
Train loss 4

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 504 0.193565 Grad Norm 0.960825 6.86s/it
Train loss 505 0.237014 Grad Norm 1.360837 1.61s/it
Train loss 506 0.134735 Grad Norm 1.346137 3.58s/it
Train loss 507 0.110151 Grad Norm 0.933047 5.32s/it
Train loss 508 0.168992 Grad Norm 0.854457 5.23s/it
Train loss 509 0.128307 Grad Norm 0.366288 4.02s/it
Train loss 510 0.132451 Grad Norm 0.770775 4.15s/it
Train loss 511 0.138962 Grad Norm 1.288437 6.40s/it
Train loss 512 0.178371 Grad Norm 1.243388 2.91s/it
Train loss 513 0.181197 Grad Norm 0.992137 3.35s/it
Train loss 514 0.123176 Grad Norm 0.490867 4.10s/it
Train loss 515 0.162253 Grad Norm 1.346331 3.47s/it
Train loss 516 0.174579 Grad Norm 0.805437 1.99s/it
Train loss 517 0.113835 Grad Norm 0.400003 7.13s/it
Train loss 518 0.189577 Grad Norm 1.284807 3.36s/it
Train loss 519 0.140776 Grad Norm 0.759688 5.37s/it
Train loss 520 0.152853 Grad Norm 0.425038 4.15s/it
Train loss 521 0.203833 Grad Norm 1.545958 4.48s/it
Train loss 522 0.196959 Grad Norm 1.033019 5.08s/it
Train loss 5

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 588 0.189819 Grad Norm 0.715195 4.56s/it
Train loss 589 0.168028 Grad Norm 0.928876 4.09s/it
Train loss 590 0.123031 Grad Norm 0.459342 5.33s/it
Train loss 591 0.159094 Grad Norm 0.706008 4.88s/it
Train loss 592 0.178411 Grad Norm 0.906517 3.36s/it
Train loss 593 0.200269 Grad Norm 0.726620 4.32s/it
Train loss 594 0.165534 Grad Norm 1.397782 5.54s/it
Train loss 595 0.258870 Grad Norm 2.275489 3.50s/it
Train loss 596 0.229829 Grad Norm 1.180533 2.58s/it
Train loss 597 0.187942 Grad Norm 0.859196 4.44s/it
Train loss 598 0.143673 Grad Norm 2.500430 5.97s/it
Train loss 599 0.186025 Grad Norm 1.404756 4.35s/it
Train loss 600 0.146706 Grad Norm 0.607354 3.46s/it
Train loss 601 0.133059 Grad Norm 0.907630 7.60s/it
Train loss 602 0.186038 Grad Norm 0.359988 2.43s/it
Train loss 603 0.188047 Grad Norm 1.130343 2.48s/it
Train loss 604 0.155726 Grad Norm 1.231740 3.48s/it
Train loss 605 0.224351 Grad Norm 0.525881 3.01s/it
Train loss 606 0.123563 Grad Norm 0.835254 6.80s/it
Train loss 6

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 673 0.232842 Grad Norm 0.614618 3.27s/it
Train loss 674 0.134997 Grad Norm 1.099902 5.44s/it
Train loss 675 0.231461 Grad Norm 2.737809 4.15s/it
Train loss 676 0.117346 Grad Norm 0.397443 2.00s/it
Train loss 677 0.124424 Grad Norm 0.599872 8.14s/it
Train loss 678 0.154819 Grad Norm 1.162912 4.94s/it
Train loss 679 0.131106 Grad Norm 0.689233 3.60s/it
Train loss 680 0.173080 Grad Norm 0.392049 3.72s/it
Train loss 681 0.195258 Grad Norm 0.883026 4.05s/it
Train loss 682 0.140435 Grad Norm 0.653879 6.49s/it
Train loss 683 0.150112 Grad Norm 0.497586 3.47s/it
Train loss 684 0.156345 Grad Norm 1.261323 4.06s/it
Train loss 685 0.192377 Grad Norm 0.645886 5.65s/it
Train loss 686 0.143390 Grad Norm 0.406168 5.90s/it
Train loss 687 0.179964 Grad Norm 1.011578 3.59s/it
Train loss 688 0.207139 Grad Norm 0.609737 3.09s/it
Train loss 689 0.176447 Grad Norm 0.514097 3.86s/it
Train loss 690 0.237178 Grad Norm 0.604145 2.42s/it
Train loss 691 0.215909 Grad Norm 0.363388 3.96s/it
Train loss 6

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 767 0.173072 Grad Norm 1.356818 4.59s/it
Train loss 768 0.152789 Grad Norm 0.600784 3.48s/it
Train loss 769 0.149908 Grad Norm 0.786700 4.05s/it
Train loss 770 0.140094 Grad Norm 1.167544 2.83s/it
Train loss 771 0.206648 Grad Norm 0.660186 1.86s/it
Train loss 772 0.155247 Grad Norm 1.358092 3.33s/it
Train loss 773 0.198012 Grad Norm 0.688504 2.75s/it
Train loss 774 0.166826 Grad Norm 0.525034 3.98s/it
Train loss 775 0.173012 Grad Norm 0.602136 3.13s/it
Train loss 776 0.188138 Grad Norm 0.608885 3.88s/it
Train loss 777 0.161273 Grad Norm 0.501680 4.40s/it
Train loss 778 0.167661 Grad Norm 0.599107 2.19s/it
Train loss 779 0.197783 Grad Norm 0.680436 3.62s/it
Train loss 780 0.160430 Grad Norm 0.521341 2.76s/it
Train loss 781 0.202306 Grad Norm 0.804758 4.17s/it
Train loss 782 0.110839 Grad Norm 0.600471 7.84s/it
Train loss 783 0.167129 Grad Norm 0.957642 5.35s/it
Train loss 784 0.199461 Grad Norm 0.772095 2.01s/it
Train loss 785 0.166850 Grad Norm 0.371373 3.25s/it
Train loss 7

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 840 0.148527 Grad Norm 0.536604 5.51s/it
Train loss 841 0.259644 Grad Norm 0.524758 1.40s/it
Train loss 842 0.193126 Grad Norm 0.670890 3.84s/it
Train loss 843 0.153215 Grad Norm 0.605668 3.33s/it
Train loss 844 0.179528 Grad Norm 0.695598 2.99s/it
Train loss 845 0.232816 Grad Norm 0.628221 4.51s/it
Train loss 846 0.168440 Grad Norm 0.880585 5.44s/it
Train loss 847 0.147959 Grad Norm 0.716434 3.42s/it
Train loss 848 0.155255 Grad Norm 0.746379 1.83s/it
Train loss 849 0.095273 Grad Norm 1.188591 4.58s/it
Train loss 850 0.160680 Grad Norm 0.638388 5.18s/it
Train loss 851 0.126964 Grad Norm 1.200114 5.81s/it
Train loss 852 0.195847 Grad Norm 0.472354 4.08s/it
Train loss 853 0.199923 Grad Norm 0.533246 2.89s/it
Train loss 854 0.116466 Grad Norm 1.004430 5.38s/it
Train loss 855 0.167838 Grad Norm 0.690075 3.33s/it
Train loss 856 0.148642 Grad Norm 0.321341 3.94s/it
Train loss 857 0.159549 Grad Norm 0.770302 3.58s/it
Train loss 858 0.198410 Grad Norm 0.913499 2.48s/it
Train loss 8

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 924 0.125976 Grad Norm 1.486465 5.53s/it
Train loss 925 0.132704 Grad Norm 0.777703 2.71s/it
Train loss 926 0.130282 Grad Norm 1.127708 3.46s/it
Train loss 927 0.147704 Grad Norm 1.742969 5.10s/it
Train loss 928 0.150124 Grad Norm 0.442798 4.72s/it
Train loss 929 0.212044 Grad Norm 1.327913 3.18s/it
Train loss 930 0.183839 Grad Norm 1.330883 4.82s/it
Train loss 931 0.166590 Grad Norm 0.572904 5.05s/it
Train loss 932 0.207748 Grad Norm 1.265405 3.36s/it
Train loss 933 0.200982 Grad Norm 1.054292 3.31s/it
Train loss 934 0.172958 Grad Norm 0.420146 2.66s/it
Train loss 935 0.142488 Grad Norm 1.337613 3.32s/it
Train loss 936 0.249412 Grad Norm 1.362900 2.22s/it
Train loss 937 0.166402 Grad Norm 0.352003 4.03s/it
Train loss 938 0.125892 Grad Norm 0.750913 3.94s/it
Train loss 939 0.179611 Grad Norm 0.968801 2.95s/it
Train loss 940 0.192671 Grad Norm 0.697881 2.28s/it
Train loss 941 0.156124 Grad Norm 0.780019 4.99s/it
Train loss 942 0.257019 Grad Norm 1.691099 2.39s/it
Train loss 9

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1010 0.157676 Grad Norm 0.409142 3.76s/it
Train loss 1011 0.171301 Grad Norm 1.707180 3.51s/it
Train loss 1012 0.153324 Grad Norm 1.875026 4.95s/it
Train loss 1013 0.188254 Grad Norm 0.942461 3.74s/it
Train loss 1014 0.144177 Grad Norm 0.983907 6.74s/it
Train loss 1015 0.129985 Grad Norm 1.030125 3.04s/it
Train loss 1016 0.259956 Grad Norm 1.639342 3.08s/it
Train loss 1017 0.117884 Grad Norm 0.469844 3.70s/it
Train loss 1018 0.178439 Grad Norm 1.083662 2.99s/it
Train loss 1019 0.203164 Grad Norm 1.582178 5.15s/it
Train loss 1020 0.185975 Grad Norm 0.841757 3.31s/it
Train loss 1021 0.165772 Grad Norm 0.777509 4.56s/it
Train loss 1022 0.137337 Grad Norm 0.656324 7.42s/it
Train loss 1023 0.178041 Grad Norm 0.916088 5.29s/it
Train loss 1024 0.160556 Grad Norm 0.810269 3.17s/it
Train loss 1025 0.210583 Grad Norm 2.146341 3.27s/it
Train loss 1026 0.126727 Grad Norm 0.912393 3.28s/it
Train loss 1027 0.145569 Grad Norm 1.043942 5.77s/it
Train loss 1028 0.138208 Grad Norm 1.216689 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1092 0.192069 Grad Norm 0.517720 5.64s/it
Train loss 1093 0.142614 Grad Norm 0.385811 7.42s/it
Train loss 1094 0.133220 Grad Norm 0.490405 4.39s/it
Train loss 1095 0.205198 Grad Norm 0.520323 2.98s/it
Train loss 1096 0.189152 Grad Norm 0.822206 4.12s/it
Train loss 1097 0.148920 Grad Norm 0.295720 2.82s/it
Train loss 1098 0.113006 Grad Norm 0.546778 3.98s/it
Train loss 1099 0.152025 Grad Norm 1.529905 3.85s/it
Train loss 1100 0.249813 Grad Norm 0.988800 1.98s/it
Train loss 1101 0.144459 Grad Norm 0.664273 5.06s/it
Train loss 1102 0.155081 Grad Norm 2.056244 7.73s/it
Train loss 1103 0.187680 Grad Norm 1.332637 2.99s/it
Train loss 1104 0.215888 Grad Norm 0.775567 2.47s/it
Train loss 1105 0.155021 Grad Norm 0.787197 2.98s/it
Train loss 1106 0.266357 Grad Norm 2.178571 2.44s/it
Train loss 1107 0.198603 Grad Norm 1.826813 3.12s/it
Train loss 1108 0.189901 Grad Norm 0.696620 3.15s/it
Train loss 1109 0.241854 Grad Norm 0.793548 2.09s/it
Train loss 1110 0.130034 Grad Norm 0.828815 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1180 0.185831 Grad Norm 0.851181 7.36s/it
Train loss 1181 0.170867 Grad Norm 0.416733 3.69s/it
Train loss 1182 0.177964 Grad Norm 1.318727 2.92s/it
Train loss 1183 0.287817 Grad Norm 2.424884 4.18s/it
Train loss 1184 0.149232 Grad Norm 0.533937 5.77s/it
Train loss 1185 0.126178 Grad Norm 0.434851 5.06s/it
Train loss 1186 0.162275 Grad Norm 0.520103 5.38s/it
Train loss 1187 0.117017 Grad Norm 0.310454 5.29s/it
Train loss 1188 0.155825 Grad Norm 0.581189 2.85s/it
Train loss 1189 0.212602 Grad Norm 0.934267 2.45s/it
Train loss 1190 0.154023 Grad Norm 0.331070 3.39s/it
Train loss 1191 0.158022 Grad Norm 0.826980 3.69s/it
Train loss 1192 0.204540 Grad Norm 0.898970 4.91s/it
Train loss 1193 0.164567 Grad Norm 0.456903 3.90s/it
Train loss 1194 0.233046 Grad Norm 2.372457 2.80s/it
Train loss 1195 0.172262 Grad Norm 1.357634 4.43s/it
Train loss 1196 0.252194 Grad Norm 0.634696 1.88s/it
Train loss 1197 0.209553 Grad Norm 1.240795 5.09s/it
Train loss 1198 0.165170 Grad Norm 0.402082 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1268 0.156092 Grad Norm 0.263435 5.35s/it
Train loss 1269 0.205003 Grad Norm 0.658369 2.53s/it
Train loss 1270 0.167837 Grad Norm 1.458698 5.29s/it
Train loss 1271 0.141087 Grad Norm 0.354668 4.60s/it
Train loss 1272 0.189203 Grad Norm 0.811725 2.29s/it
Train loss 1273 0.150582 Grad Norm 0.925615 4.94s/it
Train loss 1274 0.141650 Grad Norm 0.513014 5.11s/it
Train loss 1275 0.156621 Grad Norm 0.753728 3.89s/it
Train loss 1276 0.173012 Grad Norm 0.779647 4.12s/it
Train loss 1277 0.190510 Grad Norm 0.295855 4.14s/it
Train loss 1278 0.158773 Grad Norm 0.763294 4.16s/it
Train loss 1279 0.136628 Grad Norm 0.770407 6.38s/it
Train loss 1280 0.168515 Grad Norm 0.440427 4.89s/it
Train loss 1281 0.224920 Grad Norm 0.845206 4.88s/it
Train loss 1282 0.159561 Grad Norm 0.542463 2.79s/it
Train loss 1283 0.199429 Grad Norm 0.288840 4.50s/it
Train loss 1284 0.201661 Grad Norm 0.627323 5.66s/it
Train loss 1285 0.166892 Grad Norm 0.452085 5.64s/it
Train loss 1286 0.177598 Grad Norm 0.579042 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1344 0.179732 Grad Norm 0.763130 4.36s/it
Train loss 1345 0.179725 Grad Norm 0.851572 4.11s/it
Train loss 1346 0.176515 Grad Norm 0.634517 2.82s/it
Train loss 1347 0.120721 Grad Norm 0.670507 6.35s/it
Train loss 1348 0.173996 Grad Norm 0.583619 3.27s/it
Train loss 1349 0.215259 Grad Norm 0.894429 4.95s/it
Train loss 1350 0.217228 Grad Norm 0.928110 3.60s/it
Train loss 1351 0.184297 Grad Norm 0.424467 5.57s/it
Train loss 1352 0.158117 Grad Norm 1.390192 4.62s/it
Train loss 1353 0.211856 Grad Norm 1.316524 3.67s/it
Train loss 1354 0.164959 Grad Norm 0.550329 2.84s/it
Train loss 1355 0.228720 Grad Norm 1.834513 3.88s/it
Train loss 1356 0.123069 Grad Norm 0.823507 4.67s/it
Train loss 1357 0.169832 Grad Norm 0.515062 3.36s/it
Train loss 1358 0.158698 Grad Norm 0.998242 3.08s/it
Train loss 1359 0.194373 Grad Norm 1.041951 3.41s/it
Train loss 1360 0.171752 Grad Norm 0.872896 2.52s/it
Train loss 1361 0.171641 Grad Norm 0.525000 5.63s/it
Train loss 1362 0.181483 Grad Norm 1.607388 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1440 0.190631 Grad Norm 0.787622 2.87s/it
Train loss 1441 0.212747 Grad Norm 0.455533 4.02s/it
Train loss 1442 0.155460 Grad Norm 0.717964 2.74s/it
Train loss 1443 0.171953 Grad Norm 0.440424 4.83s/it
Train loss 1444 0.182237 Grad Norm 0.458820 4.83s/it
Train loss 1445 0.200446 Grad Norm 0.684271 2.94s/it
Train loss 1446 0.190024 Grad Norm 0.495399 5.12s/it
Train loss 1447 0.221560 Grad Norm 1.045416 3.93s/it
Train loss 1448 0.125914 Grad Norm 0.439160 3.32s/it
Train loss 1449 0.206838 Grad Norm 1.135116 2.00s/it
Train loss 1450 0.151672 Grad Norm 0.512616 3.51s/it
Train loss 1451 0.259157 Grad Norm 1.468790 1.51s/it
Train loss 1452 0.163249 Grad Norm 0.340785 5.33s/it
Train loss 1453 0.180941 Grad Norm 0.872255 5.13s/it
Train loss 1454 0.138357 Grad Norm 0.531575 5.74s/it
Train loss 1455 0.176676 Grad Norm 1.746640 3.05s/it
Train loss 1456 0.231798 Grad Norm 2.694825 2.79s/it
Train loss 1457 0.164902 Grad Norm 0.831488 3.99s/it
Train loss 1458 0.197385 Grad Norm 1.357872 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1520 0.175622 Grad Norm 0.997049 1.67s/it
Train loss 1521 0.174773 Grad Norm 1.379383 4.70s/it
Train loss 1522 0.159728 Grad Norm 0.978649 5.24s/it
Train loss 1523 0.161491 Grad Norm 0.357201 2.62s/it
Train loss 1524 0.176242 Grad Norm 1.160386 2.90s/it
Train loss 1525 0.213337 Grad Norm 1.212658 1.44s/it
Train loss 1526 0.133894 Grad Norm 1.066180 4.06s/it
Train loss 1527 0.134807 Grad Norm 1.191852 4.48s/it
Train loss 1528 0.239074 Grad Norm 2.965008 1.98s/it
Train loss 1529 0.189392 Grad Norm 1.425327 1.47s/it
Train loss 1530 0.180610 Grad Norm 0.927345 4.90s/it
Train loss 1531 0.175875 Grad Norm 1.076919 5.10s/it
Train loss 1532 0.117969 Grad Norm 0.441423 3.27s/it
Train loss 1533 0.182287 Grad Norm 1.402344 5.06s/it
Train loss 1534 0.176343 Grad Norm 0.597052 5.09s/it
Train loss 1535 0.118451 Grad Norm 0.274258 5.42s/it
Train loss 1536 0.240733 Grad Norm 0.853186 3.83s/it
Train loss 1537 0.229353 Grad Norm 0.826719 2.02s/it
Train loss 1538 0.177750 Grad Norm 0.674053 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1598 0.215955 Grad Norm 1.807600 3.23s/it
Train loss 1599 0.128057 Grad Norm 1.633348 4.72s/it
Train loss 1600 0.113610 Grad Norm 0.488614 6.49s/it
Train loss 1601 0.214163 Grad Norm 0.529636 3.19s/it
Train loss 1602 0.155401 Grad Norm 1.014028 4.06s/it
Train loss 1603 0.170799 Grad Norm 1.144730 6.40s/it
Train loss 1604 0.195989 Grad Norm 0.611781 3.39s/it
Train loss 1605 0.154769 Grad Norm 1.454278 5.06s/it
Train loss 1606 0.164834 Grad Norm 1.210611 5.15s/it
Train loss 1607 0.167623 Grad Norm 0.951773 2.50s/it
Train loss 1608 0.159963 Grad Norm 0.895800 3.52s/it
Train loss 1609 0.176403 Grad Norm 0.688934 2.89s/it
Train loss 1610 0.172065 Grad Norm 1.290620 2.30s/it
Train loss 1611 0.163323 Grad Norm 0.310261 3.61s/it
Train loss 1612 0.154208 Grad Norm 0.785177 4.15s/it
Train loss 1613 0.166507 Grad Norm 0.913793 7.19s/it
Train loss 1614 0.221326 Grad Norm 1.101581 2.87s/it
Train loss 1615 0.176027 Grad Norm 0.846327 4.77s/it
Train loss 1616 0.102038 Grad Norm 0.397586 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1688 0.154303 Grad Norm 0.973089 3.21s/it
Train loss 1689 0.201857 Grad Norm 0.992568 2.90s/it
Train loss 1690 0.154546 Grad Norm 0.704789 8.19s/it
Train loss 1691 0.202904 Grad Norm 1.272430 4.67s/it
Train loss 1692 0.166908 Grad Norm 0.574197 5.26s/it
Train loss 1693 0.156204 Grad Norm 1.002429 4.98s/it
Train loss 1694 0.181682 Grad Norm 0.577016 2.34s/it
Train loss 1695 0.205550 Grad Norm 0.757060 1.99s/it
Train loss 1696 0.241497 Grad Norm 0.858226 4.13s/it
Train loss 1697 0.184579 Grad Norm 0.987387 3.38s/it
Train loss 1698 0.176462 Grad Norm 1.327922 5.53s/it
Train loss 1699 0.134807 Grad Norm 0.642898 5.48s/it
Train loss 1700 0.247131 Grad Norm 1.183644 2.63s/it
Train loss 1701 0.150300 Grad Norm 0.382510 4.04s/it
Train loss 1702 0.176173 Grad Norm 0.787995 4.55s/it
Train loss 1703 0.220469 Grad Norm 0.927565 4.95s/it
Train loss 1704 0.158122 Grad Norm 0.703035 3.98s/it
Train loss 1705 0.196513 Grad Norm 0.995392 5.54s/it
Train loss 1706 0.203444 Grad Norm 0.934949 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1764 0.194612 Grad Norm 0.794422 2.82s/it
Train loss 1765 0.188602 Grad Norm 0.836036 5.57s/it
Train loss 1766 0.172841 Grad Norm 0.645294 4.04s/it
Train loss 1767 0.254523 Grad Norm 1.255419 2.32s/it
Train loss 1768 0.135192 Grad Norm 1.970908 6.59s/it
Train loss 1769 0.175653 Grad Norm 0.421218 4.65s/it
Train loss 1770 0.158093 Grad Norm 1.167953 4.74s/it
Train loss 1771 0.214374 Grad Norm 1.367589 3.25s/it
Train loss 1772 0.158934 Grad Norm 1.000401 2.86s/it
Train loss 1773 0.176588 Grad Norm 0.636190 4.64s/it
Train loss 1774 0.224039 Grad Norm 0.842729 4.21s/it
Train loss 1775 0.195275 Grad Norm 1.133206 3.21s/it
Train loss 1776 0.121102 Grad Norm 0.740449 4.68s/it
Train loss 1777 0.153791 Grad Norm 0.790013 5.73s/it
Train loss 1778 0.181460 Grad Norm 1.133653 3.23s/it
Train loss 1779 0.225983 Grad Norm 0.832061 5.71s/it
Train loss 1780 0.171538 Grad Norm 0.919524 2.30s/it
Train loss 1781 0.132407 Grad Norm 1.175815 6.50s/it
Train loss 1782 0.232871 Grad Norm 1.280967 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1854 0.195702 Grad Norm 1.407980 1.77s/it
Train loss 1855 0.207637 Grad Norm 0.635482 2.88s/it
Train loss 1856 0.221038 Grad Norm 0.613333 2.27s/it
Train loss 1857 0.210854 Grad Norm 0.777908 4.31s/it
Train loss 1858 0.222948 Grad Norm 0.585145 1.39s/it
Train loss 1859 0.232587 Grad Norm 0.696101 2.44s/it
Train loss 1860 0.178283 Grad Norm 0.585113 3.24s/it
Train loss 1861 0.269173 Grad Norm 0.821398 3.42s/it
Train loss 1862 0.209011 Grad Norm 1.019207 2.81s/it
Train loss 1863 0.156732 Grad Norm 0.502669 5.18s/it
Train loss 1864 0.145974 Grad Norm 1.217067 6.63s/it
Train loss 1865 0.255035 Grad Norm 1.323070 2.35s/it
Train loss 1866 0.241167 Grad Norm 0.528733 2.56s/it
Train loss 1867 0.174985 Grad Norm 1.942465 3.74s/it
Train loss 1868 0.218876 Grad Norm 1.497488 3.56s/it
Train loss 1869 0.168357 Grad Norm 0.446347 5.25s/it
Train loss 1870 0.168632 Grad Norm 0.918182 5.24s/it
Train loss 1871 0.223099 Grad Norm 1.675392 5.06s/it
Train loss 1872 0.137534 Grad Norm 0.843344 7.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 1932 0.227581 Grad Norm 1.052200 3.73s/it
Train loss 1933 0.109854 Grad Norm 0.551677 8.64s/it
Train loss 1934 0.191973 Grad Norm 0.674423 5.20s/it
Train loss 1935 0.177647 Grad Norm 0.485150 4.11s/it
Train loss 1936 0.192026 Grad Norm 0.688826 1.86s/it
Train loss 1937 0.148385 Grad Norm 0.443588 4.24s/it
Train loss 1938 0.138443 Grad Norm 0.450674 5.29s/it
Train loss 1939 0.118161 Grad Norm 0.381948 4.66s/it
Train loss 1940 0.149709 Grad Norm 0.603389 5.34s/it
Train loss 1941 0.154203 Grad Norm 0.456931 2.52s/it
Train loss 1942 0.165703 Grad Norm 1.079588 4.67s/it
Train loss 1943 0.166720 Grad Norm 0.793177 4.53s/it
Train loss 1944 0.149705 Grad Norm 0.525496 5.83s/it
Train loss 1945 0.269133 Grad Norm 1.397104 2.61s/it
Train loss 1946 0.167448 Grad Norm 0.412761 4.79s/it
Train loss 1947 0.157390 Grad Norm 0.595681 2.89s/it
Train loss 1948 0.183211 Grad Norm 1.156428 3.96s/it
Train loss 1949 0.181939 Grad Norm 1.071396 5.16s/it
Train loss 1950 0.209630 Grad Norm 1.454767 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2016 0.179170 Grad Norm 0.968956 4.85s/it
Train loss 2017 0.180826 Grad Norm 0.557297 4.67s/it
Train loss 2018 0.168882 Grad Norm 0.478223 7.58s/it
Train loss 2019 0.153305 Grad Norm 0.615822 4.00s/it
Train loss 2020 0.215422 Grad Norm 0.427020 4.78s/it
Train loss 2021 0.181447 Grad Norm 0.843025 5.11s/it
Train loss 2022 0.204464 Grad Norm 0.396401 3.42s/it
Train loss 2023 0.216544 Grad Norm 0.933219 3.18s/it
Train loss 2024 0.233295 Grad Norm 0.779399 2.37s/it
Train loss 2025 0.142213 Grad Norm 0.276736 3.05s/it
Train loss 2026 0.219093 Grad Norm 0.784492 3.46s/it
Train loss 2027 0.114582 Grad Norm 0.419929 5.05s/it
Train loss 2028 0.192015 Grad Norm 1.488554 2.77s/it
Train loss 2029 0.208291 Grad Norm 0.557284 2.81s/it
Train loss 2030 0.225098 Grad Norm 2.479026 2.91s/it
Train loss 2031 0.196787 Grad Norm 0.867410 1.80s/it
Train loss 2032 0.144372 Grad Norm 0.775246 5.60s/it
Train loss 2033 0.210599 Grad Norm 0.723101 2.38s/it
Train loss 2034 0.155588 Grad Norm 1.149890 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2108 0.175669 Grad Norm 0.752376 4.63s/it
Train loss 2109 0.156918 Grad Norm 0.833492 3.79s/it
Train loss 2110 0.196143 Grad Norm 0.811753 2.34s/it
Train loss 2111 0.147729 Grad Norm 0.957067 5.63s/it
Train loss 2112 0.191061 Grad Norm 0.996983 1.81s/it
Train loss 2113 0.238420 Grad Norm 0.820344 3.03s/it
Train loss 2114 0.224274 Grad Norm 0.874967 2.17s/it
Train loss 2115 0.202832 Grad Norm 1.375529 5.02s/it
Train loss 2116 0.202305 Grad Norm 0.638411 5.74s/it
Train loss 2117 0.167417 Grad Norm 0.674975 3.29s/it
Train loss 2118 0.169375 Grad Norm 1.040423 3.17s/it
Train loss 2119 0.212212 Grad Norm 0.812714 2.90s/it
Train loss 2120 0.189610 Grad Norm 0.876482 6.62s/it
Train loss 2121 0.210478 Grad Norm 1.124771 2.26s/it
Train loss 2122 0.224302 Grad Norm 1.361912 1.83s/it
Train loss 2123 0.212666 Grad Norm 0.942047 2.40s/it
Train loss 2124 0.200798 Grad Norm 0.887849 4.22s/it
Train loss 2125 0.149661 Grad Norm 0.753218 4.31s/it
Train loss 2126 0.153056 Grad Norm 0.430245 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2184 0.192124 Grad Norm 1.088262 3.28s/it
Train loss 2185 0.159284 Grad Norm 0.771281 5.75s/it
Train loss 2186 0.128680 Grad Norm 0.519303 6.03s/it
Train loss 2187 0.233255 Grad Norm 1.031804 3.06s/it
Train loss 2188 0.222841 Grad Norm 1.016387 2.71s/it
Train loss 2189 0.208482 Grad Norm 1.305443 3.49s/it
Train loss 2190 0.158403 Grad Norm 0.864965 4.22s/it
Train loss 2191 0.229337 Grad Norm 1.260243 3.09s/it
Train loss 2192 0.180503 Grad Norm 1.524679 3.59s/it
Train loss 2193 0.125179 Grad Norm 0.973223 5.57s/it
Train loss 2194 0.146102 Grad Norm 0.375180 5.43s/it
Train loss 2195 0.193363 Grad Norm 1.572520 3.07s/it
Train loss 2196 0.203927 Grad Norm 1.321558 2.25s/it
Train loss 2197 0.227431 Grad Norm 1.083662 1.85s/it
Train loss 2198 0.172130 Grad Norm 0.532324 4.48s/it
Train loss 2199 0.200745 Grad Norm 2.090496 3.80s/it
Train loss 2200 0.211828 Grad Norm 2.632969 3.49s/it
Train loss 2201 0.175840 Grad Norm 1.391421 3.39s/it
Train loss 2202 0.146078 Grad Norm 0.821798 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2274 0.102299 Grad Norm 0.742214 2.80s/it
Train loss 2275 0.178977 Grad Norm 2.341100 3.74s/it
Train loss 2276 0.187964 Grad Norm 0.897095 4.05s/it
Train loss 2277 0.183988 Grad Norm 1.035399 3.56s/it
Train loss 2278 0.189541 Grad Norm 1.881959 1.75s/it
Train loss 2279 0.198338 Grad Norm 1.799858 5.35s/it
Train loss 2280 0.167901 Grad Norm 0.461353 4.50s/it
Train loss 2281 0.123267 Grad Norm 0.638660 7.27s/it
Train loss 2282 0.202366 Grad Norm 1.608027 3.90s/it
Train loss 2283 0.162517 Grad Norm 2.520803 2.38s/it
Train loss 2284 0.244264 Grad Norm 0.643199 1.72s/it
Train loss 2285 0.187911 Grad Norm 2.863175 3.62s/it
Train loss 2286 0.132494 Grad Norm 1.340278 3.59s/it
Train loss 2287 0.176613 Grad Norm 1.157823 2.21s/it
Train loss 2288 0.188783 Grad Norm 1.396099 3.91s/it
Train loss 2289 0.153616 Grad Norm 1.315229 2.66s/it
Train loss 2290 0.170365 Grad Norm 0.742367 2.79s/it
Train loss 2291 0.170699 Grad Norm 1.026482 2.18s/it
Train loss 2292 0.138817 Grad Norm 0.941850 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2355 0.194233 Grad Norm 2.081473 5.80s/it
Train loss 2356 0.152641 Grad Norm 0.449126 5.09s/it
Train loss 2357 0.210157 Grad Norm 0.630279 3.38s/it
Train loss 2358 0.153974 Grad Norm 1.231592 5.25s/it
Train loss 2359 0.220196 Grad Norm 1.501299 1.79s/it
Train loss 2360 0.138797 Grad Norm 0.244650 3.47s/it
Train loss 2361 0.190558 Grad Norm 2.522584 5.80s/it
Train loss 2362 0.171073 Grad Norm 1.470533 3.07s/it
Train loss 2363 0.165458 Grad Norm 0.488121 7.49s/it
Train loss 2364 0.221640 Grad Norm 1.063485 3.53s/it
Train loss 2365 0.221967 Grad Norm 1.186041 1.63s/it
Train loss 2366 0.216888 Grad Norm 0.809458 1.98s/it
Train loss 2367 0.162143 Grad Norm 1.442363 3.41s/it
Train loss 2368 0.152644 Grad Norm 0.978429 3.48s/it
Train loss 2369 0.198398 Grad Norm 0.610262 2.48s/it
Train loss 2370 0.153570 Grad Norm 1.721544 4.25s/it
Train loss 2371 0.187700 Grad Norm 1.542160 3.03s/it
Train loss 2372 0.217420 Grad Norm 0.978866 3.91s/it
Train loss 2373 0.129395 Grad Norm 0.473206 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2436 0.153156 Grad Norm 0.520309 3.11s/it
Train loss 2437 0.134971 Grad Norm 0.554678 3.94s/it
Train loss 2438 0.252406 Grad Norm 0.948847 2.88s/it
Train loss 2439 0.151620 Grad Norm 0.819470 6.59s/it
Train loss 2440 0.233167 Grad Norm 1.150190 4.23s/it
Train loss 2441 0.184449 Grad Norm 0.710003 3.47s/it
Train loss 2442 0.151594 Grad Norm 0.464158 5.07s/it
Train loss 2443 0.201979 Grad Norm 0.883208 2.00s/it
Train loss 2444 0.235703 Grad Norm 1.812919 1.78s/it
Train loss 2445 0.148800 Grad Norm 0.551650 7.37s/it
Train loss 2446 0.177939 Grad Norm 1.188224 2.14s/it
Train loss 2447 0.186041 Grad Norm 0.574837 2.36s/it
Train loss 2448 0.155530 Grad Norm 1.382561 4.03s/it
Train loss 2449 0.146138 Grad Norm 0.876864 3.17s/it
Train loss 2450 0.185695 Grad Norm 0.848048 4.25s/it
Train loss 2451 0.146877 Grad Norm 0.497566 2.74s/it
Train loss 2452 0.185526 Grad Norm 0.681715 5.01s/it
Train loss 2453 0.161741 Grad Norm 0.806563 7.38s/it
Train loss 2454 0.200517 Grad Norm 0.699591 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2520 0.217050 Grad Norm 1.213181 2.33s/it
Train loss 2521 0.148568 Grad Norm 0.899319 3.96s/it
Train loss 2522 0.187081 Grad Norm 0.705504 3.27s/it
Train loss 2523 0.234684 Grad Norm 1.852582 3.38s/it
Train loss 2524 0.143726 Grad Norm 0.756335 7.81s/it
Train loss 2525 0.196361 Grad Norm 1.058028 2.33s/it
Train loss 2526 0.214556 Grad Norm 0.471598 5.00s/it
Train loss 2527 0.162049 Grad Norm 0.768105 3.42s/it
Train loss 2528 0.225387 Grad Norm 1.729424 2.83s/it
Train loss 2529 0.192506 Grad Norm 0.385414 4.60s/it
Train loss 2530 0.158319 Grad Norm 0.739273 6.50s/it
Train loss 2531 0.183582 Grad Norm 0.972450 5.66s/it
Train loss 2532 0.238753 Grad Norm 1.034342 1.57s/it
Train loss 2533 0.173781 Grad Norm 1.477661 5.61s/it
Train loss 2534 0.194428 Grad Norm 1.566671 3.65s/it
Train loss 2535 0.139451 Grad Norm 0.974610 5.51s/it
Train loss 2536 0.171888 Grad Norm 0.833498 4.76s/it
Train loss 2537 0.197929 Grad Norm 1.386941 4.19s/it
Train loss 2538 0.190861 Grad Norm 1.174437 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2605 0.219002 Grad Norm 1.447137 3.64s/it
Train loss 2606 0.167514 Grad Norm 0.622643 3.28s/it
Train loss 2607 0.213110 Grad Norm 0.558628 3.66s/it
Train loss 2608 0.177342 Grad Norm 0.600126 3.05s/it
Train loss 2609 0.147738 Grad Norm 0.371544 2.17s/it
Train loss 2610 0.120684 Grad Norm 0.409185 5.27s/it
Train loss 2611 0.143779 Grad Norm 0.517717 4.93s/it
Train loss 2612 0.210245 Grad Norm 1.148625 2.82s/it
Train loss 2613 0.140376 Grad Norm 0.207215 5.33s/it
Train loss 2614 0.171382 Grad Norm 1.180619 2.87s/it
Train loss 2615 0.161887 Grad Norm 0.696516 4.15s/it
Train loss 2616 0.194475 Grad Norm 0.911025 1.98s/it
Train loss 2617 0.145968 Grad Norm 0.841578 6.34s/it
Train loss 2618 0.175018 Grad Norm 0.770546 2.92s/it
Train loss 2619 0.170155 Grad Norm 0.890564 4.40s/it
Train loss 2620 0.137287 Grad Norm 0.669423 5.19s/it
Train loss 2621 0.183692 Grad Norm 0.673168 4.09s/it
Train loss 2622 0.184887 Grad Norm 0.389157 3.32s/it
Train loss 2623 0.113838 Grad Norm 0.354344 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2696 0.202408 Grad Norm 0.673351 3.56s/it
Train loss 2697 0.165102 Grad Norm 0.335076 1.99s/it
Train loss 2698 0.196631 Grad Norm 0.735631 4.02s/it
Train loss 2699 0.160078 Grad Norm 0.460338 3.02s/it
Train loss 2700 0.197014 Grad Norm 0.809319 3.86s/it
Train loss 2701 0.244251 Grad Norm 0.693550 2.24s/it
Train loss 2702 0.121624 Grad Norm 0.368553 5.42s/it
Train loss 2703 0.219651 Grad Norm 1.048648 5.45s/it
Train loss 2704 0.238863 Grad Norm 0.667441 1.89s/it
Train loss 2705 0.190638 Grad Norm 0.476436 3.22s/it
Train loss 2706 0.176098 Grad Norm 0.655713 4.17s/it
Train loss 2707 0.139791 Grad Norm 0.527930 5.79s/it
Train loss 2708 0.200165 Grad Norm 0.655843 2.50s/it
Train loss 2709 0.161490 Grad Norm 0.967780 2.46s/it
Train loss 2710 0.150639 Grad Norm 0.585576 3.30s/it
Train loss 2711 0.201446 Grad Norm 1.053731 4.28s/it
Train loss 2712 0.191539 Grad Norm 0.844514 3.71s/it
Train loss 2713 0.226532 Grad Norm 1.245294 2.99s/it
Train loss 2714 0.178719 Grad Norm 0.857392 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2772 0.234678 Grad Norm 3.023557 3.84s/it
Train loss 2773 0.179202 Grad Norm 1.253308 4.55s/it
Train loss 2774 0.183358 Grad Norm 0.552771 3.09s/it
Train loss 2775 0.178783 Grad Norm 0.999684 4.08s/it
Train loss 2776 0.180692 Grad Norm 2.000332 2.41s/it
Train loss 2777 0.126421 Grad Norm 1.180229 6.68s/it
Train loss 2778 0.199631 Grad Norm 0.876502 2.44s/it
Train loss 2779 0.160442 Grad Norm 1.435358 4.01s/it
Train loss 2780 0.182576 Grad Norm 0.738309 3.32s/it
Train loss 2781 0.245166 Grad Norm 0.550881 2.58s/it
Train loss 2782 0.138691 Grad Norm 0.378984 5.13s/it
Train loss 2783 0.147258 Grad Norm 0.513688 4.98s/it
Train loss 2784 0.219047 Grad Norm 0.540639 5.53s/it
Train loss 2785 0.179204 Grad Norm 0.398429 5.06s/it
Train loss 2786 0.141054 Grad Norm 0.616858 3.35s/it
Train loss 2787 0.178843 Grad Norm 0.711960 5.26s/it
Train loss 2788 0.198033 Grad Norm 0.802343 4.30s/it
Train loss 2789 0.136261 Grad Norm 0.753550 3.02s/it
Train loss 2790 0.215287 Grad Norm 0.880603 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2870 0.134624 Grad Norm 0.745961 4.24s/it
Train loss 2871 0.135066 Grad Norm 0.657780 5.72s/it
Train loss 2872 0.160959 Grad Norm 1.467126 3.16s/it
Train loss 2873 0.186617 Grad Norm 0.418120 3.16s/it
Train loss 2874 0.224019 Grad Norm 0.850940 2.08s/it
Train loss 2875 0.204066 Grad Norm 1.237709 2.40s/it
Train loss 2876 0.164107 Grad Norm 0.480307 3.22s/it
Train loss 2877 0.124746 Grad Norm 0.547721 5.16s/it
Train loss 2878 0.128843 Grad Norm 0.567794 4.95s/it
Train loss 2879 0.102261 Grad Norm 0.449193 3.82s/it
Train loss 2880 0.183452 Grad Norm 0.448380 4.62s/it
Train loss 2881 0.230650 Grad Norm 1.610206 3.21s/it
Train loss 2882 0.211672 Grad Norm 0.844259 2.69s/it
Train loss 2883 0.121042 Grad Norm 0.589967 2.95s/it
Train loss 2884 0.146048 Grad Norm 0.946323 6.31s/it
Train loss 2885 0.222331 Grad Norm 0.919119 3.70s/it
Train loss 2886 0.214158 Grad Norm 0.606463 2.23s/it
Train loss 2887 0.200394 Grad Norm 1.427408 3.59s/it
Train loss 2888 0.186322 Grad Norm 0.991101 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 2940 0.177970 Grad Norm 0.865518 3.61s/it
Train loss 2941 0.191640 Grad Norm 0.953983 3.05s/it
Train loss 2942 0.159924 Grad Norm 1.016657 5.62s/it
Train loss 2943 0.184696 Grad Norm 0.657180 5.24s/it
Train loss 2944 0.136815 Grad Norm 0.311737 5.11s/it
Train loss 2945 0.193937 Grad Norm 1.102977 4.85s/it
Train loss 2946 0.177850 Grad Norm 0.600820 3.89s/it
Train loss 2947 0.225195 Grad Norm 0.535810 2.32s/it
Train loss 2948 0.133216 Grad Norm 0.532687 6.30s/it
Train loss 2949 0.163127 Grad Norm 0.370694 2.90s/it
Train loss 2950 0.167411 Grad Norm 1.621308 5.26s/it
Train loss 2951 0.176053 Grad Norm 1.574642 2.90s/it
Train loss 2952 0.110731 Grad Norm 0.487837 4.57s/it
Train loss 2953 0.134777 Grad Norm 0.580622 6.40s/it
Train loss 2954 0.212000 Grad Norm 1.640917 3.41s/it
Train loss 2955 0.197119 Grad Norm 0.991455 2.69s/it
Train loss 2956 0.207279 Grad Norm 1.121144 4.34s/it
Train loss 2957 0.181992 Grad Norm 0.790482 3.29s/it
Train loss 2958 0.147460 Grad Norm 0.513887 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3024 0.191786 Grad Norm 1.405452 2.62s/it
Train loss 3025 0.112609 Grad Norm 0.427666 5.60s/it
Train loss 3026 0.133482 Grad Norm 0.622012 7.37s/it
Train loss 3027 0.138612 Grad Norm 0.608797 4.51s/it
Train loss 3028 0.146575 Grad Norm 0.589213 6.30s/it
Train loss 3029 0.169956 Grad Norm 0.798230 6.47s/it
Train loss 3030 0.215769 Grad Norm 0.685500 2.21s/it
Train loss 3031 0.183792 Grad Norm 0.457786 3.19s/it
Train loss 3032 0.123311 Grad Norm 0.647767 3.12s/it
Train loss 3033 0.113418 Grad Norm 0.444571 3.82s/it
Train loss 3034 0.219893 Grad Norm 1.002114 3.59s/it
Train loss 3035 0.147060 Grad Norm 0.847686 4.85s/it
Train loss 3036 0.126845 Grad Norm 0.866641 4.43s/it
Train loss 3037 0.201771 Grad Norm 1.417952 2.81s/it
Train loss 3038 0.201496 Grad Norm 0.711005 2.42s/it
Train loss 3039 0.189642 Grad Norm 1.065392 2.86s/it
Train loss 3040 0.146831 Grad Norm 0.878236 2.66s/it
Train loss 3041 0.149206 Grad Norm 0.895434 7.63s/it
Train loss 3042 0.165226 Grad Norm 0.620274 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3121 0.206174 Grad Norm 1.635268 2.37s/it
Train loss 3122 0.129409 Grad Norm 0.541605 3.92s/it
Train loss 3123 0.127923 Grad Norm 0.727887 4.11s/it
Train loss 3124 0.142871 Grad Norm 1.196777 5.41s/it
Train loss 3125 0.172418 Grad Norm 0.786408 4.61s/it
Train loss 3126 0.179267 Grad Norm 0.493717 3.98s/it
Train loss 3127 0.138676 Grad Norm 0.562011 4.13s/it
Train loss 3128 0.173554 Grad Norm 0.809794 4.37s/it
Train loss 3129 0.203389 Grad Norm 0.936922 3.82s/it
Train loss 3130 0.107799 Grad Norm 0.271605 5.27s/it
Train loss 3131 0.196622 Grad Norm 0.845737 2.75s/it
Train loss 3132 0.151376 Grad Norm 0.907883 4.87s/it
Train loss 3133 0.146811 Grad Norm 1.111855 3.35s/it
Train loss 3134 0.129029 Grad Norm 0.325036 5.56s/it
Train loss 3135 0.148618 Grad Norm 0.781844 4.88s/it
Train loss 3136 0.151883 Grad Norm 1.506501 2.39s/it
Train loss 3137 0.186718 Grad Norm 0.684337 2.81s/it
Train loss 3138 0.140549 Grad Norm 0.481049 5.46s/it
Train loss 3139 0.169294 Grad Norm 1.138982 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3193 0.190060 Grad Norm 0.846898 8.21s/it
Train loss 3194 0.191303 Grad Norm 1.403589 5.51s/it
Train loss 3195 0.243951 Grad Norm 3.623342 4.40s/it
Train loss 3196 0.190861 Grad Norm 1.540105 3.88s/it
Train loss 3197 0.177400 Grad Norm 0.413497 3.02s/it
Train loss 3198 0.148509 Grad Norm 1.084927 3.92s/it
Train loss 3199 0.225701 Grad Norm 2.765315 2.74s/it
Train loss 3200 0.118678 Grad Norm 0.998037 5.74s/it
Train loss 3201 0.206268 Grad Norm 1.063270 2.77s/it
Train loss 3202 0.246237 Grad Norm 2.320564 2.12s/it
Train loss 3203 0.118724 Grad Norm 0.386027 6.25s/it
Train loss 3204 0.220347 Grad Norm 1.009854 3.39s/it
Train loss 3205 0.199297 Grad Norm 1.980557 5.34s/it
Train loss 3206 0.117265 Grad Norm 0.908847 6.41s/it
Train loss 3207 0.142943 Grad Norm 1.016083 3.97s/it
Train loss 3208 0.159861 Grad Norm 1.181606 7.18s/it
Train loss 3209 0.156739 Grad Norm 0.530111 3.37s/it
Train loss 3210 0.165063 Grad Norm 0.938951 2.21s/it
Train loss 3211 0.198347 Grad Norm 1.162449 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3278 0.142556 Grad Norm 0.471943 4.50s/it
Train loss 3279 0.180231 Grad Norm 1.518592 3.35s/it
Train loss 3280 0.146771 Grad Norm 0.582567 5.36s/it
Train loss 3281 0.157266 Grad Norm 0.612536 4.09s/it
Train loss 3282 0.182573 Grad Norm 0.660612 3.60s/it
Train loss 3283 0.185199 Grad Norm 0.654932 2.23s/it
Train loss 3284 0.228291 Grad Norm 0.823378 3.02s/it
Train loss 3285 0.117342 Grad Norm 0.344823 5.18s/it
Train loss 3286 0.190464 Grad Norm 1.233223 2.22s/it
Train loss 3287 0.142770 Grad Norm 0.752096 3.89s/it
Train loss 3288 0.125129 Grad Norm 0.391914 5.09s/it
Train loss 3289 0.224013 Grad Norm 1.801797 2.41s/it
Train loss 3290 0.184314 Grad Norm 1.613460 2.82s/it
Train loss 3291 0.156903 Grad Norm 0.599418 3.59s/it
Train loss 3292 0.203082 Grad Norm 1.804782 4.54s/it
Train loss 3293 0.208212 Grad Norm 0.874268 2.02s/it
Train loss 3294 0.169146 Grad Norm 0.530569 3.76s/it
Train loss 3295 0.246299 Grad Norm 0.514892 3.21s/it
Train loss 3296 0.155899 Grad Norm 0.908248 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3360 0.169036 Grad Norm 0.552772 4.38s/it
Train loss 3361 0.229614 Grad Norm 1.610333 2.19s/it
Train loss 3362 0.216077 Grad Norm 1.838605 1.61s/it
Train loss 3363 0.199022 Grad Norm 0.807261 2.63s/it
Train loss 3364 0.177254 Grad Norm 1.138712 3.62s/it
Train loss 3365 0.209351 Grad Norm 1.844537 5.62s/it
Train loss 3366 0.118525 Grad Norm 0.768582 7.40s/it
Train loss 3367 0.131018 Grad Norm 0.565933 3.39s/it
Train loss 3368 0.230829 Grad Norm 1.338274 4.16s/it
Train loss 3369 0.160748 Grad Norm 0.613241 3.34s/it
Train loss 3370 0.228753 Grad Norm 0.618105 5.38s/it
Train loss 3371 0.173865 Grad Norm 1.038463 2.55s/it
Train loss 3372 0.132364 Grad Norm 0.895689 2.26s/it
Train loss 3373 0.226449 Grad Norm 0.516073 5.27s/it
Train loss 3374 0.204218 Grad Norm 0.481054 2.46s/it
Train loss 3375 0.107485 Grad Norm 0.374675 4.33s/it
Train loss 3376 0.134093 Grad Norm 0.336749 7.66s/it
Train loss 3377 0.176457 Grad Norm 0.581497 2.54s/it
Train loss 3378 0.243375 Grad Norm 1.074757 1.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3449 0.178389 Grad Norm 1.033889 1.97s/it
Train loss 3450 0.170065 Grad Norm 1.525595 3.60s/it
Train loss 3451 0.109418 Grad Norm 0.888767 7.24s/it
Train loss 3452 0.202818 Grad Norm 1.380538 2.95s/it
Train loss 3453 0.180038 Grad Norm 0.740960 2.83s/it
Train loss 3454 0.197541 Grad Norm 1.560318 3.12s/it
Train loss 3455 0.157527 Grad Norm 1.410319 6.55s/it
Train loss 3456 0.142375 Grad Norm 0.886897 2.97s/it
Train loss 3457 0.156897 Grad Norm 0.677399 5.03s/it
Train loss 3458 0.133999 Grad Norm 1.007311 4.93s/it
Train loss 3459 0.220695 Grad Norm 1.538834 2.54s/it
Train loss 3460 0.129721 Grad Norm 0.918867 5.67s/it
Train loss 3461 0.171289 Grad Norm 1.439187 5.22s/it
Train loss 3462 0.127724 Grad Norm 0.909055 8.37s/it
Train loss 3463 0.137543 Grad Norm 1.226702 3.06s/it
Train loss 3464 0.186224 Grad Norm 0.980992 3.35s/it
Train loss 3465 0.118619 Grad Norm 0.533501 5.30s/it
Train loss 3466 0.215351 Grad Norm 0.745037 3.93s/it
Train loss 3467 0.172603 Grad Norm 0.764058 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3533 0.196490 Grad Norm 0.921336 1.92s/it
Train loss 3534 0.105162 Grad Norm 0.522362 4.49s/it
Train loss 3535 0.103971 Grad Norm 0.556574 6.75s/it
Train loss 3536 0.268376 Grad Norm 0.840398 2.07s/it
Train loss 3537 0.166548 Grad Norm 0.955335 7.54s/it
Train loss 3538 0.123659 Grad Norm 0.728866 5.51s/it
Train loss 3539 0.205017 Grad Norm 0.408115 4.25s/it
Train loss 3540 0.112546 Grad Norm 0.471738 5.66s/it
Train loss 3541 0.101059 Grad Norm 0.770061 5.13s/it
Train loss 3542 0.184514 Grad Norm 0.722243 2.02s/it
Train loss 3543 0.183722 Grad Norm 0.553069 3.61s/it
Train loss 3544 0.178643 Grad Norm 1.771626 4.60s/it
Train loss 3545 0.175850 Grad Norm 1.634542 5.91s/it
Train loss 3546 0.113666 Grad Norm 0.274982 3.38s/it
Train loss 3547 0.263409 Grad Norm 1.919926 2.88s/it
Train loss 3548 0.149764 Grad Norm 0.886533 5.33s/it
Train loss 3549 0.136476 Grad Norm 0.835489 4.96s/it
Train loss 3550 0.132544 Grad Norm 0.996691 4.72s/it
Train loss 3551 0.179324 Grad Norm 2.339570 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3614 0.225278 Grad Norm 1.240540 2.83s/it
Train loss 3615 0.181821 Grad Norm 0.523080 2.45s/it
Train loss 3616 0.219554 Grad Norm 0.577169 1.95s/it
Train loss 3617 0.214995 Grad Norm 1.285897 2.57s/it
Train loss 3618 0.136847 Grad Norm 0.401414 4.87s/it
Train loss 3619 0.214056 Grad Norm 1.210301 3.72s/it
Train loss 3620 0.135716 Grad Norm 0.516095 3.40s/it
Train loss 3621 0.127393 Grad Norm 0.596118 6.46s/it
Train loss 3622 0.156745 Grad Norm 0.494728 2.50s/it
Train loss 3623 0.220176 Grad Norm 2.086511 4.98s/it
Train loss 3624 0.148466 Grad Norm 0.700643 3.98s/it
Train loss 3625 0.240380 Grad Norm 2.330007 2.67s/it
Train loss 3626 0.202100 Grad Norm 1.247854 4.59s/it
Train loss 3627 0.190812 Grad Norm 0.450279 5.32s/it
Train loss 3628 0.118753 Grad Norm 0.475746 7.47s/it
Train loss 3629 0.210997 Grad Norm 0.914967 3.33s/it
Train loss 3630 0.218757 Grad Norm 0.760831 2.35s/it
Train loss 3631 0.139276 Grad Norm 0.686033 5.67s/it
Train loss 3632 0.112005 Grad Norm 1.156089 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3709 0.199343 Grad Norm 0.955152 5.69s/it
Train loss 3710 0.150828 Grad Norm 0.594724 3.45s/it
Train loss 3711 0.139163 Grad Norm 0.266651 5.18s/it
Train loss 3712 0.196914 Grad Norm 0.561063 2.78s/it
Train loss 3713 0.187417 Grad Norm 0.945224 2.63s/it
Train loss 3714 0.183779 Grad Norm 0.634917 2.90s/it
Train loss 3715 0.164245 Grad Norm 0.677850 4.49s/it
Train loss 3716 0.206377 Grad Norm 1.253829 2.40s/it
Train loss 3717 0.147259 Grad Norm 0.296315 4.65s/it
Train loss 3718 0.209662 Grad Norm 1.110975 2.12s/it
Train loss 3719 0.217139 Grad Norm 0.954411 3.66s/it
Train loss 3720 0.190394 Grad Norm 0.382766 3.04s/it
Train loss 3721 0.220206 Grad Norm 2.020215 3.27s/it
Train loss 3722 0.159451 Grad Norm 0.395663 5.39s/it
Train loss 3723 0.218402 Grad Norm 0.826247 2.89s/it
Train loss 3724 0.150274 Grad Norm 0.295587 4.09s/it
Train loss 3725 0.200686 Grad Norm 0.582428 2.32s/it
Train loss 3726 0.183752 Grad Norm 1.377921 4.97s/it
Train loss 3727 0.136988 Grad Norm 1.306674 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3798 0.164987 Grad Norm 1.049951 3.70s/it
Train loss 3799 0.214229 Grad Norm 0.517399 1.39s/it
Train loss 3800 0.164000 Grad Norm 1.046441 6.42s/it
Train loss 3801 0.199659 Grad Norm 1.239757 2.91s/it
Train loss 3802 0.210171 Grad Norm 0.692697 2.19s/it
Train loss 3803 0.169119 Grad Norm 0.529815 3.41s/it
Train loss 3804 0.250559 Grad Norm 4.225179 1.78s/it
Train loss 3805 0.183991 Grad Norm 0.829240 2.89s/it
Train loss 3806 0.139849 Grad Norm 1.112156 5.10s/it
Train loss 3807 0.195824 Grad Norm 1.309325 2.65s/it
Train loss 3808 0.157700 Grad Norm 0.781603 6.58s/it
Train loss 3809 0.206621 Grad Norm 1.083876 4.03s/it
Train loss 3810 0.224852 Grad Norm 0.778008 2.87s/it
Train loss 3811 0.202114 Grad Norm 1.229858 2.34s/it
Train loss 3812 0.217504 Grad Norm 1.777278 3.29s/it
Train loss 3813 0.228507 Grad Norm 1.236129 3.16s/it
Train loss 3814 0.247226 Grad Norm 0.756659 2.41s/it
Train loss 3815 0.143593 Grad Norm 0.807722 5.66s/it
Train loss 3816 0.148746 Grad Norm 0.989044 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3890 0.196368 Grad Norm 0.879506 2.09s/it
Train loss 3891 0.244459 Grad Norm 2.519087 4.01s/it
Train loss 3892 0.204972 Grad Norm 1.577397 3.64s/it
Train loss 3893 0.197224 Grad Norm 1.319827 3.12s/it
Train loss 3894 0.175489 Grad Norm 0.520947 4.67s/it
Train loss 3895 0.198487 Grad Norm 1.359336 3.38s/it
Train loss 3896 0.215718 Grad Norm 1.073665 3.53s/it
Train loss 3897 0.151999 Grad Norm 0.661312 4.64s/it
Train loss 3898 0.232149 Grad Norm 1.497532 2.54s/it
Train loss 3899 0.133375 Grad Norm 0.463355 5.16s/it
Train loss 3900 0.136307 Grad Norm 0.792416 3.97s/it
Train loss 3901 0.136331 Grad Norm 0.943445 2.74s/it
Train loss 3902 0.169545 Grad Norm 0.473218 7.89s/it
Train loss 3903 0.122998 Grad Norm 0.524012 5.86s/it
Train loss 3904 0.191108 Grad Norm 0.829495 4.41s/it
Train loss 3905 0.105819 Grad Norm 0.524153 8.22s/it
Train loss 3906 0.164718 Grad Norm 1.085743 2.23s/it
Train loss 3907 0.255033 Grad Norm 2.328741 2.60s/it
Train loss 3908 0.123312 Grad Norm 0.909183 6.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 3949 0.197367 Grad Norm 0.792451 3.14s/it
Train loss 3950 0.185903 Grad Norm 0.693654 3.51s/it
Train loss 3951 0.193893 Grad Norm 1.834094 2.08s/it
Train loss 3952 0.145302 Grad Norm 0.788189 5.71s/it
Train loss 3953 0.201052 Grad Norm 0.807779 6.69s/it
Train loss 3954 0.161570 Grad Norm 0.505866 4.87s/it
Train loss 3955 0.195671 Grad Norm 1.388499 4.32s/it
Train loss 3956 0.219397 Grad Norm 1.617052 5.07s/it
Train loss 3957 0.166191 Grad Norm 0.552609 5.44s/it
Train loss 3958 0.145925 Grad Norm 1.406206 5.75s/it
Train loss 3959 0.152743 Grad Norm 0.959925 4.15s/it
Train loss 3960 0.221370 Grad Norm 1.231028 3.04s/it
Train loss 3961 0.163590 Grad Norm 0.961773 3.42s/it
Train loss 3962 0.125908 Grad Norm 0.637169 8.24s/it
Train loss 3963 0.198932 Grad Norm 0.702107 5.25s/it
Train loss 3964 0.225859 Grad Norm 2.310808 3.37s/it
Train loss 3965 0.267940 Grad Norm 2.739209 1.69s/it
Train loss 3966 0.122640 Grad Norm 0.405495 5.15s/it
Train loss 3967 0.230700 Grad Norm 0.901902 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4040 0.133751 Grad Norm 0.651360 3.70s/it
Train loss 4041 0.157631 Grad Norm 0.651197 7.33s/it
Train loss 4042 0.178599 Grad Norm 0.483617 2.99s/it
Train loss 4043 0.187836 Grad Norm 0.767039 6.40s/it
Train loss 4044 0.221496 Grad Norm 0.665126 3.17s/it
Train loss 4045 0.185272 Grad Norm 0.850029 5.60s/it
Train loss 4046 0.195169 Grad Norm 0.975755 3.15s/it
Train loss 4047 0.187952 Grad Norm 0.476152 3.21s/it
Train loss 4048 0.231015 Grad Norm 1.041143 3.87s/it
Train loss 4049 0.235438 Grad Norm 0.893582 2.91s/it
Train loss 4050 0.127806 Grad Norm 0.529229 5.76s/it
Train loss 4051 0.197090 Grad Norm 1.179263 3.20s/it
Train loss 4052 0.123924 Grad Norm 0.698185 2.29s/it
Train loss 4053 0.147461 Grad Norm 0.410127 7.75s/it
Train loss 4054 0.155966 Grad Norm 0.714112 3.01s/it
Train loss 4055 0.156159 Grad Norm 0.661088 4.40s/it
Train loss 4056 0.184101 Grad Norm 1.309681 3.43s/it
Train loss 4057 0.151687 Grad Norm 0.621250 4.48s/it
Train loss 4058 0.181976 Grad Norm 0.842434 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4116 0.196608 Grad Norm 0.971834 4.80s/it
Train loss 4117 0.211435 Grad Norm 0.929510 3.97s/it
Train loss 4118 0.147367 Grad Norm 0.517109 4.18s/it
Train loss 4119 0.181801 Grad Norm 0.801777 3.53s/it
Train loss 4120 0.161522 Grad Norm 0.619222 3.89s/it
Train loss 4121 0.140495 Grad Norm 0.369798 6.51s/it
Train loss 4122 0.211157 Grad Norm 0.771658 2.35s/it
Train loss 4123 0.134028 Grad Norm 0.392202 3.67s/it
Train loss 4124 0.199577 Grad Norm 0.668405 4.92s/it
Train loss 4125 0.173997 Grad Norm 0.594484 5.32s/it
Train loss 4126 0.199459 Grad Norm 0.472121 3.38s/it
Train loss 4127 0.205321 Grad Norm 0.562769 2.36s/it
Train loss 4128 0.267889 Grad Norm 0.960258 1.30s/it
Train loss 4129 0.141415 Grad Norm 0.899641 3.30s/it
Train loss 4130 0.108821 Grad Norm 0.599338 1.97s/it
Train loss 4131 0.101450 Grad Norm 0.411540 8.23s/it
Train loss 4132 0.167532 Grad Norm 0.704368 4.31s/it
Train loss 4133 0.196323 Grad Norm 0.882517 3.75s/it
Train loss 4134 0.152837 Grad Norm 0.317888 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4203 0.132726 Grad Norm 0.599218 6.65s/it
Train loss 4204 0.201764 Grad Norm 0.966696 2.33s/it
Train loss 4205 0.116261 Grad Norm 0.692846 2.29s/it
Train loss 4206 0.196849 Grad Norm 0.902073 5.26s/it
Train loss 4207 0.185052 Grad Norm 0.501832 2.19s/it
Train loss 4208 0.201199 Grad Norm 0.871985 3.55s/it
Train loss 4209 0.139346 Grad Norm 0.375241 2.81s/it
Train loss 4210 0.231587 Grad Norm 0.840692 3.23s/it
Train loss 4211 0.195666 Grad Norm 0.480045 1.25s/it
Train loss 4212 0.146047 Grad Norm 0.542728 3.83s/it
Train loss 4213 0.165021 Grad Norm 0.435643 4.14s/it
Train loss 4214 0.213157 Grad Norm 1.358385 1.86s/it
Train loss 4215 0.234120 Grad Norm 1.352445 3.11s/it
Train loss 4216 0.139712 Grad Norm 0.855144 5.31s/it
Train loss 4217 0.140229 Grad Norm 0.784672 4.28s/it
Train loss 4218 0.178879 Grad Norm 1.175890 3.94s/it
Train loss 4219 0.112070 Grad Norm 0.764442 7.39s/it
Train loss 4220 0.165471 Grad Norm 1.173868 4.52s/it
Train loss 4221 0.139205 Grad Norm 0.482566 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4284 0.213600 Grad Norm 0.656914 6.17s/it
Train loss 4285 0.174760 Grad Norm 0.875513 2.99s/it
Train loss 4286 0.193438 Grad Norm 1.401930 3.40s/it
Train loss 4287 0.159082 Grad Norm 0.308948 2.40s/it
Train loss 4288 0.120606 Grad Norm 0.587168 5.71s/it
Train loss 4289 0.151336 Grad Norm 1.200171 5.94s/it
Train loss 4290 0.140360 Grad Norm 0.600811 3.82s/it
Train loss 4291 0.151549 Grad Norm 0.919734 3.63s/it
Train loss 4292 0.200715 Grad Norm 1.025350 4.22s/it
Train loss 4293 0.156571 Grad Norm 0.317524 5.78s/it
Train loss 4294 0.170532 Grad Norm 0.824045 3.23s/it
Train loss 4295 0.165513 Grad Norm 0.413019 3.93s/it
Train loss 4296 0.206906 Grad Norm 0.845841 3.09s/it
Train loss 4297 0.145919 Grad Norm 0.227177 2.92s/it
Train loss 4298 0.172112 Grad Norm 0.877634 3.53s/it
Train loss 4299 0.148189 Grad Norm 0.508014 5.36s/it
Train loss 4300 0.170032 Grad Norm 0.432815 3.96s/it
Train loss 4301 0.172486 Grad Norm 0.365085 3.37s/it
Train loss 4302 0.145186 Grad Norm 0.512804 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4378 0.197046 Grad Norm 1.028080 3.86s/it
Train loss 4379 0.171438 Grad Norm 1.186651 4.10s/it
Train loss 4380 0.183443 Grad Norm 0.989958 4.95s/it
Train loss 4381 0.141679 Grad Norm 0.476811 4.79s/it
Train loss 4382 0.178431 Grad Norm 2.185495 5.88s/it
Train loss 4383 0.238139 Grad Norm 2.828649 3.80s/it
Train loss 4384 0.172828 Grad Norm 0.500338 3.14s/it
Train loss 4385 0.121724 Grad Norm 0.650512 7.29s/it
Train loss 4386 0.122984 Grad Norm 1.124641 6.57s/it
Train loss 4387 0.131654 Grad Norm 0.819405 3.44s/it
Train loss 4388 0.243194 Grad Norm 0.677516 2.92s/it
Train loss 4389 0.174436 Grad Norm 1.407448 5.39s/it
Train loss 4390 0.126050 Grad Norm 0.615176 5.22s/it
Train loss 4391 0.253299 Grad Norm 1.347891 3.01s/it
Train loss 4392 0.184021 Grad Norm 0.571598 3.92s/it
Train loss 4393 0.151032 Grad Norm 0.767114 4.17s/it
Train loss 4394 0.193911 Grad Norm 1.017290 3.95s/it
Train loss 4395 0.197757 Grad Norm 0.997431 4.43s/it
Train loss 4396 0.184046 Grad Norm 0.979657 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4453 0.159335 Grad Norm 0.720416 6.65s/it
Train loss 4454 0.174878 Grad Norm 1.075954 8.39s/it
Train loss 4455 0.204874 Grad Norm 1.196792 3.04s/it
Train loss 4456 0.149769 Grad Norm 0.595349 2.93s/it
Train loss 4457 0.210159 Grad Norm 0.849366 5.23s/it
Train loss 4458 0.170285 Grad Norm 1.289054 2.76s/it
Train loss 4459 0.134673 Grad Norm 0.519376 4.12s/it
Train loss 4460 0.144694 Grad Norm 0.734193 2.29s/it
Train loss 4461 0.156521 Grad Norm 0.881809 5.60s/it
Train loss 4462 0.175427 Grad Norm 0.631130 5.43s/it
Train loss 4463 0.164672 Grad Norm 0.731012 4.08s/it
Train loss 4464 0.154664 Grad Norm 0.875047 3.56s/it
Train loss 4465 0.140979 Grad Norm 0.579526 3.56s/it
Train loss 4466 0.180759 Grad Norm 1.187582 3.17s/it
Train loss 4467 0.168651 Grad Norm 0.765467 7.34s/it
Train loss 4468 0.205241 Grad Norm 0.659227 3.24s/it
Train loss 4469 0.138768 Grad Norm 1.301389 3.33s/it
Train loss 4470 0.145478 Grad Norm 0.996364 2.29s/it
Train loss 4471 0.193329 Grad Norm 0.839798 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4536 0.161621 Grad Norm 0.533979 3.83s/it
Train loss 4537 0.144022 Grad Norm 0.339074 4.13s/it
Train loss 4538 0.194465 Grad Norm 0.677694 2.22s/it
Train loss 4539 0.146170 Grad Norm 0.638080 3.80s/it
Train loss 4540 0.193028 Grad Norm 0.801201 3.65s/it
Train loss 4541 0.131485 Grad Norm 0.621430 2.82s/it
Train loss 4542 0.203047 Grad Norm 0.291244 5.64s/it
Train loss 4543 0.221033 Grad Norm 0.684609 2.34s/it
Train loss 4544 0.124021 Grad Norm 0.189794 6.37s/it
Train loss 4545 0.148370 Grad Norm 0.422608 4.02s/it
Train loss 4546 0.158407 Grad Norm 0.762750 2.66s/it
Train loss 4547 0.174297 Grad Norm 0.545220 3.18s/it
Train loss 4548 0.140600 Grad Norm 0.393197 2.21s/it
Train loss 4549 0.140416 Grad Norm 0.464829 4.89s/it
Train loss 4550 0.144062 Grad Norm 0.936759 4.98s/it
Train loss 4551 0.183174 Grad Norm 0.467554 4.54s/it
Train loss 4552 0.165309 Grad Norm 0.543152 4.65s/it
Train loss 4553 0.199878 Grad Norm 0.892538 2.82s/it
Train loss 4554 0.205730 Grad Norm 0.932308 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4637 0.228691 Grad Norm 2.333058 3.70s/it
Train loss 4638 0.169200 Grad Norm 0.698211 4.09s/it
Train loss 4639 0.203719 Grad Norm 1.517484 3.66s/it
Train loss 4640 0.183857 Grad Norm 1.743647 3.96s/it
Train loss 4641 0.165286 Grad Norm 0.385076 4.78s/it
Train loss 4642 0.129126 Grad Norm 0.522317 2.79s/it
Train loss 4643 0.157831 Grad Norm 1.388044 4.04s/it
Train loss 4644 0.101898 Grad Norm 0.423366 4.36s/it
Train loss 4645 0.199561 Grad Norm 0.834542 3.02s/it
Train loss 4646 0.203640 Grad Norm 1.009751 2.32s/it
Train loss 4647 0.150082 Grad Norm 0.676600 4.20s/it
Train loss 4648 0.187365 Grad Norm 0.785394 5.66s/it
Train loss 4649 0.209282 Grad Norm 0.746987 5.16s/it
Train loss 4650 0.192957 Grad Norm 1.381586 2.25s/it
Train loss 4651 0.159936 Grad Norm 1.130817 4.81s/it
Train loss 4652 0.173651 Grad Norm 0.551595 8.23s/it
Train loss 4653 0.173903 Grad Norm 1.088823 3.91s/it
Train loss 4654 0.116918 Grad Norm 0.913101 5.19s/it
Train loss 4655 0.230492 Grad Norm 0.410920 1.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4704 0.186614 Grad Norm 0.576094 4.47s/it
Train loss 4705 0.183525 Grad Norm 0.982853 3.68s/it
Train loss 4706 0.152449 Grad Norm 0.563195 2.87s/it
Train loss 4707 0.163259 Grad Norm 0.452880 4.84s/it
Train loss 4708 0.211330 Grad Norm 1.246182 2.34s/it
Train loss 4709 0.170254 Grad Norm 1.465691 3.72s/it
Train loss 4710 0.217552 Grad Norm 0.478904 3.01s/it
Train loss 4711 0.167903 Grad Norm 1.210646 5.10s/it
Train loss 4712 0.244707 Grad Norm 1.981416 2.57s/it
Train loss 4713 0.218026 Grad Norm 0.712393 2.58s/it
Train loss 4714 0.187545 Grad Norm 1.412009 2.35s/it
Train loss 4715 0.229326 Grad Norm 1.543106 2.29s/it
Train loss 4716 0.260836 Grad Norm 0.910175 1.91s/it
Train loss 4717 0.199434 Grad Norm 0.847173 5.01s/it
Train loss 4718 0.139887 Grad Norm 1.066681 7.40s/it
Train loss 4719 0.189780 Grad Norm 0.812018 1.61s/it
Train loss 4720 0.174034 Grad Norm 0.565484 3.32s/it
Train loss 4721 0.193708 Grad Norm 0.481167 4.20s/it
Train loss 4722 0.252086 Grad Norm 2.916862 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4788 0.207459 Grad Norm 2.175468 4.44s/it
Train loss 4789 0.167829 Grad Norm 1.493216 6.81s/it
Train loss 4790 0.135888 Grad Norm 0.547574 5.40s/it
Train loss 4791 0.188849 Grad Norm 0.985844 5.64s/it
Train loss 4792 0.205772 Grad Norm 1.881617 2.25s/it
Train loss 4793 0.159718 Grad Norm 0.476466 3.31s/it
Train loss 4794 0.143423 Grad Norm 0.389125 4.47s/it
Train loss 4795 0.175682 Grad Norm 1.858344 5.19s/it
Train loss 4796 0.249584 Grad Norm 0.652463 3.25s/it
Train loss 4797 0.217507 Grad Norm 0.459559 1.80s/it
Train loss 4798 0.171133 Grad Norm 0.480287 4.68s/it
Train loss 4799 0.183432 Grad Norm 0.631085 2.87s/it
Train loss 4800 0.151597 Grad Norm 0.641615 3.18s/it
Train loss 4801 0.151723 Grad Norm 0.407595 5.77s/it
Train loss 4802 0.203597 Grad Norm 1.280188 2.84s/it
Train loss 4803 0.256270 Grad Norm 0.812648 2.92s/it
Train loss 4804 0.242804 Grad Norm 0.600732 2.28s/it
Train loss 4805 0.225830 Grad Norm 1.026234 2.79s/it
Train loss 4806 0.234789 Grad Norm 0.808114 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4872 0.215532 Grad Norm 0.689974 2.66s/it
Train loss 4873 0.161033 Grad Norm 0.686221 2.73s/it
Train loss 4874 0.236090 Grad Norm 1.360149 2.11s/it
Train loss 4875 0.230301 Grad Norm 0.529475 3.40s/it
Train loss 4876 0.210427 Grad Norm 0.593101 4.10s/it
Train loss 4877 0.199030 Grad Norm 0.396102 3.91s/it
Train loss 4878 0.191844 Grad Norm 0.635310 2.42s/it
Train loss 4879 0.208087 Grad Norm 0.624995 2.35s/it
Train loss 4880 0.223191 Grad Norm 0.957126 3.51s/it
Train loss 4881 0.286829 Grad Norm 1.888948 1.55s/it
Train loss 4882 0.187683 Grad Norm 1.337035 3.10s/it
Train loss 4883 0.189324 Grad Norm 0.548277 3.25s/it
Train loss 4884 0.240064 Grad Norm 0.819909 3.05s/it
Train loss 4885 0.193546 Grad Norm 0.495507 2.99s/it
Train loss 4886 0.225982 Grad Norm 1.145656 1.37s/it
Train loss 4887 0.183391 Grad Norm 1.684824 5.34s/it
Train loss 4888 0.197853 Grad Norm 0.864315 4.09s/it
Train loss 4889 0.229526 Grad Norm 1.258661 2.30s/it
Train loss 4890 0.230103 Grad Norm 0.778524 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 4956 0.172495 Grad Norm 0.450610 5.45s/it
Train loss 4957 0.150119 Grad Norm 0.878833 3.91s/it
Train loss 4958 0.127492 Grad Norm 0.933531 4.58s/it
Train loss 4959 0.240353 Grad Norm 0.654542 3.65s/it
Train loss 4960 0.197357 Grad Norm 0.949203 3.61s/it
Train loss 4961 0.217192 Grad Norm 1.817678 1.97s/it
Train loss 4962 0.154102 Grad Norm 1.092325 7.36s/it
Train loss 4963 0.223227 Grad Norm 2.316116 2.30s/it
Train loss 4964 0.222349 Grad Norm 2.398595 1.48s/it
Train loss 4965 0.175706 Grad Norm 1.468384 3.92s/it
Train loss 4966 0.166419 Grad Norm 0.768488 5.36s/it
Train loss 4967 0.205993 Grad Norm 1.435623 3.61s/it
Train loss 4968 0.233169 Grad Norm 3.036113 2.73s/it
Train loss 4969 0.144481 Grad Norm 1.140329 4.86s/it
Train loss 4970 0.188470 Grad Norm 0.814565 3.19s/it
Train loss 4971 0.218418 Grad Norm 2.202288 2.39s/it
Train loss 4972 0.123693 Grad Norm 1.000181 5.51s/it
Train loss 4973 0.162078 Grad Norm 0.971472 5.87s/it
Train loss 4974 0.160876 Grad Norm 0.315922 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5041 0.236611 Grad Norm 1.558765 2.99s/it
Train loss 5042 0.139915 Grad Norm 0.971218 6.28s/it
Train loss 5043 0.155979 Grad Norm 1.007484 6.57s/it
Train loss 5044 0.137898 Grad Norm 0.453960 4.57s/it
Train loss 5045 0.165431 Grad Norm 1.334167 3.39s/it
Train loss 5046 0.138156 Grad Norm 1.269841 4.89s/it
Train loss 5047 0.102580 Grad Norm 0.715023 4.03s/it
Train loss 5048 0.190463 Grad Norm 0.555795 4.16s/it
Train loss 5049 0.235635 Grad Norm 1.989233 3.36s/it
Train loss 5050 0.206604 Grad Norm 1.507853 2.03s/it
Train loss 5051 0.156106 Grad Norm 0.708076 4.01s/it
Train loss 5052 0.145247 Grad Norm 0.439633 3.63s/it
Train loss 5053 0.185101 Grad Norm 0.783929 1.51s/it
Train loss 5054 0.155734 Grad Norm 1.871504 3.27s/it
Train loss 5055 0.142621 Grad Norm 1.055516 3.79s/it
Train loss 5056 0.220975 Grad Norm 0.732439 3.34s/it
Train loss 5057 0.132307 Grad Norm 0.510145 8.08s/it
Train loss 5058 0.202506 Grad Norm 1.220456 5.00s/it
Train loss 5059 0.224533 Grad Norm 1.390882 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5129 0.176449 Grad Norm 0.514090 2.98s/it
Train loss 5130 0.139149 Grad Norm 0.464420 5.16s/it
Train loss 5131 0.248887 Grad Norm 1.451854 5.90s/it
Train loss 5132 0.187052 Grad Norm 0.544072 6.49s/it
Train loss 5133 0.201658 Grad Norm 0.604261 3.44s/it
Train loss 5134 0.169661 Grad Norm 0.767235 5.45s/it
Train loss 5135 0.160443 Grad Norm 0.353776 3.92s/it
Train loss 5136 0.168991 Grad Norm 0.529152 3.34s/it
Train loss 5137 0.144565 Grad Norm 0.548599 3.65s/it
Train loss 5138 0.193721 Grad Norm 0.403932 3.10s/it
Train loss 5139 0.220084 Grad Norm 0.413286 2.66s/it
Train loss 5140 0.185432 Grad Norm 0.350253 3.57s/it
Train loss 5141 0.212385 Grad Norm 0.407405 4.10s/it
Train loss 5142 0.158850 Grad Norm 0.289052 2.86s/it
Train loss 5143 0.170735 Grad Norm 0.471987 3.45s/it
Train loss 5144 0.157990 Grad Norm 1.753346 2.86s/it
Train loss 5145 0.197312 Grad Norm 1.449063 2.66s/it
Train loss 5146 0.162580 Grad Norm 0.919081 3.36s/it
Train loss 5147 0.148669 Grad Norm 1.160812 6.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5213 0.175579 Grad Norm 0.264386 5.58s/it
Train loss 5214 0.192723 Grad Norm 0.788983 3.51s/it
Train loss 5215 0.237642 Grad Norm 0.834002 3.33s/it
Train loss 5216 0.224803 Grad Norm 0.558714 2.43s/it
Train loss 5217 0.181013 Grad Norm 1.670124 7.37s/it
Train loss 5218 0.146480 Grad Norm 0.889358 3.87s/it
Train loss 5219 0.133377 Grad Norm 0.426158 3.83s/it
Train loss 5220 0.118470 Grad Norm 0.937977 5.27s/it
Train loss 5221 0.209256 Grad Norm 1.758264 2.53s/it
Train loss 5222 0.181093 Grad Norm 0.527976 5.29s/it
Train loss 5223 0.186766 Grad Norm 0.523406 2.61s/it
Train loss 5224 0.219046 Grad Norm 0.856477 3.68s/it
Train loss 5225 0.154895 Grad Norm 0.619007 5.19s/it
Train loss 5226 0.151258 Grad Norm 0.907207 5.00s/it
Train loss 5227 0.187380 Grad Norm 1.192041 2.31s/it
Train loss 5228 0.193432 Grad Norm 0.496855 3.16s/it
Train loss 5229 0.210113 Grad Norm 1.255234 6.61s/it
Train loss 5230 0.096792 Grad Norm 0.435409 6.54s/it
Train loss 5231 0.163110 Grad Norm 0.938725 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5296 0.164875 Grad Norm 0.710499 7.41s/it
Train loss 5297 0.241720 Grad Norm 1.281321 2.51s/it
Train loss 5298 0.237028 Grad Norm 0.628428 3.09s/it
Train loss 5299 0.194308 Grad Norm 0.780929 3.09s/it
Train loss 5300 0.221827 Grad Norm 0.748584 4.55s/it
Train loss 5301 0.126731 Grad Norm 0.331288 8.20s/it
Train loss 5302 0.147767 Grad Norm 0.681946 3.44s/it
Train loss 5303 0.193836 Grad Norm 0.915776 6.36s/it
Train loss 5304 0.213779 Grad Norm 0.954866 3.85s/it
Train loss 5305 0.269040 Grad Norm 0.907781 2.54s/it
Train loss 5306 0.138078 Grad Norm 0.794149 4.98s/it
Train loss 5307 0.139206 Grad Norm 0.603060 2.87s/it
Train loss 5308 0.203354 Grad Norm 1.292550 1.91s/it
Train loss 5309 0.157010 Grad Norm 0.994156 4.98s/it
Train loss 5310 0.158770 Grad Norm 1.597373 2.44s/it
Train loss 5311 0.148655 Grad Norm 0.486594 2.94s/it
Train loss 5312 0.124053 Grad Norm 0.401945 4.98s/it
Train loss 5313 0.154368 Grad Norm 1.567039 4.40s/it
Train loss 5314 0.220675 Grad Norm 2.137952 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5378 0.139912 Grad Norm 1.275045 2.55s/it
Train loss 5379 0.195718 Grad Norm 1.512437 2.28s/it
Train loss 5380 0.196998 Grad Norm 0.686794 2.08s/it
Train loss 5381 0.166261 Grad Norm 1.235421 3.19s/it
Train loss 5382 0.153247 Grad Norm 0.789885 3.90s/it
Train loss 5383 0.225212 Grad Norm 1.188064 2.02s/it
Train loss 5384 0.204764 Grad Norm 0.622574 4.01s/it
Train loss 5385 0.223816 Grad Norm 0.702712 2.28s/it
Train loss 5386 0.197329 Grad Norm 0.795646 3.81s/it
Train loss 5387 0.200985 Grad Norm 0.461617 2.69s/it
Train loss 5388 0.164222 Grad Norm 0.445937 4.04s/it
Train loss 5389 0.119411 Grad Norm 0.451500 3.78s/it
Train loss 5390 0.131918 Grad Norm 0.622464 5.39s/it
Train loss 5391 0.121869 Grad Norm 0.505587 2.45s/it
Train loss 5392 0.186871 Grad Norm 1.280471 2.97s/it
Train loss 5393 0.157095 Grad Norm 1.344399 2.38s/it
Train loss 5394 0.253646 Grad Norm 0.668462 1.33s/it
Train loss 5395 0.145119 Grad Norm 0.371116 6.37s/it
Train loss 5396 0.126162 Grad Norm 2.279817 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5460 0.149077 Grad Norm 0.528410 4.00s/it
Train loss 5461 0.211099 Grad Norm 0.703945 2.29s/it
Train loss 5462 0.234339 Grad Norm 1.075064 3.97s/it
Train loss 5463 0.183037 Grad Norm 0.248066 3.22s/it
Train loss 5464 0.161489 Grad Norm 1.063432 2.79s/it
Train loss 5465 0.207670 Grad Norm 1.629676 3.78s/it
Train loss 5466 0.159184 Grad Norm 0.895116 5.25s/it
Train loss 5467 0.163923 Grad Norm 1.131816 3.34s/it
Train loss 5468 0.169101 Grad Norm 0.754798 4.06s/it
Train loss 5469 0.255297 Grad Norm 1.241032 3.33s/it
Train loss 5470 0.148859 Grad Norm 0.740290 3.52s/it
Train loss 5471 0.154814 Grad Norm 0.552710 5.70s/it
Train loss 5472 0.186419 Grad Norm 0.496384 4.70s/it
Train loss 5473 0.177328 Grad Norm 0.658155 3.40s/it
Train loss 5474 0.160059 Grad Norm 0.369914 7.37s/it
Train loss 5475 0.208182 Grad Norm 0.370455 2.49s/it
Train loss 5476 0.124178 Grad Norm 0.594290 4.53s/it
Train loss 5477 0.171836 Grad Norm 0.373527 5.25s/it
Train loss 5478 0.161210 Grad Norm 0.459659 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5544 0.174021 Grad Norm 0.464923 3.94s/it
Train loss 5545 0.165842 Grad Norm 1.262570 5.42s/it
Train loss 5546 0.248336 Grad Norm 1.786896 3.24s/it
Train loss 5547 0.173691 Grad Norm 0.404627 6.45s/it
Train loss 5548 0.193140 Grad Norm 0.937460 3.90s/it
Train loss 5549 0.205964 Grad Norm 1.357008 2.78s/it
Train loss 5550 0.198224 Grad Norm 0.750550 3.12s/it
Train loss 5551 0.140632 Grad Norm 0.768018 5.26s/it
Train loss 5552 0.167899 Grad Norm 1.003954 4.95s/it
Train loss 5553 0.165416 Grad Norm 0.298827 5.37s/it
Train loss 5554 0.169225 Grad Norm 0.735779 3.69s/it
Train loss 5555 0.202742 Grad Norm 0.756242 3.69s/it
Train loss 5556 0.138216 Grad Norm 0.244156 3.62s/it
Train loss 5557 0.181712 Grad Norm 0.964259 1.70s/it
Train loss 5558 0.119835 Grad Norm 0.689230 4.12s/it
Train loss 5559 0.102431 Grad Norm 0.587869 5.48s/it
Train loss 5560 0.191399 Grad Norm 0.654001 4.53s/it
Train loss 5561 0.203467 Grad Norm 0.954458 2.64s/it
Train loss 5562 0.158538 Grad Norm 0.616500 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5637 0.137508 Grad Norm 0.779842 5.35s/it
Train loss 5638 0.205445 Grad Norm 0.596865 3.61s/it
Train loss 5639 0.203901 Grad Norm 0.714019 2.63s/it
Train loss 5640 0.243320 Grad Norm 1.209553 2.02s/it
Train loss 5641 0.174150 Grad Norm 0.608086 5.17s/it
Train loss 5642 0.145102 Grad Norm 0.636674 7.85s/it
Train loss 5643 0.130908 Grad Norm 0.293718 4.44s/it
Train loss 5644 0.225076 Grad Norm 0.943674 1.95s/it
Train loss 5645 0.168953 Grad Norm 0.353247 3.16s/it
Train loss 5646 0.177066 Grad Norm 0.690456 1.86s/it
Train loss 5647 0.119376 Grad Norm 0.558196 7.23s/it
Train loss 5648 0.238794 Grad Norm 0.694851 4.19s/it
Train loss 5649 0.229316 Grad Norm 1.058865 3.03s/it
Train loss 5650 0.168688 Grad Norm 0.274455 5.07s/it
Train loss 5651 0.141520 Grad Norm 0.980675 5.21s/it
Train loss 5652 0.156313 Grad Norm 0.513165 4.89s/it
Train loss 5653 0.144021 Grad Norm 0.465204 5.38s/it
Train loss 5654 0.204425 Grad Norm 1.589272 2.28s/it
Train loss 5655 0.197606 Grad Norm 0.835345 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5723 0.171834 Grad Norm 1.153218 3.03s/it
Train loss 5724 0.163437 Grad Norm 0.781383 1.98s/it
Train loss 5725 0.115621 Grad Norm 0.859196 8.08s/it
Train loss 5726 0.221937 Grad Norm 1.069523 2.31s/it
Train loss 5727 0.178654 Grad Norm 0.651503 4.93s/it
Train loss 5728 0.160718 Grad Norm 0.902284 3.81s/it
Train loss 5729 0.142107 Grad Norm 0.917819 4.84s/it
Train loss 5730 0.171325 Grad Norm 0.868401 2.99s/it
Train loss 5731 0.130125 Grad Norm 0.623150 7.25s/it
Train loss 5732 0.206998 Grad Norm 1.250746 1.98s/it
Train loss 5733 0.227291 Grad Norm 1.430440 3.02s/it
Train loss 5734 0.160930 Grad Norm 0.741843 2.43s/it
Train loss 5735 0.130457 Grad Norm 0.682312 5.17s/it
Train loss 5736 0.114136 Grad Norm 1.079027 5.56s/it
Train loss 5737 0.219244 Grad Norm 1.107875 4.64s/it
Train loss 5738 0.205623 Grad Norm 0.915134 3.20s/it
Train loss 5739 0.142827 Grad Norm 0.320369 3.99s/it
Train loss 5740 0.248628 Grad Norm 2.774082 5.37s/it
Train loss 5741 0.146585 Grad Norm 1.560684 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5799 0.140005 Grad Norm 1.438344 5.22s/it
Train loss 5800 0.203809 Grad Norm 0.581562 3.08s/it
Train loss 5801 0.185952 Grad Norm 0.867618 4.60s/it
Train loss 5802 0.131423 Grad Norm 0.273660 5.61s/it
Train loss 5803 0.254246 Grad Norm 0.569169 4.31s/it
Train loss 5804 0.210147 Grad Norm 0.941386 2.25s/it
Train loss 5805 0.207785 Grad Norm 0.708436 1.94s/it
Train loss 5806 0.178326 Grad Norm 0.547156 2.46s/it
Train loss 5807 0.200690 Grad Norm 1.182519 1.74s/it
Train loss 5808 0.250502 Grad Norm 0.803595 2.25s/it
Train loss 5809 0.173205 Grad Norm 1.169876 3.51s/it
Train loss 5810 0.208338 Grad Norm 1.023453 3.45s/it
Train loss 5811 0.175356 Grad Norm 0.392318 4.07s/it
Train loss 5812 0.109384 Grad Norm 0.383585 6.32s/it
Train loss 5813 0.178186 Grad Norm 1.066125 4.10s/it
Train loss 5814 0.148893 Grad Norm 0.747865 3.30s/it
Train loss 5815 0.213030 Grad Norm 0.685466 2.79s/it
Train loss 5816 0.214827 Grad Norm 1.111973 2.30s/it
Train loss 5817 0.154394 Grad Norm 0.911827 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5890 0.136528 Grad Norm 0.725363 2.55s/it
Train loss 5891 0.202604 Grad Norm 0.491820 3.24s/it
Train loss 5892 0.246174 Grad Norm 0.503987 3.57s/it
Train loss 5893 0.188444 Grad Norm 0.433894 3.55s/it
Train loss 5894 0.140433 Grad Norm 0.260640 5.42s/it
Train loss 5895 0.124771 Grad Norm 0.322229 4.19s/it
Train loss 5896 0.137966 Grad Norm 0.584664 2.83s/it
Train loss 5897 0.188205 Grad Norm 0.458315 3.93s/it
Train loss 5898 0.150063 Grad Norm 0.712773 5.18s/it
Train loss 5899 0.178637 Grad Norm 0.654639 2.60s/it
Train loss 5900 0.151925 Grad Norm 0.254399 5.50s/it
Train loss 5901 0.157304 Grad Norm 0.725138 2.77s/it
Train loss 5902 0.213506 Grad Norm 0.722900 3.57s/it
Train loss 5903 0.186399 Grad Norm 0.387308 5.61s/it
Train loss 5904 0.151028 Grad Norm 0.560502 4.84s/it
Train loss 5905 0.190729 Grad Norm 2.164836 4.22s/it
Train loss 5906 0.128297 Grad Norm 0.201763 6.32s/it
Train loss 5907 0.158619 Grad Norm 0.342079 2.90s/it
Train loss 5908 0.120070 Grad Norm 0.249361 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 5970 0.115955 Grad Norm 0.933977 7.84s/it
Train loss 5971 0.146785 Grad Norm 0.823392 4.26s/it
Train loss 5972 0.152382 Grad Norm 0.555155 3.89s/it
Train loss 5973 0.120875 Grad Norm 0.921958 4.77s/it
Train loss 5974 0.156201 Grad Norm 1.742586 2.69s/it
Train loss 5975 0.167630 Grad Norm 0.442970 2.92s/it
Train loss 5976 0.162601 Grad Norm 0.890462 1.52s/it
Train loss 5977 0.173540 Grad Norm 0.690867 3.62s/it
Train loss 5978 0.143534 Grad Norm 1.384595 2.43s/it
Train loss 5979 0.167831 Grad Norm 0.950062 3.19s/it
Train loss 5980 0.204113 Grad Norm 0.916263 4.96s/it
Train loss 5981 0.203012 Grad Norm 1.065908 3.30s/it
Train loss 5982 0.174156 Grad Norm 1.177973 4.09s/it
Train loss 5983 0.114562 Grad Norm 0.255051 8.08s/it
Train loss 5984 0.172123 Grad Norm 1.169891 3.68s/it
Train loss 5985 0.189820 Grad Norm 1.170011 4.78s/it
Train loss 5986 0.137616 Grad Norm 0.310928 4.93s/it
Train loss 5987 0.141020 Grad Norm 1.303078 5.50s/it
Train loss 5988 0.172043 Grad Norm 0.761599 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6048 0.222074 Grad Norm 0.765838 5.81s/it
Train loss 6049 0.233152 Grad Norm 0.587088 3.27s/it
Train loss 6050 0.132135 Grad Norm 1.476787 4.91s/it
Train loss 6051 0.304403 Grad Norm 3.380346 2.15s/it
Train loss 6052 0.180152 Grad Norm 0.741909 1.62s/it
Train loss 6053 0.118601 Grad Norm 0.576595 6.47s/it
Train loss 6054 0.131732 Grad Norm 0.897255 6.55s/it
Train loss 6055 0.193232 Grad Norm 0.763184 5.68s/it
Train loss 6056 0.164409 Grad Norm 0.937738 4.02s/it
Train loss 6057 0.176544 Grad Norm 1.235857 3.11s/it
Train loss 6058 0.142299 Grad Norm 0.796425 5.02s/it
Train loss 6059 0.137198 Grad Norm 0.476747 5.08s/it
Train loss 6060 0.154163 Grad Norm 1.140492 3.01s/it
Train loss 6061 0.140189 Grad Norm 0.708440 3.06s/it
Train loss 6062 0.166729 Grad Norm 0.777146 6.34s/it
Train loss 6063 0.236568 Grad Norm 0.742621 2.16s/it
Train loss 6064 0.136081 Grad Norm 0.915535 4.41s/it
Train loss 6065 0.239667 Grad Norm 1.955684 2.40s/it
Train loss 6066 0.195985 Grad Norm 0.818247 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6132 0.191745 Grad Norm 0.397008 5.15s/it
Train loss 6133 0.157719 Grad Norm 0.634150 8.34s/it
Train loss 6134 0.218975 Grad Norm 0.416094 3.91s/it
Train loss 6135 0.103689 Grad Norm 0.355268 5.62s/it
Train loss 6136 0.175397 Grad Norm 0.550471 2.79s/it
Train loss 6137 0.221976 Grad Norm 0.604718 1.00s/it
Train loss 6138 0.187728 Grad Norm 0.804695 4.95s/it
Train loss 6139 0.178027 Grad Norm 1.124137 5.17s/it
Train loss 6140 0.116498 Grad Norm 0.379160 4.93s/it
Train loss 6141 0.163157 Grad Norm 1.049879 5.40s/it
Train loss 6142 0.114832 Grad Norm 0.833486 5.16s/it
Train loss 6143 0.188186 Grad Norm 0.575318 5.10s/it
Train loss 6144 0.154186 Grad Norm 1.304242 5.52s/it
Train loss 6145 0.233668 Grad Norm 2.293500 2.66s/it
Train loss 6146 0.165520 Grad Norm 0.874860 3.13s/it
Train loss 6147 0.193961 Grad Norm 1.250219 3.36s/it
Train loss 6148 0.153794 Grad Norm 1.004438 5.61s/it
Train loss 6149 0.200823 Grad Norm 1.675008 2.51s/it
Train loss 6150 0.157631 Grad Norm 0.665766 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6222 0.272930 Grad Norm 0.854176 3.44s/it
Train loss 6223 0.139025 Grad Norm 0.473228 4.89s/it
Train loss 6224 0.157473 Grad Norm 0.816268 3.24s/it
Train loss 6225 0.169042 Grad Norm 1.222330 2.50s/it
Train loss 6226 0.119655 Grad Norm 0.455452 4.04s/it
Train loss 6227 0.181660 Grad Norm 0.787698 2.79s/it
Train loss 6228 0.180910 Grad Norm 0.725723 3.20s/it
Train loss 6229 0.251817 Grad Norm 0.795373 2.18s/it
Train loss 6230 0.155915 Grad Norm 0.752466 3.65s/it
Train loss 6231 0.126416 Grad Norm 0.765572 3.98s/it
Train loss 6232 0.141561 Grad Norm 0.618571 4.44s/it
Train loss 6233 0.113557 Grad Norm 0.468337 3.24s/it
Train loss 6234 0.219846 Grad Norm 1.001118 1.60s/it
Train loss 6235 0.190342 Grad Norm 1.552165 3.81s/it
Train loss 6236 0.121272 Grad Norm 0.301834 5.51s/it
Train loss 6237 0.144090 Grad Norm 0.775287 2.65s/it
Train loss 6238 0.180091 Grad Norm 1.147072 2.31s/it
Train loss 6239 0.179895 Grad Norm 0.373878 3.06s/it
Train loss 6240 0.251649 Grad Norm 1.687770 1.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6301 0.218322 Grad Norm 0.676400 4.83s/it
Train loss 6302 0.191341 Grad Norm 0.584847 2.69s/it
Train loss 6303 0.109658 Grad Norm 0.216151 6.36s/it
Train loss 6304 0.117218 Grad Norm 0.324034 8.19s/it
Train loss 6305 0.239620 Grad Norm 0.792093 3.05s/it
Train loss 6306 0.136061 Grad Norm 0.254608 4.53s/it
Train loss 6307 0.253155 Grad Norm 0.782790 3.21s/it
Train loss 6308 0.181091 Grad Norm 0.448156 5.19s/it
Train loss 6309 0.236439 Grad Norm 1.292582 2.40s/it
Train loss 6310 0.159995 Grad Norm 0.712803 4.99s/it
Train loss 6311 0.169443 Grad Norm 1.146667 4.93s/it
Train loss 6312 0.207905 Grad Norm 1.163075 1.17s/it
Train loss 6313 0.165462 Grad Norm 0.327813 2.39s/it
Train loss 6314 0.143222 Grad Norm 0.283461 3.83s/it
Train loss 6315 0.227011 Grad Norm 1.192335 2.01s/it
Train loss 6316 0.121262 Grad Norm 0.473078 6.22s/it
Train loss 6317 0.147448 Grad Norm 0.385102 7.66s/it
Train loss 6318 0.196020 Grad Norm 1.229742 3.93s/it
Train loss 6319 0.167607 Grad Norm 0.344440 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6386 0.139546 Grad Norm 0.385341 1.87s/it
Train loss 6387 0.171175 Grad Norm 0.870323 4.91s/it
Train loss 6388 0.229492 Grad Norm 0.950466 1.99s/it
Train loss 6389 0.158755 Grad Norm 0.520723 4.66s/it
Train loss 6390 0.169646 Grad Norm 0.720866 4.99s/it
Train loss 6391 0.141614 Grad Norm 0.454839 4.71s/it
Train loss 6392 0.185728 Grad Norm 0.764640 3.82s/it
Train loss 6393 0.203453 Grad Norm 0.812944 2.08s/it
Train loss 6394 0.158789 Grad Norm 0.734521 4.33s/it
Train loss 6395 0.199853 Grad Norm 1.712979 3.88s/it
Train loss 6396 0.199178 Grad Norm 2.054229 3.65s/it
Train loss 6397 0.152915 Grad Norm 1.095419 3.97s/it
Train loss 6398 0.160152 Grad Norm 1.243260 3.97s/it
Train loss 6399 0.219253 Grad Norm 2.308029 2.44s/it
Train loss 6400 0.142870 Grad Norm 1.032571 4.01s/it
Train loss 6401 0.123862 Grad Norm 0.760013 4.84s/it
Train loss 6402 0.199902 Grad Norm 1.922065 4.31s/it
Train loss 6403 0.122938 Grad Norm 0.467190 4.05s/it
Train loss 6404 0.145584 Grad Norm 1.086799 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6468 0.132791 Grad Norm 0.389569 5.07s/it
Train loss 6469 0.216404 Grad Norm 1.292370 5.11s/it
Train loss 6470 0.173771 Grad Norm 0.606590 4.95s/it
Train loss 6471 0.167896 Grad Norm 0.485151 3.23s/it
Train loss 6472 0.171847 Grad Norm 0.898772 2.96s/it
Train loss 6473 0.189469 Grad Norm 1.356330 5.15s/it
Train loss 6474 0.172941 Grad Norm 1.153056 4.70s/it
Train loss 6475 0.191356 Grad Norm 0.900336 2.69s/it
Train loss 6476 0.228126 Grad Norm 0.834925 2.28s/it
Train loss 6477 0.165959 Grad Norm 0.258362 2.83s/it
Train loss 6478 0.189148 Grad Norm 1.016308 1.76s/it
Train loss 6479 0.180712 Grad Norm 1.341882 2.86s/it
Train loss 6480 0.157103 Grad Norm 0.711904 2.40s/it
Train loss 6481 0.160203 Grad Norm 0.536605 4.10s/it
Train loss 6482 0.213812 Grad Norm 2.051959 3.42s/it
Train loss 6483 0.209290 Grad Norm 0.696494 1.82s/it
Train loss 6484 0.223922 Grad Norm 0.864425 4.52s/it
Train loss 6485 0.173096 Grad Norm 0.864231 5.63s/it
Train loss 6486 0.189355 Grad Norm 0.760892 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6552 0.119715 Grad Norm 0.498798 3.83s/it
Train loss 6553 0.131210 Grad Norm 0.760309 6.47s/it
Train loss 6554 0.245097 Grad Norm 0.524401 2.73s/it
Train loss 6555 0.110705 Grad Norm 0.674519 5.14s/it
Train loss 6556 0.189061 Grad Norm 0.951455 5.31s/it
Train loss 6557 0.160279 Grad Norm 0.715631 2.40s/it
Train loss 6558 0.212352 Grad Norm 1.367287 2.12s/it
Train loss 6559 0.150014 Grad Norm 0.794256 5.08s/it
Train loss 6560 0.162514 Grad Norm 0.995324 3.03s/it
Train loss 6561 0.168436 Grad Norm 0.825062 3.11s/it
Train loss 6562 0.115437 Grad Norm 0.643553 3.87s/it
Train loss 6563 0.126829 Grad Norm 0.324846 3.28s/it
Train loss 6564 0.195414 Grad Norm 1.939994 2.43s/it
Train loss 6565 0.136949 Grad Norm 0.614727 3.92s/it
Train loss 6566 0.171441 Grad Norm 1.613377 3.39s/it
Train loss 6567 0.139304 Grad Norm 0.661151 2.29s/it
Train loss 6568 0.206517 Grad Norm 0.754597 5.32s/it
Train loss 6569 0.139512 Grad Norm 0.392878 4.14s/it
Train loss 6570 0.200231 Grad Norm 0.830102 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6638 0.167657 Grad Norm 0.501849 4.16s/it
Train loss 6639 0.156696 Grad Norm 0.657069 2.77s/it
Train loss 6640 0.160130 Grad Norm 0.457415 6.37s/it
Train loss 6641 0.174966 Grad Norm 0.423320 3.35s/it
Train loss 6642 0.242702 Grad Norm 0.783441 1.45s/it
Train loss 6643 0.154690 Grad Norm 0.368667 7.53s/it
Train loss 6644 0.129546 Grad Norm 0.691553 3.96s/it
Train loss 6645 0.215702 Grad Norm 0.612486 2.28s/it
Train loss 6646 0.178147 Grad Norm 0.533743 1.55s/it
Train loss 6647 0.171844 Grad Norm 1.796231 2.43s/it
Train loss 6648 0.203502 Grad Norm 1.682175 3.79s/it
Train loss 6649 0.165381 Grad Norm 1.379533 4.12s/it
Train loss 6650 0.155037 Grad Norm 0.796341 2.89s/it
Train loss 6651 0.232152 Grad Norm 3.286103 3.52s/it
Train loss 6652 0.169874 Grad Norm 1.324361 2.91s/it
Train loss 6653 0.140653 Grad Norm 0.708151 5.27s/it
Train loss 6654 0.213988 Grad Norm 1.033343 2.85s/it
Train loss 6655 0.186273 Grad Norm 0.896130 2.97s/it
Train loss 6656 0.235133 Grad Norm 1.348776 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6720 0.178593 Grad Norm 1.110438 3.32s/it
Train loss 6721 0.146959 Grad Norm 1.962161 4.01s/it
Train loss 6722 0.167981 Grad Norm 2.034489 5.48s/it
Train loss 6723 0.163136 Grad Norm 0.629579 5.31s/it
Train loss 6724 0.193029 Grad Norm 0.796844 2.14s/it
Train loss 6725 0.189278 Grad Norm 1.270867 2.75s/it
Train loss 6726 0.165047 Grad Norm 1.304918 4.75s/it
Train loss 6727 0.155995 Grad Norm 0.615896 5.27s/it
Train loss 6728 0.205564 Grad Norm 2.032326 3.61s/it
Train loss 6729 0.155202 Grad Norm 1.508480 3.19s/it
Train loss 6730 0.200909 Grad Norm 0.723353 2.73s/it
Train loss 6731 0.216850 Grad Norm 0.416009 3.70s/it
Train loss 6732 0.137708 Grad Norm 0.641992 5.02s/it
Train loss 6733 0.168560 Grad Norm 0.601314 2.30s/it
Train loss 6734 0.152128 Grad Norm 0.534465 4.21s/it
Train loss 6735 0.123114 Grad Norm 0.633543 6.23s/it
Train loss 6736 0.135641 Grad Norm 0.531227 4.39s/it
Train loss 6737 0.212993 Grad Norm 0.573702 2.31s/it
Train loss 6738 0.153570 Grad Norm 0.568899 7.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6810 0.251409 Grad Norm 1.528261 2.34s/it
Train loss 6811 0.132424 Grad Norm 0.722888 4.34s/it
Train loss 6812 0.177389 Grad Norm 1.562538 3.77s/it
Train loss 6813 0.207924 Grad Norm 0.854877 3.18s/it
Train loss 6814 0.148855 Grad Norm 0.497529 5.63s/it
Train loss 6815 0.200073 Grad Norm 1.569701 3.08s/it
Train loss 6816 0.261609 Grad Norm 1.403038 3.17s/it
Train loss 6817 0.145079 Grad Norm 0.378346 5.48s/it
Train loss 6818 0.105252 Grad Norm 0.392350 5.44s/it
Train loss 6819 0.202152 Grad Norm 1.000653 3.46s/it
Train loss 6820 0.202114 Grad Norm 0.821739 2.90s/it
Train loss 6821 0.133288 Grad Norm 0.369536 2.25s/it
Train loss 6822 0.164247 Grad Norm 1.591659 3.61s/it
Train loss 6823 0.126281 Grad Norm 0.933748 6.33s/it
Train loss 6824 0.128841 Grad Norm 0.479110 4.74s/it
Train loss 6825 0.162036 Grad Norm 0.813008 1.22s/it
Train loss 6826 0.144708 Grad Norm 1.976056 5.26s/it
Train loss 6827 0.160699 Grad Norm 1.440193 5.29s/it
Train loss 6828 0.120100 Grad Norm 0.308738 8.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6898 0.158026 Grad Norm 0.425274 3.23s/it
Train loss 6899 0.123049 Grad Norm 0.286371 3.71s/it
Train loss 6900 0.149333 Grad Norm 0.317983 2.76s/it
Train loss 6901 0.192229 Grad Norm 1.333243 4.08s/it
Train loss 6902 0.127968 Grad Norm 0.482597 5.38s/it
Train loss 6903 0.180134 Grad Norm 1.130030 5.63s/it
Train loss 6904 0.167786 Grad Norm 0.265536 4.15s/it
Train loss 6905 0.191423 Grad Norm 0.855505 3.60s/it
Train loss 6906 0.164572 Grad Norm 0.707378 5.72s/it
Train loss 6907 0.230342 Grad Norm 0.650455 2.86s/it
Train loss 6908 0.163536 Grad Norm 0.549477 4.39s/it
Train loss 6909 0.216881 Grad Norm 0.411399 3.08s/it
Train loss 6910 0.188175 Grad Norm 0.462000 2.64s/it
Train loss 6911 0.205386 Grad Norm 0.719581 2.87s/it
Train loss 6912 0.191002 Grad Norm 0.339015 4.45s/it
Train loss 6913 0.167523 Grad Norm 1.411129 3.54s/it
Train loss 6914 0.209802 Grad Norm 0.534875 2.76s/it
Train loss 6915 0.130596 Grad Norm 0.343289 2.27s/it
Train loss 6916 0.250512 Grad Norm 2.249106 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 6972 0.132110 Grad Norm 1.354238 5.64s/it
Train loss 6973 0.183545 Grad Norm 2.699783 6.46s/it
Train loss 6974 0.103526 Grad Norm 0.836881 5.59s/it
Train loss 6975 0.174409 Grad Norm 0.404785 4.58s/it
Train loss 6976 0.174151 Grad Norm 2.174354 4.12s/it
Train loss 6977 0.147635 Grad Norm 1.197181 4.17s/it
Train loss 6978 0.153912 Grad Norm 1.234161 5.25s/it
Train loss 6979 0.152146 Grad Norm 1.340219 2.83s/it
Train loss 6980 0.124441 Grad Norm 1.473729 5.17s/it
Train loss 6981 0.166576 Grad Norm 1.828180 3.17s/it
Train loss 6982 0.168205 Grad Norm 0.448464 1.94s/it
Train loss 6983 0.162541 Grad Norm 0.759476 5.12s/it
Train loss 6984 0.144231 Grad Norm 1.202345 4.98s/it
Train loss 6985 0.181305 Grad Norm 0.465979 2.29s/it
Train loss 6986 0.143683 Grad Norm 0.513060 4.35s/it
Train loss 6987 0.104907 Grad Norm 1.010610 4.78s/it
Train loss 6988 0.243489 Grad Norm 1.288434 2.25s/it
Train loss 6989 0.183249 Grad Norm 0.644222 2.80s/it
Train loss 6990 0.205689 Grad Norm 1.112945 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7056 0.174694 Grad Norm 1.443810 3.59s/it
Train loss 7057 0.208614 Grad Norm 1.716560 5.05s/it
Train loss 7058 0.149485 Grad Norm 0.633757 5.66s/it
Train loss 7059 0.205537 Grad Norm 0.968732 2.05s/it
Train loss 7060 0.134746 Grad Norm 0.566795 5.09s/it
Train loss 7061 0.167702 Grad Norm 0.592058 4.12s/it
Train loss 7062 0.149273 Grad Norm 0.658599 5.19s/it
Train loss 7063 0.157358 Grad Norm 1.030731 3.02s/it
Train loss 7064 0.147428 Grad Norm 1.436538 1.87s/it
Train loss 7065 0.163565 Grad Norm 0.432298 4.72s/it
Train loss 7066 0.138337 Grad Norm 0.603336 4.12s/it
Train loss 7067 0.205148 Grad Norm 1.316120 3.56s/it
Train loss 7068 0.127358 Grad Norm 0.351716 5.66s/it
Train loss 7069 0.128720 Grad Norm 0.387993 3.45s/it
Train loss 7070 0.179954 Grad Norm 1.147756 2.69s/it
Train loss 7071 0.173884 Grad Norm 0.751860 2.24s/it
Train loss 7072 0.157173 Grad Norm 0.673614 2.80s/it
Train loss 7073 0.205079 Grad Norm 0.875676 4.01s/it
Train loss 7074 0.204355 Grad Norm 0.582786 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7147 0.174471 Grad Norm 1.009927 4.84s/it
Train loss 7148 0.160322 Grad Norm 0.940795 4.88s/it
Train loss 7149 0.227701 Grad Norm 1.389141 5.29s/it
Train loss 7150 0.208561 Grad Norm 0.538473 3.08s/it
Train loss 7151 0.157800 Grad Norm 0.807538 2.66s/it
Train loss 7152 0.132113 Grad Norm 0.657120 5.15s/it
Train loss 7153 0.156528 Grad Norm 0.530282 3.94s/it
Train loss 7154 0.163264 Grad Norm 0.338218 5.20s/it
Train loss 7155 0.156470 Grad Norm 0.398987 3.28s/it
Train loss 7156 0.160905 Grad Norm 0.392891 1.98s/it
Train loss 7157 0.254623 Grad Norm 1.069201 3.39s/it
Train loss 7158 0.113174 Grad Norm 0.343343 7.20s/it
Train loss 7159 0.124792 Grad Norm 0.216419 5.33s/it
Train loss 7160 0.160213 Grad Norm 0.947237 2.55s/it
Train loss 7161 0.184557 Grad Norm 0.541073 2.59s/it
Train loss 7162 0.145969 Grad Norm 0.336662 3.12s/it
Train loss 7163 0.162981 Grad Norm 0.352617 1.94s/it
Train loss 7164 0.152242 Grad Norm 0.682883 4.05s/it
Train loss 7165 0.205485 Grad Norm 0.527254 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7232 0.122414 Grad Norm 1.102384 2.28s/it
Train loss 7233 0.198131 Grad Norm 1.171483 2.71s/it
Train loss 7234 0.208977 Grad Norm 0.831389 3.86s/it
Train loss 7235 0.214642 Grad Norm 1.137784 3.40s/it
Train loss 7236 0.208409 Grad Norm 1.260322 3.92s/it
Train loss 7237 0.217605 Grad Norm 0.555297 5.35s/it
Train loss 7238 0.139394 Grad Norm 1.355407 2.51s/it
Train loss 7239 0.128311 Grad Norm 1.028773 5.26s/it
Train loss 7240 0.112693 Grad Norm 0.937381 4.90s/it
Train loss 7241 0.181724 Grad Norm 1.948113 5.22s/it
Train loss 7242 0.159691 Grad Norm 1.698528 3.09s/it
Train loss 7243 0.186634 Grad Norm 1.450193 6.42s/it
Train loss 7244 0.130744 Grad Norm 0.366848 6.56s/it
Train loss 7245 0.190698 Grad Norm 1.722595 4.10s/it
Train loss 7246 0.143355 Grad Norm 1.096826 7.31s/it
Train loss 7247 0.189311 Grad Norm 1.165439 1.95s/it
Train loss 7248 0.118272 Grad Norm 0.572527 5.32s/it
Train loss 7249 0.252486 Grad Norm 1.261528 3.23s/it
Train loss 7250 0.178473 Grad Norm 1.532645 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7308 0.150131 Grad Norm 0.532658 3.99s/it
Train loss 7309 0.116263 Grad Norm 0.637673 5.73s/it
Train loss 7310 0.188537 Grad Norm 0.490189 3.92s/it
Train loss 7311 0.164817 Grad Norm 0.825523 3.14s/it
Train loss 7312 0.155809 Grad Norm 0.811245 4.55s/it
Train loss 7313 0.173571 Grad Norm 0.785386 3.37s/it
Train loss 7314 0.148273 Grad Norm 0.501495 3.65s/it
Train loss 7315 0.152539 Grad Norm 0.421503 3.34s/it
Train loss 7316 0.127117 Grad Norm 0.685870 8.25s/it
Train loss 7317 0.182036 Grad Norm 0.658935 4.57s/it
Train loss 7318 0.154202 Grad Norm 0.513147 4.16s/it
Train loss 7319 0.138574 Grad Norm 0.622479 2.02s/it
Train loss 7320 0.142753 Grad Norm 0.438558 2.06s/it
Train loss 7321 0.201826 Grad Norm 0.361544 2.29s/it
Train loss 7322 0.213968 Grad Norm 0.821726 3.35s/it
Train loss 7323 0.230877 Grad Norm 0.465504 3.38s/it
Train loss 7324 0.243662 Grad Norm 0.915339 5.05s/it
Train loss 7325 0.176584 Grad Norm 0.502143 3.99s/it
Train loss 7326 0.098175 Grad Norm 0.423321 7.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7392 0.218831 Grad Norm 1.536945 4.52s/it
Train loss 7393 0.223327 Grad Norm 0.852171 6.59s/it
Train loss 7394 0.281387 Grad Norm 1.474542 5.40s/it
Train loss 7395 0.191720 Grad Norm 1.846257 4.05s/it
Train loss 7396 0.169638 Grad Norm 1.613144 2.98s/it
Train loss 7397 0.252593 Grad Norm 1.481417 4.66s/it
Train loss 7398 0.189657 Grad Norm 0.546784 5.22s/it
Train loss 7399 0.187845 Grad Norm 1.056006 3.53s/it
Train loss 7400 0.150833 Grad Norm 0.985050 5.65s/it
Train loss 7401 0.224657 Grad Norm 0.845351 4.06s/it
Train loss 7402 0.256919 Grad Norm 1.561828 1.88s/it
Train loss 7403 0.228414 Grad Norm 1.402256 3.35s/it
Train loss 7404 0.148246 Grad Norm 0.929738 3.62s/it
Train loss 7405 0.145279 Grad Norm 1.321848 5.33s/it
Train loss 7406 0.212939 Grad Norm 1.862636 4.40s/it
Train loss 7407 0.217476 Grad Norm 1.630299 4.62s/it
Train loss 7408 0.194023 Grad Norm 1.207837 5.59s/it
Train loss 7409 0.289569 Grad Norm 2.230974 2.62s/it
Train loss 7410 0.145420 Grad Norm 1.417493 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7486 0.133917 Grad Norm 0.997231 4.10s/it
Train loss 7487 0.219803 Grad Norm 1.351079 1.95s/it
Train loss 7488 0.210506 Grad Norm 0.927475 2.56s/it
Train loss 7489 0.124463 Grad Norm 0.369560 8.20s/it
Train loss 7490 0.169717 Grad Norm 0.554905 3.30s/it
Train loss 7491 0.169726 Grad Norm 0.314142 2.68s/it
Train loss 7492 0.226274 Grad Norm 0.621626 3.18s/it
Train loss 7493 0.178251 Grad Norm 0.710070 6.35s/it
Train loss 7494 0.129500 Grad Norm 0.396126 6.51s/it
Train loss 7495 0.185209 Grad Norm 0.660605 5.62s/it
Train loss 7496 0.204128 Grad Norm 0.412184 3.65s/it
Train loss 7497 0.203626 Grad Norm 0.953439 3.23s/it
Train loss 7498 0.166821 Grad Norm 0.502821 2.45s/it
Train loss 7499 0.207149 Grad Norm 1.008276 3.10s/it
Train loss 7500 0.142096 Grad Norm 0.512646 4.20s/it
Train loss 7501 0.128061 Grad Norm 0.527003 3.32s/it
Train loss 7502 0.198210 Grad Norm 1.473683 3.13s/it
Train loss 7503 0.242240 Grad Norm 0.912552 2.90s/it
Train loss 7504 0.178350 Grad Norm 1.063710 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7560 0.156245 Grad Norm 0.346288 6.03s/it
Train loss 7561 0.193873 Grad Norm 0.595324 2.14s/it
Train loss 7562 0.202289 Grad Norm 0.799708 2.99s/it
Train loss 7563 0.148034 Grad Norm 0.524367 3.95s/it
Train loss 7564 0.237193 Grad Norm 1.218232 2.31s/it
Train loss 7565 0.224734 Grad Norm 0.314449 4.51s/it
Train loss 7566 0.137704 Grad Norm 2.358185 7.22s/it
Train loss 7567 0.202518 Grad Norm 1.626487 3.17s/it
Train loss 7568 0.169462 Grad Norm 0.801478 4.87s/it
Train loss 7569 0.173789 Grad Norm 0.454083 5.19s/it
Train loss 7570 0.216457 Grad Norm 2.291287 3.04s/it
Train loss 7571 0.235558 Grad Norm 1.664047 3.57s/it
Train loss 7572 0.187214 Grad Norm 1.132066 4.74s/it
Train loss 7573 0.188988 Grad Norm 0.783854 2.47s/it
Train loss 7574 0.154988 Grad Norm 1.462597 5.05s/it
Train loss 7575 0.177261 Grad Norm 1.865414 1.64s/it
Train loss 7576 0.196439 Grad Norm 1.427194 2.85s/it
Train loss 7577 0.188216 Grad Norm 1.227525 3.90s/it
Train loss 7578 0.210108 Grad Norm 1.488254 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7644 0.301983 Grad Norm 3.341439 2.12s/it
Train loss 7645 0.139171 Grad Norm 1.090724 5.01s/it
Train loss 7646 0.173177 Grad Norm 0.604093 3.24s/it
Train loss 7647 0.256997 Grad Norm 1.453622 2.33s/it
Train loss 7648 0.254377 Grad Norm 2.985255 3.11s/it
Train loss 7649 0.252477 Grad Norm 1.422134 3.08s/it
Train loss 7650 0.202000 Grad Norm 0.581655 3.24s/it
Train loss 7651 0.151971 Grad Norm 0.650015 4.94s/it
Train loss 7652 0.273120 Grad Norm 2.254011 3.30s/it
Train loss 7653 0.126148 Grad Norm 0.932839 3.19s/it
Train loss 7654 0.277711 Grad Norm 0.967497 1.38s/it
Train loss 7655 0.180675 Grad Norm 1.667475 2.02s/it
Train loss 7656 0.218546 Grad Norm 1.584052 4.71s/it
Train loss 7657 0.163155 Grad Norm 0.694754 4.07s/it
Train loss 7658 0.204266 Grad Norm 1.243723 4.62s/it
Train loss 7659 0.264844 Grad Norm 2.298913 6.51s/it
Train loss 7660 0.218071 Grad Norm 0.735686 3.61s/it
Train loss 7661 0.201333 Grad Norm 0.823495 3.29s/it
Train loss 7662 0.176519 Grad Norm 0.683131 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7728 0.142358 Grad Norm 0.596975 5.71s/it
Train loss 7729 0.196787 Grad Norm 2.217150 3.18s/it
Train loss 7730 0.167395 Grad Norm 2.270783 6.23s/it
Train loss 7731 0.170444 Grad Norm 0.446500 2.90s/it
Train loss 7732 0.198362 Grad Norm 0.616999 2.21s/it
Train loss 7733 0.189047 Grad Norm 1.691887 2.00s/it
Train loss 7734 0.192130 Grad Norm 1.131911 2.91s/it
Train loss 7735 0.283621 Grad Norm 0.443061 1.83s/it
Train loss 7736 0.133197 Grad Norm 0.934119 4.35s/it
Train loss 7737 0.154581 Grad Norm 0.996404 6.48s/it
Train loss 7738 0.150149 Grad Norm 0.437575 7.67s/it
Train loss 7739 0.190808 Grad Norm 0.793243 2.46s/it
Train loss 7740 0.182085 Grad Norm 2.194070 3.12s/it
Train loss 7741 0.250206 Grad Norm 1.261437 4.27s/it
Train loss 7742 0.184551 Grad Norm 0.770904 3.98s/it
Train loss 7743 0.161547 Grad Norm 0.641678 3.07s/it
Train loss 7744 0.176655 Grad Norm 0.714741 5.53s/it
Train loss 7745 0.140426 Grad Norm 0.335986 3.61s/it
Train loss 7746 0.179116 Grad Norm 0.487917 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7812 0.172172 Grad Norm 0.638538 2.51s/it
Train loss 7813 0.116130 Grad Norm 0.407611 3.58s/it
Train loss 7814 0.204241 Grad Norm 1.221931 3.36s/it
Train loss 7815 0.135601 Grad Norm 0.812789 5.34s/it
Train loss 7816 0.181315 Grad Norm 0.756056 5.61s/it
Train loss 7817 0.185153 Grad Norm 1.405423 4.79s/it
Train loss 7818 0.148039 Grad Norm 0.931315 3.21s/it
Train loss 7819 0.152260 Grad Norm 0.832795 2.60s/it
Train loss 7820 0.179711 Grad Norm 1.076763 2.87s/it
Train loss 7821 0.185478 Grad Norm 0.841602 2.78s/it
Train loss 7822 0.176462 Grad Norm 0.528562 1.99s/it
Train loss 7823 0.166631 Grad Norm 1.131785 4.88s/it
Train loss 7824 0.134038 Grad Norm 0.904426 6.47s/it
Train loss 7825 0.191642 Grad Norm 0.867864 1.75s/it
Train loss 7826 0.208614 Grad Norm 1.647884 5.13s/it
Train loss 7827 0.171944 Grad Norm 0.593253 7.61s/it
Train loss 7828 0.219197 Grad Norm 0.971923 2.91s/it
Train loss 7829 0.274933 Grad Norm 1.720197 2.53s/it
Train loss 7830 0.131602 Grad Norm 0.803071 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7896 0.182196 Grad Norm 0.734887 3.26s/it
Train loss 7897 0.199919 Grad Norm 0.730246 3.03s/it
Train loss 7898 0.189416 Grad Norm 0.889887 3.61s/it
Train loss 7899 0.118650 Grad Norm 0.411648 4.97s/it
Train loss 7900 0.214142 Grad Norm 0.517335 3.49s/it
Train loss 7901 0.189969 Grad Norm 0.828005 4.41s/it
Train loss 7902 0.130191 Grad Norm 0.240359 5.20s/it
Train loss 7903 0.192737 Grad Norm 0.503905 3.60s/it
Train loss 7904 0.191584 Grad Norm 0.975416 1.77s/it
Train loss 7905 0.200436 Grad Norm 0.427680 2.53s/it
Train loss 7906 0.156715 Grad Norm 0.674297 7.27s/it
Train loss 7907 0.130476 Grad Norm 0.622628 4.85s/it
Train loss 7908 0.162340 Grad Norm 0.380403 2.56s/it
Train loss 7909 0.193409 Grad Norm 0.450639 2.27s/it
Train loss 7910 0.178611 Grad Norm 0.390745 3.78s/it
Train loss 7911 0.145757 Grad Norm 0.666595 3.13s/it
Train loss 7912 0.145633 Grad Norm 0.237566 5.04s/it
Train loss 7913 0.270520 Grad Norm 1.330282 3.83s/it
Train loss 7914 0.109672 Grad Norm 0.555208 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 7980 0.166896 Grad Norm 0.362498 3.85s/it
Train loss 7981 0.209921 Grad Norm 0.682028 2.22s/it
Train loss 7982 0.180192 Grad Norm 1.885218 5.69s/it
Train loss 7983 0.156150 Grad Norm 0.835248 2.83s/it
Train loss 7984 0.162750 Grad Norm 1.492111 3.24s/it
Train loss 7985 0.170189 Grad Norm 0.973338 4.47s/it
Train loss 7986 0.157797 Grad Norm 0.305838 4.34s/it
Train loss 7987 0.199593 Grad Norm 2.416248 4.60s/it
Train loss 7988 0.173852 Grad Norm 2.270340 4.09s/it
Train loss 7989 0.202104 Grad Norm 1.618521 2.61s/it
Train loss 7990 0.130595 Grad Norm 0.522281 4.75s/it
Train loss 7991 0.128108 Grad Norm 0.824505 7.35s/it
Train loss 7992 0.154567 Grad Norm 0.833011 2.35s/it
Train loss 7993 0.163407 Grad Norm 0.690095 3.99s/it
Train loss 7994 0.146501 Grad Norm 0.450178 8.16s/it
Train loss 7995 0.170583 Grad Norm 1.425342 3.43s/it
Train loss 7996 0.164741 Grad Norm 1.772395 4.98s/it
Train loss 7997 0.139597 Grad Norm 0.564519 3.98s/it
Train loss 7998 0.248285 Grad Norm 0.649443 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8065 0.213547 Grad Norm 0.581587 2.51s/it
Train loss 8066 0.211350 Grad Norm 0.855618 3.62s/it
Train loss 8067 0.148183 Grad Norm 0.613833 5.29s/it
Train loss 8068 0.180952 Grad Norm 0.565937 5.22s/it
Train loss 8069 0.162045 Grad Norm 0.684688 3.44s/it
Train loss 8070 0.205547 Grad Norm 1.672642 3.66s/it
Train loss 8071 0.161426 Grad Norm 0.737974 4.61s/it
Train loss 8072 0.122693 Grad Norm 0.555768 5.31s/it
Train loss 8073 0.190376 Grad Norm 1.563000 3.77s/it
Train loss 8074 0.143524 Grad Norm 0.731093 4.42s/it
Train loss 8075 0.108162 Grad Norm 0.284498 7.31s/it
Train loss 8076 0.199888 Grad Norm 0.905665 3.32s/it
Train loss 8077 0.132463 Grad Norm 1.136413 4.07s/it
Train loss 8078 0.147022 Grad Norm 0.991951 1.39s/it
Train loss 8079 0.171959 Grad Norm 0.491812 3.59s/it
Train loss 8080 0.141434 Grad Norm 1.860110 5.55s/it
Train loss 8081 0.127813 Grad Norm 1.158683 5.01s/it
Train loss 8082 0.146495 Grad Norm 0.379455 3.83s/it
Train loss 8083 0.109594 Grad Norm 0.236060 7.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8148 0.158617 Grad Norm 2.306293 5.97s/it
Train loss 8149 0.233362 Grad Norm 1.784636 3.46s/it
Train loss 8150 0.125587 Grad Norm 0.406236 4.57s/it
Train loss 8151 0.174814 Grad Norm 1.186799 2.01s/it
Train loss 8152 0.259423 Grad Norm 2.648730 3.91s/it
Train loss 8153 0.215217 Grad Norm 0.901759 2.49s/it
Train loss 8154 0.171996 Grad Norm 1.478871 3.34s/it
Train loss 8155 0.225493 Grad Norm 3.154876 1.72s/it
Train loss 8156 0.099199 Grad Norm 0.948543 4.99s/it
Train loss 8157 0.123622 Grad Norm 0.243998 3.74s/it
Train loss 8158 0.201798 Grad Norm 0.954613 3.22s/it
Train loss 8159 0.165874 Grad Norm 1.129467 3.88s/it
Train loss 8160 0.211762 Grad Norm 0.710010 3.01s/it
Train loss 8161 0.120558 Grad Norm 0.331857 4.05s/it
Train loss 8162 0.121144 Grad Norm 0.579484 5.09s/it
Train loss 8163 0.174896 Grad Norm 0.396474 7.46s/it
Train loss 8164 0.227521 Grad Norm 0.466805 2.62s/it
Train loss 8165 0.155819 Grad Norm 0.812708 4.18s/it
Train loss 8166 0.210553 Grad Norm 0.716021 1.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8241 0.129667 Grad Norm 0.848252 3.60s/it
Train loss 8242 0.116153 Grad Norm 1.357066 5.18s/it
Train loss 8243 0.185357 Grad Norm 1.258104 3.05s/it
Train loss 8244 0.145130 Grad Norm 0.472385 6.44s/it
Train loss 8245 0.175372 Grad Norm 1.696351 3.62s/it
Train loss 8246 0.189112 Grad Norm 0.617645 2.28s/it
Train loss 8247 0.166507 Grad Norm 0.871596 2.41s/it
Train loss 8248 0.165445 Grad Norm 0.527919 4.37s/it
Train loss 8249 0.098377 Grad Norm 0.665615 5.49s/it
Train loss 8250 0.132064 Grad Norm 1.020129 5.02s/it
Train loss 8251 0.177412 Grad Norm 0.523098 2.81s/it
Train loss 8252 0.134618 Grad Norm 0.350595 8.22s/it
Train loss 8253 0.155046 Grad Norm 0.688179 3.28s/it
Train loss 8254 0.213187 Grad Norm 1.817649 3.43s/it
Train loss 8255 0.204636 Grad Norm 0.483420 2.65s/it
Train loss 8256 0.154072 Grad Norm 0.913952 4.86s/it
Train loss 8257 0.111600 Grad Norm 0.499193 5.68s/it
Train loss 8258 0.187666 Grad Norm 0.430807 5.39s/it
Train loss 8259 0.203384 Grad Norm 1.643232 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8325 0.188125 Grad Norm 0.490328 2.84s/it
Train loss 8326 0.181384 Grad Norm 0.476712 1.37s/it
Train loss 8327 0.188737 Grad Norm 0.462647 3.20s/it
Train loss 8328 0.175702 Grad Norm 0.567315 5.09s/it
Train loss 8329 0.236316 Grad Norm 0.906038 3.04s/it
Train loss 8330 0.176627 Grad Norm 1.360649 3.28s/it
Train loss 8331 0.182236 Grad Norm 1.276561 2.96s/it
Train loss 8332 0.170353 Grad Norm 0.439317 5.64s/it
Train loss 8333 0.106054 Grad Norm 0.252028 3.32s/it
Train loss 8334 0.181894 Grad Norm 1.036932 4.16s/it
Train loss 8335 0.148727 Grad Norm 0.788083 3.63s/it
Train loss 8336 0.122487 Grad Norm 0.387555 4.21s/it
Train loss 8337 0.118749 Grad Norm 0.655817 5.56s/it
Train loss 8338 0.143195 Grad Norm 0.689958 4.70s/it
Train loss 8339 0.162526 Grad Norm 0.463679 4.92s/it
Train loss 8340 0.131688 Grad Norm 0.326774 7.51s/it
Train loss 8341 0.157826 Grad Norm 0.490805 3.92s/it
Train loss 8342 0.115004 Grad Norm 0.585731 3.28s/it
Train loss 8343 0.208893 Grad Norm 0.458903 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8400 0.131458 Grad Norm 0.475729 7.01s/it
Train loss 8401 0.177250 Grad Norm 0.916282 4.86s/it
Train loss 8402 0.105631 Grad Norm 0.559476 8.22s/it
Train loss 8403 0.173422 Grad Norm 1.429556 4.01s/it
Train loss 8404 0.199542 Grad Norm 1.002833 2.33s/it
Train loss 8405 0.154999 Grad Norm 1.321549 3.54s/it
Train loss 8406 0.236333 Grad Norm 2.949277 3.57s/it
Train loss 8407 0.151426 Grad Norm 1.143110 7.55s/it
Train loss 8408 0.202359 Grad Norm 0.575094 3.62s/it
Train loss 8409 0.235338 Grad Norm 1.470350 2.94s/it
Train loss 8410 0.143577 Grad Norm 1.042238 5.07s/it
Train loss 8411 0.197248 Grad Norm 0.873478 3.44s/it
Train loss 8412 0.162773 Grad Norm 1.319873 4.94s/it
Train loss 8413 0.189355 Grad Norm 1.134973 2.37s/it
Train loss 8414 0.164542 Grad Norm 0.513388 5.06s/it
Train loss 8415 0.114697 Grad Norm 0.429915 5.52s/it
Train loss 8416 0.121360 Grad Norm 0.725166 4.40s/it
Train loss 8417 0.169841 Grad Norm 0.962752 1.72s/it
Train loss 8418 0.205421 Grad Norm 0.514812 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8497 0.199282 Grad Norm 0.817824 1.23s/it
Train loss 8498 0.176716 Grad Norm 0.964510 2.49s/it
Train loss 8499 0.127846 Grad Norm 0.728257 6.32s/it
Train loss 8500 0.148639 Grad Norm 0.545336 7.36s/it
Train loss 8501 0.187168 Grad Norm 1.683658 5.78s/it
Train loss 8502 0.124440 Grad Norm 0.816424 4.85s/it
Train loss 8503 0.194902 Grad Norm 1.161382 3.98s/it
Train loss 8504 0.134238 Grad Norm 0.906071 4.98s/it
Train loss 8505 0.176073 Grad Norm 1.719844 2.81s/it
Train loss 8506 0.207256 Grad Norm 2.242262 2.63s/it
Train loss 8507 0.154868 Grad Norm 0.597018 4.17s/it
Train loss 8508 0.122055 Grad Norm 0.552022 3.94s/it
Train loss 8509 0.187662 Grad Norm 1.799437 3.44s/it
Train loss 8510 0.165357 Grad Norm 1.219147 1.92s/it
Train loss 8511 0.184475 Grad Norm 0.460913 2.51s/it
Train loss 8512 0.178993 Grad Norm 2.117807 5.09s/it
Train loss 8513 0.177449 Grad Norm 1.551758 5.55s/it
Train loss 8514 0.134361 Grad Norm 0.640388 3.84s/it
Train loss 8515 0.105811 Grad Norm 0.270542 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8574 0.192730 Grad Norm 0.585285 3.37s/it
Train loss 8575 0.161828 Grad Norm 0.583318 3.89s/it
Train loss 8576 0.202183 Grad Norm 0.584690 3.58s/it
Train loss 8577 0.179918 Grad Norm 0.918270 2.06s/it
Train loss 8578 0.183325 Grad Norm 1.007541 4.73s/it
Train loss 8579 0.175028 Grad Norm 0.501595 3.48s/it
Train loss 8580 0.173886 Grad Norm 1.220046 2.23s/it
Train loss 8581 0.152571 Grad Norm 0.468609 7.70s/it
Train loss 8582 0.131559 Grad Norm 0.465304 5.35s/it
Train loss 8583 0.130932 Grad Norm 0.325003 4.15s/it
Train loss 8584 0.189809 Grad Norm 1.045011 5.01s/it
Train loss 8585 0.167455 Grad Norm 0.432118 5.09s/it
Train loss 8586 0.153865 Grad Norm 0.237404 5.38s/it
Train loss 8587 0.210760 Grad Norm 1.052027 2.58s/it
Train loss 8588 0.160434 Grad Norm 0.459220 5.38s/it
Train loss 8589 0.199353 Grad Norm 0.742224 2.34s/it
Train loss 8590 0.156144 Grad Norm 0.629420 7.27s/it
Train loss 8591 0.245690 Grad Norm 1.071275 2.11s/it
Train loss 8592 0.188953 Grad Norm 1.230725 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8653 0.177105 Grad Norm 0.659796 4.37s/it
Train loss 8654 0.189521 Grad Norm 0.936904 4.20s/it
Train loss 8655 0.235949 Grad Norm 0.783355 2.35s/it
Train loss 8656 0.167379 Grad Norm 1.633887 4.92s/it
Train loss 8657 0.236478 Grad Norm 2.090477 3.37s/it
Train loss 8658 0.124216 Grad Norm 0.592873 6.36s/it
Train loss 8659 0.115196 Grad Norm 0.720228 8.25s/it
Train loss 8660 0.182104 Grad Norm 1.124629 3.09s/it
Train loss 8661 0.253785 Grad Norm 1.509738 3.04s/it
Train loss 8662 0.124110 Grad Norm 0.279602 5.13s/it
Train loss 8663 0.223734 Grad Norm 1.872444 2.32s/it
Train loss 8664 0.156597 Grad Norm 1.553585 7.27s/it
Train loss 8665 0.138177 Grad Norm 1.038461 3.36s/it
Train loss 8666 0.186660 Grad Norm 0.472502 1.65s/it
Train loss 8667 0.205088 Grad Norm 2.660896 2.75s/it
Train loss 8668 0.147658 Grad Norm 1.341074 3.82s/it
Train loss 8669 0.193720 Grad Norm 1.647060 2.32s/it
Train loss 8670 0.165767 Grad Norm 0.446820 4.50s/it
Train loss 8671 0.134649 Grad Norm 1.137081 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8736 0.167728 Grad Norm 0.762496 4.37s/it
Train loss 8737 0.138281 Grad Norm 0.502912 3.18s/it
Train loss 8738 0.166095 Grad Norm 0.578973 3.15s/it
Train loss 8739 0.186359 Grad Norm 0.955591 3.65s/it
Train loss 8740 0.215732 Grad Norm 0.334838 2.48s/it
Train loss 8741 0.145616 Grad Norm 0.446007 2.35s/it
Train loss 8742 0.158398 Grad Norm 1.564188 5.80s/it
Train loss 8743 0.190894 Grad Norm 0.976580 4.64s/it
Train loss 8744 0.129981 Grad Norm 0.941665 6.38s/it
Train loss 8745 0.216608 Grad Norm 1.375243 3.69s/it
Train loss 8746 0.123370 Grad Norm 0.401694 3.10s/it
Train loss 8747 0.107697 Grad Norm 0.429940 6.34s/it
Train loss 8748 0.123769 Grad Norm 0.654152 8.27s/it
Train loss 8749 0.151803 Grad Norm 0.964111 4.05s/it
Train loss 8750 0.128133 Grad Norm 0.714973 4.99s/it
Train loss 8751 0.207912 Grad Norm 0.510532 2.38s/it
Train loss 8752 0.244817 Grad Norm 1.003145 2.40s/it
Train loss 8753 0.149762 Grad Norm 1.282719 3.22s/it
Train loss 8754 0.117394 Grad Norm 0.319734 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8827 0.126956 Grad Norm 0.667056 5.65s/it
Train loss 8828 0.144150 Grad Norm 0.886222 3.68s/it
Train loss 8829 0.167452 Grad Norm 0.458447 4.45s/it
Train loss 8830 0.196020 Grad Norm 0.445164 3.37s/it
Train loss 8831 0.195176 Grad Norm 1.370264 2.58s/it
Train loss 8832 0.141631 Grad Norm 0.352655 3.22s/it
Train loss 8833 0.210372 Grad Norm 1.858954 5.28s/it
Train loss 8834 0.181232 Grad Norm 0.686660 3.41s/it
Train loss 8835 0.209902 Grad Norm 1.079965 4.57s/it
Train loss 8836 0.155120 Grad Norm 0.697721 5.38s/it
Train loss 8837 0.213504 Grad Norm 0.626052 3.02s/it
Train loss 8838 0.181705 Grad Norm 1.436844 2.63s/it
Train loss 8839 0.113275 Grad Norm 1.040309 8.16s/it
Train loss 8840 0.140203 Grad Norm 0.542754 4.27s/it
Train loss 8841 0.154323 Grad Norm 1.560747 5.61s/it
Train loss 8842 0.192017 Grad Norm 1.672497 2.79s/it
Train loss 8843 0.190229 Grad Norm 0.838991 3.09s/it
Train loss 8844 0.173583 Grad Norm 0.646238 5.37s/it
Train loss 8845 0.217929 Grad Norm 3.272795 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8904 0.171751 Grad Norm 0.357287 4.85s/it
Train loss 8905 0.153444 Grad Norm 0.653263 5.07s/it
Train loss 8906 0.146598 Grad Norm 0.434763 2.93s/it
Train loss 8907 0.174161 Grad Norm 0.780541 5.36s/it
Train loss 8908 0.235680 Grad Norm 0.755490 4.03s/it
Train loss 8909 0.156270 Grad Norm 0.537954 4.69s/it
Train loss 8910 0.179975 Grad Norm 0.839133 7.74s/it
Train loss 8911 0.149501 Grad Norm 0.279084 3.21s/it
Train loss 8912 0.158521 Grad Norm 0.911203 2.29s/it
Train loss 8913 0.107068 Grad Norm 0.577142 4.03s/it
Train loss 8914 0.222401 Grad Norm 0.533108 2.29s/it
Train loss 8915 0.142313 Grad Norm 0.503311 5.29s/it
Train loss 8916 0.115584 Grad Norm 0.828525 4.89s/it
Train loss 8917 0.171171 Grad Norm 1.123531 3.61s/it
Train loss 8918 0.131287 Grad Norm 0.681039 3.13s/it
Train loss 8919 0.163073 Grad Norm 0.551142 2.83s/it
Train loss 8920 0.201062 Grad Norm 1.798754 2.76s/it
Train loss 8921 0.215133 Grad Norm 1.156790 1.50s/it
Train loss 8922 0.230679 Grad Norm 0.985805 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 8988 0.188047 Grad Norm 0.920318 2.79s/it
Train loss 8989 0.155719 Grad Norm 0.928583 3.98s/it
Train loss 8990 0.158729 Grad Norm 0.810206 3.86s/it
Train loss 8991 0.151637 Grad Norm 0.724941 3.64s/it
Train loss 8992 0.180206 Grad Norm 0.714571 5.43s/it
Train loss 8993 0.172883 Grad Norm 0.446293 1.81s/it
Train loss 8994 0.169086 Grad Norm 0.418611 5.22s/it
Train loss 8995 0.149816 Grad Norm 0.341554 7.53s/it
Train loss 8996 0.214397 Grad Norm 0.793572 2.85s/it
Train loss 8997 0.135076 Grad Norm 0.410091 4.79s/it
Train loss 8998 0.203900 Grad Norm 0.917527 5.65s/it
Train loss 8999 0.133039 Grad Norm 0.320921 3.45s/it
Train loss 9000 0.157899 Grad Norm 0.471294 4.97s/it
Validation loss 9000:  0.412939  
Saving model and optimizer state at iteration 9000 to ./drive/MyDrive/final_weight/checkpoint_9000
Train loss 9001 0.182551 Grad Norm 0.725946 3.10s/it
Train loss 9002 0.173137 Grad Norm 0.512516 5.07s/it
Train loss 9003 0.187158 Grad Norm 0.864156 4.50s/it
Train loss 9004 0.1

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9079 0.179628 Grad Norm 0.968191 2.66s/it
Train loss 9080 0.134518 Grad Norm 0.827551 4.05s/it
Train loss 9081 0.182609 Grad Norm 0.603957 3.08s/it
Train loss 9082 0.152741 Grad Norm 0.340695 5.24s/it
Train loss 9083 0.225409 Grad Norm 1.643013 5.78s/it
Train loss 9084 0.194203 Grad Norm 0.402775 3.10s/it
Train loss 9085 0.158986 Grad Norm 0.785722 4.14s/it
Train loss 9086 0.240020 Grad Norm 1.359546 3.84s/it
Train loss 9087 0.191336 Grad Norm 0.565331 2.61s/it
Train loss 9088 0.138053 Grad Norm 1.024314 3.22s/it
Train loss 9089 0.156280 Grad Norm 0.726787 3.14s/it
Train loss 9090 0.171397 Grad Norm 0.327764 4.20s/it
Train loss 9091 0.149773 Grad Norm 0.871136 4.21s/it
Train loss 9092 0.212396 Grad Norm 0.397555 2.75s/it
Train loss 9093 0.187308 Grad Norm 0.502851 3.69s/it
Train loss 9094 0.211787 Grad Norm 0.845940 1.97s/it
Train loss 9095 0.181934 Grad Norm 0.405835 2.70s/it
Train loss 9096 0.142468 Grad Norm 0.499622 4.82s/it
Train loss 9097 0.220246 Grad Norm 1.622992 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9156 0.182702 Grad Norm 0.461494 6.26s/it
Train loss 9157 0.164274 Grad Norm 0.467657 3.92s/it
Train loss 9158 0.160270 Grad Norm 0.758694 3.62s/it
Train loss 9159 0.171618 Grad Norm 0.323950 5.43s/it
Train loss 9160 0.085344 Grad Norm 0.509656 4.41s/it
Train loss 9161 0.098145 Grad Norm 0.359446 7.41s/it
Train loss 9162 0.166292 Grad Norm 0.506352 3.84s/it
Train loss 9163 0.192268 Grad Norm 0.553594 2.99s/it
Train loss 9164 0.160292 Grad Norm 0.264757 5.61s/it
Train loss 9165 0.191921 Grad Norm 0.943813 4.96s/it
Train loss 9166 0.210718 Grad Norm 0.578672 3.05s/it
Train loss 9167 0.176969 Grad Norm 1.618089 3.24s/it
Train loss 9168 0.247051 Grad Norm 1.581271 3.30s/it
Train loss 9169 0.193229 Grad Norm 0.802064 2.96s/it
Train loss 9170 0.166447 Grad Norm 1.050124 4.07s/it
Train loss 9171 0.172342 Grad Norm 0.755830 2.91s/it
Train loss 9172 0.181978 Grad Norm 0.654911 2.24s/it
Train loss 9173 0.184671 Grad Norm 1.241072 4.18s/it
Train loss 9174 0.190921 Grad Norm 1.134456 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9244 0.219768 Grad Norm 1.717739 2.31s/it
Train loss 9245 0.196218 Grad Norm 1.163360 3.94s/it
Train loss 9246 0.184340 Grad Norm 2.058464 4.11s/it
Train loss 9247 0.164008 Grad Norm 0.972360 5.41s/it
Train loss 9248 0.170807 Grad Norm 0.726537 5.15s/it
Train loss 9249 0.142551 Grad Norm 0.755825 3.27s/it
Train loss 9250 0.156214 Grad Norm 1.771423 4.16s/it
Train loss 9251 0.163965 Grad Norm 0.507287 5.16s/it
Train loss 9252 0.248687 Grad Norm 1.042332 2.58s/it
Train loss 9253 0.136376 Grad Norm 0.441267 2.77s/it
Train loss 9254 0.165642 Grad Norm 0.458998 5.27s/it
Train loss 9255 0.186943 Grad Norm 0.854097 1.79s/it
Train loss 9256 0.207566 Grad Norm 1.045003 2.63s/it
Train loss 9257 0.138765 Grad Norm 1.177252 3.25s/it
Train loss 9258 0.140028 Grad Norm 0.507522 3.26s/it
Train loss 9259 0.124586 Grad Norm 0.884041 8.12s/it
Train loss 9260 0.167265 Grad Norm 1.290263 5.32s/it
Train loss 9261 0.170115 Grad Norm 0.492801 4.20s/it
Train loss 9262 0.133909 Grad Norm 0.360644 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9324 0.171720 Grad Norm 1.201331 3.60s/it
Train loss 9325 0.146825 Grad Norm 0.400543 6.01s/it
Train loss 9326 0.150647 Grad Norm 0.610916 6.65s/it
Train loss 9327 0.138614 Grad Norm 1.824176 5.19s/it
Train loss 9328 0.177919 Grad Norm 2.966386 3.85s/it
Train loss 9329 0.165444 Grad Norm 0.948555 5.23s/it
Train loss 9330 0.159879 Grad Norm 0.673052 2.09s/it
Train loss 9331 0.145064 Grad Norm 1.502293 3.93s/it
Train loss 9332 0.176252 Grad Norm 0.957177 7.22s/it
Train loss 9333 0.152493 Grad Norm 0.389660 1.67s/it
Train loss 9334 0.187339 Grad Norm 0.927764 2.07s/it
Train loss 9335 0.216371 Grad Norm 0.998916 3.15s/it
Train loss 9336 0.131138 Grad Norm 0.933272 3.22s/it
Train loss 9337 0.188279 Grad Norm 0.808635 4.17s/it
Train loss 9338 0.178814 Grad Norm 0.441882 3.50s/it
Train loss 9339 0.209294 Grad Norm 1.815082 5.74s/it
Train loss 9340 0.180633 Grad Norm 0.854717 3.45s/it
Train loss 9341 0.197749 Grad Norm 0.792602 3.03s/it
Train loss 9342 0.167875 Grad Norm 1.366377 1.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9415 0.169820 Grad Norm 0.495366 4.81s/it
Train loss 9416 0.184635 Grad Norm 1.352101 7.41s/it
Train loss 9417 0.221818 Grad Norm 1.988006 3.74s/it
Train loss 9418 0.158622 Grad Norm 0.324141 2.83s/it
Train loss 9419 0.158116 Grad Norm 0.627342 4.19s/it
Train loss 9420 0.147740 Grad Norm 1.104361 2.10s/it
Train loss 9421 0.164908 Grad Norm 0.485098 6.52s/it
Train loss 9422 0.131290 Grad Norm 0.567368 5.08s/it
Train loss 9423 0.130987 Grad Norm 0.822820 3.86s/it
Train loss 9424 0.185360 Grad Norm 0.604102 3.29s/it
Train loss 9425 0.201933 Grad Norm 0.559454 5.47s/it
Train loss 9426 0.154318 Grad Norm 0.997467 4.23s/it
Train loss 9427 0.154521 Grad Norm 0.682697 5.07s/it
Train loss 9428 0.137351 Grad Norm 0.838678 4.45s/it
Train loss 9429 0.219135 Grad Norm 1.152280 3.49s/it
Train loss 9430 0.135348 Grad Norm 0.367239 4.15s/it
Train loss 9431 0.160420 Grad Norm 1.084272 2.35s/it
Train loss 9432 0.114094 Grad Norm 0.694941 5.26s/it
Train loss 9433 0.111206 Grad Norm 0.614497 7.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9495 0.163226 Grad Norm 1.386865 3.97s/it
Train loss 9496 0.147450 Grad Norm 0.438396 3.31s/it
Train loss 9497 0.192480 Grad Norm 2.005959 4.17s/it
Train loss 9498 0.149632 Grad Norm 1.680176 2.15s/it
Train loss 9499 0.177622 Grad Norm 0.499789 3.50s/it
Train loss 9500 0.194428 Grad Norm 0.736145 2.31s/it
Train loss 9501 0.183045 Grad Norm 0.804525 5.07s/it
Train loss 9502 0.177570 Grad Norm 0.788593 2.32s/it
Train loss 9503 0.161347 Grad Norm 0.948829 5.01s/it
Train loss 9504 0.223270 Grad Norm 1.087941 3.18s/it
Train loss 9505 0.164717 Grad Norm 1.063150 6.51s/it
Train loss 9506 0.223522 Grad Norm 0.795081 4.59s/it
Train loss 9507 0.215512 Grad Norm 1.114231 2.44s/it
Train loss 9508 0.123154 Grad Norm 0.514713 5.33s/it
Train loss 9509 0.122954 Grad Norm 1.547049 5.21s/it
Train loss 9510 0.181015 Grad Norm 0.600052 3.38s/it
Train loss 9511 0.181920 Grad Norm 1.105187 5.06s/it
Train loss 9512 0.106376 Grad Norm 0.640286 4.17s/it
Train loss 9513 0.206354 Grad Norm 1.518577 1.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9576 0.116073 Grad Norm 1.078156 6.93s/it
Train loss 9577 0.143825 Grad Norm 1.611952 5.09s/it
Train loss 9578 0.176352 Grad Norm 0.982062 2.28s/it
Train loss 9579 0.149477 Grad Norm 0.713425 3.05s/it
Train loss 9580 0.223957 Grad Norm 2.198574 3.14s/it
Train loss 9581 0.187077 Grad Norm 1.832944 3.86s/it
Train loss 9582 0.172379 Grad Norm 0.788470 2.79s/it
Train loss 9583 0.192368 Grad Norm 1.157668 2.51s/it
Train loss 9584 0.172785 Grad Norm 2.006229 3.03s/it
Train loss 9585 0.152887 Grad Norm 1.068720 3.82s/it
Train loss 9586 0.154058 Grad Norm 0.665015 3.19s/it
Train loss 9587 0.159269 Grad Norm 1.144791 3.99s/it
Train loss 9588 0.204071 Grad Norm 1.854582 3.25s/it
Train loss 9589 0.157436 Grad Norm 0.685073 3.68s/it
Train loss 9590 0.177810 Grad Norm 1.133491 5.69s/it
Train loss 9591 0.150415 Grad Norm 0.816047 4.62s/it
Train loss 9592 0.224344 Grad Norm 0.738254 3.38s/it
Train loss 9593 0.129854 Grad Norm 0.228909 4.04s/it
Train loss 9594 0.146706 Grad Norm 0.884928 5.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9665 0.191792 Grad Norm 0.557474 3.21s/it
Train loss 9666 0.151375 Grad Norm 0.589478 4.83s/it
Train loss 9667 0.162934 Grad Norm 0.802504 5.43s/it
Train loss 9668 0.175118 Grad Norm 0.320071 4.52s/it
Train loss 9669 0.156833 Grad Norm 0.441728 4.60s/it
Train loss 9670 0.133633 Grad Norm 0.437541 3.88s/it
Train loss 9671 0.212604 Grad Norm 0.548004 5.18s/it
Train loss 9672 0.125367 Grad Norm 0.528856 2.49s/it
Train loss 9673 0.212193 Grad Norm 0.747865 2.56s/it
Train loss 9674 0.172856 Grad Norm 0.736827 3.70s/it
Train loss 9675 0.207086 Grad Norm 0.719797 3.31s/it
Train loss 9676 0.167699 Grad Norm 0.634543 4.23s/it
Train loss 9677 0.190159 Grad Norm 0.990150 2.91s/it
Train loss 9678 0.140996 Grad Norm 0.431700 6.39s/it
Train loss 9679 0.206789 Grad Norm 0.842838 4.20s/it
Train loss 9680 0.128736 Grad Norm 0.868426 5.33s/it
Train loss 9681 0.246724 Grad Norm 0.969514 3.66s/it
Train loss 9682 0.188791 Grad Norm 0.882447 2.87s/it
Train loss 9683 0.192895 Grad Norm 1.656863 1.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9744 0.148243 Grad Norm 0.893929 5.32s/it
Train loss 9745 0.128825 Grad Norm 0.317120 4.27s/it
Train loss 9746 0.140588 Grad Norm 0.760176 7.88s/it
Train loss 9747 0.211069 Grad Norm 1.350660 2.37s/it
Train loss 9748 0.152550 Grad Norm 0.669484 4.36s/it
Train loss 9749 0.188879 Grad Norm 0.797832 2.10s/it
Train loss 9750 0.191281 Grad Norm 1.485263 2.51s/it
Train loss 9751 0.253227 Grad Norm 1.091564 2.76s/it
Train loss 9752 0.182147 Grad Norm 0.994789 2.71s/it
Train loss 9753 0.183339 Grad Norm 0.851945 3.32s/it
Train loss 9754 0.245808 Grad Norm 1.656025 1.03s/it
Train loss 9755 0.170785 Grad Norm 1.182138 3.10s/it
Train loss 9756 0.199157 Grad Norm 0.629308 2.45s/it
Train loss 9757 0.171666 Grad Norm 0.933073 3.50s/it
Train loss 9758 0.235601 Grad Norm 1.005612 2.88s/it
Train loss 9759 0.152991 Grad Norm 0.832524 2.68s/it
Train loss 9760 0.175842 Grad Norm 1.140976 3.58s/it
Train loss 9761 0.176305 Grad Norm 1.168140 4.99s/it
Train loss 9762 0.221674 Grad Norm 0.910400 3.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9829 0.208711 Grad Norm 1.064824 2.89s/it
Train loss 9830 0.249622 Grad Norm 0.746719 3.16s/it
Train loss 9831 0.141157 Grad Norm 0.523271 6.58s/it
Train loss 9832 0.116273 Grad Norm 0.752040 3.31s/it
Train loss 9833 0.165152 Grad Norm 0.823227 3.29s/it
Train loss 9834 0.199898 Grad Norm 0.384253 2.39s/it
Train loss 9835 0.118818 Grad Norm 0.536781 4.88s/it
Train loss 9836 0.183253 Grad Norm 0.825872 2.98s/it
Train loss 9837 0.188062 Grad Norm 0.437555 4.10s/it
Train loss 9838 0.129997 Grad Norm 0.631152 7.79s/it
Train loss 9839 0.119089 Grad Norm 0.906187 3.56s/it
Train loss 9840 0.136111 Grad Norm 0.285505 5.19s/it
Train loss 9841 0.099192 Grad Norm 0.268324 5.75s/it
Train loss 9842 0.198289 Grad Norm 1.353021 3.03s/it
Train loss 9843 0.206459 Grad Norm 0.710570 2.90s/it
Train loss 9844 0.148652 Grad Norm 0.916850 5.00s/it
Train loss 9845 0.207854 Grad Norm 0.672789 5.23s/it
Train loss 9846 0.181859 Grad Norm 1.011961 3.74s/it
Train loss 9847 0.184752 Grad Norm 0.596990 4.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9912 0.162293 Grad Norm 0.722599 5.19s/it
Train loss 9913 0.132973 Grad Norm 1.026793 5.11s/it
Train loss 9914 0.178858 Grad Norm 0.423584 2.08s/it
Train loss 9915 0.138554 Grad Norm 0.434540 8.25s/it
Train loss 9916 0.196338 Grad Norm 0.799252 5.53s/it
Train loss 9917 0.139396 Grad Norm 0.419652 3.39s/it
Train loss 9918 0.218014 Grad Norm 0.691546 2.44s/it
Train loss 9919 0.229313 Grad Norm 1.606592 2.83s/it
Train loss 9920 0.158452 Grad Norm 1.118270 3.92s/it
Train loss 9921 0.125570 Grad Norm 0.263895 7.73s/it
Train loss 9922 0.158800 Grad Norm 1.670004 5.76s/it
Train loss 9923 0.224032 Grad Norm 1.266551 3.73s/it
Train loss 9924 0.138379 Grad Norm 0.377880 3.97s/it
Train loss 9925 0.153533 Grad Norm 0.507859 4.62s/it
Train loss 9926 0.205366 Grad Norm 1.210369 1.38s/it
Train loss 9927 0.131620 Grad Norm 0.354184 4.83s/it
Train loss 9928 0.114213 Grad Norm 0.282209 7.46s/it
Train loss 9929 0.230387 Grad Norm 1.439767 2.74s/it
Train loss 9930 0.205368 Grad Norm 0.677628 2.

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 9996 0.138630 Grad Norm 1.154917 7.08s/it
Train loss 9997 0.185479 Grad Norm 0.638282 4.71s/it
Train loss 9998 0.269575 Grad Norm 1.928436 5.52s/it
Train loss 9999 0.224997 Grad Norm 2.549165 4.04s/it
Train loss 10000 0.411784 Grad Norm 5.632980 1.49s/it
Validation loss 10000:  0.430656  
Saving model and optimizer state at iteration 10000 to ./drive/MyDrive/final_weight/checkpoint_10000
Train loss 10001 0.248587 Grad Norm 0.997032 3.65s/it
Train loss 10002 0.212282 Grad Norm 0.960800 5.90s/it
Train loss 10003 0.313500 Grad Norm 2.182800 2.09s/it
Train loss 10004 0.376729 Grad Norm 10.264567 4.28s/it
Train loss 10005 0.385693 Grad Norm 3.255537 3.42s/it
Train loss 10006 0.271525 Grad Norm 2.184643 3.47s/it
Train loss 10007 0.295648 Grad Norm 1.848783 3.18s/it
Train loss 10008 0.254887 Grad Norm 1.339087 6.02s/it
Train loss 10009 0.260153 Grad Norm 1.979893 2.64s/it
Train loss 10010 0.284304 Grad Norm 2.361384 5.19s/it
Train loss 10011 0.271741 Grad Norm 1.404205 2.39s/it
Tra

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10080 0.184321 Grad Norm 0.889210 5.11s/it
Train loss 10081 0.211853 Grad Norm 0.959068 4.88s/it
Train loss 10082 0.182028 Grad Norm 1.451921 5.24s/it
Train loss 10083 0.245033 Grad Norm 0.695919 3.44s/it
Train loss 10084 0.159036 Grad Norm 0.580742 4.02s/it
Train loss 10085 0.206421 Grad Norm 0.735745 5.73s/it
Train loss 10086 0.180629 Grad Norm 1.249482 2.64s/it
Train loss 10087 0.213208 Grad Norm 1.647627 3.08s/it
Train loss 10088 0.267332 Grad Norm 1.869178 3.37s/it
Train loss 10089 0.157851 Grad Norm 0.685167 8.37s/it
Train loss 10090 0.195870 Grad Norm 1.158952 4.84s/it
Train loss 10091 0.164242 Grad Norm 0.798115 3.26s/it
Train loss 10092 0.137940 Grad Norm 1.419810 5.01s/it
Train loss 10093 0.143215 Grad Norm 1.654079 5.30s/it
Train loss 10094 0.198291 Grad Norm 1.129478 3.38s/it
Train loss 10095 0.206674 Grad Norm 2.107889 2.18s/it
Train loss 10096 0.160076 Grad Norm 0.788975 4.54s/it
Train loss 10097 0.204762 Grad Norm 1.820757 3.34s/it
Train loss 10098 0.248445 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10176 0.127272 Grad Norm 1.115465 5.68s/it
Train loss 10177 0.168780 Grad Norm 0.613375 4.30s/it
Train loss 10178 0.262303 Grad Norm 1.027116 5.50s/it
Train loss 10179 0.286080 Grad Norm 2.350958 2.38s/it
Train loss 10180 0.202292 Grad Norm 1.910729 2.11s/it
Train loss 10181 0.240896 Grad Norm 0.872882 3.31s/it
Train loss 10182 0.287717 Grad Norm 2.260365 1.99s/it
Train loss 10183 0.227387 Grad Norm 2.352971 4.14s/it
Train loss 10184 0.146408 Grad Norm 0.844060 2.77s/it
Train loss 10185 0.165828 Grad Norm 0.571810 2.63s/it
Train loss 10186 0.191702 Grad Norm 2.147674 7.35s/it
Train loss 10187 0.232579 Grad Norm 1.738147 3.00s/it
Train loss 10188 0.192923 Grad Norm 0.685280 4.60s/it
Train loss 10189 0.149576 Grad Norm 0.984888 5.51s/it
Train loss 10190 0.184255 Grad Norm 0.863885 4.51s/it
Train loss 10191 0.204041 Grad Norm 1.326873 3.28s/it
Train loss 10192 0.145734 Grad Norm 0.655229 5.13s/it
Train loss 10193 0.189107 Grad Norm 0.741538 2.20s/it
Train loss 10194 0.215748 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10251 0.139166 Grad Norm 1.079815 5.57s/it
Train loss 10252 0.213278 Grad Norm 2.009034 2.30s/it
Train loss 10253 0.203505 Grad Norm 1.714999 2.77s/it
Train loss 10254 0.233781 Grad Norm 2.566347 3.61s/it
Train loss 10255 0.155211 Grad Norm 0.722253 6.64s/it
Train loss 10256 0.116926 Grad Norm 0.998556 5.26s/it
Train loss 10257 0.269950 Grad Norm 1.262721 5.56s/it
Train loss 10258 0.195630 Grad Norm 1.139506 2.56s/it
Train loss 10259 0.173003 Grad Norm 2.624842 2.02s/it
Train loss 10260 0.212959 Grad Norm 1.279155 3.19s/it
Train loss 10261 0.155288 Grad Norm 0.482569 7.85s/it
Train loss 10262 0.229876 Grad Norm 2.028904 3.38s/it
Train loss 10263 0.122044 Grad Norm 1.323169 5.54s/it
Train loss 10264 0.262354 Grad Norm 1.056535 2.93s/it
Train loss 10265 0.154348 Grad Norm 1.624916 4.92s/it
Train loss 10266 0.231009 Grad Norm 1.516065 4.16s/it
Train loss 10267 0.158379 Grad Norm 0.917003 4.23s/it
Train loss 10268 0.137975 Grad Norm 1.008867 6.52s/it
Train loss 10269 0.201565 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10337 0.149535 Grad Norm 0.369971 5.87s/it
Train loss 10338 0.168575 Grad Norm 0.465488 3.59s/it
Train loss 10339 0.123216 Grad Norm 0.220010 3.44s/it
Train loss 10340 0.237534 Grad Norm 0.455525 2.33s/it
Train loss 10341 0.165943 Grad Norm 0.354922 3.30s/it
Train loss 10342 0.146590 Grad Norm 0.633830 3.92s/it
Train loss 10343 0.197223 Grad Norm 0.980452 2.55s/it
Train loss 10344 0.214219 Grad Norm 0.840207 1.98s/it
Train loss 10345 0.193359 Grad Norm 1.490930 4.94s/it
Train loss 10346 0.235788 Grad Norm 1.650174 4.13s/it
Train loss 10347 0.201240 Grad Norm 0.992634 7.45s/it
Train loss 10348 0.115744 Grad Norm 0.585557 7.46s/it
Train loss 10349 0.184416 Grad Norm 1.038019 4.17s/it
Train loss 10350 0.164133 Grad Norm 0.496408 3.91s/it
Train loss 10351 0.097073 Grad Norm 0.966835 5.23s/it
Train loss 10352 0.126279 Grad Norm 0.647529 4.79s/it
Train loss 10353 0.217085 Grad Norm 0.883642 2.19s/it
Train loss 10354 0.203444 Grad Norm 1.527615 4.22s/it
Train loss 10355 0.167216 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10417 0.154157 Grad Norm 0.988833 5.18s/it
Train loss 10418 0.166371 Grad Norm 1.643799 4.29s/it
Train loss 10419 0.193917 Grad Norm 2.438309 6.48s/it
Train loss 10420 0.186223 Grad Norm 1.030691 2.06s/it
Train loss 10421 0.195943 Grad Norm 1.072318 6.36s/it
Train loss 10422 0.117098 Grad Norm 1.068189 4.14s/it
Train loss 10423 0.142738 Grad Norm 1.258564 3.68s/it
Train loss 10424 0.182338 Grad Norm 0.473884 3.43s/it
Train loss 10425 0.200115 Grad Norm 1.683230 3.42s/it
Train loss 10426 0.171943 Grad Norm 1.246109 3.34s/it
Train loss 10427 0.120375 Grad Norm 0.519865 3.67s/it
Train loss 10428 0.127394 Grad Norm 0.785165 3.95s/it
Train loss 10429 0.160704 Grad Norm 0.836137 4.58s/it
Train loss 10430 0.205537 Grad Norm 0.351513 3.22s/it
Train loss 10431 0.156396 Grad Norm 0.759285 5.60s/it
Train loss 10432 0.211358 Grad Norm 1.319779 2.50s/it
Train loss 10433 0.152262 Grad Norm 0.668644 4.32s/it
Train loss 10434 0.175749 Grad Norm 0.872111 2.31s/it
Train loss 10435 0.179330 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10511 0.208537 Grad Norm 0.929048 2.88s/it
Train loss 10512 0.164913 Grad Norm 1.089685 2.43s/it
Train loss 10513 0.210343 Grad Norm 1.133027 2.70s/it
Train loss 10514 0.155883 Grad Norm 2.031919 3.30s/it
Train loss 10515 0.166159 Grad Norm 2.172769 2.48s/it
Train loss 10516 0.142548 Grad Norm 1.167082 5.46s/it
Train loss 10517 0.162137 Grad Norm 0.529985 5.39s/it
Train loss 10518 0.138808 Grad Norm 0.943481 4.44s/it
Train loss 10519 0.190458 Grad Norm 1.140628 4.24s/it
Train loss 10520 0.166835 Grad Norm 0.470896 3.67s/it
Train loss 10521 0.140382 Grad Norm 0.665015 4.38s/it
Train loss 10522 0.181368 Grad Norm 1.352343 3.11s/it
Train loss 10523 0.137318 Grad Norm 0.657943 2.84s/it
Train loss 10524 0.157676 Grad Norm 0.688239 4.84s/it
Train loss 10525 0.226079 Grad Norm 1.332287 2.99s/it
Train loss 10526 0.191346 Grad Norm 0.441525 1.89s/it
Train loss 10527 0.233673 Grad Norm 0.594794 3.21s/it
Train loss 10528 0.153871 Grad Norm 0.398488 6.46s/it
Train loss 10529 0.187171 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10584 0.195156 Grad Norm 1.405775 5.71s/it
Train loss 10585 0.116504 Grad Norm 0.829174 5.76s/it
Train loss 10586 0.213028 Grad Norm 1.519840 5.64s/it
Train loss 10587 0.125538 Grad Norm 0.868686 7.23s/it
Train loss 10588 0.165899 Grad Norm 0.859818 2.83s/it
Train loss 10589 0.148189 Grad Norm 0.261325 2.69s/it
Train loss 10590 0.180043 Grad Norm 0.983592 4.22s/it
Train loss 10591 0.158659 Grad Norm 0.410320 1.93s/it
Train loss 10592 0.176661 Grad Norm 0.925515 5.71s/it
Train loss 10593 0.178951 Grad Norm 0.646793 2.56s/it
Train loss 10594 0.164871 Grad Norm 0.330148 4.10s/it
Train loss 10595 0.185006 Grad Norm 0.623995 4.40s/it
Train loss 10596 0.113785 Grad Norm 0.210538 4.84s/it
Train loss 10597 0.132512 Grad Norm 0.358859 3.78s/it
Train loss 10598 0.121037 Grad Norm 0.261713 6.41s/it
Train loss 10599 0.187628 Grad Norm 0.747105 3.64s/it
Train loss 10600 0.190040 Grad Norm 0.596542 2.55s/it
Train loss 10601 0.117187 Grad Norm 0.283824 5.13s/it
Train loss 10602 0.183742 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10670 0.151557 Grad Norm 0.656552 3.95s/it
Train loss 10671 0.223904 Grad Norm 0.540718 3.51s/it
Train loss 10672 0.175653 Grad Norm 0.434716 5.04s/it
Train loss 10673 0.111170 Grad Norm 0.489809 6.47s/it
Train loss 10674 0.177344 Grad Norm 0.708345 7.43s/it
Train loss 10675 0.196612 Grad Norm 0.371222 3.94s/it
Train loss 10676 0.113775 Grad Norm 0.425111 4.99s/it
Train loss 10677 0.216103 Grad Norm 1.175194 4.82s/it
Train loss 10678 0.161954 Grad Norm 0.439852 5.83s/it
Train loss 10679 0.172939 Grad Norm 1.190220 2.43s/it
Train loss 10680 0.142616 Grad Norm 0.785844 4.36s/it
Train loss 10681 0.126502 Grad Norm 0.282699 6.50s/it
Train loss 10682 0.140213 Grad Norm 1.109931 7.65s/it
Train loss 10683 0.177807 Grad Norm 0.777711 4.75s/it
Train loss 10684 0.218463 Grad Norm 0.607390 3.40s/it
Train loss 10685 0.132419 Grad Norm 0.294501 4.17s/it
Train loss 10686 0.151784 Grad Norm 1.576634 4.89s/it
Train loss 10687 0.150343 Grad Norm 0.730571 4.35s/it
Train loss 10688 0.226136 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10764 0.227542 Grad Norm 0.719602 2.80s/it
Train loss 10765 0.207518 Grad Norm 0.441781 1.53s/it
Train loss 10766 0.190121 Grad Norm 0.760639 3.97s/it
Train loss 10767 0.205526 Grad Norm 1.354068 5.11s/it
Train loss 10768 0.130247 Grad Norm 0.371793 8.21s/it
Train loss 10769 0.183821 Grad Norm 0.666296 3.08s/it
Train loss 10770 0.119480 Grad Norm 0.289897 7.26s/it
Train loss 10771 0.210920 Grad Norm 0.477934 4.15s/it
Train loss 10772 0.140594 Grad Norm 0.872899 3.62s/it
Train loss 10773 0.235024 Grad Norm 1.161120 2.24s/it
Train loss 10774 0.201664 Grad Norm 1.280999 2.24s/it
Train loss 10775 0.180148 Grad Norm 0.425961 4.20s/it
Train loss 10776 0.189613 Grad Norm 0.586388 2.41s/it
Train loss 10777 0.168124 Grad Norm 0.560755 2.06s/it
Train loss 10778 0.251483 Grad Norm 2.228018 2.76s/it
Train loss 10779 0.217792 Grad Norm 1.370334 1.71s/it
Train loss 10780 0.177595 Grad Norm 0.396433 5.61s/it
Train loss 10781 0.142999 Grad Norm 0.556918 2.37s/it
Train loss 10782 0.156280 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10842 0.209401 Grad Norm 0.838106 3.81s/it
Train loss 10843 0.185972 Grad Norm 0.577914 4.88s/it
Train loss 10844 0.209561 Grad Norm 0.309041 2.76s/it
Train loss 10845 0.190420 Grad Norm 0.540868 4.13s/it
Train loss 10846 0.125054 Grad Norm 0.430747 2.39s/it
Train loss 10847 0.149767 Grad Norm 0.460281 3.49s/it
Train loss 10848 0.241911 Grad Norm 0.447571 1.08s/it
Train loss 10849 0.162248 Grad Norm 0.862649 3.92s/it
Train loss 10850 0.154660 Grad Norm 0.612445 1.82s/it
Train loss 10851 0.204790 Grad Norm 1.444027 5.14s/it
Train loss 10852 0.122839 Grad Norm 0.776042 4.90s/it
Train loss 10853 0.213455 Grad Norm 0.606509 1.34s/it
Train loss 10854 0.181686 Grad Norm 1.201416 3.41s/it
Train loss 10855 0.132864 Grad Norm 0.484975 4.53s/it
Train loss 10856 0.191282 Grad Norm 0.726479 5.60s/it
Train loss 10857 0.139415 Grad Norm 1.013717 5.03s/it
Train loss 10858 0.162969 Grad Norm 1.491223 3.44s/it
Train loss 10859 0.119439 Grad Norm 0.492262 5.76s/it
Train loss 10860 0.178779 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 10920 0.175577 Grad Norm 1.077542 3.16s/it
Train loss 10921 0.167222 Grad Norm 0.840963 5.34s/it
Train loss 10922 0.121922 Grad Norm 0.323533 4.27s/it
Train loss 10923 0.181122 Grad Norm 0.594867 3.30s/it
Train loss 10924 0.210101 Grad Norm 2.102733 1.47s/it
Train loss 10925 0.179264 Grad Norm 0.750514 3.05s/it
Train loss 10926 0.129820 Grad Norm 0.473560 7.33s/it
Train loss 10927 0.106527 Grad Norm 1.059552 5.66s/it
Train loss 10928 0.177405 Grad Norm 0.696323 4.56s/it
Train loss 10929 0.197802 Grad Norm 0.494025 4.73s/it
Train loss 10930 0.176163 Grad Norm 2.602244 2.42s/it
Train loss 10931 0.118267 Grad Norm 0.427333 7.99s/it
Train loss 10932 0.184360 Grad Norm 0.819382 3.42s/it
Train loss 10933 0.147909 Grad Norm 0.604934 2.67s/it
Train loss 10934 0.181083 Grad Norm 0.424042 4.17s/it
Train loss 10935 0.230727 Grad Norm 0.890238 3.03s/it
Train loss 10936 0.120889 Grad Norm 0.372335 4.47s/it
Train loss 10937 0.170007 Grad Norm 1.173693 3.46s/it
Train loss 10938 0.158083 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11004 0.162074 Grad Norm 1.798086 6.97s/it
Train loss 11005 0.105631 Grad Norm 1.022251 5.77s/it
Train loss 11006 0.160839 Grad Norm 0.240543 3.97s/it
Train loss 11007 0.156501 Grad Norm 0.773931 4.59s/it
Train loss 11008 0.166925 Grad Norm 0.750732 5.24s/it
Train loss 11009 0.171948 Grad Norm 0.559904 4.92s/it
Train loss 11010 0.113649 Grad Norm 0.573347 3.96s/it
Train loss 11011 0.194918 Grad Norm 1.375796 2.39s/it
Train loss 11012 0.151493 Grad Norm 0.627300 3.82s/it
Train loss 11013 0.168661 Grad Norm 1.078750 2.37s/it
Train loss 11014 0.175974 Grad Norm 1.888118 2.72s/it
Train loss 11015 0.183760 Grad Norm 2.413595 3.22s/it
Train loss 11016 0.182914 Grad Norm 2.311216 2.71s/it
Train loss 11017 0.172576 Grad Norm 0.868685 3.85s/it
Train loss 11018 0.165738 Grad Norm 1.109318 3.02s/it
Train loss 11019 0.223622 Grad Norm 3.428127 2.30s/it
Train loss 11020 0.134844 Grad Norm 1.971567 3.61s/it
Train loss 11021 0.159140 Grad Norm 0.995496 3.41s/it
Train loss 11022 0.143762 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11088 0.169571 Grad Norm 0.658326 3.52s/it
Train loss 11089 0.136711 Grad Norm 1.153101 4.94s/it
Train loss 11090 0.141410 Grad Norm 0.431487 4.51s/it
Train loss 11091 0.180575 Grad Norm 0.512583 5.71s/it
Train loss 11092 0.170761 Grad Norm 0.605174 3.94s/it
Train loss 11093 0.124483 Grad Norm 0.328761 5.28s/it
Train loss 11094 0.198018 Grad Norm 0.697889 2.88s/it
Train loss 11095 0.197442 Grad Norm 0.514438 4.10s/it
Train loss 11096 0.191402 Grad Norm 0.493591 3.56s/it
Train loss 11097 0.163734 Grad Norm 0.233614 2.79s/it
Train loss 11098 0.166356 Grad Norm 0.425789 4.20s/it
Train loss 11099 0.118026 Grad Norm 0.334110 2.68s/it
Train loss 11100 0.111023 Grad Norm 0.216200 5.57s/it
Train loss 11101 0.144364 Grad Norm 0.957740 2.08s/it
Train loss 11102 0.123199 Grad Norm 0.381448 6.35s/it
Train loss 11103 0.155442 Grad Norm 0.365851 3.23s/it
Train loss 11104 0.207770 Grad Norm 1.331405 3.44s/it
Train loss 11105 0.188304 Grad Norm 1.327665 3.96s/it
Train loss 11106 0.123774 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11173 0.150422 Grad Norm 0.809845 3.38s/it
Train loss 11174 0.157372 Grad Norm 0.487831 3.12s/it
Train loss 11175 0.212223 Grad Norm 0.540929 2.89s/it
Train loss 11176 0.211062 Grad Norm 0.990360 5.44s/it
Train loss 11177 0.148038 Grad Norm 0.549488 5.36s/it
Train loss 11178 0.202299 Grad Norm 0.528860 2.87s/it
Train loss 11179 0.195642 Grad Norm 0.970204 4.46s/it
Train loss 11180 0.185561 Grad Norm 0.629665 2.87s/it
Train loss 11181 0.127806 Grad Norm 0.264053 8.32s/it
Train loss 11182 0.191059 Grad Norm 0.460774 2.39s/it
Train loss 11183 0.142767 Grad Norm 0.353067 5.19s/it
Train loss 11184 0.163049 Grad Norm 0.345511 3.94s/it
Train loss 11185 0.201138 Grad Norm 0.930634 5.24s/it
Train loss 11186 0.178986 Grad Norm 0.403934 3.43s/it
Train loss 11187 0.190477 Grad Norm 0.394068 2.42s/it
Train loss 11188 0.226206 Grad Norm 0.676121 2.67s/it
Train loss 11189 0.132990 Grad Norm 0.856148 6.43s/it
Train loss 11190 0.201385 Grad Norm 1.221956 5.62s/it
Train loss 11191 0.137384 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11256 0.161811 Grad Norm 1.831264 2.07s/it
Train loss 11257 0.236048 Grad Norm 1.115411 4.06s/it
Train loss 11258 0.119897 Grad Norm 0.857813 3.08s/it
Train loss 11259 0.154911 Grad Norm 2.679777 4.36s/it
Train loss 11260 0.175716 Grad Norm 1.344788 2.33s/it
Train loss 11261 0.258469 Grad Norm 1.745912 3.29s/it
Train loss 11262 0.168873 Grad Norm 1.481492 3.93s/it
Train loss 11263 0.198377 Grad Norm 3.191672 5.10s/it
Train loss 11264 0.183634 Grad Norm 1.798129 4.05s/it
Train loss 11265 0.148611 Grad Norm 0.291235 2.83s/it
Train loss 11266 0.173793 Grad Norm 1.855808 4.18s/it
Train loss 11267 0.229391 Grad Norm 2.515577 2.88s/it
Train loss 11268 0.170490 Grad Norm 0.590723 4.99s/it
Train loss 11269 0.119222 Grad Norm 0.531501 5.73s/it
Train loss 11270 0.174983 Grad Norm 1.791517 3.66s/it
Train loss 11271 0.152675 Grad Norm 0.786934 6.31s/it
Train loss 11272 0.145723 Grad Norm 0.744165 3.26s/it
Train loss 11273 0.165501 Grad Norm 0.717404 3.20s/it
Train loss 11274 0.174098 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11344 0.203166 Grad Norm 0.804924 4.05s/it
Train loss 11345 0.095425 Grad Norm 0.295802 5.00s/it
Train loss 11346 0.181181 Grad Norm 1.039108 3.27s/it
Train loss 11347 0.182561 Grad Norm 1.139894 1.53s/it
Train loss 11348 0.230250 Grad Norm 1.275952 3.21s/it
Train loss 11349 0.181337 Grad Norm 1.812904 4.88s/it
Train loss 11350 0.169461 Grad Norm 2.115371 3.02s/it
Train loss 11351 0.224193 Grad Norm 1.945381 2.14s/it
Train loss 11352 0.132952 Grad Norm 0.763752 5.57s/it
Train loss 11353 0.158818 Grad Norm 2.512214 3.37s/it
Train loss 11354 0.247324 Grad Norm 2.076085 1.96s/it
Train loss 11355 0.175050 Grad Norm 0.923378 5.55s/it
Train loss 11356 0.233194 Grad Norm 1.214386 1.52s/it
Train loss 11357 0.240838 Grad Norm 1.174702 2.32s/it
Train loss 11358 0.181332 Grad Norm 0.646996 2.42s/it
Train loss 11359 0.172392 Grad Norm 0.661639 4.99s/it
Train loss 11360 0.124775 Grad Norm 1.537887 3.94s/it
Train loss 11361 0.143628 Grad Norm 1.193533 7.31s/it
Train loss 11362 0.173994 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11425 0.175593 Grad Norm 1.054480 3.60s/it
Train loss 11426 0.171934 Grad Norm 0.459022 5.67s/it
Train loss 11427 0.196387 Grad Norm 0.625271 3.31s/it
Train loss 11428 0.191966 Grad Norm 0.376172 2.59s/it
Train loss 11429 0.177749 Grad Norm 0.730154 3.04s/it
Train loss 11430 0.144416 Grad Norm 0.625151 2.01s/it
Train loss 11431 0.183114 Grad Norm 0.867060 5.08s/it
Train loss 11432 0.227425 Grad Norm 1.271349 3.22s/it
Train loss 11433 0.254292 Grad Norm 0.941014 1.66s/it
Train loss 11434 0.142919 Grad Norm 0.520677 3.62s/it
Train loss 11435 0.248166 Grad Norm 1.399958 2.41s/it
Train loss 11436 0.115945 Grad Norm 0.572030 4.98s/it
Train loss 11437 0.132384 Grad Norm 0.422338 5.07s/it
Train loss 11438 0.180468 Grad Norm 0.307705 2.40s/it
Train loss 11439 0.125496 Grad Norm 0.438804 4.11s/it
Train loss 11440 0.213209 Grad Norm 0.376111 2.79s/it
Train loss 11441 0.143802 Grad Norm 0.361038 3.29s/it
Train loss 11442 0.170547 Grad Norm 0.310514 5.59s/it
Train loss 11443 0.129799 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11528 0.178016 Grad Norm 0.533658 3.71s/it
Train loss 11529 0.201568 Grad Norm 1.322453 5.10s/it
Train loss 11530 0.128333 Grad Norm 0.566827 6.40s/it
Train loss 11531 0.168684 Grad Norm 0.737266 2.91s/it
Train loss 11532 0.169434 Grad Norm 0.867758 3.87s/it
Train loss 11533 0.138519 Grad Norm 1.127287 4.04s/it
Train loss 11534 0.140358 Grad Norm 0.584726 3.88s/it
Train loss 11535 0.148844 Grad Norm 1.541075 5.53s/it
Train loss 11536 0.217363 Grad Norm 3.535237 3.89s/it
Train loss 11537 0.170933 Grad Norm 1.376951 2.06s/it
Train loss 11538 0.165612 Grad Norm 0.549758 2.83s/it
Train loss 11539 0.134925 Grad Norm 0.946949 4.05s/it
Train loss 11540 0.183617 Grad Norm 1.818093 2.55s/it
Train loss 11541 0.253109 Grad Norm 2.338785 2.97s/it
Train loss 11542 0.177662 Grad Norm 0.510873 4.55s/it
Train loss 11543 0.150540 Grad Norm 1.041492 4.88s/it
Train loss 11544 0.131764 Grad Norm 0.446092 4.90s/it
Train loss 11545 0.199401 Grad Norm 0.600961 3.46s/it
Train loss 11546 0.202821 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11594 0.145475 Grad Norm 0.445458 5.27s/it
Train loss 11595 0.176338 Grad Norm 0.700825 3.63s/it
Train loss 11596 0.130795 Grad Norm 0.390274 3.94s/it
Train loss 11597 0.142252 Grad Norm 0.809492 5.36s/it
Train loss 11598 0.153501 Grad Norm 0.776977 2.99s/it
Train loss 11599 0.190602 Grad Norm 0.449147 1.42s/it
Train loss 11600 0.139137 Grad Norm 0.762628 7.62s/it
Train loss 11601 0.138764 Grad Norm 1.006692 4.25s/it
Train loss 11602 0.132012 Grad Norm 0.785418 3.09s/it
Train loss 11603 0.246381 Grad Norm 1.347968 2.24s/it
Train loss 11604 0.161930 Grad Norm 1.894801 2.21s/it
Train loss 11605 0.154005 Grad Norm 1.607478 3.38s/it
Train loss 11606 0.129788 Grad Norm 0.332324 4.57s/it
Train loss 11607 0.185377 Grad Norm 2.085957 2.73s/it
Train loss 11608 0.103586 Grad Norm 1.303697 4.59s/it
Train loss 11609 0.159791 Grad Norm 1.200214 2.69s/it
Train loss 11610 0.124627 Grad Norm 0.605749 4.14s/it
Train loss 11611 0.149080 Grad Norm 1.829945 4.82s/it
Train loss 11612 0.206127 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11676 0.174131 Grad Norm 0.441660 4.88s/it
Train loss 11677 0.103234 Grad Norm 1.182801 6.58s/it
Train loss 11678 0.154892 Grad Norm 1.835948 5.46s/it
Train loss 11679 0.150969 Grad Norm 2.029411 5.06s/it
Train loss 11680 0.099038 Grad Norm 0.365805 8.27s/it
Train loss 11681 0.151011 Grad Norm 1.464879 5.41s/it
Train loss 11682 0.120060 Grad Norm 1.216708 3.30s/it
Train loss 11683 0.136982 Grad Norm 0.724130 1.84s/it
Train loss 11684 0.128532 Grad Norm 0.355801 3.07s/it
Train loss 11685 0.175830 Grad Norm 1.146771 4.73s/it
Train loss 11686 0.238639 Grad Norm 1.604140 2.87s/it
Train loss 11687 0.162559 Grad Norm 0.796306 2.77s/it
Train loss 11688 0.130026 Grad Norm 1.231779 5.31s/it
Train loss 11689 0.133764 Grad Norm 1.414191 4.76s/it
Train loss 11690 0.179777 Grad Norm 1.166257 2.92s/it
Train loss 11691 0.213463 Grad Norm 1.074547 2.65s/it
Train loss 11692 0.175521 Grad Norm 0.864174 3.74s/it
Train loss 11693 0.145899 Grad Norm 0.989483 4.02s/it
Train loss 11694 0.203931 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11767 0.155740 Grad Norm 0.517170 3.73s/it
Train loss 11768 0.119706 Grad Norm 0.240616 4.78s/it
Train loss 11769 0.133812 Grad Norm 1.129160 6.31s/it
Train loss 11770 0.117841 Grad Norm 0.972336 5.10s/it
Train loss 11771 0.168817 Grad Norm 0.423449 3.31s/it
Train loss 11772 0.143501 Grad Norm 1.332640 4.89s/it
Train loss 11773 0.115099 Grad Norm 0.867169 4.06s/it
Train loss 11774 0.231214 Grad Norm 0.965082 3.39s/it
Train loss 11775 0.168140 Grad Norm 0.580147 5.16s/it
Train loss 11776 0.143198 Grad Norm 0.758247 5.62s/it
Train loss 11777 0.183886 Grad Norm 1.252792 2.18s/it
Train loss 11778 0.206337 Grad Norm 0.543873 2.90s/it
Train loss 11779 0.165057 Grad Norm 1.303597 5.40s/it
Train loss 11780 0.194604 Grad Norm 0.791837 2.85s/it
Train loss 11781 0.170532 Grad Norm 0.344857 4.80s/it
Train loss 11782 0.211541 Grad Norm 1.157895 3.32s/it
Train loss 11783 0.192182 Grad Norm 0.508374 2.83s/it
Train loss 11784 0.176931 Grad Norm 0.862777 2.03s/it
Train loss 11785 0.205315 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11844 0.141988 Grad Norm 2.063853 6.93s/it
Train loss 11845 0.156627 Grad Norm 1.212637 6.40s/it
Train loss 11846 0.155930 Grad Norm 0.897793 4.91s/it
Train loss 11847 0.151174 Grad Norm 0.727866 2.84s/it
Train loss 11848 0.195535 Grad Norm 1.388037 2.87s/it
Train loss 11849 0.208988 Grad Norm 1.694139 1.72s/it
Train loss 11850 0.282898 Grad Norm 1.862600 3.21s/it
Train loss 11851 0.224186 Grad Norm 1.682791 2.25s/it
Train loss 11852 0.153522 Grad Norm 1.708040 3.21s/it
Train loss 11853 0.101760 Grad Norm 0.635341 7.65s/it
Train loss 11854 0.186799 Grad Norm 0.904941 2.43s/it
Train loss 11855 0.173629 Grad Norm 0.952917 4.16s/it
Train loss 11856 0.127429 Grad Norm 0.684660 3.66s/it
Train loss 11857 0.117363 Grad Norm 0.395056 5.36s/it
Train loss 11858 0.169632 Grad Norm 0.560497 2.04s/it
Train loss 11859 0.166788 Grad Norm 0.609854 4.19s/it
Train loss 11860 0.174469 Grad Norm 0.788395 4.97s/it
Train loss 11861 0.204521 Grad Norm 0.699908 2.97s/it
Train loss 11862 0.144288 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 11928 0.153674 Grad Norm 1.896441 5.86s/it
Train loss 11929 0.214557 Grad Norm 0.738147 3.46s/it
Train loss 11930 0.185436 Grad Norm 0.723546 4.58s/it
Train loss 11931 0.235989 Grad Norm 1.921200 2.01s/it
Train loss 11932 0.140376 Grad Norm 0.964806 3.85s/it
Train loss 11933 0.144901 Grad Norm 0.768915 2.55s/it
Train loss 11934 0.145788 Grad Norm 0.988747 2.00s/it
Train loss 11935 0.141372 Grad Norm 1.802997 5.10s/it
Train loss 11936 0.132140 Grad Norm 1.485369 2.65s/it
Train loss 11937 0.173542 Grad Norm 0.848547 2.15s/it
Train loss 11938 0.113200 Grad Norm 0.897158 4.08s/it
Train loss 11939 0.238169 Grad Norm 1.713137 1.66s/it
Train loss 11940 0.147534 Grad Norm 0.771274 3.90s/it
Train loss 11941 0.204025 Grad Norm 0.552225 4.16s/it
Train loss 11942 0.195176 Grad Norm 1.997575 5.85s/it
Train loss 11943 0.205701 Grad Norm 1.207966 4.37s/it
Train loss 11944 0.132190 Grad Norm 0.356302 5.57s/it
Train loss 11945 0.167061 Grad Norm 1.039688 2.24s/it
Train loss 11946 0.203370 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12012 0.176542 Grad Norm 1.010821 2.89s/it
Train loss 12013 0.213330 Grad Norm 0.594713 3.61s/it
Train loss 12014 0.174418 Grad Norm 1.142344 3.39s/it
Train loss 12015 0.109295 Grad Norm 0.425079 6.49s/it
Train loss 12016 0.210684 Grad Norm 1.769793 5.56s/it
Train loss 12017 0.115478 Grad Norm 0.626215 2.99s/it
Train loss 12018 0.161026 Grad Norm 0.960606 4.90s/it
Train loss 12019 0.274808 Grad Norm 1.274427 3.58s/it
Train loss 12020 0.137085 Grad Norm 0.823921 2.82s/it
Train loss 12021 0.180320 Grad Norm 0.972907 4.03s/it
Train loss 12022 0.134320 Grad Norm 0.441663 3.22s/it
Train loss 12023 0.175780 Grad Norm 0.993051 3.19s/it
Train loss 12024 0.181731 Grad Norm 1.217602 2.36s/it
Train loss 12025 0.114633 Grad Norm 0.622546 3.93s/it
Train loss 12026 0.146163 Grad Norm 1.139345 3.39s/it
Train loss 12027 0.152539 Grad Norm 0.873539 7.30s/it
Train loss 12028 0.152910 Grad Norm 0.750510 3.36s/it
Train loss 12029 0.134927 Grad Norm 0.661687 6.58s/it
Train loss 12030 0.189179 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12097 0.158124 Grad Norm 0.743977 3.54s/it
Train loss 12098 0.174056 Grad Norm 1.741949 5.02s/it
Train loss 12099 0.167083 Grad Norm 1.169737 6.39s/it
Train loss 12100 0.180547 Grad Norm 0.618197 3.08s/it
Train loss 12101 0.156507 Grad Norm 1.363368 6.36s/it
Train loss 12102 0.242770 Grad Norm 1.175006 2.31s/it
Train loss 12103 0.189621 Grad Norm 1.260284 2.28s/it
Train loss 12104 0.122114 Grad Norm 0.484615 4.15s/it
Train loss 12105 0.169263 Grad Norm 0.791790 3.62s/it
Train loss 12106 0.087439 Grad Norm 0.286783 5.80s/it
Train loss 12107 0.193193 Grad Norm 0.972458 2.32s/it
Train loss 12108 0.208045 Grad Norm 1.898384 2.57s/it
Train loss 12109 0.197505 Grad Norm 1.065676 3.19s/it
Train loss 12110 0.199284 Grad Norm 1.116981 2.78s/it
Train loss 12111 0.152878 Grad Norm 1.688383 3.13s/it
Train loss 12112 0.205024 Grad Norm 0.925432 4.53s/it
Train loss 12113 0.203779 Grad Norm 0.641673 2.26s/it
Train loss 12114 0.180746 Grad Norm 1.153256 5.10s/it
Train loss 12115 0.119524 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12183 0.150114 Grad Norm 0.641728 2.01s/it
Train loss 12184 0.232053 Grad Norm 4.760710 2.36s/it
Train loss 12185 0.144325 Grad Norm 0.898458 4.42s/it
Train loss 12186 0.138499 Grad Norm 1.068684 8.30s/it
Train loss 12187 0.112085 Grad Norm 0.609059 6.40s/it
Train loss 12188 0.170562 Grad Norm 1.641957 4.06s/it
Train loss 12189 0.119402 Grad Norm 1.051453 5.34s/it
Train loss 12190 0.156012 Grad Norm 0.644200 3.69s/it
Train loss 12191 0.180752 Grad Norm 0.519127 4.67s/it
Train loss 12192 0.201559 Grad Norm 1.042007 2.90s/it
Train loss 12193 0.216648 Grad Norm 1.381407 1.92s/it
Train loss 12194 0.189802 Grad Norm 1.057480 2.33s/it
Train loss 12195 0.122863 Grad Norm 1.325779 5.87s/it
Train loss 12196 0.244114 Grad Norm 0.950141 2.33s/it
Train loss 12197 0.155075 Grad Norm 0.310709 2.69s/it
Train loss 12198 0.200856 Grad Norm 0.553472 4.27s/it
Train loss 12199 0.164808 Grad Norm 0.439324 2.91s/it
Train loss 12200 0.142366 Grad Norm 0.342791 4.49s/it
Train loss 12201 0.132274 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12279 0.198374 Grad Norm 1.859539 4.07s/it
Train loss 12280 0.169305 Grad Norm 2.381732 3.21s/it
Train loss 12281 0.191829 Grad Norm 1.353255 2.07s/it
Train loss 12282 0.194785 Grad Norm 1.138586 5.09s/it
Train loss 12283 0.130493 Grad Norm 1.749263 3.90s/it
Train loss 12284 0.159954 Grad Norm 1.187360 3.38s/it
Train loss 12285 0.221021 Grad Norm 0.873184 2.07s/it
Train loss 12286 0.159802 Grad Norm 1.708620 2.29s/it
Train loss 12287 0.161603 Grad Norm 2.197033 4.57s/it
Train loss 12288 0.152313 Grad Norm 1.663968 3.01s/it
Train loss 12289 0.170403 Grad Norm 0.694334 2.06s/it
Train loss 12290 0.211514 Grad Norm 1.339445 3.46s/it
Train loss 12291 0.147618 Grad Norm 0.965663 3.96s/it
Train loss 12292 0.166120 Grad Norm 0.948963 4.52s/it
Train loss 12293 0.219034 Grad Norm 0.506714 1.77s/it
Train loss 12294 0.189189 Grad Norm 1.213688 3.32s/it
Train loss 12295 0.149085 Grad Norm 0.947909 4.88s/it
Train loss 12296 0.145212 Grad Norm 0.319527 5.40s/it
Train loss 12297 0.180703 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12369 0.153816 Grad Norm 0.796167 4.18s/it
Train loss 12370 0.158160 Grad Norm 0.879843 7.25s/it
Train loss 12371 0.175362 Grad Norm 1.296052 3.65s/it
Train loss 12372 0.181538 Grad Norm 0.796075 2.43s/it
Train loss 12373 0.188817 Grad Norm 1.189542 5.07s/it
Train loss 12374 0.145875 Grad Norm 1.600049 3.63s/it
Train loss 12375 0.160353 Grad Norm 0.870256 5.39s/it
Train loss 12376 0.192233 Grad Norm 1.426133 3.43s/it
Train loss 12377 0.199006 Grad Norm 2.191840 1.56s/it
Train loss 12378 0.175317 Grad Norm 0.972531 3.36s/it
Train loss 12379 0.135208 Grad Norm 0.628125 2.59s/it
Train loss 12380 0.253789 Grad Norm 2.768952 4.62s/it
Train loss 12381 0.142736 Grad Norm 1.121623 4.12s/it
Train loss 12382 0.172050 Grad Norm 0.302162 2.86s/it
Train loss 12383 0.148177 Grad Norm 0.961746 5.38s/it
Train loss 12384 0.156944 Grad Norm 1.192341 2.12s/it
Train loss 12385 0.180550 Grad Norm 0.928365 4.12s/it
Train loss 12386 0.147062 Grad Norm 1.091732 5.00s/it
Train loss 12387 0.133205 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12432 0.153860 Grad Norm 0.782557 4.07s/it
Train loss 12433 0.180797 Grad Norm 1.203789 7.47s/it
Train loss 12434 0.165681 Grad Norm 0.688407 2.80s/it
Train loss 12435 0.239956 Grad Norm 0.849727 3.37s/it
Train loss 12436 0.233036 Grad Norm 0.627521 4.03s/it
Train loss 12437 0.201641 Grad Norm 0.857193 3.72s/it
Train loss 12438 0.179137 Grad Norm 0.769248 2.34s/it
Train loss 12439 0.169733 Grad Norm 0.542627 2.68s/it
Train loss 12440 0.174454 Grad Norm 0.516718 3.21s/it
Train loss 12441 0.211467 Grad Norm 0.459186 4.10s/it
Train loss 12442 0.241176 Grad Norm 0.946932 2.14s/it
Train loss 12443 0.186386 Grad Norm 0.806977 1.93s/it
Train loss 12444 0.153521 Grad Norm 0.937197 2.25s/it
Train loss 12445 0.102925 Grad Norm 0.528452 7.90s/it
Train loss 12446 0.128450 Grad Norm 0.565131 5.53s/it
Train loss 12447 0.183341 Grad Norm 0.740399 4.23s/it
Train loss 12448 0.197473 Grad Norm 0.412649 2.41s/it
Train loss 12449 0.122846 Grad Norm 0.428176 5.65s/it
Train loss 12450 0.230798 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12516 0.213893 Grad Norm 2.024386 5.85s/it
Train loss 12517 0.147241 Grad Norm 0.332464 5.63s/it
Train loss 12518 0.161958 Grad Norm 1.387955 4.66s/it
Train loss 12519 0.149614 Grad Norm 1.693884 5.81s/it
Train loss 12520 0.179212 Grad Norm 1.802432 2.50s/it
Train loss 12521 0.105664 Grad Norm 0.367265 5.69s/it
Train loss 12522 0.220855 Grad Norm 2.480155 3.69s/it
Train loss 12523 0.107444 Grad Norm 1.462763 6.54s/it
Train loss 12524 0.174521 Grad Norm 1.540949 3.48s/it
Train loss 12525 0.174306 Grad Norm 0.585877 2.53s/it
Train loss 12526 0.140173 Grad Norm 0.721014 4.23s/it
Train loss 12527 0.155233 Grad Norm 0.878825 4.96s/it
Train loss 12528 0.225734 Grad Norm 1.129055 3.31s/it
Train loss 12529 0.159244 Grad Norm 0.481195 4.09s/it
Train loss 12530 0.113853 Grad Norm 0.752504 8.25s/it
Train loss 12531 0.215893 Grad Norm 1.011842 2.99s/it
Train loss 12532 0.214905 Grad Norm 0.686658 2.00s/it
Train loss 12533 0.240725 Grad Norm 0.599821 2.75s/it
Train loss 12534 0.194485 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12604 0.128240 Grad Norm 0.776819 5.51s/it
Train loss 12605 0.194710 Grad Norm 0.474093 2.04s/it
Train loss 12606 0.158205 Grad Norm 0.655239 5.39s/it
Train loss 12607 0.191093 Grad Norm 0.579886 3.64s/it
Train loss 12608 0.192357 Grad Norm 0.460766 3.21s/it
Train loss 12609 0.202581 Grad Norm 0.568276 3.50s/it
Train loss 12610 0.205349 Grad Norm 0.495574 2.07s/it
Train loss 12611 0.198296 Grad Norm 0.760048 2.69s/it
Train loss 12612 0.153799 Grad Norm 0.444306 6.38s/it
Train loss 12613 0.101164 Grad Norm 0.490674 5.39s/it
Train loss 12614 0.137758 Grad Norm 0.449450 5.22s/it
Train loss 12615 0.143951 Grad Norm 0.442016 3.66s/it
Train loss 12616 0.131093 Grad Norm 3.562028 4.41s/it
Train loss 12617 0.167432 Grad Norm 0.648421 4.84s/it
Train loss 12618 0.170961 Grad Norm 0.642208 3.73s/it
Train loss 12619 0.237286 Grad Norm 0.613193 2.57s/it
Train loss 12620 0.128468 Grad Norm 0.427192 4.80s/it
Train loss 12621 0.169083 Grad Norm 0.520401 3.44s/it
Train loss 12622 0.181087 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12688 0.143940 Grad Norm 0.628440 8.21s/it
Train loss 12689 0.233169 Grad Norm 2.049978 4.47s/it
Train loss 12690 0.308964 Grad Norm 1.303089 3.13s/it
Train loss 12691 0.193658 Grad Norm 2.772919 4.99s/it
Train loss 12692 0.172181 Grad Norm 1.004805 3.46s/it
Train loss 12693 0.218062 Grad Norm 0.725454 3.34s/it
Train loss 12694 0.248731 Grad Norm 1.273235 3.40s/it
Train loss 12695 0.206100 Grad Norm 24.564650 3.97s/it
Train loss 12696 0.368726 Grad Norm 2.550461 2.52s/it
Train loss 12697 0.255029 Grad Norm 2.844433 4.02s/it
Train loss 12698 0.158686 Grad Norm 0.567349 4.27s/it
Train loss 12699 0.194283 Grad Norm 1.203030 3.88s/it
Train loss 12700 0.353170 Grad Norm 5.053144 4.21s/it
Train loss 12701 0.166269 Grad Norm 0.894363 5.62s/it
Train loss 12702 0.124427 Grad Norm 0.801607 5.68s/it
Train loss 12703 0.328823 Grad Norm 2.537791 3.11s/it
Train loss 12704 0.363066 Grad Norm 1.989526 2.55s/it
Train loss 12705 0.202919 Grad Norm 1.400746 3.19s/it
Train loss 12706 0.138618 G

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12772 0.199912 Grad Norm 0.846429 2.27s/it
Train loss 12773 0.225005 Grad Norm 1.818236 5.25s/it
Train loss 12774 0.144356 Grad Norm 1.142701 6.43s/it
Train loss 12775 0.237862 Grad Norm 1.498930 3.48s/it
Train loss 12776 0.206588 Grad Norm 1.685987 2.40s/it
Train loss 12777 0.251989 Grad Norm 1.470793 3.61s/it
Train loss 12778 0.120299 Grad Norm 0.363709 4.16s/it
Train loss 12779 0.173473 Grad Norm 0.495829 3.45s/it
Train loss 12780 0.207283 Grad Norm 1.418777 2.36s/it
Train loss 12781 0.268692 Grad Norm 1.474789 2.33s/it
Train loss 12782 0.241085 Grad Norm 0.873715 3.21s/it
Train loss 12783 0.142809 Grad Norm 0.428673 5.02s/it
Train loss 12784 0.157159 Grad Norm 2.474247 5.93s/it
Train loss 12785 0.221588 Grad Norm 1.682286 3.01s/it
Train loss 12786 0.205911 Grad Norm 1.280649 5.63s/it
Train loss 12787 0.144430 Grad Norm 0.751837 7.31s/it
Train loss 12788 0.240051 Grad Norm 2.302508 2.87s/it
Train loss 12789 0.159562 Grad Norm 1.273947 4.07s/it
Train loss 12790 0.226900 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12863 0.243650 Grad Norm 0.987627 2.66s/it
Train loss 12864 0.153767 Grad Norm 1.142849 4.82s/it
Train loss 12865 0.198225 Grad Norm 2.579483 3.55s/it
Train loss 12866 0.172748 Grad Norm 1.549593 2.27s/it
Train loss 12867 0.196339 Grad Norm 1.341144 5.07s/it
Train loss 12868 0.273559 Grad Norm 2.362142 3.43s/it
Train loss 12869 0.225912 Grad Norm 2.151092 2.82s/it
Train loss 12870 0.247855 Grad Norm 3.093641 2.67s/it
Train loss 12871 0.197374 Grad Norm 0.724276 5.69s/it
Train loss 12872 0.231913 Grad Norm 1.903860 3.38s/it
Train loss 12873 0.114798 Grad Norm 1.935797 5.02s/it
Train loss 12874 0.199984 Grad Norm 2.053948 4.76s/it
Train loss 12875 0.179677 Grad Norm 0.576395 5.15s/it
Train loss 12876 0.238096 Grad Norm 1.534287 5.59s/it
Train loss 12877 0.108516 Grad Norm 0.914883 6.51s/it
Train loss 12878 0.211656 Grad Norm 0.727116 1.53s/it
Train loss 12879 0.105508 Grad Norm 0.254925 4.04s/it
Train loss 12880 0.188914 Grad Norm 0.600820 3.60s/it
Train loss 12881 0.251471 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 12936 0.220224 Grad Norm 1.228964 3.08s/it
Train loss 12937 0.253062 Grad Norm 0.589057 3.35s/it
Train loss 12938 0.160621 Grad Norm 1.909388 4.13s/it
Train loss 12939 0.177313 Grad Norm 1.730972 5.36s/it
Train loss 12940 0.212321 Grad Norm 0.494399 3.51s/it
Train loss 12941 0.191362 Grad Norm 0.797801 2.56s/it
Train loss 12942 0.131286 Grad Norm 1.624930 6.36s/it
Train loss 12943 0.186447 Grad Norm 1.718769 3.27s/it
Train loss 12944 0.272046 Grad Norm 0.816947 3.37s/it
Train loss 12945 0.188557 Grad Norm 1.036526 4.44s/it
Train loss 12946 0.190572 Grad Norm 1.767802 5.69s/it
Train loss 12947 0.200089 Grad Norm 1.886549 3.99s/it
Train loss 12948 0.190936 Grad Norm 1.041406 5.22s/it
Train loss 12949 0.168957 Grad Norm 0.531714 6.52s/it
Train loss 12950 0.140300 Grad Norm 0.590723 5.42s/it
Train loss 12951 0.118807 Grad Norm 0.331832 5.71s/it
Train loss 12952 0.126300 Grad Norm 0.386732 4.49s/it
Train loss 12953 0.148013 Grad Norm 0.907785 2.45s/it
Train loss 12954 0.111962 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13026 0.216834 Grad Norm 1.786885 3.10s/it
Train loss 13027 0.225665 Grad Norm 0.990739 2.63s/it
Train loss 13028 0.191920 Grad Norm 1.187173 6.42s/it
Train loss 13029 0.126927 Grad Norm 1.037318 3.86s/it
Train loss 13030 0.120082 Grad Norm 1.593845 5.18s/it
Train loss 13031 0.191727 Grad Norm 0.659739 4.46s/it
Train loss 13032 0.151404 Grad Norm 0.726650 6.48s/it
Train loss 13033 0.192794 Grad Norm 0.954440 3.26s/it
Train loss 13034 0.182714 Grad Norm 0.936016 3.24s/it
Train loss 13035 0.224573 Grad Norm 0.785774 2.32s/it
Train loss 13036 0.132979 Grad Norm 1.592122 5.28s/it
Train loss 13037 0.123226 Grad Norm 1.424367 4.75s/it
Train loss 13038 0.238039 Grad Norm 2.235059 2.50s/it
Train loss 13039 0.162723 Grad Norm 0.661983 2.43s/it
Train loss 13040 0.239361 Grad Norm 1.801665 1.95s/it
Train loss 13041 0.275318 Grad Norm 4.024354 1.74s/it
Train loss 13042 0.255334 Grad Norm 2.271764 1.79s/it
Train loss 13043 0.154906 Grad Norm 1.217225 3.90s/it
Train loss 13044 0.171974 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13106 0.199268 Grad Norm 0.777690 2.54s/it
Train loss 13107 0.133324 Grad Norm 0.248482 3.99s/it
Train loss 13108 0.153967 Grad Norm 0.881995 5.15s/it
Train loss 13109 0.155322 Grad Norm 0.785781 4.01s/it
Train loss 13110 0.198844 Grad Norm 0.921324 3.47s/it
Train loss 13111 0.182173 Grad Norm 0.456691 2.39s/it
Train loss 13112 0.220954 Grad Norm 0.511884 3.89s/it
Train loss 13113 0.135197 Grad Norm 0.482585 4.26s/it
Train loss 13114 0.170432 Grad Norm 0.499574 2.98s/it
Train loss 13115 0.198954 Grad Norm 0.499091 2.75s/it
Train loss 13116 0.227158 Grad Norm 0.947099 1.77s/it
Train loss 13117 0.140472 Grad Norm 0.255187 5.16s/it
Train loss 13118 0.249915 Grad Norm 1.303032 4.30s/it
Train loss 13119 0.131279 Grad Norm 0.461211 4.33s/it
Train loss 13120 0.140376 Grad Norm 0.364452 4.11s/it
Train loss 13121 0.147527 Grad Norm 0.815945 2.04s/it
Train loss 13122 0.204070 Grad Norm 0.554442 2.00s/it
Train loss 13123 0.202610 Grad Norm 0.721136 1.23s/it
Train loss 13124 0.152838 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13200 0.242372 Grad Norm 3.186639 1.39s/it
Train loss 13201 0.243734 Grad Norm 2.852025 5.19s/it
Train loss 13202 0.132324 Grad Norm 0.310921 5.62s/it
Train loss 13203 0.190903 Grad Norm 1.319547 5.10s/it
Train loss 13204 0.243904 Grad Norm 2.257139 3.40s/it
Train loss 13205 0.259502 Grad Norm 0.915716 3.63s/it
Train loss 13206 0.201584 Grad Norm 1.404664 3.56s/it
Train loss 13207 0.142741 Grad Norm 1.646908 3.21s/it
Train loss 13208 0.254852 Grad Norm 1.437623 2.81s/it
Train loss 13209 0.181639 Grad Norm 0.520890 3.06s/it
Train loss 13210 0.175918 Grad Norm 1.745467 5.11s/it
Train loss 13211 0.154130 Grad Norm 1.678392 2.98s/it
Train loss 13212 0.173880 Grad Norm 0.864130 7.28s/it
Train loss 13213 0.153644 Grad Norm 0.687727 3.95s/it
Train loss 13214 0.168252 Grad Norm 1.440920 4.16s/it
Train loss 13215 0.164321 Grad Norm 2.426697 3.70s/it
Train loss 13216 0.215512 Grad Norm 0.565626 5.00s/it
Train loss 13217 0.180204 Grad Norm 1.394897 4.65s/it
Train loss 13218 0.150865 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13279 0.166294 Grad Norm 0.837558 3.33s/it
Train loss 13280 0.149294 Grad Norm 0.526238 4.22s/it
Train loss 13281 0.160707 Grad Norm 0.930737 3.09s/it
Train loss 13282 0.136092 Grad Norm 0.736658 3.73s/it
Train loss 13283 0.245284 Grad Norm 1.373523 2.03s/it
Train loss 13284 0.168668 Grad Norm 0.584211 4.06s/it
Train loss 13285 0.150394 Grad Norm 1.061553 7.44s/it
Train loss 13286 0.244520 Grad Norm 1.654162 3.96s/it
Train loss 13287 0.111453 Grad Norm 0.490265 7.24s/it
Train loss 13288 0.161054 Grad Norm 0.770146 2.29s/it
Train loss 13289 0.179420 Grad Norm 0.781903 2.81s/it
Train loss 13290 0.197756 Grad Norm 0.511254 5.32s/it
Train loss 13291 0.170037 Grad Norm 0.302456 6.42s/it
Train loss 13292 0.173061 Grad Norm 0.434708 3.98s/it
Train loss 13293 0.208025 Grad Norm 1.554182 2.62s/it
Train loss 13294 0.187497 Grad Norm 0.516288 3.68s/it
Train loss 13295 0.188136 Grad Norm 0.591293 4.48s/it
Train loss 13296 0.226910 Grad Norm 1.218557 2.00s/it
Train loss 13297 0.122168 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13372 0.116132 Grad Norm 0.500626 4.05s/it
Train loss 13373 0.230664 Grad Norm 0.893262 1.51s/it
Train loss 13374 0.207213 Grad Norm 1.137953 4.10s/it
Train loss 13375 0.135810 Grad Norm 0.425576 7.77s/it
Train loss 13376 0.170877 Grad Norm 1.465662 4.10s/it
Train loss 13377 0.185015 Grad Norm 1.246479 4.94s/it
Train loss 13378 0.168268 Grad Norm 0.579127 5.73s/it
Train loss 13379 0.167809 Grad Norm 0.369848 5.06s/it
Train loss 13380 0.135689 Grad Norm 0.329256 3.89s/it
Train loss 13381 0.189718 Grad Norm 0.817200 2.43s/it
Train loss 13382 0.176508 Grad Norm 0.967729 6.33s/it
Train loss 13383 0.228830 Grad Norm 1.416400 2.53s/it
Train loss 13384 0.180739 Grad Norm 1.061296 3.20s/it
Train loss 13385 0.185737 Grad Norm 0.636749 4.70s/it
Train loss 13386 0.161885 Grad Norm 0.570329 3.94s/it
Train loss 13387 0.153121 Grad Norm 0.666332 2.23s/it
Train loss 13388 0.185104 Grad Norm 0.683465 7.35s/it
Train loss 13389 0.207494 Grad Norm 0.739678 2.92s/it
Train loss 13390 0.169820 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13443 0.167583 Grad Norm 0.491754 2.51s/it
Train loss 13444 0.208341 Grad Norm 0.362446 3.64s/it
Train loss 13445 0.131882 Grad Norm 0.372876 5.62s/it
Train loss 13446 0.177687 Grad Norm 1.345533 4.82s/it
Train loss 13447 0.177422 Grad Norm 0.403195 4.27s/it
Train loss 13448 0.186724 Grad Norm 0.807868 4.19s/it
Train loss 13449 0.191187 Grad Norm 0.679744 2.98s/it
Train loss 13450 0.144455 Grad Norm 0.388379 3.65s/it
Train loss 13451 0.232274 Grad Norm 0.639768 2.13s/it
Train loss 13452 0.130188 Grad Norm 0.512228 5.63s/it
Train loss 13453 0.143363 Grad Norm 0.479265 2.93s/it
Train loss 13454 0.173727 Grad Norm 0.935269 2.11s/it
Train loss 13455 0.164058 Grad Norm 0.355510 3.32s/it
Train loss 13456 0.169864 Grad Norm 1.853990 2.45s/it
Train loss 13457 0.153536 Grad Norm 1.212398 6.43s/it
Train loss 13458 0.162354 Grad Norm 0.824362 4.69s/it
Train loss 13459 0.191744 Grad Norm 0.929999 2.41s/it
Train loss 13460 0.142478 Grad Norm 0.389015 4.23s/it
Train loss 13461 0.165860 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13525 0.212080 Grad Norm 0.613257 2.17s/it
Train loss 13526 0.198617 Grad Norm 0.890861 2.82s/it
Train loss 13527 0.196554 Grad Norm 0.646462 3.34s/it
Train loss 13528 0.165779 Grad Norm 0.733703 5.43s/it
Train loss 13529 0.166156 Grad Norm 1.426949 6.52s/it
Train loss 13530 0.157643 Grad Norm 0.701813 5.26s/it
Train loss 13531 0.237971 Grad Norm 2.340177 2.80s/it
Train loss 13532 0.117303 Grad Norm 1.222302 4.89s/it
Train loss 13533 0.163193 Grad Norm 0.872115 3.29s/it
Train loss 13534 0.173965 Grad Norm 2.491811 3.61s/it
Train loss 13535 0.138212 Grad Norm 1.435232 5.02s/it
Train loss 13536 0.144243 Grad Norm 1.261301 5.29s/it
Train loss 13537 0.174616 Grad Norm 0.977905 5.56s/it
Train loss 13538 0.148273 Grad Norm 1.246803 3.63s/it
Train loss 13539 0.216233 Grad Norm 1.057626 2.27s/it
Train loss 13540 0.204175 Grad Norm 1.170783 2.11s/it
Train loss 13541 0.163962 Grad Norm 2.074460 2.48s/it
Train loss 13542 0.189465 Grad Norm 1.501583 1.85s/it
Train loss 13543 0.159449 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13611 0.155289 Grad Norm 1.057775 5.72s/it
Train loss 13612 0.199041 Grad Norm 1.439968 2.33s/it
Train loss 13613 0.140178 Grad Norm 0.956477 5.85s/it
Train loss 13614 0.183699 Grad Norm 1.601279 4.17s/it
Train loss 13615 0.189057 Grad Norm 1.924405 4.05s/it
Train loss 13616 0.145958 Grad Norm 1.210659 8.23s/it
Train loss 13617 0.200996 Grad Norm 1.517072 3.29s/it
Train loss 13618 0.165905 Grad Norm 1.258584 3.21s/it
Train loss 13619 0.183224 Grad Norm 1.864771 5.13s/it
Train loss 13620 0.128091 Grad Norm 1.134613 2.72s/it
Train loss 13621 0.166891 Grad Norm 0.437821 3.94s/it
Train loss 13622 0.120841 Grad Norm 0.818892 3.58s/it
Train loss 13623 0.184886 Grad Norm 1.467274 5.61s/it
Train loss 13624 0.161566 Grad Norm 0.616622 4.86s/it
Train loss 13625 0.211054 Grad Norm 1.271634 2.34s/it
Train loss 13626 0.093384 Grad Norm 0.507951 7.67s/it
Train loss 13627 0.166515 Grad Norm 0.805906 5.13s/it
Train loss 13628 0.190661 Grad Norm 0.959949 3.77s/it
Train loss 13629 0.168619 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13706 0.190338 Grad Norm 0.770619 5.73s/it
Train loss 13707 0.170764 Grad Norm 0.301993 2.33s/it
Train loss 13708 0.132137 Grad Norm 0.471894 7.27s/it
Train loss 13709 0.176983 Grad Norm 0.541144 2.86s/it
Train loss 13710 0.185243 Grad Norm 1.107111 1.74s/it
Train loss 13711 0.197299 Grad Norm 0.476949 3.62s/it
Train loss 13712 0.178055 Grad Norm 0.356068 5.63s/it
Train loss 13713 0.108676 Grad Norm 0.570845 5.35s/it
Train loss 13714 0.154707 Grad Norm 0.408452 3.45s/it
Train loss 13715 0.183960 Grad Norm 0.733230 3.82s/it
Train loss 13716 0.204490 Grad Norm 0.486328 4.14s/it
Train loss 13717 0.182338 Grad Norm 0.435364 3.95s/it
Train loss 13718 0.131814 Grad Norm 0.859116 3.28s/it
Train loss 13719 0.173539 Grad Norm 0.854604 3.42s/it
Train loss 13720 0.231244 Grad Norm 0.608550 2.73s/it
Train loss 13721 0.215159 Grad Norm 0.547555 1.84s/it
Train loss 13722 0.119309 Grad Norm 0.571340 4.55s/it
Train loss 13723 0.158621 Grad Norm 0.672503 4.53s/it
Train loss 13724 0.184531 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13777 0.124616 Grad Norm 0.236059 4.62s/it
Train loss 13778 0.166216 Grad Norm 1.546026 3.13s/it
Train loss 13779 0.159565 Grad Norm 0.695123 3.30s/it
Train loss 13780 0.150638 Grad Norm 1.138578 3.29s/it
Train loss 13781 0.145596 Grad Norm 1.127779 4.98s/it
Train loss 13782 0.173038 Grad Norm 0.718100 6.46s/it
Train loss 13783 0.182413 Grad Norm 0.720037 3.08s/it
Train loss 13784 0.102981 Grad Norm 0.529350 4.00s/it
Train loss 13785 0.149785 Grad Norm 0.533706 2.67s/it
Train loss 13786 0.117040 Grad Norm 0.776286 4.40s/it
Train loss 13787 0.240487 Grad Norm 0.328510 3.39s/it
Train loss 13788 0.203318 Grad Norm 1.452568 3.24s/it
Train loss 13789 0.147536 Grad Norm 1.135842 2.85s/it
Train loss 13790 0.128600 Grad Norm 0.444255 8.14s/it
Train loss 13791 0.134100 Grad Norm 0.325599 2.78s/it
Train loss 13792 0.101867 Grad Norm 0.736140 7.24s/it
Train loss 13793 0.092798 Grad Norm 0.442208 5.19s/it
Train loss 13794 0.145366 Grad Norm 2.254377 3.31s/it
Train loss 13795 0.161212 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13883 0.124771 Grad Norm 0.338853 5.34s/it
Train loss 13884 0.191539 Grad Norm 0.613461 4.94s/it
Train loss 13885 0.135287 Grad Norm 0.365367 4.82s/it
Train loss 13886 0.206784 Grad Norm 0.758213 5.42s/it
Train loss 13887 0.195422 Grad Norm 0.509718 3.59s/it
Train loss 13888 0.118505 Grad Norm 0.618548 2.25s/it
Train loss 13889 0.162951 Grad Norm 0.912479 2.84s/it
Train loss 13890 0.102810 Grad Norm 0.608159 7.29s/it
Train loss 13891 0.164601 Grad Norm 0.540750 2.71s/it
Train loss 13892 0.122549 Grad Norm 0.961952 2.21s/it
Train loss 13893 0.147533 Grad Norm 0.927714 3.93s/it
Train loss 13894 0.237767 Grad Norm 0.578281 2.30s/it
Train loss 13895 0.099930 Grad Norm 0.646948 6.41s/it
Train loss 13896 0.203472 Grad Norm 1.825409 2.02s/it
Train loss 13897 0.185639 Grad Norm 2.237092 3.20s/it
Train loss 13898 0.222635 Grad Norm 0.994171 3.80s/it
Train loss 13899 0.173838 Grad Norm 0.925912 5.15s/it
Train loss 13900 0.106292 Grad Norm 0.566109 5.37s/it
Train loss 13901 0.131135 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 13944 0.174190 Grad Norm 0.705371 4.29s/it
Train loss 13945 0.145263 Grad Norm 0.734128 2.81s/it
Train loss 13946 0.151785 Grad Norm 0.525151 8.20s/it
Train loss 13947 0.250994 Grad Norm 1.164854 2.08s/it
Train loss 13948 0.132729 Grad Norm 0.633668 4.32s/it
Train loss 13949 0.196074 Grad Norm 0.959968 5.50s/it
Train loss 13950 0.214432 Grad Norm 1.501120 2.72s/it
Train loss 13951 0.208279 Grad Norm 1.210320 1.63s/it
Train loss 13952 0.213716 Grad Norm 1.612690 2.22s/it
Train loss 13953 0.171541 Grad Norm 0.481164 5.27s/it
Train loss 13954 0.183880 Grad Norm 0.554255 7.36s/it
Train loss 13955 0.122707 Grad Norm 0.415631 3.87s/it
Train loss 13956 0.178209 Grad Norm 1.012006 3.97s/it
Train loss 13957 0.266077 Grad Norm 0.845585 3.51s/it
Train loss 13958 0.135476 Grad Norm 0.611162 3.27s/it
Train loss 13959 0.225020 Grad Norm 1.790844 3.59s/it
Train loss 13960 0.149022 Grad Norm 0.674122 3.07s/it
Train loss 13961 0.112466 Grad Norm 0.380128 5.78s/it
Train loss 13962 0.129019 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14031 0.115070 Grad Norm 0.388095 3.92s/it
Train loss 14032 0.118331 Grad Norm 0.610352 5.64s/it
Train loss 14033 0.169925 Grad Norm 0.881525 4.39s/it
Train loss 14034 0.173787 Grad Norm 0.623670 2.05s/it
Train loss 14035 0.170874 Grad Norm 1.044225 4.03s/it
Train loss 14036 0.137905 Grad Norm 1.148001 4.87s/it
Train loss 14037 0.110231 Grad Norm 0.409443 5.59s/it
Train loss 14038 0.173723 Grad Norm 0.947614 1.77s/it
Train loss 14039 0.215591 Grad Norm 2.724552 5.37s/it
Train loss 14040 0.102411 Grad Norm 0.483087 3.53s/it
Train loss 14041 0.097359 Grad Norm 0.361770 5.14s/it
Train loss 14042 0.139586 Grad Norm 1.284254 2.26s/it
Train loss 14043 0.162593 Grad Norm 1.441666 2.16s/it
Train loss 14044 0.166059 Grad Norm 0.444685 2.79s/it
Train loss 14045 0.141302 Grad Norm 1.028894 6.40s/it
Train loss 14046 0.157463 Grad Norm 0.892822 2.90s/it
Train loss 14047 0.140415 Grad Norm 0.703376 4.15s/it
Train loss 14048 0.158860 Grad Norm 0.541443 4.07s/it
Train loss 14049 0.130026 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14112 0.142570 Grad Norm 1.098078 2.53s/it
Train loss 14113 0.191282 Grad Norm 0.398232 3.07s/it
Train loss 14114 0.191171 Grad Norm 1.253399 5.43s/it
Train loss 14115 0.183927 Grad Norm 1.170349 2.16s/it
Train loss 14116 0.243468 Grad Norm 1.049826 3.21s/it
Train loss 14117 0.112475 Grad Norm 0.374934 7.75s/it
Train loss 14118 0.162299 Grad Norm 0.545275 5.40s/it
Train loss 14119 0.152368 Grad Norm 0.487298 3.48s/it
Train loss 14120 0.117617 Grad Norm 0.509834 5.29s/it
Train loss 14121 0.140270 Grad Norm 0.326401 4.10s/it
Train loss 14122 0.265749 Grad Norm 0.925240 2.05s/it
Train loss 14123 0.184483 Grad Norm 0.425140 5.09s/it
Train loss 14124 0.151983 Grad Norm 0.802061 3.55s/it
Train loss 14125 0.132651 Grad Norm 0.675744 5.40s/it
Train loss 14126 0.162266 Grad Norm 0.277148 3.88s/it
Train loss 14127 0.106337 Grad Norm 0.324137 5.39s/it
Train loss 14128 0.213964 Grad Norm 0.851380 2.37s/it
Train loss 14129 0.140547 Grad Norm 0.238930 4.02s/it
Train loss 14130 0.229827 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14196 0.117233 Grad Norm 0.555976 4.41s/it
Train loss 14197 0.164806 Grad Norm 1.408267 2.40s/it
Train loss 14198 0.126905 Grad Norm 0.357215 4.86s/it
Train loss 14199 0.145588 Grad Norm 0.374306 3.97s/it
Train loss 14200 0.143500 Grad Norm 0.536152 4.04s/it
Train loss 14201 0.161284 Grad Norm 0.481286 2.76s/it
Train loss 14202 0.131581 Grad Norm 0.341491 6.28s/it
Train loss 14203 0.192287 Grad Norm 0.619551 2.03s/it
Train loss 14204 0.171386 Grad Norm 0.490134 2.04s/it
Train loss 14205 0.099503 Grad Norm 0.253535 5.28s/it
Train loss 14206 0.175570 Grad Norm 1.275046 5.00s/it
Train loss 14207 0.149252 Grad Norm 0.352691 5.63s/it
Train loss 14208 0.189877 Grad Norm 1.143731 3.28s/it
Train loss 14209 0.169443 Grad Norm 0.927836 4.90s/it
Train loss 14210 0.123857 Grad Norm 0.512865 5.44s/it
Train loss 14211 0.154258 Grad Norm 1.580953 3.63s/it
Train loss 14212 0.167900 Grad Norm 0.814797 2.10s/it
Train loss 14213 0.159765 Grad Norm 0.577962 3.79s/it
Train loss 14214 0.160421 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14280 0.182732 Grad Norm 0.840614 3.05s/it
Train loss 14281 0.187775 Grad Norm 0.552077 1.94s/it
Train loss 14282 0.167058 Grad Norm 0.504774 3.91s/it
Train loss 14283 0.091650 Grad Norm 0.603452 6.55s/it
Train loss 14284 0.155908 Grad Norm 0.780506 4.01s/it
Train loss 14285 0.189644 Grad Norm 1.039376 2.94s/it
Train loss 14286 0.126410 Grad Norm 0.350335 5.44s/it
Train loss 14287 0.117850 Grad Norm 0.473905 3.37s/it
Train loss 14288 0.190554 Grad Norm 0.702985 4.57s/it
Train loss 14289 0.200761 Grad Norm 0.589375 4.95s/it
Train loss 14290 0.170968 Grad Norm 1.540831 5.89s/it
Train loss 14291 0.195137 Grad Norm 1.186260 3.45s/it
Train loss 14292 0.150639 Grad Norm 0.627934 4.48s/it
Train loss 14293 0.134851 Grad Norm 0.819819 4.83s/it
Train loss 14294 0.137254 Grad Norm 0.480723 5.54s/it
Train loss 14295 0.183092 Grad Norm 1.106151 3.32s/it
Train loss 14296 0.129060 Grad Norm 0.797228 7.30s/it
Train loss 14297 0.189607 Grad Norm 0.387201 2.51s/it
Train loss 14298 0.195501 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14376 0.162696 Grad Norm 0.801326 3.97s/it
Train loss 14377 0.192823 Grad Norm 0.937322 5.45s/it
Train loss 14378 0.191085 Grad Norm 2.036150 5.25s/it
Train loss 14379 0.150516 Grad Norm 1.269886 2.30s/it
Train loss 14380 0.116149 Grad Norm 1.154425 4.92s/it
Train loss 14381 0.157534 Grad Norm 0.721447 3.15s/it
Train loss 14382 0.161858 Grad Norm 1.947153 3.22s/it
Train loss 14383 0.209368 Grad Norm 2.285433 2.40s/it
Train loss 14384 0.148325 Grad Norm 1.460424 7.72s/it
Train loss 14385 0.133122 Grad Norm 0.669772 4.35s/it
Train loss 14386 0.121286 Grad Norm 0.782601 5.37s/it
Train loss 14387 0.162131 Grad Norm 1.407804 4.45s/it
Train loss 14388 0.191540 Grad Norm 1.451496 2.48s/it
Train loss 14389 0.199065 Grad Norm 0.940861 1.37s/it
Train loss 14390 0.187768 Grad Norm 1.093863 4.05s/it
Train loss 14391 0.176345 Grad Norm 2.689946 5.12s/it
Train loss 14392 0.177785 Grad Norm 1.368146 5.57s/it
Train loss 14393 0.185048 Grad Norm 0.707748 4.93s/it
Train loss 14394 0.150547 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14456 0.204306 Grad Norm 1.246689 1.67s/it
Train loss 14457 0.226249 Grad Norm 1.890377 2.61s/it
Train loss 14458 0.126048 Grad Norm 1.353242 6.44s/it
Train loss 14459 0.106379 Grad Norm 0.256067 5.26s/it
Train loss 14460 0.181481 Grad Norm 0.813593 2.64s/it
Train loss 14461 0.198043 Grad Norm 0.698609 3.16s/it
Train loss 14462 0.157442 Grad Norm 0.629668 3.19s/it
Train loss 14463 0.124304 Grad Norm 0.712835 4.32s/it
Train loss 14464 0.250038 Grad Norm 0.697177 3.44s/it
Train loss 14465 0.173003 Grad Norm 0.705373 2.04s/it
Train loss 14466 0.199550 Grad Norm 1.113087 3.57s/it
Train loss 14467 0.196840 Grad Norm 1.153956 3.31s/it
Train loss 14468 0.147373 Grad Norm 0.493992 4.97s/it
Train loss 14469 0.176575 Grad Norm 0.503617 3.28s/it
Train loss 14470 0.163138 Grad Norm 0.357332 2.81s/it
Train loss 14471 0.159049 Grad Norm 0.621935 3.96s/it
Train loss 14472 0.177840 Grad Norm 1.267370 4.92s/it
Train loss 14473 0.170196 Grad Norm 0.532396 3.85s/it
Train loss 14474 0.127814 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14537 0.123617 Grad Norm 1.050084 4.99s/it
Train loss 14538 0.139137 Grad Norm 0.378532 4.53s/it
Train loss 14539 0.166327 Grad Norm 0.433657 3.04s/it
Train loss 14540 0.188918 Grad Norm 0.411740 3.56s/it
Train loss 14541 0.204356 Grad Norm 2.109237 2.85s/it
Train loss 14542 0.200162 Grad Norm 1.360249 4.00s/it
Train loss 14543 0.166555 Grad Norm 1.027793 3.62s/it
Train loss 14544 0.179997 Grad Norm 1.072469 3.06s/it
Train loss 14545 0.101567 Grad Norm 0.370856 7.86s/it
Train loss 14546 0.146742 Grad Norm 0.273309 4.43s/it
Train loss 14547 0.170233 Grad Norm 1.427976 2.56s/it
Train loss 14548 0.154017 Grad Norm 0.335107 5.00s/it
Train loss 14549 0.173819 Grad Norm 0.397482 2.40s/it
Train loss 14550 0.128562 Grad Norm 0.382771 4.11s/it
Train loss 14551 0.232274 Grad Norm 1.102637 3.20s/it
Train loss 14552 0.132883 Grad Norm 0.217398 4.37s/it
Train loss 14553 0.153579 Grad Norm 0.588003 2.89s/it
Train loss 14554 0.160611 Grad Norm 0.516119 3.24s/it
Train loss 14555 0.175530 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14616 0.189813 Grad Norm 1.164550 3.16s/it
Train loss 14617 0.199373 Grad Norm 0.471396 5.17s/it
Train loss 14618 0.190393 Grad Norm 1.554316 3.19s/it
Train loss 14619 0.184038 Grad Norm 1.034061 1.52s/it
Train loss 14620 0.133427 Grad Norm 0.553329 4.14s/it
Train loss 14621 0.179772 Grad Norm 0.600747 2.80s/it
Train loss 14622 0.142647 Grad Norm 1.131654 5.19s/it
Train loss 14623 0.159325 Grad Norm 0.307764 1.99s/it
Train loss 14624 0.217364 Grad Norm 1.012581 4.83s/it
Train loss 14625 0.191347 Grad Norm 1.132647 4.88s/it
Train loss 14626 0.179491 Grad Norm 1.109045 2.33s/it
Train loss 14627 0.162323 Grad Norm 0.807175 5.31s/it
Train loss 14628 0.109506 Grad Norm 0.787886 3.97s/it
Train loss 14629 0.161661 Grad Norm 1.515666 2.94s/it
Train loss 14630 0.210527 Grad Norm 0.851627 2.05s/it
Train loss 14631 0.146216 Grad Norm 0.462984 4.95s/it
Train loss 14632 0.144896 Grad Norm 0.722414 4.60s/it
Train loss 14633 0.119216 Grad Norm 1.065598 4.53s/it
Train loss 14634 0.220149 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14708 0.154354 Grad Norm 0.682223 5.38s/it
Train loss 14709 0.179092 Grad Norm 0.338521 4.14s/it
Train loss 14710 0.133856 Grad Norm 0.718503 5.33s/it
Train loss 14711 0.164035 Grad Norm 0.450181 4.13s/it
Train loss 14712 0.230370 Grad Norm 0.870950 3.23s/it
Train loss 14713 0.101291 Grad Norm 0.346210 5.48s/it
Train loss 14714 0.142612 Grad Norm 0.457221 3.26s/it
Train loss 14715 0.180643 Grad Norm 0.615623 2.53s/it
Train loss 14716 0.128347 Grad Norm 0.507808 6.51s/it
Train loss 14717 0.140045 Grad Norm 0.275474 5.22s/it
Train loss 14718 0.149692 Grad Norm 0.500471 3.28s/it
Train loss 14719 0.137010 Grad Norm 0.502559 7.22s/it
Train loss 14720 0.193885 Grad Norm 0.800284 2.08s/it
Train loss 14721 0.151448 Grad Norm 0.521353 5.73s/it
Train loss 14722 0.189792 Grad Norm 0.425894 3.40s/it
Train loss 14723 0.228450 Grad Norm 0.606199 2.60s/it
Train loss 14724 0.140130 Grad Norm 0.677745 2.81s/it
Train loss 14725 0.193310 Grad Norm 0.761813 4.13s/it
Train loss 14726 0.252942 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14784 0.162699 Grad Norm 0.622499 4.53s/it
Train loss 14785 0.123938 Grad Norm 1.135614 3.24s/it
Train loss 14786 0.142592 Grad Norm 0.636977 6.51s/it
Train loss 14787 0.149433 Grad Norm 0.671299 2.62s/it
Train loss 14788 0.174151 Grad Norm 0.368951 5.52s/it
Train loss 14789 0.186622 Grad Norm 1.423786 2.77s/it
Train loss 14790 0.164197 Grad Norm 0.611437 3.56s/it
Train loss 14791 0.226927 Grad Norm 0.562719 2.82s/it
Train loss 14792 0.154690 Grad Norm 0.644538 6.36s/it
Train loss 14793 0.181985 Grad Norm 1.090855 3.28s/it
Train loss 14794 0.173050 Grad Norm 0.785570 4.08s/it
Train loss 14795 0.147959 Grad Norm 0.581178 4.45s/it
Train loss 14796 0.151106 Grad Norm 1.410387 2.57s/it
Train loss 14797 0.131183 Grad Norm 0.499695 4.34s/it
Train loss 14798 0.222675 Grad Norm 0.613144 2.22s/it
Train loss 14799 0.144242 Grad Norm 0.726856 2.84s/it
Train loss 14800 0.189428 Grad Norm 1.311149 2.73s/it
Train loss 14801 0.137442 Grad Norm 0.237606 3.37s/it
Train loss 14802 0.165900 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14884 0.122970 Grad Norm 0.618432 4.90s/it
Train loss 14885 0.194982 Grad Norm 0.916855 2.89s/it
Train loss 14886 0.176519 Grad Norm 0.968599 3.60s/it
Train loss 14887 0.185460 Grad Norm 0.725219 1.54s/it
Train loss 14888 0.126776 Grad Norm 0.834781 8.03s/it
Train loss 14889 0.187109 Grad Norm 0.635778 2.32s/it
Train loss 14890 0.177494 Grad Norm 0.636605 6.28s/it
Train loss 14891 0.145509 Grad Norm 0.593088 2.93s/it
Train loss 14892 0.220913 Grad Norm 0.833879 2.35s/it
Train loss 14893 0.189075 Grad Norm 0.877369 5.41s/it
Train loss 14894 0.173310 Grad Norm 0.442897 3.34s/it
Train loss 14895 0.113902 Grad Norm 0.633303 3.83s/it
Train loss 14896 0.100330 Grad Norm 0.770099 3.95s/it
Train loss 14897 0.180304 Grad Norm 0.622650 2.87s/it
Train loss 14898 0.147529 Grad Norm 0.463443 4.42s/it
Train loss 14899 0.213492 Grad Norm 0.542328 2.40s/it
Train loss 14900 0.207168 Grad Norm 0.618252 2.58s/it
Train loss 14901 0.244067 Grad Norm 0.859335 1.88s/it
Train loss 14902 0.195324 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 14964 0.115394 Grad Norm 0.257984 4.81s/it
Train loss 14965 0.203498 Grad Norm 0.719269 3.48s/it
Train loss 14966 0.114184 Grad Norm 0.858244 8.22s/it
Train loss 14967 0.128541 Grad Norm 0.298086 3.95s/it
Train loss 14968 0.158649 Grad Norm 0.827642 3.27s/it
Train loss 14969 0.135699 Grad Norm 0.905318 5.13s/it
Train loss 14970 0.210192 Grad Norm 1.712757 2.53s/it
Train loss 14971 0.118633 Grad Norm 1.055006 4.86s/it
Train loss 14972 0.084192 Grad Norm 0.929799 5.09s/it
Train loss 14973 0.173850 Grad Norm 1.461653 2.74s/it
Train loss 14974 0.194543 Grad Norm 1.252299 5.62s/it
Train loss 14975 0.182909 Grad Norm 0.939365 5.44s/it
Train loss 14976 0.224655 Grad Norm 1.501394 5.71s/it
Train loss 14977 0.149126 Grad Norm 0.810426 1.79s/it
Train loss 14978 0.190370 Grad Norm 0.777026 1.93s/it
Train loss 14979 0.174813 Grad Norm 0.951971 6.41s/it
Train loss 14980 0.181798 Grad Norm 1.149502 4.45s/it
Train loss 14981 0.244778 Grad Norm 0.662279 2.56s/it
Train loss 14982 0.127391 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15036 0.184459 Grad Norm 1.880396 4.94s/it
Train loss 15037 0.196596 Grad Norm 2.354154 3.40s/it
Train loss 15038 0.155354 Grad Norm 0.352269 7.80s/it
Train loss 15039 0.155880 Grad Norm 1.571437 5.62s/it
Train loss 15040 0.197463 Grad Norm 2.229816 2.31s/it
Train loss 15041 0.103948 Grad Norm 0.312337 6.38s/it
Train loss 15042 0.175185 Grad Norm 0.410919 7.23s/it
Train loss 15043 0.143915 Grad Norm 1.029781 4.67s/it
Train loss 15044 0.178683 Grad Norm 1.498731 4.23s/it
Train loss 15045 0.199152 Grad Norm 0.798494 2.60s/it
Train loss 15046 0.162885 Grad Norm 0.732870 5.83s/it
Train loss 15047 0.148355 Grad Norm 0.230650 5.72s/it
Train loss 15048 0.123639 Grad Norm 0.603412 8.23s/it
Train loss 15049 0.212765 Grad Norm 0.604082 2.90s/it
Train loss 15050 0.225534 Grad Norm 0.831661 3.49s/it
Train loss 15051 0.144460 Grad Norm 0.213273 4.03s/it
Train loss 15052 0.114412 Grad Norm 0.840350 5.39s/it
Train loss 15053 0.140776 Grad Norm 1.075500 4.91s/it
Train loss 15054 0.151829 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15122 0.141269 Grad Norm 1.300087 3.98s/it
Train loss 15123 0.133845 Grad Norm 1.008309 6.39s/it
Train loss 15124 0.145430 Grad Norm 0.317567 5.43s/it
Train loss 15125 0.168514 Grad Norm 1.197056 3.27s/it
Train loss 15126 0.159896 Grad Norm 1.126388 2.97s/it
Train loss 15127 0.164227 Grad Norm 0.495727 5.17s/it
Train loss 15128 0.153853 Grad Norm 0.995337 4.13s/it
Train loss 15129 0.169778 Grad Norm 1.418406 4.12s/it
Train loss 15130 0.152276 Grad Norm 1.531128 4.93s/it
Train loss 15131 0.161001 Grad Norm 0.945195 3.23s/it
Train loss 15132 0.192467 Grad Norm 1.250338 2.97s/it
Train loss 15133 0.236327 Grad Norm 0.625847 2.77s/it
Train loss 15134 0.161857 Grad Norm 0.566622 4.65s/it
Train loss 15135 0.134170 Grad Norm 0.491709 3.47s/it
Train loss 15136 0.158893 Grad Norm 0.667006 5.35s/it
Train loss 15137 0.156202 Grad Norm 0.642672 3.39s/it
Train loss 15138 0.206217 Grad Norm 0.456419 2.25s/it
Train loss 15139 0.112484 Grad Norm 0.295928 6.42s/it
Train loss 15140 0.251836 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15207 0.170361 Grad Norm 1.406694 2.87s/it
Train loss 15208 0.193155 Grad Norm 0.740656 4.03s/it
Train loss 15209 0.142851 Grad Norm 0.739229 3.63s/it
Train loss 15210 0.162144 Grad Norm 0.424764 4.78s/it
Train loss 15211 0.174892 Grad Norm 1.601304 4.29s/it
Train loss 15212 0.197053 Grad Norm 1.174451 2.23s/it
Train loss 15213 0.118109 Grad Norm 0.551151 4.74s/it
Train loss 15214 0.211057 Grad Norm 1.642436 2.49s/it
Train loss 15215 0.123364 Grad Norm 0.631940 3.76s/it
Train loss 15216 0.139528 Grad Norm 0.248117 4.21s/it
Train loss 15217 0.161244 Grad Norm 1.160203 4.10s/it
Train loss 15218 0.166487 Grad Norm 0.729906 2.92s/it
Train loss 15219 0.144316 Grad Norm 0.370868 5.87s/it
Train loss 15220 0.142407 Grad Norm 0.736360 3.96s/it
Train loss 15221 0.095331 Grad Norm 0.657327 3.41s/it
Train loss 15222 0.110768 Grad Norm 0.584406 6.40s/it
Train loss 15223 0.260886 Grad Norm 1.695666 3.94s/it
Train loss 15224 0.114597 Grad Norm 1.419107 4.06s/it
Train loss 15225 0.220150 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15295 0.217434 Grad Norm 1.699928 2.46s/it
Train loss 15296 0.189659 Grad Norm 0.655229 2.62s/it
Train loss 15297 0.122703 Grad Norm 0.503580 8.10s/it
Train loss 15298 0.144718 Grad Norm 1.310791 5.70s/it
Train loss 15299 0.134198 Grad Norm 0.841971 6.48s/it
Train loss 15300 0.207268 Grad Norm 0.836835 3.97s/it
Train loss 15301 0.117539 Grad Norm 0.306827 5.20s/it
Train loss 15302 0.169082 Grad Norm 0.997635 5.42s/it
Train loss 15303 0.182671 Grad Norm 0.307188 3.45s/it
Train loss 15304 0.171220 Grad Norm 0.931872 5.42s/it
Train loss 15305 0.156881 Grad Norm 1.215163 3.34s/it
Train loss 15306 0.180156 Grad Norm 0.413667 2.02s/it
Train loss 15307 0.181806 Grad Norm 1.905760 5.25s/it
Train loss 15308 0.109466 Grad Norm 0.773194 7.31s/it
Train loss 15309 0.182971 Grad Norm 0.469312 2.86s/it
Train loss 15310 0.140848 Grad Norm 0.443289 3.82s/it
Train loss 15311 0.180173 Grad Norm 0.418813 2.07s/it
Train loss 15312 0.256277 Grad Norm 1.697190 2.83s/it
Train loss 15313 0.222493 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15385 0.218804 Grad Norm 0.700346 3.08s/it
Train loss 15386 0.185067 Grad Norm 1.207952 1.75s/it
Train loss 15387 0.143038 Grad Norm 0.862424 3.86s/it
Train loss 15388 0.228798 Grad Norm 0.963328 3.24s/it
Train loss 15389 0.155023 Grad Norm 0.501988 4.54s/it
Train loss 15390 0.124103 Grad Norm 1.757785 4.71s/it
Train loss 15391 0.210487 Grad Norm 2.064455 3.07s/it
Train loss 15392 0.211773 Grad Norm 0.941260 3.20s/it
Train loss 15393 0.161504 Grad Norm 0.615716 3.82s/it
Train loss 15394 0.107870 Grad Norm 0.458725 2.95s/it
Train loss 15395 0.172235 Grad Norm 0.851460 4.06s/it
Train loss 15396 0.241100 Grad Norm 0.982870 2.81s/it
Train loss 15397 0.172094 Grad Norm 1.370191 3.36s/it
Train loss 15398 0.181263 Grad Norm 0.978071 2.32s/it
Train loss 15399 0.169206 Grad Norm 0.836576 3.55s/it
Train loss 15400 0.227988 Grad Norm 1.526717 2.81s/it
Train loss 15401 0.201688 Grad Norm 0.813123 3.56s/it
Train loss 15402 0.122018 Grad Norm 1.194371 3.02s/it
Train loss 15403 0.151285 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15458 0.169746 Grad Norm 0.827802 4.84s/it
Train loss 15459 0.159574 Grad Norm 0.879880 3.15s/it
Train loss 15460 0.149872 Grad Norm 0.619723 3.91s/it
Train loss 15461 0.152857 Grad Norm 0.455835 4.88s/it
Train loss 15462 0.158895 Grad Norm 0.528638 4.41s/it
Train loss 15463 0.156644 Grad Norm 0.511717 4.06s/it
Train loss 15464 0.184453 Grad Norm 1.224430 3.30s/it
Train loss 15465 0.200711 Grad Norm 0.481325 2.85s/it
Train loss 15466 0.147855 Grad Norm 0.251897 3.95s/it
Train loss 15467 0.177074 Grad Norm 1.199582 4.13s/it
Train loss 15468 0.150205 Grad Norm 0.509008 3.87s/it
Train loss 15469 0.211244 Grad Norm 0.925989 2.31s/it
Train loss 15470 0.133003 Grad Norm 0.357579 6.43s/it
Train loss 15471 0.249305 Grad Norm 1.601893 5.28s/it
Train loss 15472 0.129100 Grad Norm 0.905324 5.12s/it
Train loss 15473 0.181535 Grad Norm 1.120005 4.13s/it
Train loss 15474 0.220605 Grad Norm 0.732556 3.32s/it
Train loss 15475 0.157942 Grad Norm 0.601295 3.59s/it
Train loss 15476 0.225516 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15540 0.154593 Grad Norm 0.564200 3.84s/it
Train loss 15541 0.155047 Grad Norm 0.868544 3.17s/it
Train loss 15542 0.125875 Grad Norm 0.698072 5.07s/it
Train loss 15543 0.175787 Grad Norm 0.500496 3.88s/it
Train loss 15544 0.150829 Grad Norm 0.481495 2.22s/it
Train loss 15545 0.244330 Grad Norm 0.757751 6.28s/it
Train loss 15546 0.156922 Grad Norm 1.099975 5.14s/it
Train loss 15547 0.158630 Grad Norm 0.888663 5.09s/it
Train loss 15548 0.112204 Grad Norm 0.475471 4.81s/it
Train loss 15549 0.160248 Grad Norm 0.454971 5.38s/it
Train loss 15550 0.180532 Grad Norm 0.530936 3.38s/it
Train loss 15551 0.152016 Grad Norm 0.342080 2.38s/it
Train loss 15552 0.162937 Grad Norm 0.905894 4.13s/it
Train loss 15553 0.169126 Grad Norm 0.946825 2.84s/it
Train loss 15554 0.165456 Grad Norm 0.727263 3.63s/it
Train loss 15555 0.135193 Grad Norm 0.538073 4.67s/it
Train loss 15556 0.207013 Grad Norm 1.677266 2.23s/it
Train loss 15557 0.224436 Grad Norm 0.708704 2.24s/it
Train loss 15558 0.133580 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15624 0.152861 Grad Norm 0.331223 4.80s/it
Train loss 15625 0.241753 Grad Norm 1.084917 1.83s/it
Train loss 15626 0.192776 Grad Norm 0.806815 2.81s/it
Train loss 15627 0.153828 Grad Norm 1.006044 4.81s/it
Train loss 15628 0.209865 Grad Norm 0.795010 3.57s/it
Train loss 15629 0.246438 Grad Norm 2.746593 3.35s/it
Train loss 15630 0.112405 Grad Norm 0.619583 5.43s/it
Train loss 15631 0.219833 Grad Norm 1.669216 1.84s/it
Train loss 15632 0.186838 Grad Norm 0.831487 5.41s/it
Train loss 15633 0.175250 Grad Norm 0.960984 2.82s/it
Train loss 15634 0.180950 Grad Norm 0.528898 3.57s/it
Train loss 15635 0.195924 Grad Norm 1.557669 2.93s/it
Train loss 15636 0.186895 Grad Norm 0.761930 3.82s/it
Train loss 15637 0.237350 Grad Norm 1.003340 1.99s/it
Train loss 15638 0.155541 Grad Norm 1.171814 3.70s/it
Train loss 15639 0.121328 Grad Norm 1.496781 4.67s/it
Train loss 15640 0.175194 Grad Norm 1.112432 4.11s/it
Train loss 15641 0.222237 Grad Norm 1.396129 5.26s/it
Train loss 15642 0.121895 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15711 0.157103 Grad Norm 0.467578 5.02s/it
Train loss 15712 0.245794 Grad Norm 1.811858 2.08s/it
Train loss 15713 0.130407 Grad Norm 1.966342 6.49s/it
Train loss 15714 0.188935 Grad Norm 1.890625 3.59s/it
Train loss 15715 0.175389 Grad Norm 0.914185 2.89s/it
Train loss 15716 0.222503 Grad Norm 2.073713 2.86s/it
Train loss 15717 0.129744 Grad Norm 2.137435 4.04s/it
Train loss 15718 0.209213 Grad Norm 2.623890 3.96s/it
Train loss 15719 0.189580 Grad Norm 1.102486 5.02s/it
Train loss 15720 0.167243 Grad Norm 1.335373 3.90s/it
Train loss 15721 0.123883 Grad Norm 1.450310 4.85s/it
Train loss 15722 0.172436 Grad Norm 1.406110 3.78s/it
Train loss 15723 0.206897 Grad Norm 0.830715 3.26s/it
Train loss 15724 0.175509 Grad Norm 1.774925 5.01s/it
Train loss 15725 0.125477 Grad Norm 1.397397 5.34s/it
Train loss 15726 0.198569 Grad Norm 1.491457 4.03s/it
Train loss 15727 0.166967 Grad Norm 0.941758 2.34s/it
Train loss 15728 0.253686 Grad Norm 1.049482 5.59s/it
Train loss 15729 0.159633 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15796 0.157749 Grad Norm 0.698030 5.08s/it
Train loss 15797 0.162682 Grad Norm 1.057970 5.10s/it
Train loss 15798 0.205601 Grad Norm 0.717064 3.57s/it
Train loss 15799 0.168454 Grad Norm 0.443520 4.99s/it
Train loss 15800 0.165544 Grad Norm 1.229878 4.00s/it
Train loss 15801 0.125944 Grad Norm 0.367642 5.65s/it
Train loss 15802 0.191055 Grad Norm 0.391576 5.34s/it
Train loss 15803 0.127704 Grad Norm 0.295344 3.95s/it
Train loss 15804 0.204174 Grad Norm 1.063143 2.37s/it
Train loss 15805 0.196443 Grad Norm 0.594149 3.11s/it
Train loss 15806 0.211189 Grad Norm 1.247280 1.82s/it
Train loss 15807 0.176122 Grad Norm 0.716421 4.04s/it
Train loss 15808 0.214121 Grad Norm 1.159446 3.26s/it
Train loss 15809 0.251549 Grad Norm 0.855919 2.01s/it
Train loss 15810 0.122067 Grad Norm 0.438452 3.00s/it
Train loss 15811 0.147972 Grad Norm 0.438593 5.66s/it
Train loss 15812 0.184000 Grad Norm 0.945691 5.18s/it
Train loss 15813 0.169309 Grad Norm 0.787414 3.09s/it
Train loss 15814 0.149364 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15876 0.085959 Grad Norm 0.360328 4.50s/it
Train loss 15877 0.134385 Grad Norm 1.364992 5.43s/it
Train loss 15878 0.148724 Grad Norm 1.583581 7.45s/it
Train loss 15879 0.205252 Grad Norm 0.500595 4.10s/it
Train loss 15880 0.210861 Grad Norm 1.999117 3.24s/it
Train loss 15881 0.176059 Grad Norm 1.992527 3.57s/it
Train loss 15882 0.118505 Grad Norm 0.784116 4.01s/it
Train loss 15883 0.170666 Grad Norm 0.410684 3.60s/it
Train loss 15884 0.127145 Grad Norm 1.072062 4.34s/it
Train loss 15885 0.211467 Grad Norm 1.845751 1.65s/it
Train loss 15886 0.184210 Grad Norm 0.791331 2.74s/it
Train loss 15887 0.203381 Grad Norm 1.295897 3.23s/it
Train loss 15888 0.204832 Grad Norm 1.705624 3.92s/it
Train loss 15889 0.162806 Grad Norm 0.572171 3.19s/it
Train loss 15890 0.212293 Grad Norm 0.767629 1.85s/it
Train loss 15891 0.154393 Grad Norm 1.293981 4.67s/it
Train loss 15892 0.229878 Grad Norm 1.789094 3.87s/it
Train loss 15893 0.210253 Grad Norm 1.124519 3.48s/it
Train loss 15894 0.157048 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 15960 0.146677 Grad Norm 0.684691 5.42s/it
Train loss 15961 0.190916 Grad Norm 0.700179 3.67s/it
Train loss 15962 0.152555 Grad Norm 1.406210 5.76s/it
Train loss 15963 0.170867 Grad Norm 0.349816 5.96s/it
Train loss 15964 0.203443 Grad Norm 0.518106 5.31s/it
Train loss 15965 0.129633 Grad Norm 0.464929 3.33s/it
Train loss 15966 0.125492 Grad Norm 0.313108 4.84s/it
Train loss 15967 0.169413 Grad Norm 0.272117 4.42s/it
Train loss 15968 0.152736 Grad Norm 0.526785 4.65s/it
Train loss 15969 0.159175 Grad Norm 0.310278 4.37s/it
Train loss 15970 0.172491 Grad Norm 0.657541 5.68s/it
Train loss 15971 0.141551 Grad Norm 0.589000 5.11s/it
Train loss 15972 0.157440 Grad Norm 1.248606 2.13s/it
Train loss 15973 0.199585 Grad Norm 1.050781 5.28s/it
Train loss 15974 0.164053 Grad Norm 0.563131 3.99s/it
Train loss 15975 0.152843 Grad Norm 0.674488 4.53s/it
Train loss 15976 0.204000 Grad Norm 1.114463 1.70s/it
Train loss 15977 0.208799 Grad Norm 0.781838 4.55s/it
Train loss 15978 0.111690 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16049 0.157495 Grad Norm 1.463394 2.46s/it
Train loss 16050 0.238261 Grad Norm 1.910000 5.23s/it
Train loss 16051 0.170485 Grad Norm 0.615820 3.15s/it
Train loss 16052 0.197801 Grad Norm 0.642887 3.47s/it
Train loss 16053 0.150817 Grad Norm 1.077039 5.59s/it
Train loss 16054 0.125088 Grad Norm 0.525812 5.39s/it
Train loss 16055 0.120750 Grad Norm 0.430503 5.16s/it
Train loss 16056 0.224407 Grad Norm 1.214457 2.62s/it
Train loss 16057 0.186566 Grad Norm 0.839812 3.32s/it
Train loss 16058 0.121366 Grad Norm 0.680548 8.20s/it
Train loss 16059 0.224260 Grad Norm 1.538381 2.91s/it
Train loss 16060 0.211987 Grad Norm 1.135013 2.92s/it
Train loss 16061 0.188323 Grad Norm 0.750642 2.16s/it
Train loss 16062 0.151435 Grad Norm 0.460944 3.38s/it
Train loss 16063 0.146284 Grad Norm 0.800468 5.38s/it
Train loss 16064 0.212765 Grad Norm 0.495347 2.18s/it
Train loss 16065 0.128131 Grad Norm 0.951954 5.79s/it
Train loss 16066 0.196270 Grad Norm 0.893580 4.78s/it
Train loss 16067 0.191997 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16135 0.159088 Grad Norm 0.753783 4.86s/it
Train loss 16136 0.116162 Grad Norm 0.744337 2.36s/it
Train loss 16137 0.179671 Grad Norm 1.638827 5.42s/it
Train loss 16138 0.198483 Grad Norm 1.696941 2.02s/it
Train loss 16139 0.114270 Grad Norm 0.370274 5.66s/it
Train loss 16140 0.145542 Grad Norm 0.649207 4.12s/it
Train loss 16141 0.160591 Grad Norm 0.784765 4.10s/it
Train loss 16142 0.184359 Grad Norm 1.221422 3.53s/it
Train loss 16143 0.159609 Grad Norm 0.522907 4.66s/it
Train loss 16144 0.186531 Grad Norm 1.123621 3.29s/it
Train loss 16145 0.103112 Grad Norm 0.270380 4.20s/it
Train loss 16146 0.219189 Grad Norm 0.876895 2.68s/it
Train loss 16147 0.136718 Grad Norm 0.354944 5.90s/it
Train loss 16148 0.193715 Grad Norm 0.809697 3.32s/it
Train loss 16149 0.152325 Grad Norm 0.483443 3.92s/it
Train loss 16150 0.249458 Grad Norm 0.738337 1.65s/it
Train loss 16151 0.117733 Grad Norm 0.936207 4.97s/it
Train loss 16152 0.163690 Grad Norm 1.034215 3.23s/it
Train loss 16153 0.152330 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16213 0.122497 Grad Norm 0.734982 5.12s/it
Train loss 16214 0.165916 Grad Norm 0.755360 2.20s/it
Train loss 16215 0.154513 Grad Norm 0.698299 6.39s/it
Train loss 16216 0.162908 Grad Norm 0.780963 3.16s/it
Train loss 16217 0.186087 Grad Norm 0.631323 2.81s/it
Train loss 16218 0.192465 Grad Norm 0.892205 5.04s/it
Train loss 16219 0.118153 Grad Norm 0.535553 3.36s/it
Train loss 16220 0.172122 Grad Norm 0.919452 2.57s/it
Train loss 16221 0.110579 Grad Norm 0.445677 4.39s/it
Train loss 16222 0.183350 Grad Norm 1.028839 4.97s/it
Train loss 16223 0.171830 Grad Norm 1.084154 3.86s/it
Train loss 16224 0.254465 Grad Norm 0.811520 3.03s/it
Train loss 16225 0.202101 Grad Norm 1.214112 2.87s/it
Train loss 16226 0.238588 Grad Norm 1.391518 2.52s/it
Train loss 16227 0.162659 Grad Norm 1.175325 5.55s/it
Train loss 16228 0.184100 Grad Norm 1.512547 8.26s/it
Train loss 16229 0.149181 Grad Norm 2.116518 3.75s/it
Train loss 16230 0.197320 Grad Norm 1.080026 2.70s/it
Train loss 16231 0.169006 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16302 0.207476 Grad Norm 0.551423 3.36s/it
Train loss 16303 0.144623 Grad Norm 0.878874 2.71s/it
Train loss 16304 0.148246 Grad Norm 0.630711 4.07s/it
Train loss 16305 0.212332 Grad Norm 0.581674 3.91s/it
Train loss 16306 0.171645 Grad Norm 0.669112 4.87s/it
Train loss 16307 0.203204 Grad Norm 0.383498 3.36s/it
Train loss 16308 0.194341 Grad Norm 0.645228 1.92s/it
Train loss 16309 0.189957 Grad Norm 0.968746 3.12s/it
Train loss 16310 0.177580 Grad Norm 0.532369 4.09s/it
Train loss 16311 0.176181 Grad Norm 0.812199 2.95s/it
Train loss 16312 0.175069 Grad Norm 0.679911 2.34s/it
Train loss 16313 0.144519 Grad Norm 0.621750 2.33s/it
Train loss 16314 0.196848 Grad Norm 0.461058 5.31s/it
Train loss 16315 0.166946 Grad Norm 0.901257 5.07s/it
Train loss 16316 0.167307 Grad Norm 0.724802 6.62s/it
Train loss 16317 0.174380 Grad Norm 0.755844 2.82s/it
Train loss 16318 0.182559 Grad Norm 0.590977 4.48s/it
Train loss 16319 0.116911 Grad Norm 0.556606 3.11s/it
Train loss 16320 0.088942 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16381 0.229352 Grad Norm 1.287995 2.87s/it
Train loss 16382 0.136597 Grad Norm 0.338524 4.14s/it
Train loss 16383 0.176085 Grad Norm 0.863231 3.89s/it
Train loss 16384 0.150221 Grad Norm 0.730830 4.83s/it
Train loss 16385 0.162769 Grad Norm 0.281440 5.25s/it
Train loss 16386 0.199336 Grad Norm 0.923340 2.75s/it
Train loss 16387 0.248047 Grad Norm 1.006550 1.87s/it
Train loss 16388 0.115207 Grad Norm 0.508943 3.57s/it
Train loss 16389 0.177878 Grad Norm 1.227820 3.07s/it
Train loss 16390 0.244985 Grad Norm 1.590343 3.32s/it
Train loss 16391 0.156440 Grad Norm 1.207564 3.31s/it
Train loss 16392 0.204162 Grad Norm 1.532775 2.20s/it
Train loss 16393 0.193294 Grad Norm 1.720876 4.94s/it
Train loss 16394 0.171363 Grad Norm 0.693382 2.96s/it
Train loss 16395 0.144760 Grad Norm 2.152141 4.16s/it
Train loss 16396 0.181740 Grad Norm 2.466370 4.04s/it
Train loss 16397 0.127008 Grad Norm 0.525110 3.79s/it
Train loss 16398 0.149414 Grad Norm 0.836578 3.56s/it
Train loss 16399 0.116366 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16468 0.159680 Grad Norm 0.737550 1.83s/it
Train loss 16469 0.188758 Grad Norm 0.710264 4.03s/it
Train loss 16470 0.161396 Grad Norm 1.280234 3.60s/it
Train loss 16471 0.181637 Grad Norm 1.117846 3.70s/it
Train loss 16472 0.139662 Grad Norm 0.555689 4.46s/it
Train loss 16473 0.171448 Grad Norm 0.772240 5.01s/it
Train loss 16474 0.139795 Grad Norm 0.228142 5.18s/it
Train loss 16475 0.189982 Grad Norm 1.253857 4.04s/it
Train loss 16476 0.132247 Grad Norm 0.227961 2.85s/it
Train loss 16477 0.187680 Grad Norm 1.253593 4.05s/it
Train loss 16478 0.172402 Grad Norm 1.327751 3.43s/it
Train loss 16479 0.138106 Grad Norm 0.518933 5.22s/it
Train loss 16480 0.094818 Grad Norm 0.426238 5.60s/it
Train loss 16481 0.122186 Grad Norm 0.908200 5.68s/it
Train loss 16482 0.186854 Grad Norm 1.216750 2.66s/it
Train loss 16483 0.169564 Grad Norm 0.711885 4.95s/it
Train loss 16484 0.153665 Grad Norm 1.007522 6.39s/it
Train loss 16485 0.191609 Grad Norm 1.172509 1.73s/it
Train loss 16486 0.162066 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16563 0.098460 Grad Norm 0.524038 8.36s/it
Train loss 16564 0.214591 Grad Norm 2.744200 3.24s/it
Train loss 16565 0.225764 Grad Norm 1.299353 3.62s/it
Train loss 16566 0.187611 Grad Norm 0.449686 5.65s/it
Train loss 16567 0.178644 Grad Norm 1.131192 2.92s/it
Train loss 16568 0.161976 Grad Norm 1.415969 3.98s/it
Train loss 16569 0.165455 Grad Norm 1.433714 4.67s/it
Train loss 16570 0.150715 Grad Norm 0.593928 3.47s/it
Train loss 16571 0.201411 Grad Norm 2.429273 1.78s/it
Train loss 16572 0.094658 Grad Norm 0.733468 7.23s/it
Train loss 16573 0.171074 Grad Norm 0.600768 2.85s/it
Train loss 16574 0.179747 Grad Norm 0.476306 5.00s/it
Train loss 16575 0.246035 Grad Norm 1.762211 2.30s/it
Train loss 16576 0.135129 Grad Norm 1.098680 2.35s/it
Train loss 16577 0.150916 Grad Norm 0.746656 3.99s/it
Train loss 16578 0.192218 Grad Norm 1.548535 4.24s/it
Train loss 16579 0.111526 Grad Norm 0.519086 6.48s/it
Train loss 16580 0.123961 Grad Norm 0.399363 3.88s/it
Train loss 16581 0.175591 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16643 0.204962 Grad Norm 1.043459 3.41s/it
Train loss 16644 0.169871 Grad Norm 1.643031 4.47s/it
Train loss 16645 0.152592 Grad Norm 1.404953 4.05s/it
Train loss 16646 0.152437 Grad Norm 0.736703 3.92s/it
Train loss 16647 0.145266 Grad Norm 0.643680 5.14s/it
Train loss 16648 0.163131 Grad Norm 2.922087 4.15s/it
Train loss 16649 0.172477 Grad Norm 1.101967 3.39s/it
Train loss 16650 0.150505 Grad Norm 0.687393 3.93s/it
Train loss 16651 0.138296 Grad Norm 0.991724 8.30s/it
Train loss 16652 0.112034 Grad Norm 1.113966 3.71s/it
Train loss 16653 0.234686 Grad Norm 1.824067 1.72s/it
Train loss 16654 0.153963 Grad Norm 0.841087 2.10s/it
Train loss 16655 0.166546 Grad Norm 0.416868 4.03s/it
Train loss 16656 0.180253 Grad Norm 1.327399 3.66s/it
Train loss 16657 0.196858 Grad Norm 0.995739 4.05s/it
Train loss 16658 0.124749 Grad Norm 0.576996 5.13s/it
Train loss 16659 0.188170 Grad Norm 1.421811 2.57s/it
Train loss 16660 0.145403 Grad Norm 0.690726 2.96s/it
Train loss 16661 0.197372 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16716 0.155799 Grad Norm 0.804187 5.85s/it
Train loss 16717 0.165029 Grad Norm 1.091564 3.30s/it
Train loss 16718 0.171914 Grad Norm 0.360240 3.71s/it
Train loss 16719 0.149808 Grad Norm 0.531793 5.12s/it
Train loss 16720 0.138622 Grad Norm 1.130193 4.44s/it
Train loss 16721 0.140092 Grad Norm 0.434036 3.61s/it
Train loss 16722 0.134915 Grad Norm 1.421538 5.88s/it
Train loss 16723 0.165524 Grad Norm 1.484739 6.65s/it
Train loss 16724 0.139001 Grad Norm 0.484372 5.69s/it
Train loss 16725 0.212262 Grad Norm 1.242925 2.90s/it
Train loss 16726 0.141325 Grad Norm 0.538322 2.33s/it
Train loss 16727 0.134652 Grad Norm 0.841213 6.48s/it
Train loss 16728 0.179674 Grad Norm 0.764143 5.08s/it
Train loss 16729 0.229901 Grad Norm 0.621114 2.37s/it
Train loss 16730 0.195167 Grad Norm 1.390593 2.44s/it
Train loss 16731 0.175232 Grad Norm 1.046017 3.16s/it
Train loss 16732 0.142365 Grad Norm 0.512660 2.15s/it
Train loss 16733 0.161155 Grad Norm 0.756127 7.72s/it
Train loss 16734 0.165814 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16801 0.150326 Grad Norm 0.618724 1.58s/it
Train loss 16802 0.170487 Grad Norm 0.431644 3.98s/it
Train loss 16803 0.221582 Grad Norm 0.840094 3.29s/it
Train loss 16804 0.147896 Grad Norm 0.370824 2.77s/it
Train loss 16805 0.185513 Grad Norm 0.297668 5.04s/it
Train loss 16806 0.123300 Grad Norm 0.513173 4.68s/it
Train loss 16807 0.241031 Grad Norm 0.534652 2.46s/it
Train loss 16808 0.150378 Grad Norm 0.380241 4.25s/it
Train loss 16809 0.221123 Grad Norm 0.452675 2.53s/it
Train loss 16810 0.180787 Grad Norm 1.063857 2.29s/it
Train loss 16811 0.176199 Grad Norm 1.558102 3.49s/it
Train loss 16812 0.223833 Grad Norm 0.838966 2.08s/it
Train loss 16813 0.143115 Grad Norm 0.949667 5.40s/it
Train loss 16814 0.122025 Grad Norm 0.360371 5.67s/it
Train loss 16815 0.179077 Grad Norm 1.454896 5.05s/it
Train loss 16816 0.199485 Grad Norm 0.322301 2.43s/it
Train loss 16817 0.219782 Grad Norm 1.425625 4.96s/it
Train loss 16818 0.188342 Grad Norm 0.484893 2.78s/it
Train loss 16819 0.100176 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16894 0.138035 Grad Norm 0.381834 2.54s/it
Train loss 16895 0.153709 Grad Norm 0.924708 4.10s/it
Train loss 16896 0.097697 Grad Norm 0.350375 4.20s/it
Train loss 16897 0.256309 Grad Norm 0.568587 3.13s/it
Train loss 16898 0.161904 Grad Norm 1.132413 4.87s/it
Train loss 16899 0.161452 Grad Norm 0.522936 3.00s/it
Train loss 16900 0.143851 Grad Norm 0.617194 4.69s/it
Train loss 16901 0.219213 Grad Norm 1.038363 2.72s/it
Train loss 16902 0.195905 Grad Norm 0.456347 1.99s/it
Train loss 16903 0.223833 Grad Norm 1.116197 3.88s/it
Train loss 16904 0.122709 Grad Norm 0.378166 4.01s/it
Train loss 16905 0.131905 Grad Norm 0.601558 4.66s/it
Train loss 16906 0.142639 Grad Norm 0.641772 5.67s/it
Train loss 16907 0.216104 Grad Norm 1.640595 3.68s/it
Train loss 16908 0.164556 Grad Norm 0.336650 2.29s/it
Train loss 16909 0.105384 Grad Norm 0.490725 8.08s/it
Train loss 16910 0.189919 Grad Norm 1.563312 2.33s/it
Train loss 16911 0.228894 Grad Norm 0.730450 2.92s/it
Train loss 16912 0.129372 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 16978 0.120629 Grad Norm 0.310199 5.30s/it
Train loss 16979 0.152155 Grad Norm 0.396805 3.09s/it
Train loss 16980 0.165302 Grad Norm 0.726533 2.70s/it
Train loss 16981 0.182195 Grad Norm 0.668700 3.60s/it
Train loss 16982 0.127588 Grad Norm 0.276192 4.13s/it
Train loss 16983 0.141990 Grad Norm 0.238884 3.36s/it
Train loss 16984 0.114513 Grad Norm 1.012689 3.22s/it
Train loss 16985 0.186549 Grad Norm 1.188641 2.88s/it
Train loss 16986 0.122935 Grad Norm 0.439266 3.21s/it
Train loss 16987 0.151068 Grad Norm 0.985309 5.45s/it
Train loss 16988 0.189228 Grad Norm 1.103202 5.35s/it
Train loss 16989 0.200044 Grad Norm 0.390560 3.35s/it
Train loss 16990 0.128876 Grad Norm 0.643576 5.79s/it
Train loss 16991 0.165674 Grad Norm 1.563115 7.73s/it
Train loss 16992 0.133776 Grad Norm 0.877060 5.09s/it
Train loss 16993 0.194932 Grad Norm 1.668641 4.97s/it
Train loss 16994 0.194311 Grad Norm 1.397708 7.70s/it
Train loss 16995 0.263811 Grad Norm 1.713670 2.18s/it
Train loss 16996 0.185171 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17060 0.185643 Grad Norm 0.616283 1.60s/it
Train loss 17061 0.127498 Grad Norm 0.717709 3.23s/it
Train loss 17062 0.139270 Grad Norm 0.776284 7.28s/it
Train loss 17063 0.118663 Grad Norm 0.694647 5.39s/it
Train loss 17064 0.138217 Grad Norm 0.675710 5.02s/it
Train loss 17065 0.181921 Grad Norm 1.092657 4.17s/it
Train loss 17066 0.176146 Grad Norm 0.425963 4.60s/it
Train loss 17067 0.156904 Grad Norm 0.846503 3.08s/it
Train loss 17068 0.129495 Grad Norm 0.967837 3.29s/it
Train loss 17069 0.176107 Grad Norm 0.659883 3.46s/it
Train loss 17070 0.199250 Grad Norm 1.223384 2.46s/it
Train loss 17071 0.139255 Grad Norm 0.620596 5.38s/it
Train loss 17072 0.180889 Grad Norm 0.615601 3.04s/it
Train loss 17073 0.164213 Grad Norm 0.507561 6.31s/it
Train loss 17074 0.131499 Grad Norm 1.379045 5.76s/it
Train loss 17075 0.165971 Grad Norm 0.810958 2.93s/it
Train loss 17076 0.153907 Grad Norm 0.352432 3.56s/it
Train loss 17077 0.132439 Grad Norm 0.373020 3.85s/it
Train loss 17078 0.231781 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17136 0.171798 Grad Norm 0.689405 3.81s/it
Train loss 17137 0.179489 Grad Norm 0.720283 5.23s/it
Train loss 17138 0.141604 Grad Norm 0.598637 3.74s/it
Train loss 17139 0.179466 Grad Norm 0.836994 5.76s/it
Train loss 17140 0.156129 Grad Norm 0.504330 4.84s/it
Train loss 17141 0.223979 Grad Norm 2.021807 3.02s/it
Train loss 17142 0.211950 Grad Norm 1.699129 3.45s/it
Train loss 17143 0.191065 Grad Norm 0.827155 3.64s/it
Train loss 17144 0.143888 Grad Norm 1.499800 3.24s/it
Train loss 17145 0.144059 Grad Norm 1.682573 4.01s/it
Train loss 17146 0.122590 Grad Norm 1.106442 5.24s/it
Train loss 17147 0.101008 Grad Norm 0.337385 6.55s/it
Train loss 17148 0.172161 Grad Norm 1.403837 5.54s/it
Train loss 17149 0.125472 Grad Norm 1.184358 6.58s/it
Train loss 17150 0.170727 Grad Norm 1.214609 3.96s/it
Train loss 17151 0.200856 Grad Norm 0.774425 2.48s/it
Train loss 17152 0.173658 Grad Norm 1.059878 2.88s/it
Train loss 17153 0.155330 Grad Norm 0.307656 3.50s/it
Train loss 17154 0.215800 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17226 0.195489 Grad Norm 0.713769 3.72s/it
Train loss 17227 0.191328 Grad Norm 1.668579 1.95s/it
Train loss 17228 0.162909 Grad Norm 1.162901 4.98s/it
Train loss 17229 0.131344 Grad Norm 0.861151 5.96s/it
Train loss 17230 0.159604 Grad Norm 0.404254 3.12s/it
Train loss 17231 0.168999 Grad Norm 1.597035 6.63s/it
Train loss 17232 0.199016 Grad Norm 1.330997 3.70s/it
Train loss 17233 0.207426 Grad Norm 0.632090 2.29s/it
Train loss 17234 0.144082 Grad Norm 0.404939 5.19s/it
Train loss 17235 0.148252 Grad Norm 1.578536 4.71s/it
Train loss 17236 0.132036 Grad Norm 0.265968 5.47s/it
Train loss 17237 0.137268 Grad Norm 0.234550 5.02s/it
Train loss 17238 0.159020 Grad Norm 0.205107 7.55s/it
Train loss 17239 0.107897 Grad Norm 0.622899 4.68s/it
Train loss 17240 0.196209 Grad Norm 1.249681 3.26s/it
Train loss 17241 0.137783 Grad Norm 0.435735 3.98s/it
Train loss 17242 0.127643 Grad Norm 0.529918 4.52s/it
Train loss 17243 0.156842 Grad Norm 1.145543 4.28s/it
Train loss 17244 0.196999 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17321 0.215921 Grad Norm 0.901145 2.48s/it
Train loss 17322 0.188181 Grad Norm 0.384767 4.26s/it
Train loss 17323 0.203567 Grad Norm 0.945851 5.56s/it
Train loss 17324 0.147152 Grad Norm 0.666762 5.33s/it
Train loss 17325 0.168194 Grad Norm 0.379022 2.42s/it
Train loss 17326 0.175500 Grad Norm 0.641785 3.58s/it
Train loss 17327 0.123697 Grad Norm 0.375645 5.76s/it
Train loss 17328 0.159682 Grad Norm 0.693869 4.03s/it
Train loss 17329 0.145022 Grad Norm 0.257001 5.33s/it
Train loss 17330 0.110807 Grad Norm 0.483425 4.12s/it
Train loss 17331 0.166052 Grad Norm 0.381440 4.68s/it
Train loss 17332 0.197688 Grad Norm 0.469739 2.30s/it
Train loss 17333 0.146923 Grad Norm 0.273215 2.02s/it
Train loss 17334 0.198248 Grad Norm 0.547471 3.09s/it
Train loss 17335 0.157246 Grad Norm 0.483367 3.45s/it
Train loss 17336 0.230208 Grad Norm 0.676760 3.96s/it
Train loss 17337 0.121651 Grad Norm 0.293686 7.90s/it
Train loss 17338 0.152686 Grad Norm 0.302883 4.26s/it
Train loss 17339 0.168985 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17388 0.139097 Grad Norm 0.709423 4.69s/it
Train loss 17389 0.182409 Grad Norm 1.201865 4.26s/it
Train loss 17390 0.158150 Grad Norm 0.595061 3.71s/it
Train loss 17391 0.193099 Grad Norm 1.531122 3.38s/it
Train loss 17392 0.202432 Grad Norm 1.643298 2.51s/it
Train loss 17393 0.210589 Grad Norm 0.631166 2.51s/it
Train loss 17394 0.188524 Grad Norm 2.350867 3.89s/it
Train loss 17395 0.193486 Grad Norm 2.560118 2.30s/it
Train loss 17396 0.119621 Grad Norm 0.480020 5.89s/it
Train loss 17397 0.186119 Grad Norm 1.333317 3.20s/it
Train loss 17398 0.250758 Grad Norm 2.204032 5.67s/it
Train loss 17399 0.185671 Grad Norm 0.989329 4.26s/it
Train loss 17400 0.112978 Grad Norm 0.542274 4.28s/it
Train loss 17401 0.120631 Grad Norm 0.708906 8.50s/it
Train loss 17402 0.109559 Grad Norm 0.903956 3.38s/it
Train loss 17403 0.105228 Grad Norm 0.557778 5.58s/it
Train loss 17404 0.226946 Grad Norm 0.605016 2.96s/it
Train loss 17405 0.153516 Grad Norm 0.774612 4.35s/it
Train loss 17406 0.214799 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17478 0.149382 Grad Norm 0.635919 3.29s/it
Train loss 17479 0.249763 Grad Norm 1.870907 3.24s/it
Train loss 17480 0.189005 Grad Norm 1.224900 3.52s/it
Train loss 17481 0.147849 Grad Norm 0.917548 8.10s/it
Train loss 17482 0.107216 Grad Norm 1.110052 5.61s/it
Train loss 17483 0.233867 Grad Norm 1.525746 5.49s/it
Train loss 17484 0.234115 Grad Norm 1.054816 2.81s/it
Train loss 17485 0.167974 Grad Norm 1.392488 5.75s/it
Train loss 17486 0.139270 Grad Norm 0.596930 4.37s/it
Train loss 17487 0.201358 Grad Norm 0.751603 2.58s/it
Train loss 17488 0.148022 Grad Norm 0.435014 3.14s/it
Train loss 17489 0.148124 Grad Norm 0.639737 7.47s/it
Train loss 17490 0.114982 Grad Norm 0.470175 5.11s/it
Train loss 17491 0.158538 Grad Norm 1.246780 5.09s/it
Train loss 17492 0.183911 Grad Norm 0.522374 3.66s/it
Train loss 17493 0.157972 Grad Norm 0.694583 5.55s/it
Train loss 17494 0.201843 Grad Norm 0.648340 4.36s/it
Train loss 17495 0.183586 Grad Norm 0.448427 2.93s/it
Train loss 17496 0.229985 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17556 0.197381 Grad Norm 0.693564 6.23s/it
Train loss 17557 0.149048 Grad Norm 0.563728 3.52s/it
Train loss 17558 0.196486 Grad Norm 0.851998 3.37s/it
Train loss 17559 0.220858 Grad Norm 0.820267 4.22s/it
Train loss 17560 0.128594 Grad Norm 0.486343 5.36s/it
Train loss 17561 0.209444 Grad Norm 1.379322 2.55s/it
Train loss 17562 0.147294 Grad Norm 0.744720 6.61s/it
Train loss 17563 0.187968 Grad Norm 0.673630 3.66s/it
Train loss 17564 0.160896 Grad Norm 0.610757 4.21s/it
Train loss 17565 0.180303 Grad Norm 1.108712 5.50s/it
Train loss 17566 0.199590 Grad Norm 0.475825 4.05s/it
Train loss 17567 0.248419 Grad Norm 1.322793 2.99s/it
Train loss 17568 0.144848 Grad Norm 0.806592 5.92s/it
Train loss 17569 0.131994 Grad Norm 0.522748 5.61s/it
Train loss 17570 0.180244 Grad Norm 1.215884 2.48s/it
Train loss 17571 0.127355 Grad Norm 1.256840 5.79s/it
Train loss 17572 0.164718 Grad Norm 0.749569 4.48s/it
Train loss 17573 0.175887 Grad Norm 1.327984 3.36s/it
Train loss 17574 0.172733 Gr

  cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) #scipy method


Train loss 17653 0.098275 Grad Norm 0.295277 8.47s/it
Train loss 17654 0.163488 Grad Norm 1.006215 5.02s/it
Train loss 17655 0.123726 Grad Norm 0.494050 4.02s/it
Train loss 17656 0.170734 Grad Norm 0.769328 2.38s/it
Train loss 17657 0.193407 Grad Norm 0.636801 2.48s/it
Train loss 17658 0.159717 Grad Norm 0.376752 3.46s/it
Train loss 17659 0.140138 Grad Norm 0.564588 1.96s/it
Train loss 17660 0.135066 Grad Norm 0.735563 5.26s/it
Train loss 17661 0.185358 Grad Norm 1.246633 4.23s/it
Train loss 17662 0.202923 Grad Norm 1.506220 3.82s/it
Train loss 17663 0.129510 Grad Norm 1.239004 3.02s/it
Train loss 17664 0.154738 Grad Norm 0.613430 5.32s/it
Train loss 17665 0.140665 Grad Norm 0.246258 4.04s/it
Train loss 17666 0.131541 Grad Norm 1.445718 5.15s/it
Train loss 17667 0.143255 Grad Norm 2.005965 5.80s/it
Train loss 17668 0.105230 Grad Norm 0.358764 5.91s/it
Train loss 17669 0.120055 Grad Norm 0.260861 3.54s/it
Train loss 17670 0.167674 Grad Norm 0.644183 2.71s/it
Train loss 17671 0.192499 Gr