<a href="https://colab.research.google.com/github/walnashgit/S30-Capstone/blob/main/FineTunePhi2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

walnash_llava_instruct_150k_path = kagglehub.dataset_download('walnash/llava-instruct-150k')
walnash_coco2014embeddingsm2_path = kagglehub.dataset_download('walnash/coco2014embeddingsm2')
walnash_visionprojectorphi2_m2_pytorch_default_1_path = kagglehub.model_download('walnash/visionprojectorphi2-m2/PyTorch/default/1')

print('Data source import complete.')


In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb
!pip install -U bitsandbytes



In [None]:
cfg = dict(
    vision_projector_file='/kaggle/input/visionprojectorphi2-m2/pytorch/default/1/projectionModel_ckpt_0_m2.pth',
    checkpoint_dir='/kaggle/working/checkpoint',
    finetuned_dir='/kaggle/working/fine_tuned',
    resume=False,
    inference=False,
    un_tuned=False,
    raw_test=False,
    batch_size=16,
    clip_dim=768, #512,
    phi_dim=2560,
    image_token='<|image|>',
    image_token2="<image>",
)

In [None]:
import os

WORKING_DIR = '/kaggle/working'
CHECKPOINT_DIR = os.path.join(WORKING_DIR, cfg['checkpoint_dir'])
FINE_TUNED_DIR = os.path.join(WORKING_DIR, cfg['finetuned_dir'])

# Create directories
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(FINE_TUNED_DIR, exist_ok=True)

print(f"Checkpoint directory created at: {CHECKPOINT_DIR}")
print(f"Fine-tuned model directory created at: {FINE_TUNED_DIR}")

Checkpoint directory created at: /kaggle/working/checkpoint
Fine-tuned model directory created at: /kaggle/working/fine_tuned


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

class ProjectionLayer(torch.nn.Module):
    def __init__(self, clip_dim, phi_dim):
        super().__init__()
        self.linear = torch.nn.Linear(clip_dim, phi_dim)

    def forward(self, x):
        return self.linear(x)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2"
    ]
)

IGNORE_INDEX = -100

class PhiWithVision(torch.nn.Module):
    def __init__(self, device='cuda', tokenizer=None, tokenizer_length=None):
        super().__init__()
        clip_dim = cfg['clip_dim']
        phi_dim = cfg['phi_dim']
        self.device = device
        self.tokenizer = tokenizer
        self.loss_fct = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX, label_smoothing=0.1)

        # for inference
        if cfg['un_tuned'] or cfg['raw_test']:
            self.phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",
                                                                  low_cpu_mem_usage=True,
                                                                  return_dict=True,
                                                                  trust_remote_code=True,
                                                                  torch_dtype=torch.float32)
            self.phi_model.eval()
        elif cfg['inference']:
            base_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",
                                                                  low_cpu_mem_usage=True,
                                                                  return_dict=True,
                                                                  trust_remote_code=True,
                                                                  torch_dtype=torch.float32)

            if tokenizer_length is not None:
                base_model.resize_token_embeddings(tokenizer_length)
            self.phi_model = PeftModel.from_pretrained(base_model, cfg['finetuned_dir'])
            self.phi_model.merge_and_unload()
            self.phi_model.eval()
        else:
            if device == 'mps':
                self.phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",
                                                                      trust_remote_code=True)
                self.phi_model = self.phi_model.half()
            else:
                self.phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",
                                                                      trust_remote_code=True,
                                                                      quantization_config=bnb_config,
                                                                      low_cpu_mem_usage=True)
            if tokenizer_length is not None:
                self.phi_model.resize_token_embeddings(tokenizer_length)
            self.phi_model.config.use_cache = False
            self.phi_model = prepare_model_for_kbit_training(self.phi_model)
            self.phi_model = get_peft_model(self.phi_model, peft_config)
            self.phi_embeddings = self.phi_model.get_input_embeddings()
            self.phi_model.train()

        if cfg['vision_projector_file'] == './checkpoint/vp_ckpt_0.pth':
            self.projectionModel = ProjectionLayer2(clip_dim, phi_dim)
        else:
            self.projectionModel = ProjectionLayer(clip_dim, phi_dim)

        # if cfg['raw_test']:
        if device == 'cuda':
            # trained_proj_model = torch.load(cfg['vision_projector_file'], map_location='cpu')
            trained_proj_model = torch.load(cfg['vision_projector_file'])
            self.projectionModel.load_state_dict(trained_proj_model['model_state_dict'])
            self.projectionModel.to(device)
        else:
            # trained_proj_model = torch.load(cfg['vision_projector_file'])
            trained_proj_model = torch.load(cfg['vision_projector_file'], map_location='cpu')
            self.projectionModel.load_state_dict(trained_proj_model['model_state_dict'])

        # freeze vision projection
        self.projectionModel.eval()
        for param in self.projectionModel.parameters():
            param.requires_grad = False

        # self.fusion_layer = torch.nn.Linear(phi_dim * 2, phi_dim)

    def forward(self, input_ids=None, attention_mask=None, image_embedding=None, image_token_positions=None, labels=None, **kwargs):
        if cfg['raw_test']:
            # Generate text based only on the image embedding
            # projected_embed = self.projectionModel(image_embedding)
            # inputs_embeds = projected_embed.unsqueeze(1)

            inputs_embeds = self.projectionModel(image_embedding)
            if cfg['clip_dim'] == 512:
                inputs_embeds = inputs_embeds.unsqueeze(1)

            # Create a dummy input_ids tensor
            batch_size = inputs_embeds.shape[0]
            input_ids = torch.full((batch_size, 1), self.tokenizer.bos_token_id, dtype=torch.long,
                                   device=inputs_embeds.device)


            # Prepare generation config
            generation_kwargs = {
                'input_ids': input_ids,
                'max_length': 50,
                # 'inputs_embeds': inputs_embeds,
                **kwargs  # This will include all other passed parameters
            }

            with torch.no_grad():
                outputs = self.phi_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
            return outputs
        else:
#             print(f"Image embedding range: {image_embedding.min().item():.4f} to {image_embedding.max().item():.4f}")

            projected_clip = self.projectionModel(image_embedding).requires_grad_(requires_grad=False)

            combined_input, new_labels = self.prepare_input_embed(input_ids, projected_clip, image_token_positions, labels)
            phi_outputs = self.phi_model(inputs_embeds=combined_input, attention_mask=attention_mask)

            if torch.isinf(phi_outputs.logits).any():
                print("WARNING: inf values in logits!")
            if torch.isnan(phi_outputs.logits).any():
                print("WARNING: nan values in logits!")

#             return phi_outputs, new_labels
            logits = phi_outputs.logits
            X = logits[:, :-1, :].contiguous()
            Y = new_labels[:, 1:].contiguous().to(self.device, dtype=torch.long)
            X = X.view(-1, X.size(-1))
            Y = Y.view(-1)

            loss = self.loss_fct(X, Y)
            return loss

    def prepare_input_embed(self, input_ids, projected_clip, image_token_pos, labels):
        new_input_embeds = []
        new_labels = []
        for batch_idx, cur_input_ids in enumerate(input_ids):

            pos = image_token_pos[batch_idx]
            phi_text_embedding = self.phi_embeddings(cur_input_ids)
            # if pos.numel() > 0 and pos.item() < seq_length: # lets assume image token pos is present. It should be
            cur_new_input_embeds = [phi_text_embedding[:pos], projected_clip[batch_idx], phi_text_embedding[pos + 1:]]
            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
            new_input_embeds.append(cur_new_input_embeds)

            if labels is not None:
                cur_labels = labels[batch_idx]
                cur_new_labels = [cur_labels[:pos],
                                  torch.full((projected_clip[batch_idx].shape[0],), IGNORE_INDEX, device=labels.device,
                                             dtype=labels.dtype), cur_labels[pos + 1:]]

                cur_new_labels = torch.cat(cur_new_labels, dim=0)
                new_labels.append(cur_new_labels)

        new_input_embeds = torch.stack(new_input_embeds, dim=0)
        new_labels = torch.stack(new_labels, dim=0)
        return new_input_embeds, new_labels

In [None]:
import json
import random

import torch
from torch.utils.data import Dataset, Sampler
import h5py



class MultiModalLlavaDataset(Dataset):
    def __init__(self, embedding_file, instruct_file, tokenizer, max_length=128):
        self.embedding_file = h5py.File(embedding_file, 'r')
        self.instruct_file = instruct_file
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_ids = set(self.embedding_file.keys())
        self.conversation_data = self.get_conversation_data()  # List of text inputs

        # Special token for image placeholder
        self.image_token = cfg['image_token2']
        # Add the image token to the tokenizer
        # self.tokenizer.add_special_tokens({'additional_special_tokens': [self.image_token]})

    def __len__(self):
        return len(self.conversation_data)

    def __getitem__(self, idx):
        conv_item = self.conversation_data[idx]
        image_id = str(conv_item.get('image_id'))

        image_embedding = torch.tensor(self.embedding_file[image_id][()])
        conv = conv_item.get('qna')
        # prepared_conv = self.prepare_conversation(conversations)
        prompt = '<image>\n' + conv

        # Tokenize conversation
        tokenized_prompt = self.tokenizer(prompt, truncation=True, max_length=self.max_length, padding='max_length')
        token_ids = tokenized_prompt['input_ids']

        input_pad_tokens = self.max_length - (len(token_ids) + image_embedding.size(0) - 1)

        if input_pad_tokens < 0:
            input_pad_tokens = 0
            # conn_tokens = self.tokenizer.encode('AI### ')
            # token_ids = tokenizer_image_token(parts[0], tokenizer=self.tokenizer)
            truncate_len = self.max_length - (image_embedding.size(0) - 1)
            token_ids = token_ids[:truncate_len]

        input_ids = torch.cat(
            [
                torch.tensor(token_ids, dtype=torch.int32),
                torch.tensor([self.tokenizer.pad_token_id] * input_pad_tokens, dtype=torch.int32)
            ],
            dim=0
        )

        labels = input_ids.clone()
        parts = prompt.split('AI### ')
        if len(parts) != 2:
            print(prompt)
            raise Exception("Not proper QnA text: " + conv)

        que_len = len(self.tokenizer(parts[0] + 'AI### '))
        labels[0: que_len] = IGNORE_INDEX

        image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
        # image_token_position = (torch.tensor(tokenized['input_ids']) == image_token_id).nonzero(as_tuple=True)[0]
        image_token_position = (input_ids == image_token_id).nonzero(as_tuple=True)[0]

        return {
            'image_embedding': image_embedding,
            'input_ids': input_ids,
            # 'attention_mask': torch.tensor(tokenized['attention_mask']),
            'image_token_position': image_token_position,
            'labels': labels
        }

    def get_conversation_data(self):
        with open(self.instruct_file, 'r') as f:
            instruct_file_json = json.load(f)
            return self.split_conversation(instruct_file_json)

    def split_conversation(self, instruct_file_json):
        instruct_data = []
        seps = ['\n', '<|endoftext|>']
        for conv_dict in instruct_file_json:
            image_id = self.remove_leading_zeros(str(conv_dict.get('id')))
            conv = conv_dict.get('conversations')
            if image_id in self.image_ids:
                t = None
                for i, qa in enumerate(conv):
                    role = qa['from']
                    msg = qa['value'].replace('<image>', '')
                    if i % 2 == 0:
                        t = ''

                        if role == 'human':
                            t += 'Human### ' + msg + seps[0]
                    else:
                        if role == 'gpt' and t and msg:
                            t += 'AI### ' + msg + seps[1]

                        if t:
                            instruct_dict = dict(
                                image_id=image_id,
                                qna=t
                            )
                            instruct_data.append(instruct_dict)

        return instruct_data

    def remove_leading_zeros(self, number_string):
        return number_string.lstrip('0') or '0'

    def prepare_conversation(self, conversations):
        # Prepare conversation
        conversation = ""
        for turn in conversations:
            if turn['from'] == 'human':
                conversation += f"Human###: {turn['value']}\n"
            else:
                conversation += f"AI###: {turn['value']}{self.tokenizer.eos_token}"
        return conversation


class BucketBatchSampler(Sampler):
    def __init__(self, data_source, batch_size):
        super().__init__(data_source)
        self.data_source = data_source
        self.batch_size = batch_size
        self.buckets = self._create_buckets()

    def _create_buckets(self):
        # Sort data by sequence length and create buckets
        # This is a simplified example; adjust based on your actual data structure
        sorted_data = sorted(range(len(self.data_source)),
                             key=lambda idx: len(self.data_source[idx]['input_ids']))
        return [sorted_data[i:i + self.batch_size] for i in range(0, len(sorted_data), self.batch_size)]

    def __iter__(self):
        random.shuffle(self.buckets)
        for bucket in self.buckets:
            yield bucket

    def __len__(self):
        return len(self.buckets)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm
import itertools
from torch.nn.utils.rnn import pad_sequence
import gc

phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
phi_tokenizer.pad_token = phi_tokenizer.eos_token
phi_tokenizer.add_special_tokens({'additional_special_tokens': [cfg['image_token2']]})

def fine_tune(model, dataloader, num_epochs, device):
    model.train()
#     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)

#     loss_fct = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX, label_smoothing=0.1)
    max_grad_norm = 1.0

    step_count = 0
    prev_loss = 1000
    bestLoss = 0
    bestStep = 0
    accumulation_steps = 2
    use_amp = True
    optimizer.zero_grad()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    for epoch in range(num_epochs):
        total_loss = 0
        step_loss = 0
        batch_iterator = tqdm(dataloader, desc=f"Processing Epoch {epoch:02d}")

        for batch in batch_iterator:

        # for batch in itertools.islice(batch_iterator, 2):
            input_ids = batch['input_ids'].to(device)
            # attention_mask = batch['attention_mask'].to(device)
            image_embedding = batch['image_embedding'].to(device)
            image_token_positions = batch['image_token_position'].to(device)
            labels = batch['labels'].to(device)
            with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp):
#             with torch.autocast(device_type=device, dtype=torch.float32):
                # Forward pass
#                 outputs, new_labels = model(input_ids, None, image_embedding, image_token_positions, labels)
                loss = model(input_ids, None, image_embedding, image_token_positions, labels)

#             if isinstance(outputs, tuple):
#                 logits = outputs[0]
#             elif hasattr(outputs, 'logits'):
#                 logits = outputs.logits
#             else:
#                 raise ValueError("Unexpected output format from the model")

#             X = logits[:, :-1, :].contiguous()
#             Y = new_labels[:, 1:].contiguous().to(device, dtype=torch.long)

#             # Add debugging info
# #             print(f"\nShapes before view - X: {X.shape}, Y: {Y.shape}")
# #             print(f"Number of IGNORE_INDEX tokens: {(Y == -100).sum().item()}")


#             X = X.view(-1, X.size(-1))
#             Y = Y.view(-1)

            # Add value checks
#             print(f"Logits range: {X.min().item():.4f} to {X.max().item():.4f}")
#             print(f"Labels range: {Y.min().item()} to {Y.max().item()}")

#             # Calculate loss
#             loss = loss_fct(X, Y)

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()
#             step_loss += loss.item()

            try:
#                 loss = loss_fct(X, Y) / accumulation_steps

                if torch.isinf(loss) or torch.isnan(loss):
                    print("WARNING: Loss is inf/nan! Skipping batch")
#                     continue

#                 loss.backward()
                scaler.scale(loss).backward()

                # Add gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                if step_count > 0 and (step_count + 1) % accumulation_steps == 0:
#                     optimizer.step()
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()

#                 optimizer.step()

                total_loss += loss.item()
                step_loss += loss.item()

            except RuntimeError as e:
                print(f"Error in loss computation: {str(e)}")
                print(f"X shape: {X.shape}, Y shape: {Y.shape}")
                print(f"X dtype: {X.dtype}, Y dtype: {Y.dtype}")
                continue

#             print(f"\n Epoch {epoch + 1}, step: {step_count}, Loss: {loss.item()}, total loss: {total_loss}")

            if loss.item() < prev_loss:
                bestLoss = loss.item()
                bestStep = step_count
                print(f"\n Epoch {epoch + 1}, step: {step_count}, Loss: {loss.item()}, total loss: {total_loss}")
                save_model(epoch, model, total_loss, optimizer, step_count)
                prev_loss = loss.item()
            elif (step_count > 0 and step_count % 10 == 0):
                print(f"\n Epoch {epoch + 1}, step: {step_count}, loss: {loss.item()}, total loss: {total_loss}")
                if device == 'cuda':
                    print('clearing cache')
                    gc.collect()
                    torch.cuda.empty_cache()


            # Clear cache periodically
            if step_count % 10 == 0:
                if device == 'cuda':
                    torch.cuda.empty_cache()

            step_count += 1

#         if device == 'cuda':
#             torch.cuda.empty_cache()
#         elif device == 'mps':
#             torch.mps.empty_cache()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss}")


def save_model(epoch, model, loss, optimizer, step_count):
    print('saving model')
    torch.save({
        'epoch': epoch,
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'step_count': step_count,
    }, '%s/phiModel_ckpt_%s.pth' % (cfg['checkpoint_dir'], epoch))

    model.phi_model.save_pretrained(cfg['finetuned_dir'])
    phi_tokenizer.save_pretrained(cfg['finetuned_dir'])


def run_fine_tune():
    cfg['un_tuned'] = False
    cfg['inference'] = False
    torch.manual_seed(35)

    device = 'cpu'
    if torch.cuda.is_available():
        print('using CUDA')
        device = 'cuda'
        torch.cuda.empty_cache()
    elif torch.backends.mps.is_available():
        device = 'mps'
        torch.mps.empty_cache()


    embedding_file = '/kaggle/input/coco2014embeddingsm2/coco2014_clip_embeddings_m2.h5'
    instruction_file = '/kaggle/input/llava-instruct-150k/llava_instruct_150k.json'

    batch_size = 12
    dataset = MultiModalLlavaDataset(embedding_file,
                                     instruction_file, phi_tokenizer)
    sampler = BucketBatchSampler(dataset, batch_size)
    dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=collate_fn)

    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
        torch.cuda.empty_cache()
    elif torch.backends.mps.is_available():
        device = 'mps'
        torch.mps.empty_cache()

    torch.manual_seed(25)

    # model = PhiWithVision.load_from_checkpoint(model_path)
    model = PhiWithVision(device=device, tokenizer_length=len(phi_tokenizer))
    model.to(device)

    # model.phi_model.resize_token_embeddings(len(phi_tokenizer))

    fine_tune(model, dataloader, 1, device)


def collate_fn(batch):
    # Separate the different items in the batch
    image_embeddings = [item['image_embedding'] for item in batch]
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    # attention_masks = [torch.tensor(item['attention_mask']) for item in batch]
    image_token_positions = [item['image_token_position'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=phi_tokenizer.pad_token_id)

    image_embeddings = torch.stack(image_embeddings)
    labels = torch.stack(labels)

    image_token_positions = torch.stack(image_token_positions)

    return {
        'image_embedding': image_embeddings,
        'input_ids': input_ids_padded,
        # 'attention_mask': attention_masks_padded,
        'image_token_position': image_token_positions,
        'labels': labels
    }

In [None]:
run_fine_tune()

using CUDA


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  trained_proj_model = torch.load(cfg['vision_projector_file'])
  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
  input_ids = [torch.tensor(item['input_ids']) for item in batch]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]



 Epoch 1, step: 0, Loss: 6.850039005279541, total loss: 6.850039005279541
saving model


Processing Epoch 00:   0%|          | 1/18148 [00:06<34:11:20,  6.78s/it]


 Epoch 1, step: 1, Loss: 6.600619316101074, total loss: 13.450658321380615
saving model


Processing Epoch 00:   0%|          | 8/18148 [00:35<16:41:43,  3.31s/it]


 Epoch 1, step: 8, Loss: 6.5821027755737305, total loss: 66.05057764053345
saving model


Processing Epoch 00:   0%|          | 11/18148 [00:51<21:16:25,  4.22s/it]


 Epoch 1, step: 10, loss: 8.135393142700195, total loss: 83.15692567825317
clearing cache


Processing Epoch 00:   0%|          | 21/18148 [01:21<15:40:49,  3.11s/it]


 Epoch 1, step: 20, loss: 7.429264068603516, total loss: 161.38754796981812
clearing cache

 Epoch 1, step: 21, Loss: 6.460536956787109, total loss: 167.84808492660522
saving model


Processing Epoch 00:   0%|          | 25/18148 [01:41<19:17:27,  3.83s/it]


 Epoch 1, step: 25, Loss: 6.286007881164551, total loss: 194.79063987731934
saving model


Processing Epoch 00:   0%|          | 28/18148 [01:59<23:30:15,  4.67s/it]


 Epoch 1, step: 28, Loss: 6.115997314453125, total loss: 214.58948040008545
saving model


Processing Epoch 00:   0%|          | 31/18148 [02:18<25:26:53,  5.06s/it]


 Epoch 1, step: 30, loss: 6.947427749633789, total loss: 229.09789276123047
clearing cache


Processing Epoch 00:   0%|          | 32/18148 [02:21<22:25:46,  4.46s/it]


 Epoch 1, step: 32, Loss: 5.725915431976318, total loss: 242.28151750564575
saving model


Processing Epoch 00:   0%|          | 37/18148 [02:45<19:35:30,  3.89s/it]


 Epoch 1, step: 37, Loss: 4.883241653442383, total loss: 273.73510551452637
saving model


Processing Epoch 00:   0%|          | 41/18148 [03:07<21:40:48,  4.31s/it]


 Epoch 1, step: 40, loss: 5.978472709655762, total loss: 292.7849922180176
clearing cache


Processing Epoch 00:   0%|          | 47/18148 [03:25<16:21:15,  3.25s/it]


 Epoch 1, step: 47, Loss: 4.685049057006836, total loss: 333.1409683227539
saving model


Processing Epoch 00:   0%|          | 49/18148 [03:41<26:15:23,  5.22s/it]


 Epoch 1, step: 49, Loss: 4.54277229309082, total loss: 342.8160033226013
saving model


Processing Epoch 00:   0%|          | 51/18148 [03:57<30:56:33,  6.16s/it]


 Epoch 1, step: 50, loss: 5.085237503051758, total loss: 347.9012408256531
clearing cache


Processing Epoch 00:   0%|          | 55/18148 [04:09<19:17:29,  3.84s/it]


 Epoch 1, step: 55, Loss: 4.4929118156433105, total loss: 373.11967277526855
saving model


Processing Epoch 00:   0%|          | 57/18148 [04:24<26:02:19,  5.18s/it]


 Epoch 1, step: 57, Loss: 4.408607482910156, total loss: 382.71258783340454
saving model


Processing Epoch 00:   0%|          | 61/18148 [04:45<23:05:52,  4.60s/it]


 Epoch 1, step: 60, loss: 5.237782001495361, total loss: 397.40483379364014
clearing cache


Processing Epoch 00:   0%|          | 62/18148 [04:48<20:51:25,  4.15s/it]


 Epoch 1, step: 62, Loss: 4.026614189147949, total loss: 405.90236616134644
saving model


Processing Epoch 00:   0%|          | 66/18148 [05:09<21:21:03,  4.25s/it]


 Epoch 1, step: 66, Loss: 3.8999595642089844, total loss: 423.46519947052
saving model


Processing Epoch 00:   0%|          | 71/18148 [05:34<20:20:22,  4.05s/it]


 Epoch 1, step: 70, loss: 4.290029048919678, total loss: 442.51620626449585
clearing cache

 Epoch 1, step: 71, Loss: 3.8720600605010986, total loss: 446.38826632499695
saving model


Processing Epoch 00:   0%|          | 73/18148 [05:49<27:08:28,  5.41s/it]


 Epoch 1, step: 73, Loss: 3.7758092880249023, total loss: 455.1845133304596
saving model


Processing Epoch 00:   0%|          | 76/18148 [06:07<25:54:03,  5.16s/it]


 Epoch 1, step: 76, Loss: 3.7297120094299316, total loss: 468.6392900943756
saving model


Processing Epoch 00:   0%|          | 81/18148 [06:31<20:51:34,  4.16s/it]


 Epoch 1, step: 80, loss: 5.005265235900879, total loss: 488.1097958087921
clearing cache


Processing Epoch 00:   0%|          | 82/18148 [06:34<19:23:24,  3.86s/it]


 Epoch 1, step: 82, Loss: 3.6222176551818848, total loss: 496.72903990745544
saving model


Processing Epoch 00:   0%|          | 85/18148 [06:53<23:52:00,  4.76s/it]


 Epoch 1, step: 85, Loss: 3.4658756256103516, total loss: 509.43134331703186
saving model


Processing Epoch 00:   0%|          | 87/18148 [07:08<29:18:16,  5.84s/it]


 Epoch 1, step: 87, Loss: 3.1400036811828613, total loss: 516.3065645694733
saving model


Processing Epoch 00:   1%|          | 91/18148 [07:29<23:20:22,  4.65s/it]


 Epoch 1, step: 90, loss: 4.453973293304443, total loss: 527.1376550197601
clearing cache


Processing Epoch 00:   1%|          | 92/18148 [07:32<21:05:58,  4.21s/it]


 Epoch 1, step: 92, Loss: 3.1374599933624268, total loss: 533.6214730739594
saving model


Processing Epoch 00:   1%|          | 96/18148 [07:53<21:14:23,  4.24s/it]


 Epoch 1, step: 96, Loss: 3.096371650695801, total loss: 547.2974262237549
saving model


Processing Epoch 00:   1%|          | 101/18148 [08:17<20:00:56,  3.99s/it]


 Epoch 1, step: 100, loss: 3.5716891288757324, total loss: 561.5641138553619
clearing cache


Processing Epoch 00:   1%|          | 104/18148 [08:26<17:00:54,  3.39s/it]


 Epoch 1, step: 104, Loss: 3.068920612335205, total loss: 574.466001033783
saving model


Processing Epoch 00:   1%|          | 109/18148 [08:49<18:28:09,  3.69s/it]


 Epoch 1, step: 109, Loss: 3.048552989959717, total loss: 590.9507303237915
saving model


Processing Epoch 00:   1%|          | 111/18148 [09:05<26:33:07,  5.30s/it]


 Epoch 1, step: 110, loss: 3.093045234680176, total loss: 594.0437755584717
clearing cache


Processing Epoch 00:   1%|          | 115/18148 [09:17<18:03:54,  3.61s/it]


 Epoch 1, step: 115, Loss: 2.9548375606536865, total loss: 610.6806344985962
saving model


Processing Epoch 00:   1%|          | 119/18148 [09:38<20:48:47,  4.16s/it]


 Epoch 1, step: 119, Loss: 2.7924680709838867, total loss: 624.8444056510925
saving model


Processing Epoch 00:   1%|          | 121/18148 [09:53<27:22:18,  5.47s/it]


 Epoch 1, step: 120, loss: 2.7991762161254883, total loss: 627.643581867218
clearing cache


Processing Epoch 00:   1%|          | 126/18148 [10:09<17:40:37,  3.53s/it]


 Epoch 1, step: 126, Loss: 2.6399929523468018, total loss: 647.6398124694824
saving model


Processing Epoch 00:   1%|          | 131/18148 [10:33<19:09:51,  3.83s/it]


 Epoch 1, step: 130, loss: 2.793473243713379, total loss: 659.2374978065491
clearing cache

 Epoch 1, step: 131, Loss: 2.617830514907837, total loss: 661.8553283214569
saving model


Processing Epoch 00:   1%|          | 141/18148 [11:13<16:55:18,  3.38s/it]


 Epoch 1, step: 140, loss: 3.049771308898926, total loss: 689.4591565132141
clearing cache

 Epoch 1, step: 141, Loss: 2.5594072341918945, total loss: 692.018563747406
saving model


Processing Epoch 00:   1%|          | 150/18148 [11:50<16:20:22,  3.27s/it]


 Epoch 1, step: 150, Loss: 2.5282301902770996, total loss: 719.6436891555786
saving model


Processing Epoch 00:   1%|          | 154/18148 [12:11<19:53:52,  3.98s/it]


 Epoch 1, step: 154, Loss: 2.4078831672668457, total loss: 730.4662415981293
saving model


Processing Epoch 00:   1%|          | 161/18148 [12:42<18:00:37,  3.60s/it]


 Epoch 1, step: 160, loss: 2.6982269287109375, total loss: 747.7805867195129
clearing cache


Processing Epoch 00:   1%|          | 169/18148 [13:07<15:37:20,  3.13s/it]


 Epoch 1, step: 169, Loss: 2.2523207664489746, total loss: 773.5346245765686
saving model


Processing Epoch 00:   1%|          | 171/18148 [13:22<25:06:01,  5.03s/it]


 Epoch 1, step: 170, loss: 2.873903751373291, total loss: 776.4085283279419
clearing cache


Processing Epoch 00:   1%|          | 181/18148 [13:53<16:36:32,  3.33s/it]


 Epoch 1, step: 180, loss: 2.628145933151245, total loss: 804.4999949932098
clearing cache


Processing Epoch 00:   1%|          | 191/18148 [14:25<15:57:54,  3.20s/it]


 Epoch 1, step: 190, loss: 2.4692130088806152, total loss: 832.2587320804596
clearing cache


Processing Epoch 00:   1%|          | 201/18148 [14:55<15:44:13,  3.16s/it]


 Epoch 1, step: 200, loss: 2.4242215156555176, total loss: 857.219028711319
clearing cache


Processing Epoch 00:   1%|          | 211/18148 [15:27<15:57:55,  3.20s/it]


 Epoch 1, step: 210, loss: 3.221339464187622, total loss: 884.4601819515228
clearing cache


Processing Epoch 00:   1%|          | 220/18148 [15:54<15:19:34,  3.08s/it]


 Epoch 1, step: 220, Loss: 2.2354776859283447, total loss: 909.5898678302765
saving model


Processing Epoch 00:   1%|          | 225/18148 [16:18<18:18:29,  3.68s/it]


 Epoch 1, step: 225, Loss: 2.234790802001953, total loss: 923.4563131332397
saving model


Processing Epoch 00:   1%|▏         | 231/18148 [16:45<18:24:28,  3.70s/it]


 Epoch 1, step: 230, loss: 2.4144275188446045, total loss: 936.1716275215149
clearing cache


Processing Epoch 00:   1%|▏         | 241/18148 [17:17<15:58:02,  3.21s/it]


 Epoch 1, step: 240, loss: 2.4412670135498047, total loss: 961.3574652671814
clearing cache


Processing Epoch 00:   1%|▏         | 251/18148 [17:48<15:36:48,  3.14s/it]


 Epoch 1, step: 250, loss: 2.6122708320617676, total loss: 988.0473122596741
clearing cache


Processing Epoch 00:   1%|▏         | 252/18148 [17:51<15:33:32,  3.13s/it]


 Epoch 1, step: 252, Loss: 2.2069289684295654, total loss: 993.3836166858673
saving model


Processing Epoch 00:   1%|▏         | 256/18148 [18:12<19:52:00,  4.00s/it]


 Epoch 1, step: 256, Loss: 2.042423725128174, total loss: 1004.807003736496
saving model


Processing Epoch 00:   1%|▏         | 261/18148 [18:37<19:58:33,  4.02s/it]


 Epoch 1, step: 260, loss: 2.3108983039855957, total loss: 1014.2578237056732
clearing cache


Processing Epoch 00:   1%|▏         | 271/18148 [19:08<16:00:13,  3.22s/it]


 Epoch 1, step: 270, loss: 2.5162110328674316, total loss: 1041.0802178382874
clearing cache


Processing Epoch 00:   2%|▏         | 281/18148 [19:39<15:38:55,  3.15s/it]


 Epoch 1, step: 280, loss: 2.385115146636963, total loss: 1065.5866770744324
clearing cache


Processing Epoch 00:   2%|▏         | 291/18148 [20:10<15:50:54,  3.20s/it]


 Epoch 1, step: 290, loss: 2.4688377380371094, total loss: 1091.779799938202
clearing cache


Processing Epoch 00:   2%|▏         | 301/18148 [20:42<15:45:15,  3.18s/it]


 Epoch 1, step: 300, loss: 2.5525684356689453, total loss: 1118.335963010788
clearing cache


Processing Epoch 00:   2%|▏         | 311/18148 [21:13<15:43:03,  3.17s/it]


 Epoch 1, step: 310, loss: 2.3074228763580322, total loss: 1144.8466556072235
clearing cache


Processing Epoch 00:   2%|▏         | 317/18148 [21:31<15:12:44,  3.07s/it]


 Epoch 1, step: 317, Loss: 2.040566921234131, total loss: 1161.1508741378784
saving model


Processing Epoch 00:   2%|▏         | 321/18148 [21:52<20:02:59,  4.05s/it]


 Epoch 1, step: 320, loss: 2.1447577476501465, total loss: 1167.9702596664429
clearing cache


Processing Epoch 00:   2%|▏         | 331/18148 [22:24<16:16:53,  3.29s/it]


 Epoch 1, step: 330, loss: 3.0853495597839355, total loss: 1195.0530452728271
clearing cache


Processing Epoch 00:   2%|▏         | 341/18148 [22:55<15:37:16,  3.16s/it]


 Epoch 1, step: 340, loss: 2.3891396522521973, total loss: 1218.317259311676
clearing cache

 Epoch 1, step: 341, Loss: 2.0096585750579834, total loss: 1220.326917886734
saving model


Processing Epoch 00:   2%|▏         | 342/18148 [23:07<28:19:08,  5.73s/it]


 Epoch 1, step: 342, Loss: 1.996107816696167, total loss: 1222.3230257034302
saving model


Processing Epoch 00:   2%|▏         | 351/18148 [23:44<17:16:49,  3.50s/it]


 Epoch 1, step: 350, loss: 2.3336610794067383, total loss: 1242.9921116828918
clearing cache


Processing Epoch 00:   2%|▏         | 361/18148 [24:16<15:53:43,  3.22s/it]


 Epoch 1, step: 360, loss: 2.3260064125061035, total loss: 1267.9998407363892
clearing cache


Processing Epoch 00:   2%|▏         | 371/18148 [24:47<15:33:48,  3.15s/it]


 Epoch 1, step: 370, loss: 2.153712511062622, total loss: 1295.1949710845947
clearing cache


Processing Epoch 00:   2%|▏         | 381/18148 [25:18<15:45:42,  3.19s/it]


 Epoch 1, step: 380, loss: 2.250596046447754, total loss: 1321.0055086612701
clearing cache


Processing Epoch 00:   2%|▏         | 391/18148 [25:49<15:44:01,  3.19s/it]


 Epoch 1, step: 390, loss: 2.1959726810455322, total loss: 1347.5102779865265
clearing cache


Processing Epoch 00:   2%|▏         | 401/18148 [26:20<15:37:15,  3.17s/it]


 Epoch 1, step: 400, loss: 2.421706199645996, total loss: 1372.8670992851257
clearing cache


Processing Epoch 00:   2%|▏         | 411/18148 [26:51<15:37:41,  3.17s/it]


 Epoch 1, step: 410, loss: 2.1165499687194824, total loss: 1397.346444606781
clearing cache


Processing Epoch 00:   2%|▏         | 421/18148 [27:23<15:48:45,  3.21s/it]


 Epoch 1, step: 420, loss: 2.9880428314208984, total loss: 1426.6509590148926
clearing cache


Processing Epoch 00:   2%|▏         | 431/18148 [27:54<15:40:54,  3.19s/it]


 Epoch 1, step: 430, loss: 2.3792929649353027, total loss: 1450.8332114219666
clearing cache


Processing Epoch 00:   2%|▏         | 441/18148 [28:25<15:37:57,  3.18s/it]


 Epoch 1, step: 440, loss: 2.274524688720703, total loss: 1474.8847596645355
clearing cache


Processing Epoch 00:   2%|▏         | 451/18148 [28:56<15:46:35,  3.21s/it]


 Epoch 1, step: 450, loss: 2.904996633529663, total loss: 1500.1033599376678
clearing cache


Processing Epoch 00:   3%|▎         | 461/18148 [29:27<15:38:40,  3.18s/it]


 Epoch 1, step: 460, loss: 2.1622939109802246, total loss: 1523.0223441123962
clearing cache


Processing Epoch 00:   3%|▎         | 471/18148 [29:58<15:37:31,  3.18s/it]


 Epoch 1, step: 470, loss: 2.1216607093811035, total loss: 1547.974915266037
clearing cache


Processing Epoch 00:   3%|▎         | 481/18148 [30:29<15:38:24,  3.19s/it]


 Epoch 1, step: 480, loss: 2.449435234069824, total loss: 1572.3900225162506
clearing cache


Processing Epoch 00:   3%|▎         | 491/18148 [31:00<15:36:23,  3.18s/it]


 Epoch 1, step: 490, loss: 2.2202882766723633, total loss: 1595.7506666183472
clearing cache


Processing Epoch 00:   3%|▎         | 498/18148 [31:22<15:05:07,  3.08s/it]


 Epoch 1, step: 498, Loss: 1.967282772064209, total loss: 1613.9814162254333
saving model


Processing Epoch 00:   3%|▎         | 501/18148 [31:40<21:45:32,  4.44s/it]


 Epoch 1, step: 500, loss: 2.0246500968933105, total loss: 1618.9953365325928
clearing cache


Processing Epoch 00:   3%|▎         | 511/18148 [32:12<16:07:56,  3.29s/it]


 Epoch 1, step: 510, loss: 3.351444721221924, total loss: 1646.5902874469757
clearing cache


Processing Epoch 00:   3%|▎         | 521/18148 [32:43<15:31:40,  3.17s/it]


 Epoch 1, step: 520, loss: 3.0249905586242676, total loss: 1670.993479013443
clearing cache


Processing Epoch 00:   3%|▎         | 531/18148 [33:13<15:27:43,  3.16s/it]


 Epoch 1, step: 530, loss: 2.0885303020477295, total loss: 1694.4842958450317
clearing cache


Processing Epoch 00:   3%|▎         | 541/18148 [33:45<15:37:08,  3.19s/it]


 Epoch 1, step: 540, loss: 2.2731220722198486, total loss: 1720.5437836647034
clearing cache


Processing Epoch 00:   3%|▎         | 551/18148 [34:16<15:28:23,  3.17s/it]


 Epoch 1, step: 550, loss: 2.2759926319122314, total loss: 1745.5111784934998
clearing cache


Processing Epoch 00:   3%|▎         | 561/18148 [34:47<15:33:50,  3.19s/it]


 Epoch 1, step: 560, loss: 2.228478193283081, total loss: 1769.9812226295471
clearing cache


Processing Epoch 00:   3%|▎         | 563/18148 [34:53<15:19:31,  3.14s/it]


 Epoch 1, step: 563, Loss: 1.9544291496276855, total loss: 1777.3477709293365
saving model


Processing Epoch 00:   3%|▎         | 571/18148 [35:27<16:41:05,  3.42s/it]


 Epoch 1, step: 570, loss: 1.980024814605713, total loss: 1792.8807284832
clearing cache


Processing Epoch 00:   3%|▎         | 581/18148 [35:58<15:36:31,  3.20s/it]


 Epoch 1, step: 580, loss: 2.2482306957244873, total loss: 1818.6044809818268
clearing cache


Processing Epoch 00:   3%|▎         | 591/18148 [36:29<15:21:00,  3.15s/it]


 Epoch 1, step: 590, loss: 2.3050644397735596, total loss: 1843.300953388214
clearing cache


Processing Epoch 00:   3%|▎         | 601/18148 [37:00<15:30:10,  3.18s/it]


 Epoch 1, step: 600, loss: 2.303612470626831, total loss: 1867.260499238968
clearing cache


Processing Epoch 00:   3%|▎         | 611/18148 [37:31<15:32:23,  3.19s/it]


 Epoch 1, step: 610, loss: 3.0095880031585693, total loss: 1894.2795538902283
clearing cache


Processing Epoch 00:   3%|▎         | 613/18148 [37:37<15:13:34,  3.13s/it]


 Epoch 1, step: 613, Loss: 1.8344309329986572, total loss: 1901.2809090614319
saving model


Processing Epoch 00:   3%|▎         | 621/18148 [38:11<16:33:46,  3.40s/it]


 Epoch 1, step: 620, loss: 2.3261075019836426, total loss: 1917.8678576946259
clearing cache


Processing Epoch 00:   3%|▎         | 631/18148 [38:42<15:36:22,  3.21s/it]


 Epoch 1, step: 630, loss: 2.905473232269287, total loss: 1941.1625044345856
clearing cache


Processing Epoch 00:   4%|▎         | 641/18148 [39:13<15:19:10,  3.15s/it]


 Epoch 1, step: 640, loss: 2.156903028488159, total loss: 1964.6492612361908
clearing cache


Processing Epoch 00:   4%|▎         | 651/18148 [39:44<15:28:54,  3.19s/it]


 Epoch 1, step: 650, loss: 2.339179039001465, total loss: 1988.5186760425568
clearing cache


Processing Epoch 00:   4%|▎         | 661/18148 [40:15<15:25:58,  3.18s/it]


 Epoch 1, step: 660, loss: 2.0232625007629395, total loss: 2011.5852708816528
clearing cache


Processing Epoch 00:   4%|▎         | 671/18148 [40:46<15:26:48,  3.18s/it]


 Epoch 1, step: 670, loss: 2.224278211593628, total loss: 2036.8679077625275
clearing cache


Processing Epoch 00:   4%|▍         | 681/18148 [41:18<15:29:52,  3.19s/it]


 Epoch 1, step: 680, loss: 2.8875553607940674, total loss: 2062.5720162391663
clearing cache


Processing Epoch 00:   4%|▍         | 691/18148 [41:49<15:33:42,  3.21s/it]


 Epoch 1, step: 690, loss: 3.3262224197387695, total loss: 2089.284444093704
clearing cache


Processing Epoch 00:   4%|▍         | 701/18148 [42:20<15:24:24,  3.18s/it]


 Epoch 1, step: 700, loss: 2.0748467445373535, total loss: 2117.496412038803
clearing cache


Processing Epoch 00:   4%|▍         | 711/18148 [42:51<15:27:56,  3.19s/it]


 Epoch 1, step: 710, loss: 2.959259510040283, total loss: 2142.097090244293
clearing cache


Processing Epoch 00:   4%|▍         | 721/18148 [43:22<15:24:28,  3.18s/it]


 Epoch 1, step: 720, loss: 1.9379225969314575, total loss: 2165.9610739946365
clearing cache


Processing Epoch 00:   4%|▍         | 731/18148 [43:53<15:28:25,  3.20s/it]


 Epoch 1, step: 730, loss: 3.2535433769226074, total loss: 2191.8494757413864
clearing cache


Processing Epoch 00:   4%|▍         | 741/18148 [44:24<15:25:53,  3.19s/it]


 Epoch 1, step: 740, loss: 2.9818167686462402, total loss: 2217.2737637758255
clearing cache


Processing Epoch 00:   4%|▍         | 751/18148 [44:56<15:19:26,  3.17s/it]


 Epoch 1, step: 750, loss: 2.200916290283203, total loss: 2241.155100464821
clearing cache


Processing Epoch 00:   4%|▍         | 761/18148 [45:27<15:29:28,  3.21s/it]


 Epoch 1, step: 760, loss: 3.247098922729492, total loss: 2268.851455807686
clearing cache


Processing Epoch 00:   4%|▍         | 771/18148 [45:58<15:19:17,  3.17s/it]


 Epoch 1, step: 770, loss: 2.1211938858032227, total loss: 2293.1136511564255
clearing cache


Processing Epoch 00:   4%|▍         | 781/18148 [46:29<15:19:22,  3.18s/it]


 Epoch 1, step: 780, loss: 2.3158655166625977, total loss: 2317.2544173002243
clearing cache


Processing Epoch 00:   4%|▍         | 791/18148 [47:00<15:21:23,  3.19s/it]


 Epoch 1, step: 790, loss: 2.0472872257232666, total loss: 2342.160280585289
clearing cache


Processing Epoch 00:   4%|▍         | 801/18148 [47:31<15:19:06,  3.18s/it]


 Epoch 1, step: 800, loss: 2.135244846343994, total loss: 2367.6148577928543
clearing cache


Processing Epoch 00:   4%|▍         | 811/18148 [48:02<15:21:03,  3.19s/it]


 Epoch 1, step: 810, loss: 2.0579633712768555, total loss: 2391.288976073265
clearing cache


Processing Epoch 00:   5%|▍         | 821/18148 [48:33<15:20:52,  3.19s/it]


 Epoch 1, step: 820, loss: 2.2800369262695312, total loss: 2417.392006754875
clearing cache


Processing Epoch 00:   5%|▍         | 831/18148 [49:04<15:14:03,  3.17s/it]


 Epoch 1, step: 830, loss: 2.259941816329956, total loss: 2443.5715814828873
clearing cache


Processing Epoch 00:   5%|▍         | 841/18148 [49:35<15:16:55,  3.18s/it]


 Epoch 1, step: 840, loss: 2.270778179168701, total loss: 2467.790225625038
clearing cache


Processing Epoch 00:   5%|▍         | 851/18148 [50:07<15:15:08,  3.17s/it]


 Epoch 1, step: 850, loss: 2.426717519760132, total loss: 2490.807570576668
clearing cache


Processing Epoch 00:   5%|▍         | 861/18148 [50:38<15:15:36,  3.18s/it]


 Epoch 1, step: 860, loss: 2.146019697189331, total loss: 2514.6900309324265
clearing cache


Processing Epoch 00:   5%|▍         | 871/18148 [51:09<15:14:57,  3.18s/it]


 Epoch 1, step: 870, loss: 2.201046943664551, total loss: 2538.890837788582
clearing cache


Processing Epoch 00:   5%|▍         | 881/18148 [51:40<15:18:36,  3.19s/it]


 Epoch 1, step: 880, loss: 2.815089702606201, total loss: 2562.751648545265
clearing cache


Processing Epoch 00:   5%|▍         | 891/18148 [52:11<15:16:10,  3.19s/it]


 Epoch 1, step: 890, loss: 2.56602144241333, total loss: 2587.1516379117966
clearing cache


Processing Epoch 00:   5%|▍         | 901/18148 [52:42<15:16:17,  3.19s/it]


 Epoch 1, step: 900, loss: 2.0769002437591553, total loss: 2610.1594537496567
clearing cache


Processing Epoch 00:   5%|▌         | 911/18148 [53:13<15:14:47,  3.18s/it]


 Epoch 1, step: 910, loss: 2.244428873062134, total loss: 2633.389646410942
clearing cache


Processing Epoch 00:   5%|▌         | 921/18148 [53:44<15:12:33,  3.18s/it]


 Epoch 1, step: 920, loss: 2.158707618713379, total loss: 2656.6539758443832
clearing cache


Processing Epoch 00:   5%|▌         | 931/18148 [54:15<15:07:31,  3.16s/it]


 Epoch 1, step: 930, loss: 2.2032670974731445, total loss: 2678.4130750894547
clearing cache


Processing Epoch 00:   5%|▌         | 941/18148 [54:46<15:07:53,  3.17s/it]


 Epoch 1, step: 940, loss: 2.495980978012085, total loss: 2702.571048140526
clearing cache


Processing Epoch 00:   5%|▌         | 951/18148 [55:17<15:13:34,  3.19s/it]


 Epoch 1, step: 950, loss: 3.2811880111694336, total loss: 2728.6564232110977
clearing cache


Processing Epoch 00:   5%|▌         | 961/18148 [55:48<15:10:04,  3.18s/it]


 Epoch 1, step: 960, loss: 2.175062656402588, total loss: 2751.833273291588
clearing cache


Processing Epoch 00:   5%|▌         | 971/18148 [56:19<15:11:32,  3.18s/it]


 Epoch 1, step: 970, loss: 2.095886707305908, total loss: 2775.4769228696823
clearing cache


Processing Epoch 00:   5%|▌         | 981/18148 [56:50<15:14:13,  3.20s/it]


 Epoch 1, step: 980, loss: 2.2002623081207275, total loss: 2800.2239891290665
clearing cache


Processing Epoch 00:   5%|▌         | 991/18148 [57:21<15:09:27,  3.18s/it]


 Epoch 1, step: 990, loss: 2.2100017070770264, total loss: 2825.560002684593
clearing cache


Processing Epoch 00:   6%|▌         | 1001/18148 [57:52<15:12:49,  3.19s/it]


 Epoch 1, step: 1000, loss: 2.898031711578369, total loss: 2850.918910384178
clearing cache


Processing Epoch 00:   6%|▌         | 1011/18148 [58:24<15:11:01,  3.19s/it]


 Epoch 1, step: 1010, loss: 2.9594855308532715, total loss: 2875.802101612091
clearing cache


Processing Epoch 00:   6%|▌         | 1021/18148 [58:55<15:08:26,  3.18s/it]


 Epoch 1, step: 1020, loss: 2.1364967823028564, total loss: 2898.106516599655
clearing cache


Processing Epoch 00:   6%|▌         | 1031/18148 [59:26<15:09:21,  3.19s/it]


 Epoch 1, step: 1030, loss: 2.233792781829834, total loss: 2924.2225091457367
clearing cache


Processing Epoch 00:   6%|▌         | 1041/18148 [59:57<15:14:37,  3.21s/it]


 Epoch 1, step: 1040, loss: 2.800271987915039, total loss: 2949.771227836609
clearing cache


Processing Epoch 00:   6%|▌         | 1051/18148 [1:00:28<15:07:59,  3.19s/it]


 Epoch 1, step: 1050, loss: 2.3777084350585938, total loss: 2974.5277910232544
clearing cache


Processing Epoch 00:   6%|▌         | 1061/18148 [1:00:59<15:03:08,  3.17s/it]


 Epoch 1, step: 1060, loss: 2.159801721572876, total loss: 2997.1419427394867
clearing cache


Processing Epoch 00:   6%|▌         | 1071/18148 [1:01:30<15:07:45,  3.19s/it]


 Epoch 1, step: 1070, loss: 2.38476300239563, total loss: 3021.207785844803
clearing cache


Processing Epoch 00:   6%|▌         | 1081/18148 [1:02:01<15:06:09,  3.19s/it]


 Epoch 1, step: 1080, loss: 2.26289701461792, total loss: 3043.704032897949
clearing cache


Processing Epoch 00:   6%|▌         | 1091/18148 [1:02:32<15:01:26,  3.17s/it]


 Epoch 1, step: 1090, loss: 2.162109136581421, total loss: 3068.426950931549
clearing cache


Processing Epoch 00:   6%|▌         | 1101/18148 [1:03:03<15:03:55,  3.18s/it]


 Epoch 1, step: 1100, loss: 2.206913709640503, total loss: 3093.0203351974487
clearing cache


Processing Epoch 00:   6%|▌         | 1111/18148 [1:03:35<15:09:03,  3.20s/it]


 Epoch 1, step: 1110, loss: 2.961639165878296, total loss: 3118.684896469116
clearing cache


Processing Epoch 00:   6%|▌         | 1121/18148 [1:04:06<15:07:33,  3.20s/it]


 Epoch 1, step: 1120, loss: 2.9369959831237793, total loss: 3143.7985594272614
clearing cache


Processing Epoch 00:   6%|▌         | 1131/18148 [1:04:37<15:03:39,  3.19s/it]


 Epoch 1, step: 1130, loss: 2.2192564010620117, total loss: 3169.8976385593414
clearing cache


Processing Epoch 00:   6%|▋         | 1141/18148 [1:05:08<14:58:31,  3.17s/it]


 Epoch 1, step: 1140, loss: 2.260789155960083, total loss: 3191.9980051517487
clearing cache


Processing Epoch 00:   6%|▋         | 1151/18148 [1:05:39<15:00:38,  3.18s/it]


 Epoch 1, step: 1150, loss: 2.155158281326294, total loss: 3217.453131914139
clearing cache


Processing Epoch 00:   6%|▋         | 1161/18148 [1:06:10<15:00:47,  3.18s/it]


 Epoch 1, step: 1160, loss: 2.193052053451538, total loss: 3241.233726501465
clearing cache


Processing Epoch 00:   6%|▋         | 1171/18148 [1:06:41<14:56:03,  3.17s/it]


 Epoch 1, step: 1170, loss: 2.0138683319091797, total loss: 3264.638378381729
clearing cache


Processing Epoch 00:   7%|▋         | 1181/18148 [1:07:12<14:56:58,  3.17s/it]


 Epoch 1, step: 1180, loss: 2.3207826614379883, total loss: 3287.323746442795
clearing cache


Processing Epoch 00:   7%|▋         | 1191/18148 [1:07:43<14:59:44,  3.18s/it]


 Epoch 1, step: 1190, loss: 2.2416677474975586, total loss: 3311.4734346866608
clearing cache


Processing Epoch 00:   7%|▋         | 1201/18148 [1:08:14<14:58:04,  3.18s/it]


 Epoch 1, step: 1200, loss: 1.933261752128601, total loss: 3336.750243782997
clearing cache


Processing Epoch 00:   7%|▋         | 1211/18148 [1:08:45<14:59:25,  3.19s/it]


 Epoch 1, step: 1210, loss: 2.9562911987304688, total loss: 3361.0000644922256
clearing cache


Processing Epoch 00:   7%|▋         | 1221/18148 [1:09:16<14:59:36,  3.19s/it]


 Epoch 1, step: 1220, loss: 2.9131956100463867, total loss: 3385.519650578499
clearing cache


Processing Epoch 00:   7%|▋         | 1231/18148 [1:09:47<14:54:52,  3.17s/it]


 Epoch 1, step: 1230, loss: 2.1672916412353516, total loss: 3409.510558485985
clearing cache


Processing Epoch 00:   7%|▋         | 1241/18148 [1:10:19<14:58:54,  3.19s/it]


 Epoch 1, step: 1240, loss: 2.3801817893981934, total loss: 3434.622718691826
clearing cache


Processing Epoch 00:   7%|▋         | 1251/18148 [1:10:50<14:55:25,  3.18s/it]


 Epoch 1, step: 1250, loss: 2.2318387031555176, total loss: 3457.801392674446
clearing cache


Processing Epoch 00:   7%|▋         | 1261/18148 [1:11:21<14:54:37,  3.18s/it]


 Epoch 1, step: 1260, loss: 2.04567813873291, total loss: 3480.9052037000656
clearing cache


Processing Epoch 00:   7%|▋         | 1271/18148 [1:11:52<14:55:17,  3.18s/it]


 Epoch 1, step: 1270, loss: 2.1024441719055176, total loss: 3504.727166056633
clearing cache


Processing Epoch 00:   7%|▋         | 1281/18148 [1:12:23<14:59:44,  3.20s/it]


 Epoch 1, step: 1280, loss: 2.8789544105529785, total loss: 3529.7794865369797
clearing cache


Processing Epoch 00:   7%|▋         | 1291/18148 [1:12:54<14:58:53,  3.20s/it]


 Epoch 1, step: 1290, loss: 2.2001073360443115, total loss: 3554.899499297142
clearing cache


Processing Epoch 00:   7%|▋         | 1301/18148 [1:13:25<14:57:26,  3.20s/it]


 Epoch 1, step: 1300, loss: 2.8976919651031494, total loss: 3578.9826353788376
clearing cache


Processing Epoch 00:   7%|▋         | 1311/18148 [1:13:57<14:54:49,  3.19s/it]


 Epoch 1, step: 1310, loss: 2.2223148345947266, total loss: 3601.9438792467117
clearing cache


Processing Epoch 00:   7%|▋         | 1321/18148 [1:14:28<14:54:36,  3.19s/it]


 Epoch 1, step: 1320, loss: 2.0635833740234375, total loss: 3628.233971953392
clearing cache


Processing Epoch 00:   7%|▋         | 1331/18148 [1:14:59<14:53:44,  3.19s/it]


 Epoch 1, step: 1330, loss: 2.241278886795044, total loss: 3652.702175974846
clearing cache


Processing Epoch 00:   7%|▋         | 1341/18148 [1:15:30<14:46:56,  3.17s/it]


 Epoch 1, step: 1340, loss: 2.0227389335632324, total loss: 3676.169049143791
clearing cache


Processing Epoch 00:   7%|▋         | 1351/18148 [1:16:01<14:46:58,  3.17s/it]


 Epoch 1, step: 1350, loss: 2.1534268856048584, total loss: 3699.567771792412
clearing cache


Processing Epoch 00:   7%|▋         | 1361/18148 [1:16:32<14:50:42,  3.18s/it]


 Epoch 1, step: 1360, loss: 2.224114418029785, total loss: 3724.5011311769485
clearing cache


Processing Epoch 00:   8%|▊         | 1371/18148 [1:17:03<14:49:37,  3.18s/it]


 Epoch 1, step: 1370, loss: 1.979235291481018, total loss: 3749.6884121894836
clearing cache


Processing Epoch 00:   8%|▊         | 1381/18148 [1:17:34<14:47:33,  3.18s/it]


 Epoch 1, step: 1380, loss: 2.1418421268463135, total loss: 3772.171168088913
clearing cache


Processing Epoch 00:   8%|▊         | 1391/18148 [1:18:05<14:51:24,  3.19s/it]


 Epoch 1, step: 1390, loss: 2.1303136348724365, total loss: 3797.9963960647583
clearing cache


Processing Epoch 00:   8%|▊         | 1401/18148 [1:18:37<14:49:07,  3.19s/it]


 Epoch 1, step: 1400, loss: 2.2845070362091064, total loss: 3824.2642216682434
clearing cache


Processing Epoch 00:   8%|▊         | 1411/18148 [1:19:08<14:51:05,  3.19s/it]


 Epoch 1, step: 1410, loss: 2.926476001739502, total loss: 3848.1368687152863
clearing cache


Processing Epoch 00:   8%|▊         | 1421/18148 [1:19:39<14:52:46,  3.20s/it]


 Epoch 1, step: 1420, loss: 2.20025372505188, total loss: 3873.002354860306
clearing cache


Processing Epoch 00:   8%|▊         | 1431/18148 [1:20:10<14:46:09,  3.18s/it]


 Epoch 1, step: 1430, loss: 2.0182318687438965, total loss: 3896.1639437675476
clearing cache


Processing Epoch 00:   8%|▊         | 1441/18148 [1:20:41<14:46:29,  3.18s/it]


 Epoch 1, step: 1440, loss: 2.82460355758667, total loss: 3919.521354198456
clearing cache


Processing Epoch 00:   8%|▊         | 1451/18148 [1:21:12<14:44:32,  3.18s/it]


 Epoch 1, step: 1450, loss: 2.1926109790802, total loss: 3943.602590560913
clearing cache


Processing Epoch 00:   8%|▊         | 1461/18148 [1:21:43<14:44:16,  3.18s/it]


 Epoch 1, step: 1460, loss: 2.2454075813293457, total loss: 3968.5090458393097
clearing cache


Processing Epoch 00:   8%|▊         | 1471/18148 [1:22:14<14:42:08,  3.17s/it]


 Epoch 1, step: 1470, loss: 1.9546267986297607, total loss: 3991.2185990810394
clearing cache


Processing Epoch 00:   8%|▊         | 1481/18148 [1:22:45<14:41:16,  3.17s/it]


 Epoch 1, step: 1480, loss: 2.18955659866333, total loss: 4014.3281679153442
clearing cache


Processing Epoch 00:   8%|▊         | 1491/18148 [1:23:16<14:44:57,  3.19s/it]


 Epoch 1, step: 1490, loss: 2.468745708465576, total loss: 4038.0914756059647
clearing cache


Processing Epoch 00:   8%|▊         | 1501/18148 [1:23:47<14:44:07,  3.19s/it]


 Epoch 1, step: 1500, loss: 2.3196356296539307, total loss: 4062.840122103691
clearing cache


Processing Epoch 00:   8%|▊         | 1511/18148 [1:24:18<14:40:52,  3.18s/it]


 Epoch 1, step: 1510, loss: 2.869065761566162, total loss: 4087.1887818574905
clearing cache


Processing Epoch 00:   8%|▊         | 1521/18148 [1:24:50<14:48:09,  3.21s/it]


 Epoch 1, step: 1520, loss: 2.970651149749756, total loss: 4113.309210419655
clearing cache


Processing Epoch 00:   8%|▊         | 1531/18148 [1:25:21<14:38:53,  3.17s/it]


 Epoch 1, step: 1530, loss: 2.115555763244629, total loss: 4136.396614432335
clearing cache


Processing Epoch 00:   8%|▊         | 1541/18148 [1:25:52<14:40:04,  3.18s/it]


 Epoch 1, step: 1540, loss: 2.271221876144409, total loss: 4159.366131186485
clearing cache


Processing Epoch 00:   9%|▊         | 1551/18148 [1:26:23<14:41:18,  3.19s/it]


 Epoch 1, step: 1550, loss: 3.1543073654174805, total loss: 4184.341162323952
clearing cache


Processing Epoch 00:   9%|▊         | 1561/18148 [1:26:54<14:37:30,  3.17s/it]


 Epoch 1, step: 1560, loss: 2.033963441848755, total loss: 4207.9695254564285
clearing cache


Processing Epoch 00:   9%|▊         | 1571/18148 [1:27:25<14:34:19,  3.16s/it]


 Epoch 1, step: 1570, loss: 2.357609272003174, total loss: 4231.182483792305
clearing cache


Processing Epoch 00:   9%|▊         | 1581/18148 [1:27:56<14:44:16,  3.20s/it]


 Epoch 1, step: 1580, loss: 2.84488844871521, total loss: 4255.593194365501
clearing cache


Processing Epoch 00:   9%|▉         | 1591/18148 [1:28:27<14:37:48,  3.18s/it]


 Epoch 1, step: 1590, loss: 2.1709251403808594, total loss: 4278.831440329552
clearing cache


Processing Epoch 00:   9%|▉         | 1601/18148 [1:28:58<14:35:51,  3.18s/it]


 Epoch 1, step: 1600, loss: 2.3579018115997314, total loss: 4301.677288651466
clearing cache


Processing Epoch 00:   9%|▉         | 1611/18148 [1:29:29<14:38:59,  3.19s/it]


 Epoch 1, step: 1610, loss: 2.974252223968506, total loss: 4325.4751452207565
clearing cache


Processing Epoch 00:   9%|▉         | 1621/18148 [1:30:00<14:36:17,  3.18s/it]


 Epoch 1, step: 1620, loss: 2.024263620376587, total loss: 4346.895794034004
clearing cache


Processing Epoch 00:   9%|▉         | 1631/18148 [1:30:31<14:31:17,  3.17s/it]


 Epoch 1, step: 1630, loss: 2.1744096279144287, total loss: 4371.142883181572
clearing cache


Processing Epoch 00:   9%|▉         | 1641/18148 [1:31:02<14:35:06,  3.18s/it]


 Epoch 1, step: 1640, loss: 2.1324450969696045, total loss: 4394.7998785972595
clearing cache


Processing Epoch 00:   9%|▉         | 1651/18148 [1:31:33<14:31:27,  3.17s/it]


 Epoch 1, step: 1650, loss: 2.8277230262756348, total loss: 4417.67311835289
clearing cache


Processing Epoch 00:   9%|▉         | 1661/18148 [1:32:04<14:34:46,  3.18s/it]


 Epoch 1, step: 1660, loss: 2.144550323486328, total loss: 4443.081123590469
clearing cache


Processing Epoch 00:   9%|▉         | 1671/18148 [1:32:35<14:35:45,  3.19s/it]


 Epoch 1, step: 1670, loss: 2.1731200218200684, total loss: 4467.7394008636475
clearing cache


Processing Epoch 00:   9%|▉         | 1681/18148 [1:33:06<14:29:40,  3.17s/it]


 Epoch 1, step: 1680, loss: 2.170727252960205, total loss: 4490.937757968903
clearing cache


Processing Epoch 00:   9%|▉         | 1691/18148 [1:33:37<14:26:05,  3.16s/it]


 Epoch 1, step: 1690, loss: 2.164163827896118, total loss: 4513.607130765915
clearing cache


Processing Epoch 00:   9%|▉         | 1701/18148 [1:34:08<14:29:44,  3.17s/it]


 Epoch 1, step: 1700, loss: 2.568674087524414, total loss: 4538.342000722885
clearing cache


Processing Epoch 00:   9%|▉         | 1711/18148 [1:34:39<14:31:20,  3.18s/it]


 Epoch 1, step: 1710, loss: 2.307845115661621, total loss: 4563.428167581558
clearing cache


Processing Epoch 00:   9%|▉         | 1721/18148 [1:35:10<14:25:56,  3.16s/it]


 Epoch 1, step: 1720, loss: 2.0895864963531494, total loss: 4586.628712415695
clearing cache


Processing Epoch 00:  10%|▉         | 1731/18148 [1:35:41<14:26:58,  3.17s/it]


 Epoch 1, step: 1730, loss: 2.137885332107544, total loss: 4609.104560375214
clearing cache


Processing Epoch 00:  10%|▉         | 1741/18148 [1:36:12<14:30:12,  3.18s/it]


 Epoch 1, step: 1740, loss: 2.4251747131347656, total loss: 4633.551164627075
clearing cache


Processing Epoch 00:  10%|▉         | 1751/18148 [1:36:43<14:32:23,  3.19s/it]


 Epoch 1, step: 1750, loss: 2.2544493675231934, total loss: 4657.741641998291
clearing cache


Processing Epoch 00:  10%|▉         | 1760/18148 [1:37:11<14:05:50,  3.10s/it]


 Epoch 1, step: 1760, Loss: 1.7570056915283203, total loss: 4681.0586359500885
saving model


Processing Epoch 00:  10%|▉         | 1771/18148 [1:37:54<15:00:46,  3.30s/it]


 Epoch 1, step: 1770, loss: 2.319011926651001, total loss: 4703.933990955353
clearing cache


Processing Epoch 00:  10%|▉         | 1781/18148 [1:38:25<14:27:15,  3.18s/it]


 Epoch 1, step: 1780, loss: 3.1870815753936768, total loss: 4728.812840461731
clearing cache


Processing Epoch 00:  10%|▉         | 1791/18148 [1:38:56<14:19:15,  3.15s/it]


 Epoch 1, step: 1790, loss: 2.1843395233154297, total loss: 4750.782202243805
clearing cache


Processing Epoch 00:  10%|▉         | 1801/18148 [1:39:27<14:25:12,  3.18s/it]


 Epoch 1, step: 1800, loss: 1.985359787940979, total loss: 4774.613359093666
clearing cache


Processing Epoch 00:  10%|▉         | 1811/18148 [1:39:58<14:24:33,  3.18s/it]


 Epoch 1, step: 1810, loss: 2.261516571044922, total loss: 4799.458011746407
clearing cache


Processing Epoch 00:  10%|█         | 1821/18148 [1:40:29<14:17:29,  3.15s/it]


 Epoch 1, step: 1820, loss: 2.091879367828369, total loss: 4820.566033244133
clearing cache


Processing Epoch 00:  10%|█         | 1831/18148 [1:41:00<14:20:18,  3.16s/it]


 Epoch 1, step: 1830, loss: 2.0418002605438232, total loss: 4843.129134058952
clearing cache


Processing Epoch 00:  10%|█         | 1841/18148 [1:41:31<14:22:39,  3.17s/it]


 Epoch 1, step: 1840, loss: 2.172896146774292, total loss: 4869.580063223839
clearing cache


Processing Epoch 00:  10%|█         | 1851/18148 [1:42:02<14:23:18,  3.18s/it]


 Epoch 1, step: 1850, loss: 2.9245872497558594, total loss: 4893.007845759392
clearing cache


Processing Epoch 00:  10%|█         | 1861/18148 [1:42:33<14:26:22,  3.19s/it]


 Epoch 1, step: 1860, loss: 2.5413103103637695, total loss: 4919.086686015129
clearing cache


Processing Epoch 00:  10%|█         | 1871/18148 [1:43:04<14:24:01,  3.18s/it]


 Epoch 1, step: 1870, loss: 2.2299792766571045, total loss: 4945.159052491188
clearing cache


Processing Epoch 00:  10%|█         | 1881/18148 [1:43:35<14:17:06,  3.16s/it]


 Epoch 1, step: 1880, loss: 2.2082161903381348, total loss: 4967.06012070179
clearing cache


Processing Epoch 00:  10%|█         | 1891/18148 [1:44:06<14:21:31,  3.18s/it]


 Epoch 1, step: 1890, loss: 2.076453685760498, total loss: 4988.82629430294
clearing cache


Processing Epoch 00:  10%|█         | 1901/18148 [1:44:37<14:19:10,  3.17s/it]


 Epoch 1, step: 1900, loss: 2.2729032039642334, total loss: 5011.147926926613
clearing cache


Processing Epoch 00:  11%|█         | 1911/18148 [1:45:08<14:21:12,  3.18s/it]


 Epoch 1, step: 1910, loss: 2.820417881011963, total loss: 5035.628177762032
clearing cache


Processing Epoch 00:  11%|█         | 1921/18148 [1:45:39<14:17:13,  3.17s/it]


 Epoch 1, step: 1920, loss: 1.982398509979248, total loss: 5057.6932364702225
clearing cache


Processing Epoch 00:  11%|█         | 1931/18148 [1:46:10<14:23:20,  3.19s/it]


 Epoch 1, step: 1930, loss: 2.8889219760894775, total loss: 5084.761888384819
clearing cache


Processing Epoch 00:  11%|█         | 1941/18148 [1:46:41<14:17:06,  3.17s/it]


 Epoch 1, step: 1940, loss: 2.1578054428100586, total loss: 5107.841094851494
clearing cache


Processing Epoch 00:  11%|█         | 1951/18148 [1:47:12<14:20:03,  3.19s/it]


 Epoch 1, step: 1950, loss: 2.895670175552368, total loss: 5131.800979733467
clearing cache


Processing Epoch 00:  11%|█         | 1961/18148 [1:47:43<14:15:22,  3.17s/it]


 Epoch 1, step: 1960, loss: 2.248262882232666, total loss: 5154.408083319664
clearing cache


Processing Epoch 00:  11%|█         | 1971/18148 [1:48:15<14:15:32,  3.17s/it]


 Epoch 1, step: 1970, loss: 2.2363388538360596, total loss: 5176.493359804153
clearing cache


Processing Epoch 00:  11%|█         | 1981/18148 [1:48:45<14:14:33,  3.17s/it]


 Epoch 1, step: 1980, loss: 2.896366596221924, total loss: 5198.536709785461
clearing cache


Processing Epoch 00:  11%|█         | 1991/18148 [1:49:16<14:10:04,  3.16s/it]


 Epoch 1, step: 1990, loss: 2.09220027923584, total loss: 5222.225427865982
clearing cache


Processing Epoch 00:  11%|█         | 2001/18148 [1:49:47<14:16:35,  3.18s/it]


 Epoch 1, step: 2000, loss: 2.9357705116271973, total loss: 5248.447202682495
clearing cache


Processing Epoch 00:  11%|█         | 2011/18148 [1:50:18<14:14:18,  3.18s/it]


 Epoch 1, step: 2010, loss: 2.9740915298461914, total loss: 5270.52509021759
clearing cache


Processing Epoch 00:  11%|█         | 2021/18148 [1:50:49<14:19:13,  3.20s/it]


 Epoch 1, step: 2020, loss: 3.1487483978271484, total loss: 5294.213371515274
clearing cache


Processing Epoch 00:  11%|█         | 2031/18148 [1:51:20<14:15:47,  3.19s/it]


 Epoch 1, step: 2030, loss: 2.8097801208496094, total loss: 5317.5145399570465
clearing cache


Processing Epoch 00:  11%|█         | 2041/18148 [1:51:51<14:12:37,  3.18s/it]


 Epoch 1, step: 2040, loss: 2.0997698307037354, total loss: 5340.744645357132
clearing cache


Processing Epoch 00:  11%|█▏        | 2051/18148 [1:52:23<14:14:49,  3.19s/it]


 Epoch 1, step: 2050, loss: 2.1800835132598877, total loss: 5366.887772679329
clearing cache


Processing Epoch 00:  11%|█▏        | 2061/18148 [1:52:54<14:15:31,  3.19s/it]


 Epoch 1, step: 2060, loss: 2.8811569213867188, total loss: 5392.956899762154
clearing cache


Processing Epoch 00:  11%|█▏        | 2071/18148 [1:53:25<14:13:00,  3.18s/it]


 Epoch 1, step: 2070, loss: 2.3179304599761963, total loss: 5415.693346738815
clearing cache


Processing Epoch 00:  11%|█▏        | 2081/18148 [1:53:56<14:15:21,  3.19s/it]


 Epoch 1, step: 2080, loss: 2.2476863861083984, total loss: 5440.184092998505
clearing cache


Processing Epoch 00:  12%|█▏        | 2091/18148 [1:54:27<14:12:44,  3.19s/it]


 Epoch 1, step: 2090, loss: 2.2053980827331543, total loss: 5463.593091964722
clearing cache


Processing Epoch 00:  12%|█▏        | 2101/18148 [1:54:58<14:09:38,  3.18s/it]


 Epoch 1, step: 2100, loss: 2.2551980018615723, total loss: 5486.075640439987
clearing cache


Processing Epoch 00:  12%|█▏        | 2111/18148 [1:55:29<14:04:19,  3.16s/it]


 Epoch 1, step: 2110, loss: 2.0795111656188965, total loss: 5507.940527200699
clearing cache


Processing Epoch 00:  12%|█▏        | 2121/18148 [1:56:00<14:12:58,  3.19s/it]


 Epoch 1, step: 2120, loss: 2.912769079208374, total loss: 5533.197870135307
clearing cache


Processing Epoch 00:  12%|█▏        | 2131/18148 [1:56:32<14:12:09,  3.19s/it]


 Epoch 1, step: 2130, loss: 2.2263290882110596, total loss: 5557.732667326927
clearing cache


Processing Epoch 00:  12%|█▏        | 2141/18148 [1:57:03<14:10:53,  3.19s/it]


 Epoch 1, step: 2140, loss: 2.9177117347717285, total loss: 5580.74043238163
clearing cache


Processing Epoch 00:  12%|█▏        | 2151/18148 [1:57:34<14:09:08,  3.18s/it]


 Epoch 1, step: 2150, loss: 2.9962401390075684, total loss: 5603.749187350273
clearing cache


Processing Epoch 00:  12%|█▏        | 2161/18148 [1:58:05<14:12:59,  3.20s/it]


 Epoch 1, step: 2160, loss: 2.791686534881592, total loss: 5630.387109160423
clearing cache


Processing Epoch 00:  12%|█▏        | 2171/18148 [1:58:36<14:11:24,  3.20s/it]


 Epoch 1, step: 2170, loss: 2.960688352584839, total loss: 5654.194229245186
clearing cache


Processing Epoch 00:  12%|█▏        | 2181/18148 [1:59:07<14:06:17,  3.18s/it]


 Epoch 1, step: 2180, loss: 2.35196852684021, total loss: 5678.682317137718
clearing cache


Processing Epoch 00:  12%|█▏        | 2191/18148 [1:59:38<14:08:25,  3.19s/it]


 Epoch 1, step: 2190, loss: 2.2071847915649414, total loss: 5703.472185730934
clearing cache


Processing Epoch 00:  12%|█▏        | 2201/18148 [2:00:10<14:05:36,  3.18s/it]


 Epoch 1, step: 2200, loss: 2.312748670578003, total loss: 5727.800323843956
clearing cache


Processing Epoch 00:  12%|█▏        | 2211/18148 [2:00:41<14:02:10,  3.17s/it]


 Epoch 1, step: 2210, loss: 2.14361572265625, total loss: 5749.801707744598
clearing cache


Processing Epoch 00:  12%|█▏        | 2221/18148 [2:01:12<14:07:31,  3.19s/it]


 Epoch 1, step: 2220, loss: 2.067368268966675, total loss: 5773.055542230606
clearing cache


Processing Epoch 00:  12%|█▏        | 2231/18148 [2:01:43<14:07:49,  3.20s/it]


 Epoch 1, step: 2230, loss: 3.1902506351470947, total loss: 5796.225158691406
clearing cache


Processing Epoch 00:  12%|█▏        | 2241/18148 [2:02:14<14:09:29,  3.20s/it]


 Epoch 1, step: 2240, loss: 2.098066568374634, total loss: 5822.495128631592
clearing cache


Processing Epoch 00:  12%|█▏        | 2251/18148 [2:02:45<14:01:31,  3.18s/it]


 Epoch 1, step: 2250, loss: 2.1627986431121826, total loss: 5847.15871655941
clearing cache


Processing Epoch 00:  12%|█▏        | 2261/18148 [2:03:16<14:01:32,  3.18s/it]


 Epoch 1, step: 2260, loss: 2.2197463512420654, total loss: 5869.857320427895
clearing cache


Processing Epoch 00:  13%|█▎        | 2271/18148 [2:03:47<13:58:45,  3.17s/it]


 Epoch 1, step: 2270, loss: 2.1403539180755615, total loss: 5892.158474326134
clearing cache


Processing Epoch 00:  13%|█▎        | 2281/18148 [2:04:18<13:59:40,  3.18s/it]


 Epoch 1, step: 2280, loss: 2.266120433807373, total loss: 5915.178060173988
clearing cache


Processing Epoch 00:  13%|█▎        | 2291/18148 [2:04:49<13:59:26,  3.18s/it]


 Epoch 1, step: 2290, loss: 2.129340887069702, total loss: 5938.4294410943985
clearing cache


Processing Epoch 00:  13%|█▎        | 2301/18148 [2:05:20<13:55:08,  3.16s/it]


 Epoch 1, step: 2300, loss: 2.3250575065612793, total loss: 5961.1414514780045
clearing cache


Processing Epoch 00:  13%|█▎        | 2311/18148 [2:05:51<13:54:14,  3.16s/it]


 Epoch 1, step: 2310, loss: 2.110668897628784, total loss: 5982.118626475334
clearing cache


Processing Epoch 00:  13%|█▎        | 2321/18148 [2:06:22<14:02:15,  3.19s/it]


 Epoch 1, step: 2320, loss: 2.916659116744995, total loss: 6006.44054210186
clearing cache


Processing Epoch 00:  13%|█▎        | 2331/18148 [2:06:53<14:01:06,  3.19s/it]


 Epoch 1, step: 2330, loss: 2.0448482036590576, total loss: 6031.611545443535
clearing cache


Processing Epoch 00:  13%|█▎        | 2341/18148 [2:07:24<13:59:39,  3.19s/it]


 Epoch 1, step: 2340, loss: 2.2615063190460205, total loss: 6055.33099758625
clearing cache


Processing Epoch 00:  13%|█▎        | 2351/18148 [2:07:55<13:54:03,  3.17s/it]


 Epoch 1, step: 2350, loss: 2.1795310974121094, total loss: 6078.605082035065
clearing cache


Processing Epoch 00:  13%|█▎        | 2361/18148 [2:08:27<13:58:41,  3.19s/it]


 Epoch 1, step: 2360, loss: 2.9306139945983887, total loss: 6105.533776760101
clearing cache


Processing Epoch 00:  13%|█▎        | 2371/18148 [2:08:58<13:54:26,  3.17s/it]


 Epoch 1, step: 2370, loss: 2.160299777984619, total loss: 6128.119258403778
clearing cache


Processing Epoch 00:  13%|█▎        | 2381/18148 [2:09:29<13:57:17,  3.19s/it]


 Epoch 1, step: 2380, loss: 2.2114319801330566, total loss: 6153.56521821022
clearing cache


Processing Epoch 00:  13%|█▎        | 2391/18148 [2:10:00<13:53:53,  3.18s/it]


 Epoch 1, step: 2390, loss: 2.1165637969970703, total loss: 6177.157425165176
clearing cache


Processing Epoch 00:  13%|█▎        | 2401/18148 [2:10:31<13:53:02,  3.17s/it]


 Epoch 1, step: 2400, loss: 2.3732175827026367, total loss: 6201.306214094162
clearing cache


Processing Epoch 00:  13%|█▎        | 2411/18148 [2:11:02<13:55:03,  3.18s/it]


 Epoch 1, step: 2410, loss: 2.167903423309326, total loss: 6226.032674908638
clearing cache


Processing Epoch 00:  13%|█▎        | 2421/18148 [2:11:33<13:57:52,  3.20s/it]


 Epoch 1, step: 2420, loss: 2.951997756958008, total loss: 6250.484733939171
clearing cache


Processing Epoch 00:  13%|█▎        | 2431/18148 [2:12:04<13:53:45,  3.18s/it]


 Epoch 1, step: 2430, loss: 2.9230761528015137, total loss: 6275.337062120438
clearing cache


Processing Epoch 00:  13%|█▎        | 2441/18148 [2:12:35<13:52:41,  3.18s/it]


 Epoch 1, step: 2440, loss: 2.9500913619995117, total loss: 6299.643428325653
clearing cache


Processing Epoch 00:  14%|█▎        | 2451/18148 [2:13:06<14:01:47,  3.22s/it]


 Epoch 1, step: 2450, loss: 2.8925976753234863, total loss: 6325.722688436508
clearing cache


Processing Epoch 00:  14%|█▎        | 2461/18148 [2:13:37<13:50:39,  3.18s/it]


 Epoch 1, step: 2460, loss: 3.0205068588256836, total loss: 6349.935806751251
clearing cache


Processing Epoch 00:  14%|█▎        | 2471/18148 [2:14:08<13:48:12,  3.17s/it]


 Epoch 1, step: 2470, loss: 2.408806324005127, total loss: 6373.081442832947
clearing cache


Processing Epoch 00:  14%|█▎        | 2481/18148 [2:14:39<13:47:47,  3.17s/it]


 Epoch 1, step: 2480, loss: 2.001964569091797, total loss: 6395.303014755249
clearing cache


Processing Epoch 00:  14%|█▎        | 2491/18148 [2:15:10<13:51:42,  3.19s/it]


 Epoch 1, step: 2490, loss: 2.109576463699341, total loss: 6419.884034991264
clearing cache


Processing Epoch 00:  14%|█▍        | 2501/18148 [2:15:41<13:45:17,  3.16s/it]


 Epoch 1, step: 2500, loss: 2.216787338256836, total loss: 6443.032567858696
clearing cache


Processing Epoch 00:  14%|█▍        | 2511/18148 [2:16:12<13:51:35,  3.19s/it]


 Epoch 1, step: 2510, loss: 3.0805306434631348, total loss: 6468.3953384161
clearing cache


Processing Epoch 00:  14%|█▍        | 2521/18148 [2:16:44<13:47:04,  3.18s/it]


 Epoch 1, step: 2520, loss: 2.109159231185913, total loss: 6493.637535214424
clearing cache


Processing Epoch 00:  14%|█▍        | 2531/18148 [2:17:15<13:48:36,  3.18s/it]


 Epoch 1, step: 2530, loss: 2.942082643508911, total loss: 6518.656714081764
clearing cache


Processing Epoch 00:  14%|█▍        | 2541/18148 [2:17:46<13:47:06,  3.18s/it]


 Epoch 1, step: 2540, loss: 2.04984974861145, total loss: 6541.516660809517
clearing cache


Processing Epoch 00:  14%|█▍        | 2551/18148 [2:18:17<13:42:39,  3.16s/it]


 Epoch 1, step: 2550, loss: 2.1232497692108154, total loss: 6565.051186680794
clearing cache


Processing Epoch 00:  14%|█▍        | 2561/18148 [2:18:48<13:44:13,  3.17s/it]


 Epoch 1, step: 2560, loss: 2.1650617122650146, total loss: 6588.768006682396
clearing cache


Processing Epoch 00:  14%|█▍        | 2571/18148 [2:19:19<13:46:15,  3.18s/it]


 Epoch 1, step: 2570, loss: 2.1423521041870117, total loss: 6612.063502907753
clearing cache


Processing Epoch 00:  14%|█▍        | 2581/18148 [2:19:50<13:48:24,  3.19s/it]


 Epoch 1, step: 2580, loss: 3.0216126441955566, total loss: 6636.751382231712
clearing cache


Processing Epoch 00:  14%|█▍        | 2591/18148 [2:20:21<13:44:37,  3.18s/it]


 Epoch 1, step: 2590, loss: 1.860917329788208, total loss: 6659.582211852074
clearing cache


Processing Epoch 00:  14%|█▍        | 2601/18148 [2:20:52<13:42:59,  3.18s/it]


 Epoch 1, step: 2600, loss: 2.167065143585205, total loss: 6684.5403772592545
clearing cache


Processing Epoch 00:  14%|█▍        | 2611/18148 [2:21:23<13:49:25,  3.20s/it]


 Epoch 1, step: 2610, loss: 2.284198760986328, total loss: 6711.791999697685
clearing cache


Processing Epoch 00:  14%|█▍        | 2621/18148 [2:21:54<13:44:10,  3.18s/it]


 Epoch 1, step: 2620, loss: 2.1767823696136475, total loss: 6735.64607489109
clearing cache


Processing Epoch 00:  14%|█▍        | 2631/18148 [2:22:25<13:45:55,  3.19s/it]


 Epoch 1, step: 2630, loss: 2.8876891136169434, total loss: 6758.484373211861
clearing cache


Processing Epoch 00:  15%|█▍        | 2641/18148 [2:22:57<13:42:46,  3.18s/it]


 Epoch 1, step: 2640, loss: 2.237318992614746, total loss: 6783.496617913246
clearing cache


Processing Epoch 00:  15%|█▍        | 2651/18148 [2:23:28<13:41:11,  3.18s/it]


 Epoch 1, step: 2650, loss: 2.101752281188965, total loss: 6809.962649703026
clearing cache


Processing Epoch 00:  15%|█▍        | 2661/18148 [2:23:59<13:47:08,  3.20s/it]


 Epoch 1, step: 2660, loss: 2.9125380516052246, total loss: 6835.145353198051
clearing cache


Processing Epoch 00:  15%|█▍        | 2671/18148 [2:24:30<13:39:52,  3.18s/it]


 Epoch 1, step: 2670, loss: 2.113569736480713, total loss: 6858.892718911171
clearing cache


Processing Epoch 00:  15%|█▍        | 2681/18148 [2:25:01<13:44:44,  3.20s/it]


 Epoch 1, step: 2680, loss: 3.044158458709717, total loss: 6886.243747115135
clearing cache


Processing Epoch 00:  15%|█▍        | 2691/18148 [2:25:32<13:42:20,  3.19s/it]


 Epoch 1, step: 2690, loss: 2.8541526794433594, total loss: 6910.156602263451
clearing cache


Processing Epoch 00:  15%|█▍        | 2701/18148 [2:26:04<13:40:11,  3.19s/it]


 Epoch 1, step: 2700, loss: 2.1347713470458984, total loss: 6936.398186087608
clearing cache


Processing Epoch 00:  15%|█▍        | 2711/18148 [2:26:35<13:39:26,  3.18s/it]


 Epoch 1, step: 2710, loss: 2.9445512294769287, total loss: 6960.32359623909
clearing cache


Processing Epoch 00:  15%|█▍        | 2721/18148 [2:27:06<13:34:14,  3.17s/it]


 Epoch 1, step: 2720, loss: 2.096468448638916, total loss: 6981.498631238937
clearing cache


Processing Epoch 00:  15%|█▌        | 2731/18148 [2:27:37<13:37:31,  3.18s/it]


 Epoch 1, step: 2730, loss: 2.0499556064605713, total loss: 7003.764714598656
clearing cache


Processing Epoch 00:  15%|█▌        | 2741/18148 [2:28:08<13:36:31,  3.18s/it]


 Epoch 1, step: 2740, loss: 2.1665003299713135, total loss: 7027.592522263527
clearing cache


Processing Epoch 00:  15%|█▌        | 2751/18148 [2:28:39<13:36:28,  3.18s/it]


 Epoch 1, step: 2750, loss: 1.921891689300537, total loss: 7052.285072207451
clearing cache


Processing Epoch 00:  15%|█▌        | 2761/18148 [2:29:10<13:34:20,  3.18s/it]


 Epoch 1, step: 2760, loss: 1.9770042896270752, total loss: 7073.582801222801
clearing cache


Processing Epoch 00:  15%|█▌        | 2771/18148 [2:29:41<13:32:41,  3.17s/it]


 Epoch 1, step: 2770, loss: 2.2290797233581543, total loss: 7095.401710867882
clearing cache


Processing Epoch 00:  15%|█▌        | 2781/18148 [2:30:12<13:34:33,  3.18s/it]


 Epoch 1, step: 2780, loss: 2.2434420585632324, total loss: 7118.739817500114
clearing cache


Processing Epoch 00:  15%|█▌        | 2791/18148 [2:30:43<13:27:56,  3.16s/it]


 Epoch 1, step: 2790, loss: 2.1257355213165283, total loss: 7141.787471175194
clearing cache


Processing Epoch 00:  15%|█▌        | 2801/18148 [2:31:14<13:29:02,  3.16s/it]


 Epoch 1, step: 2800, loss: 2.190603256225586, total loss: 7167.604924559593
clearing cache


Processing Epoch 00:  15%|█▌        | 2811/18148 [2:31:45<13:31:59,  3.18s/it]


 Epoch 1, step: 2810, loss: 2.2844438552856445, total loss: 7190.769567131996
clearing cache


Processing Epoch 00:  16%|█▌        | 2821/18148 [2:32:16<13:32:22,  3.18s/it]


 Epoch 1, step: 2820, loss: 2.9135794639587402, total loss: 7214.601039290428
clearing cache


Processing Epoch 00:  16%|█▌        | 2831/18148 [2:32:47<13:30:17,  3.17s/it]


 Epoch 1, step: 2830, loss: 2.052767276763916, total loss: 7237.982896447182
clearing cache


Processing Epoch 00:  16%|█▌        | 2841/18148 [2:33:18<13:28:22,  3.17s/it]


 Epoch 1, step: 2840, loss: 2.1538503170013428, total loss: 7261.0852526426315
clearing cache


Processing Epoch 00:  16%|█▌        | 2851/18148 [2:33:49<13:29:15,  3.17s/it]


 Epoch 1, step: 2850, loss: 2.275739908218384, total loss: 7284.622987627983
clearing cache


Processing Epoch 00:  16%|█▌        | 2861/18148 [2:34:20<13:32:11,  3.19s/it]


 Epoch 1, step: 2860, loss: 2.7022194862365723, total loss: 7308.1375967264175
clearing cache


Processing Epoch 00:  16%|█▌        | 2871/18148 [2:34:51<13:26:15,  3.17s/it]


 Epoch 1, step: 2870, loss: 2.0719175338745117, total loss: 7330.691055655479
clearing cache


Processing Epoch 00:  16%|█▌        | 2881/18148 [2:35:22<13:24:11,  3.16s/it]


 Epoch 1, step: 2880, loss: 2.0685338973999023, total loss: 7353.226287007332
clearing cache


Processing Epoch 00:  16%|█▌        | 2891/18148 [2:35:53<13:29:15,  3.18s/it]


 Epoch 1, step: 2890, loss: 2.117615222930908, total loss: 7377.638089060783
clearing cache


Processing Epoch 00:  16%|█▌        | 2901/18148 [2:36:24<13:24:46,  3.17s/it]


 Epoch 1, step: 2900, loss: 2.1173923015594482, total loss: 7401.445810437202
clearing cache


Processing Epoch 00:  16%|█▌        | 2911/18148 [2:36:55<13:31:18,  3.19s/it]


 Epoch 1, step: 2910, loss: 3.027463912963867, total loss: 7424.995919704437
clearing cache


Processing Epoch 00:  16%|█▌        | 2921/18148 [2:37:26<13:27:02,  3.18s/it]


 Epoch 1, step: 2920, loss: 2.0543947219848633, total loss: 7447.8984134197235
clearing cache


Processing Epoch 00:  16%|█▌        | 2931/18148 [2:37:57<13:21:06,  3.16s/it]


 Epoch 1, step: 2930, loss: 1.7932581901550293, total loss: 7469.908723473549
clearing cache


Processing Epoch 00:  16%|█▌        | 2941/18148 [2:38:28<13:26:47,  3.18s/it]


 Epoch 1, step: 2940, loss: 2.142340660095215, total loss: 7493.319051623344
clearing cache


Processing Epoch 00:  16%|█▋        | 2951/18148 [2:38:59<13:30:18,  3.20s/it]


 Epoch 1, step: 2950, loss: 2.84090518951416, total loss: 7515.937533020973
clearing cache


Processing Epoch 00:  16%|█▋        | 2961/18148 [2:39:30<13:31:39,  3.21s/it]


 Epoch 1, step: 2960, loss: 3.033261299133301, total loss: 7540.7434676885605
clearing cache


Processing Epoch 00:  16%|█▋        | 2971/18148 [2:40:01<13:23:19,  3.18s/it]


 Epoch 1, step: 2970, loss: 2.0395569801330566, total loss: 7562.514470219612
clearing cache


Processing Epoch 00:  16%|█▋        | 2981/18148 [2:40:32<13:21:50,  3.17s/it]


 Epoch 1, step: 2980, loss: 2.2158749103546143, total loss: 7583.718574523926
clearing cache


Processing Epoch 00:  16%|█▋        | 2991/18148 [2:41:03<13:25:25,  3.19s/it]


 Epoch 1, step: 2990, loss: 2.0587034225463867, total loss: 7609.255531549454
clearing cache


Processing Epoch 00:  17%|█▋        | 3001/18148 [2:41:34<13:17:00,  3.16s/it]


 Epoch 1, step: 3000, loss: 2.1285150051116943, total loss: 7632.36959528923
clearing cache


Processing Epoch 00:  17%|█▋        | 3011/18148 [2:42:05<13:22:11,  3.18s/it]


 Epoch 1, step: 3010, loss: 2.126399278640747, total loss: 7656.144671916962
clearing cache


Processing Epoch 00:  17%|█▋        | 3021/18148 [2:42:36<13:15:57,  3.16s/it]


 Epoch 1, step: 3020, loss: 2.055088996887207, total loss: 7677.7528285980225
clearing cache


Processing Epoch 00:  17%|█▋        | 3031/18148 [2:43:07<13:23:30,  3.19s/it]


 Epoch 1, step: 3030, loss: 2.974630832672119, total loss: 7703.95476770401
clearing cache


Processing Epoch 00:  17%|█▋        | 3041/18148 [2:43:38<13:19:13,  3.17s/it]


 Epoch 1, step: 3040, loss: 2.1748251914978027, total loss: 7729.887585878372
clearing cache


Processing Epoch 00:  17%|█▋        | 3051/18148 [2:44:09<13:20:43,  3.18s/it]


 Epoch 1, step: 3050, loss: 2.1351025104522705, total loss: 7754.141906499863
clearing cache


Processing Epoch 00:  17%|█▋        | 3061/18148 [2:44:40<13:18:54,  3.18s/it]


 Epoch 1, step: 3060, loss: 2.0812816619873047, total loss: 7775.979096889496
clearing cache


Processing Epoch 00:  17%|█▋        | 3071/18148 [2:45:11<13:14:11,  3.16s/it]


 Epoch 1, step: 3070, loss: 2.051290512084961, total loss: 7799.3621389865875
clearing cache


Processing Epoch 00:  17%|█▋        | 3081/18148 [2:45:42<13:15:27,  3.17s/it]


 Epoch 1, step: 3080, loss: 2.0656521320343018, total loss: 7821.190121889114
clearing cache


Processing Epoch 00:  17%|█▋        | 3091/18148 [2:46:13<13:19:27,  3.19s/it]


 Epoch 1, step: 3090, loss: 2.0896859169006348, total loss: 7845.055996179581
clearing cache


Processing Epoch 00:  17%|█▋        | 3101/18148 [2:46:44<13:15:58,  3.17s/it]


 Epoch 1, step: 3100, loss: 2.0710086822509766, total loss: 7866.941558599472
clearing cache


Processing Epoch 00:  17%|█▋        | 3111/18148 [2:47:15<13:14:15,  3.17s/it]


 Epoch 1, step: 3110, loss: 2.0641136169433594, total loss: 7888.576006889343
clearing cache


Processing Epoch 00:  17%|█▋        | 3121/18148 [2:47:47<13:21:02,  3.20s/it]


 Epoch 1, step: 3120, loss: 2.240715742111206, total loss: 7913.9886820316315
clearing cache


Processing Epoch 00:  17%|█▋        | 3131/18148 [2:48:18<13:17:15,  3.19s/it]


 Epoch 1, step: 3130, loss: 2.951364040374756, total loss: 7939.1646275520325
clearing cache


Processing Epoch 00:  17%|█▋        | 3141/18148 [2:48:49<13:17:57,  3.19s/it]


 Epoch 1, step: 3140, loss: 3.122270107269287, total loss: 7962.539563179016
clearing cache


Processing Epoch 00:  17%|█▋        | 3151/18148 [2:49:20<13:15:46,  3.18s/it]


 Epoch 1, step: 3150, loss: 2.0171492099761963, total loss: 7987.375585079193
clearing cache


Processing Epoch 00:  17%|█▋        | 3161/18148 [2:49:51<13:11:53,  3.17s/it]


 Epoch 1, step: 3160, loss: 1.8781702518463135, total loss: 8009.898321866989
clearing cache


Processing Epoch 00:  17%|█▋        | 3171/18148 [2:50:22<13:09:07,  3.16s/it]


 Epoch 1, step: 3170, loss: 1.995812177658081, total loss: 8031.7142634391785
clearing cache


Processing Epoch 00:  18%|█▊        | 3181/18148 [2:50:53<13:18:55,  3.20s/it]


 Epoch 1, step: 3180, loss: 2.943415880203247, total loss: 8057.12886095047
clearing cache


Processing Epoch 00:  18%|█▊        | 3191/18148 [2:51:24<13:12:16,  3.18s/it]


 Epoch 1, step: 3190, loss: 2.0397064685821533, total loss: 8080.3713991642
clearing cache


Processing Epoch 00:  18%|█▊        | 3201/18148 [2:51:55<13:11:39,  3.18s/it]


 Epoch 1, step: 3200, loss: 2.281642436981201, total loss: 8102.908427357674
clearing cache


Processing Epoch 00:  18%|█▊        | 3211/18148 [2:52:26<13:13:50,  3.19s/it]


 Epoch 1, step: 3210, loss: 2.1203994750976562, total loss: 8125.964933753014
clearing cache


Processing Epoch 00:  18%|█▊        | 3221/18148 [2:52:57<13:08:22,  3.17s/it]


 Epoch 1, step: 3220, loss: 2.0943994522094727, total loss: 8149.917361617088
clearing cache


Processing Epoch 00:  18%|█▊        | 3231/18148 [2:53:28<13:08:56,  3.17s/it]


 Epoch 1, step: 3230, loss: 1.9688835144042969, total loss: 8172.426679730415
clearing cache


Processing Epoch 00:  18%|█▊        | 3241/18148 [2:53:59<13:07:57,  3.17s/it]


 Epoch 1, step: 3240, loss: 2.2174324989318848, total loss: 8195.744662642479
clearing cache


Processing Epoch 00:  18%|█▊        | 3251/18148 [2:54:30<13:07:21,  3.17s/it]


 Epoch 1, step: 3250, loss: 2.2518796920776367, total loss: 8218.772626399994
clearing cache


Processing Epoch 00:  18%|█▊        | 3261/18148 [2:55:01<13:06:21,  3.17s/it]


 Epoch 1, step: 3260, loss: 2.1920201778411865, total loss: 8242.958030939102
clearing cache


Processing Epoch 00:  18%|█▊        | 3271/18148 [2:55:32<13:06:37,  3.17s/it]


 Epoch 1, step: 3270, loss: 2.1570446491241455, total loss: 8267.062078237534
clearing cache


Processing Epoch 00:  18%|█▊        | 3281/18148 [2:56:03<13:05:38,  3.17s/it]


 Epoch 1, step: 3280, loss: 2.0170750617980957, total loss: 8290.557230472565
clearing cache


Processing Epoch 00:  18%|█▊        | 3291/18148 [2:56:34<13:04:47,  3.17s/it]


 Epoch 1, step: 3290, loss: 2.104787826538086, total loss: 8313.511913776398
clearing cache


Processing Epoch 00:  18%|█▊        | 3301/18148 [2:57:05<13:12:25,  3.20s/it]


 Epoch 1, step: 3300, loss: 2.774934768676758, total loss: 8338.928112983704
clearing cache


Processing Epoch 00:  18%|█▊        | 3310/18148 [2:57:33<12:38:01,  3.07s/it]


 Epoch 1, step: 3310, loss: 2.7849478721618652, total loss: 8361.486842632294
clearing cache


Processing Epoch 00:  18%|█▊        | 3321/18148 [2:58:07<13:03:04,  3.17s/it]


 Epoch 1, step: 3320, loss: 2.1362829208374023, total loss: 8384.26700770855
clearing cache


Processing Epoch 00:  18%|█▊        | 3331/18148 [2:58:38<13:07:59,  3.19s/it]


 Epoch 1, step: 3330, loss: 3.1532254219055176, total loss: 8410.187395215034
clearing cache


Processing Epoch 00:  18%|█▊        | 3341/18148 [2:59:10<13:08:32,  3.20s/it]


 Epoch 1, step: 3340, loss: 2.76161527633667, total loss: 8435.690353989601
clearing cache


Processing Epoch 00:  18%|█▊        | 3351/18148 [2:59:40<13:02:03,  3.17s/it]


 Epoch 1, step: 3350, loss: 2.901216506958008, total loss: 8457.898679852486
clearing cache


Processing Epoch 00:  19%|█▊        | 3361/18148 [3:00:11<13:03:58,  3.18s/it]


 Epoch 1, step: 3360, loss: 2.1094913482666016, total loss: 8481.104650974274
clearing cache


Processing Epoch 00:  19%|█▊        | 3371/18148 [3:00:43<13:06:27,  3.19s/it]


 Epoch 1, step: 3370, loss: 2.8731937408447266, total loss: 8505.693081736565
clearing cache


Processing Epoch 00:  19%|█▊        | 3381/18148 [3:01:14<12:58:00,  3.16s/it]


 Epoch 1, step: 3380, loss: 2.1996047496795654, total loss: 8528.095483899117
clearing cache


Processing Epoch 00:  19%|█▊        | 3391/18148 [3:01:45<13:01:58,  3.18s/it]


 Epoch 1, step: 3390, loss: 1.94549560546875, total loss: 8550.724484324455
clearing cache


Processing Epoch 00:  19%|█▊        | 3401/18148 [3:02:16<13:01:25,  3.18s/it]


 Epoch 1, step: 3400, loss: 2.11086368560791, total loss: 8575.327597260475
clearing cache


Processing Epoch 00:  19%|█▉        | 3411/18148 [3:02:47<13:00:32,  3.18s/it]


 Epoch 1, step: 3410, loss: 2.240609884262085, total loss: 8598.64554989338
clearing cache


Processing Epoch 00:  19%|█▉        | 3421/18148 [3:03:18<12:58:06,  3.17s/it]


 Epoch 1, step: 3420, loss: 2.2208211421966553, total loss: 8622.15404343605
clearing cache


Processing Epoch 00:  19%|█▉        | 3431/18148 [3:03:49<13:00:20,  3.18s/it]


 Epoch 1, step: 3430, loss: 2.4643726348876953, total loss: 8645.743205785751
clearing cache


Processing Epoch 00:  19%|█▉        | 3441/18148 [3:04:20<12:57:37,  3.17s/it]


 Epoch 1, step: 3440, loss: 2.363095283508301, total loss: 8668.08148086071
clearing cache


Processing Epoch 00:  19%|█▉        | 3451/18148 [3:04:51<12:59:58,  3.18s/it]


 Epoch 1, step: 3450, loss: 2.0665855407714844, total loss: 8693.41257584095
clearing cache


Processing Epoch 00:  19%|█▉        | 3461/18148 [3:05:22<12:56:21,  3.17s/it]


 Epoch 1, step: 3460, loss: 1.9619978666305542, total loss: 8716.190562486649
clearing cache


Processing Epoch 00:  19%|█▉        | 3471/18148 [3:05:53<13:00:27,  3.19s/it]


 Epoch 1, step: 3470, loss: 2.2819128036499023, total loss: 8739.869084835052
clearing cache


Processing Epoch 00:  19%|█▉        | 3481/18148 [3:06:24<12:56:24,  3.18s/it]


 Epoch 1, step: 3480, loss: 2.101097345352173, total loss: 8764.645018815994
clearing cache


Processing Epoch 00:  19%|█▉        | 3491/18148 [3:06:55<12:54:51,  3.17s/it]


 Epoch 1, step: 3490, loss: 2.160222291946411, total loss: 8788.714076519012
clearing cache


Processing Epoch 00:  19%|█▉        | 3501/18148 [3:07:26<12:57:20,  3.18s/it]


 Epoch 1, step: 3500, loss: 2.2428998947143555, total loss: 8815.389965057373
clearing cache


Processing Epoch 00:  19%|█▉        | 3511/18148 [3:07:57<12:57:05,  3.19s/it]


 Epoch 1, step: 3510, loss: 2.2253928184509277, total loss: 8837.73317861557
clearing cache


Processing Epoch 00:  19%|█▉        | 3521/18148 [3:08:28<12:55:10,  3.18s/it]


 Epoch 1, step: 3520, loss: 2.853766441345215, total loss: 8859.831163525581
clearing cache


Processing Epoch 00:  19%|█▉        | 3531/18148 [3:09:00<12:58:23,  3.20s/it]


 Epoch 1, step: 3530, loss: 2.8691787719726562, total loss: 8884.935792088509
clearing cache


Processing Epoch 00:  20%|█▉        | 3541/18148 [3:09:30<12:52:53,  3.17s/it]


 Epoch 1, step: 3540, loss: 2.8544249534606934, total loss: 8906.905388712883
clearing cache


Processing Epoch 00:  20%|█▉        | 3551/18148 [3:10:01<12:53:25,  3.18s/it]


 Epoch 1, step: 3550, loss: 2.1439242362976074, total loss: 8932.039820551872
clearing cache


Processing Epoch 00:  20%|█▉        | 3561/18148 [3:10:32<12:49:41,  3.17s/it]


 Epoch 1, step: 3560, loss: 2.0675113201141357, total loss: 8955.70186316967
clearing cache


Processing Epoch 00:  20%|█▉        | 3571/18148 [3:11:03<12:50:18,  3.17s/it]


 Epoch 1, step: 3570, loss: 2.1497859954833984, total loss: 8980.097673773766
clearing cache


Processing Epoch 00:  20%|█▉        | 3581/18148 [3:11:34<12:49:15,  3.17s/it]


 Epoch 1, step: 3580, loss: 2.2352077960968018, total loss: 9003.904665350914
clearing cache


Processing Epoch 00:  20%|█▉        | 3591/18148 [3:12:05<12:49:08,  3.17s/it]


 Epoch 1, step: 3590, loss: 2.0493059158325195, total loss: 9026.70129597187
clearing cache


Processing Epoch 00:  20%|█▉        | 3601/18148 [3:12:36<12:47:01,  3.16s/it]


 Epoch 1, step: 3600, loss: 2.2180962562561035, total loss: 9049.110745310783
clearing cache


Processing Epoch 00:  20%|█▉        | 3611/18148 [3:13:08<12:49:18,  3.18s/it]


 Epoch 1, step: 3610, loss: 2.012216806411743, total loss: 9073.809333443642
clearing cache


Processing Epoch 00:  20%|█▉        | 3621/18148 [3:13:39<12:51:14,  3.19s/it]


 Epoch 1, step: 3620, loss: 2.9162516593933105, total loss: 9098.677550673485
clearing cache


Processing Epoch 00:  20%|██        | 3631/18148 [3:14:10<12:46:27,  3.17s/it]


 Epoch 1, step: 3630, loss: 2.302748680114746, total loss: 9121.87943804264
clearing cache


Processing Epoch 00:  20%|██        | 3641/18148 [3:14:40<12:42:18,  3.15s/it]


 Epoch 1, step: 3640, loss: 2.3962604999542236, total loss: 9143.802313685417
clearing cache


Processing Epoch 00:  20%|██        | 3651/18148 [3:15:11<12:46:47,  3.17s/it]


 Epoch 1, step: 3650, loss: 1.9096689224243164, total loss: 9167.182851672173
clearing cache


Processing Epoch 00:  20%|██        | 3661/18148 [3:15:42<12:42:33,  3.16s/it]


 Epoch 1, step: 3660, loss: 2.1863255500793457, total loss: 9190.196242451668
clearing cache


Processing Epoch 00:  20%|██        | 3671/18148 [3:16:13<12:48:47,  3.19s/it]


 Epoch 1, step: 3670, loss: 2.913367748260498, total loss: 9215.01161468029
clearing cache


Processing Epoch 00:  20%|██        | 3681/18148 [3:16:44<12:47:38,  3.18s/it]


 Epoch 1, step: 3680, loss: 2.852764844894409, total loss: 9239.206351876259
clearing cache


Processing Epoch 00:  20%|██        | 3691/18148 [3:17:15<12:46:19,  3.18s/it]


 Epoch 1, step: 3690, loss: 2.1513752937316895, total loss: 9263.180301427841
clearing cache


Processing Epoch 00:  20%|██        | 3701/18148 [3:17:46<12:43:32,  3.17s/it]


 Epoch 1, step: 3700, loss: 2.2151989936828613, total loss: 9285.854192972183
clearing cache


Processing Epoch 00:  20%|██        | 3711/18148 [3:18:17<12:38:04,  3.15s/it]


 Epoch 1, step: 3710, loss: 2.225620746612549, total loss: 9308.84981226921
clearing cache


Processing Epoch 00:  21%|██        | 3721/18148 [3:18:48<12:38:16,  3.15s/it]


 Epoch 1, step: 3720, loss: 2.190244197845459, total loss: 9330.991251468658
clearing cache


Processing Epoch 00:  21%|██        | 3731/18148 [3:19:19<12:36:03,  3.15s/it]


 Epoch 1, step: 3730, loss: 2.0909810066223145, total loss: 9353.990238428116
clearing cache


Processing Epoch 00:  21%|██        | 3741/18148 [3:19:50<12:40:07,  3.17s/it]


 Epoch 1, step: 3740, loss: 2.2606797218322754, total loss: 9378.961527466774
clearing cache


Processing Epoch 00:  21%|██        | 3751/18148 [3:20:21<12:39:17,  3.16s/it]


 Epoch 1, step: 3750, loss: 2.0917863845825195, total loss: 9403.378086924553
clearing cache


Processing Epoch 00:  21%|██        | 3761/18148 [3:20:52<12:45:09,  3.19s/it]


 Epoch 1, step: 3760, loss: 2.805154800415039, total loss: 9428.95298063755
clearing cache


Processing Epoch 00:  21%|██        | 3771/18148 [3:21:23<12:41:13,  3.18s/it]


 Epoch 1, step: 3770, loss: 2.3297481536865234, total loss: 9452.922866225243
clearing cache


Processing Epoch 00:  21%|██        | 3781/18148 [3:21:54<12:40:07,  3.17s/it]


 Epoch 1, step: 3780, loss: 2.253081798553467, total loss: 9475.830273747444
clearing cache


Processing Epoch 00:  21%|██        | 3791/18148 [3:22:25<12:37:57,  3.17s/it]


 Epoch 1, step: 3790, loss: 1.9228363037109375, total loss: 9499.718635439873
clearing cache


Processing Epoch 00:  21%|██        | 3801/18148 [3:22:56<12:40:13,  3.18s/it]


 Epoch 1, step: 3800, loss: 2.10725474357605, total loss: 9523.357907652855
clearing cache


Processing Epoch 00:  21%|██        | 3811/18148 [3:23:27<12:40:49,  3.18s/it]


 Epoch 1, step: 3810, loss: 2.1231682300567627, total loss: 9547.10920202732
clearing cache


Processing Epoch 00:  21%|██        | 3821/18148 [3:23:58<12:38:14,  3.18s/it]


 Epoch 1, step: 3820, loss: 2.7761025428771973, total loss: 9568.800212025642
clearing cache


Processing Epoch 00:  21%|██        | 3831/18148 [3:24:29<12:40:08,  3.19s/it]


 Epoch 1, step: 3830, loss: 2.8644936084747314, total loss: 9591.960683941841
clearing cache


Processing Epoch 00:  21%|██        | 3841/18148 [3:25:00<12:38:35,  3.18s/it]


 Epoch 1, step: 3840, loss: 2.8035197257995605, total loss: 9616.17849266529
clearing cache


Processing Epoch 00:  21%|██        | 3851/18148 [3:25:31<12:36:16,  3.17s/it]


 Epoch 1, step: 3850, loss: 1.9618258476257324, total loss: 9638.637517809868
clearing cache


Processing Epoch 00:  21%|██▏       | 3861/18148 [3:26:02<12:38:31,  3.19s/it]


 Epoch 1, step: 3860, loss: 2.8224868774414062, total loss: 9664.442019104958
clearing cache


Processing Epoch 00:  21%|██▏       | 3871/18148 [3:26:34<12:36:37,  3.18s/it]


 Epoch 1, step: 3870, loss: 2.197617292404175, total loss: 9688.052371621132
clearing cache


Processing Epoch 00:  21%|██▏       | 3881/18148 [3:27:05<12:37:31,  3.19s/it]


 Epoch 1, step: 3880, loss: 3.0675742626190186, total loss: 9712.910994887352
clearing cache


Processing Epoch 00:  21%|██▏       | 3891/18148 [3:27:36<12:34:00,  3.17s/it]


 Epoch 1, step: 3890, loss: 2.109757423400879, total loss: 9736.658254265785
clearing cache


Processing Epoch 00:  21%|██▏       | 3901/18148 [3:28:07<12:32:24,  3.17s/it]


 Epoch 1, step: 3900, loss: 2.1373748779296875, total loss: 9759.61735355854
clearing cache


Processing Epoch 00:  22%|██▏       | 3911/18148 [3:28:38<12:34:04,  3.18s/it]


 Epoch 1, step: 3910, loss: 2.763524055480957, total loss: 9783.283857226372
clearing cache


Processing Epoch 00:  22%|██▏       | 3921/18148 [3:29:09<12:34:29,  3.18s/it]


 Epoch 1, step: 3920, loss: 3.020155668258667, total loss: 9807.282406449318
clearing cache


Processing Epoch 00:  22%|██▏       | 3931/18148 [3:29:40<12:36:10,  3.19s/it]


 Epoch 1, step: 3930, loss: 2.02901554107666, total loss: 9833.742437005043
clearing cache


Processing Epoch 00:  22%|██▏       | 3941/18148 [3:30:11<12:29:44,  3.17s/it]


 Epoch 1, step: 3940, loss: 2.146026849746704, total loss: 9856.62083017826
clearing cache


Processing Epoch 00:  22%|██▏       | 3951/18148 [3:30:42<12:31:35,  3.18s/it]


 Epoch 1, step: 3950, loss: 1.9444185495376587, total loss: 9879.491755723953
clearing cache


Processing Epoch 00:  22%|██▏       | 3961/18148 [3:31:13<12:32:07,  3.18s/it]


 Epoch 1, step: 3960, loss: 2.1951494216918945, total loss: 9901.525117635727
clearing cache


Processing Epoch 00:  22%|██▏       | 3971/18148 [3:31:44<12:28:45,  3.17s/it]


 Epoch 1, step: 3970, loss: 2.2482941150665283, total loss: 9923.810616731644
clearing cache


Processing Epoch 00:  22%|██▏       | 3981/18148 [3:32:15<12:28:18,  3.17s/it]


 Epoch 1, step: 3980, loss: 2.359093189239502, total loss: 9945.393232345581
clearing cache


Processing Epoch 00:  22%|██▏       | 3991/18148 [3:32:46<12:29:25,  3.18s/it]


 Epoch 1, step: 3990, loss: 3.0289061069488525, total loss: 9968.75583076477
clearing cache


Processing Epoch 00:  22%|██▏       | 4001/18148 [3:33:17<12:31:13,  3.19s/it]


 Epoch 1, step: 4000, loss: 2.1466031074523926, total loss: 9991.394696831703
clearing cache


Processing Epoch 00:  22%|██▏       | 4011/18148 [3:33:48<12:25:18,  3.16s/it]


 Epoch 1, step: 4010, loss: 2.052166223526001, total loss: 10014.832731366158
clearing cache


Processing Epoch 00:  22%|██▏       | 4021/18148 [3:34:19<12:27:02,  3.17s/it]


 Epoch 1, step: 4020, loss: 2.068647861480713, total loss: 10037.218051671982
clearing cache


Processing Epoch 00:  22%|██▏       | 4031/18148 [3:34:50<12:28:52,  3.18s/it]


 Epoch 1, step: 4030, loss: 2.1162328720092773, total loss: 10061.294251441956
clearing cache


Processing Epoch 00:  22%|██▏       | 4041/18148 [3:35:21<12:22:30,  3.16s/it]


 Epoch 1, step: 4040, loss: 2.135795831680298, total loss: 10082.954442024231
clearing cache


Processing Epoch 00:  22%|██▏       | 4051/18148 [3:35:52<12:23:47,  3.17s/it]


 Epoch 1, step: 4050, loss: 2.2190260887145996, total loss: 10106.89521741867
clearing cache


Processing Epoch 00:  22%|██▏       | 4061/18148 [3:36:22<12:25:30,  3.18s/it]


 Epoch 1, step: 4060, loss: 2.0493040084838867, total loss: 10130.483738422394
clearing cache


Processing Epoch 00:  22%|██▏       | 4071/18148 [3:36:54<12:26:34,  3.18s/it]


 Epoch 1, step: 4070, loss: 2.8543357849121094, total loss: 10154.633427619934
clearing cache


Processing Epoch 00:  22%|██▏       | 4081/18148 [3:37:25<12:30:21,  3.20s/it]


 Epoch 1, step: 4080, loss: 2.8116254806518555, total loss: 10180.385516166687
clearing cache


Processing Epoch 00:  23%|██▎       | 4091/18148 [3:37:56<12:22:56,  3.17s/it]


 Epoch 1, step: 4090, loss: 2.386065721511841, total loss: 10203.077710986137
clearing cache


Processing Epoch 00:  23%|██▎       | 4101/18148 [3:38:27<12:18:56,  3.16s/it]


 Epoch 1, step: 4100, loss: 2.0936481952667236, total loss: 10223.889874100685
clearing cache


Processing Epoch 00:  23%|██▎       | 4111/18148 [3:38:58<12:22:44,  3.17s/it]


 Epoch 1, step: 4110, loss: 2.180419921875, total loss: 10246.659559249878
clearing cache


Processing Epoch 00:  23%|██▎       | 4121/18148 [3:39:29<12:24:20,  3.18s/it]


 Epoch 1, step: 4120, loss: 2.119549512863159, total loss: 10271.64907002449
clearing cache


Processing Epoch 00:  23%|██▎       | 4131/18148 [3:40:00<12:25:33,  3.19s/it]


 Epoch 1, step: 4130, loss: 2.8642992973327637, total loss: 10296.186013102531
clearing cache


Processing Epoch 00:  23%|██▎       | 4141/18148 [3:40:31<12:22:21,  3.18s/it]


 Epoch 1, step: 4140, loss: 2.9291114807128906, total loss: 10319.651325583458
clearing cache


Processing Epoch 00:  23%|██▎       | 4151/18148 [3:41:02<12:20:24,  3.17s/it]


 Epoch 1, step: 4150, loss: 2.1722023487091064, total loss: 10341.06446647644
clearing cache


Processing Epoch 00:  23%|██▎       | 4161/18148 [3:41:33<12:20:28,  3.18s/it]


 Epoch 1, step: 4160, loss: 2.096176862716675, total loss: 10365.732845067978
clearing cache


Processing Epoch 00:  23%|██▎       | 4171/18148 [3:42:04<12:19:50,  3.18s/it]


 Epoch 1, step: 4170, loss: 2.223513603210449, total loss: 10390.225532770157
clearing cache


Processing Epoch 00:  23%|██▎       | 4181/18148 [3:42:35<12:19:03,  3.17s/it]


 Epoch 1, step: 4180, loss: 2.047347068786621, total loss: 10414.164972782135
clearing cache


Processing Epoch 00:  23%|██▎       | 4191/18148 [3:43:06<12:20:28,  3.18s/it]


 Epoch 1, step: 4190, loss: 2.171067237854004, total loss: 10437.397118091583
clearing cache


Processing Epoch 00:  23%|██▎       | 4201/18148 [3:43:37<12:16:13,  3.17s/it]


 Epoch 1, step: 4200, loss: 2.105938196182251, total loss: 10460.244687318802
clearing cache


Processing Epoch 00:  23%|██▎       | 4211/18148 [3:44:08<12:14:59,  3.16s/it]


 Epoch 1, step: 4210, loss: 2.0970211029052734, total loss: 10484.201248049736
clearing cache


Processing Epoch 00:  23%|██▎       | 4221/18148 [3:44:39<12:16:51,  3.17s/it]


 Epoch 1, step: 4220, loss: 2.7258143424987793, total loss: 10507.419001579285
clearing cache


Processing Epoch 00:  23%|██▎       | 4231/18148 [3:45:10<12:11:27,  3.15s/it]


 Epoch 1, step: 4230, loss: 2.200084924697876, total loss: 10528.220250487328
clearing cache


Processing Epoch 00:  23%|██▎       | 4241/18148 [3:45:41<12:18:47,  3.19s/it]


 Epoch 1, step: 4240, loss: 2.7800068855285645, total loss: 10551.253536581993
clearing cache


Processing Epoch 00:  23%|██▎       | 4251/18148 [3:46:12<12:16:49,  3.18s/it]


 Epoch 1, step: 4250, loss: 2.080474376678467, total loss: 10573.003531575203
clearing cache


Processing Epoch 00:  23%|██▎       | 4261/18148 [3:46:43<12:12:34,  3.17s/it]


 Epoch 1, step: 4260, loss: 2.1136136054992676, total loss: 10596.426686406136
clearing cache


Processing Epoch 00:  24%|██▎       | 4271/18148 [3:47:14<12:15:46,  3.18s/it]


 Epoch 1, step: 4270, loss: 2.754847764968872, total loss: 10618.884611010551
clearing cache


Processing Epoch 00:  24%|██▎       | 4281/18148 [3:47:45<12:12:26,  3.17s/it]


 Epoch 1, step: 4280, loss: 2.0560500621795654, total loss: 10640.197113633156
clearing cache


Processing Epoch 00:  24%|██▎       | 4291/18148 [3:48:16<12:13:25,  3.18s/it]


 Epoch 1, step: 4290, loss: 1.8809292316436768, total loss: 10662.72623205185
clearing cache


Processing Epoch 00:  24%|██▎       | 4301/18148 [3:48:47<12:11:57,  3.17s/it]


 Epoch 1, step: 4300, loss: 2.032327651977539, total loss: 10687.459300041199
clearing cache


Processing Epoch 00:  24%|██▍       | 4311/18148 [3:49:16<10:32:29,  2.74s/it]


 Epoch 1, step: 4310, loss: 2.0030245780944824, total loss: 10710.355196714401
clearing cache


Processing Epoch 00:  24%|██▍       | 4321/18148 [3:49:47<12:08:55,  3.16s/it]


 Epoch 1, step: 4320, loss: 2.222043514251709, total loss: 10731.89874637127
clearing cache


Processing Epoch 00:  24%|██▍       | 4331/18148 [3:50:18<12:14:22,  3.19s/it]


 Epoch 1, step: 4330, loss: 2.8627939224243164, total loss: 10757.853375315666
clearing cache


Processing Epoch 00:  24%|██▍       | 4341/18148 [3:50:49<12:08:17,  3.16s/it]


 Epoch 1, step: 4340, loss: 2.1875431537628174, total loss: 10779.547222614288
clearing cache


Processing Epoch 00:  24%|██▍       | 4351/18148 [3:51:20<12:08:06,  3.17s/it]


 Epoch 1, step: 4350, loss: 2.0661749839782715, total loss: 10804.096056461334
clearing cache


Processing Epoch 00:  24%|██▍       | 4361/18148 [3:51:51<12:09:33,  3.17s/it]


 Epoch 1, step: 4360, loss: 2.8316965103149414, total loss: 10826.608366847038
clearing cache


Processing Epoch 00:  24%|██▍       | 4371/18148 [3:52:22<12:10:17,  3.18s/it]


 Epoch 1, step: 4370, loss: 2.1055455207824707, total loss: 10849.257607340813
clearing cache


Processing Epoch 00:  24%|██▍       | 4381/18148 [3:52:53<12:11:06,  3.19s/it]


 Epoch 1, step: 4380, loss: 2.128804922103882, total loss: 10872.072656273842
clearing cache


Processing Epoch 00:  24%|██▍       | 4391/18148 [3:53:24<12:08:37,  3.18s/it]


 Epoch 1, step: 4390, loss: 2.3371481895446777, total loss: 10895.520926117897
clearing cache


Processing Epoch 00:  24%|██▍       | 4401/18148 [3:53:55<12:06:40,  3.17s/it]


 Epoch 1, step: 4400, loss: 2.3252813816070557, total loss: 10918.3073939085
clearing cache


Processing Epoch 00:  24%|██▍       | 4411/18148 [3:54:26<12:06:26,  3.17s/it]


 Epoch 1, step: 4410, loss: 2.246908664703369, total loss: 10942.286483883858
clearing cache


Processing Epoch 00:  24%|██▍       | 4421/18148 [3:54:57<12:04:34,  3.17s/it]


 Epoch 1, step: 4420, loss: 2.1127877235412598, total loss: 10964.930805563927
clearing cache


Processing Epoch 00:  24%|██▍       | 4431/18148 [3:55:28<12:06:00,  3.18s/it]


 Epoch 1, step: 4430, loss: 2.125096082687378, total loss: 10987.725710749626
clearing cache


Processing Epoch 00:  24%|██▍       | 4441/18148 [3:55:59<12:03:19,  3.17s/it]


 Epoch 1, step: 4440, loss: 2.0726218223571777, total loss: 11009.6452678442
clearing cache


Processing Epoch 00:  25%|██▍       | 4451/18148 [3:56:30<12:03:42,  3.17s/it]


 Epoch 1, step: 4450, loss: 2.0753543376922607, total loss: 11033.869823098183
clearing cache


Processing Epoch 00:  25%|██▍       | 4461/18148 [3:57:01<12:07:41,  3.19s/it]


 Epoch 1, step: 4460, loss: 2.9791736602783203, total loss: 11058.10122191906
clearing cache


Processing Epoch 00:  25%|██▍       | 4471/18148 [3:57:32<12:06:08,  3.19s/it]


 Epoch 1, step: 4470, loss: 2.0969319343566895, total loss: 11083.48353588581
clearing cache


Processing Epoch 00:  25%|██▍       | 4481/18148 [3:58:03<12:02:23,  3.17s/it]


 Epoch 1, step: 4480, loss: 2.012540817260742, total loss: 11107.150795817375
clearing cache


Processing Epoch 00:  25%|██▍       | 4491/18148 [3:58:34<12:03:15,  3.18s/it]


 Epoch 1, step: 4490, loss: 1.9542267322540283, total loss: 11130.481109023094
clearing cache


Processing Epoch 00:  25%|██▍       | 4501/18148 [3:59:05<12:05:48,  3.19s/it]


 Epoch 1, step: 4500, loss: 3.012810230255127, total loss: 11154.783375620842
clearing cache


Processing Epoch 00:  25%|██▍       | 4511/18148 [3:59:36<12:01:02,  3.17s/it]


 Epoch 1, step: 4510, loss: 2.087125778198242, total loss: 11179.975804924965
clearing cache


Processing Epoch 00:  25%|██▍       | 4521/18148 [4:00:07<12:00:53,  3.17s/it]


 Epoch 1, step: 4520, loss: 2.089664936065674, total loss: 11203.111221194267
clearing cache


Processing Epoch 00:  25%|██▍       | 4531/18148 [4:00:38<11:58:49,  3.17s/it]


 Epoch 1, step: 4530, loss: 2.1219637393951416, total loss: 11227.122444033623
clearing cache


Processing Epoch 00:  25%|██▌       | 4541/18148 [4:01:09<12:00:31,  3.18s/it]


 Epoch 1, step: 4540, loss: 2.885430097579956, total loss: 11250.52175295353
clearing cache


Processing Epoch 00:  25%|██▌       | 4551/18148 [4:01:40<11:56:55,  3.16s/it]


 Epoch 1, step: 4550, loss: 2.832071542739868, total loss: 11274.746550917625
clearing cache


Processing Epoch 00:  25%|██▌       | 4561/18148 [4:02:11<11:56:02,  3.16s/it]


 Epoch 1, step: 4560, loss: 2.1965885162353516, total loss: 11297.403828024864
clearing cache


Processing Epoch 00:  25%|██▌       | 4571/18148 [4:02:42<11:57:28,  3.17s/it]


 Epoch 1, step: 4570, loss: 2.100797653198242, total loss: 11320.123269319534
clearing cache


Processing Epoch 00:  25%|██▌       | 4581/18148 [4:03:13<12:03:55,  3.20s/it]


 Epoch 1, step: 4580, loss: 2.2549071311950684, total loss: 11343.65001797676
clearing cache


Processing Epoch 00:  25%|██▌       | 4591/18148 [4:03:44<11:56:38,  3.17s/it]


 Epoch 1, step: 4590, loss: 2.764467239379883, total loss: 11368.70935678482
clearing cache


Processing Epoch 00:  25%|██▌       | 4601/18148 [4:04:15<11:54:48,  3.17s/it]


 Epoch 1, step: 4600, loss: 2.302454948425293, total loss: 11393.851108074188
clearing cache


Processing Epoch 00:  25%|██▌       | 4611/18148 [4:04:46<11:54:46,  3.17s/it]


 Epoch 1, step: 4610, loss: 1.9434458017349243, total loss: 11415.665462374687
clearing cache


Processing Epoch 00:  25%|██▌       | 4621/18148 [4:05:17<11:51:56,  3.16s/it]


 Epoch 1, step: 4620, loss: 2.0695557594299316, total loss: 11437.271601676941
clearing cache


Processing Epoch 00:  26%|██▌       | 4631/18148 [4:05:48<11:54:26,  3.17s/it]


 Epoch 1, step: 4630, loss: 2.8405866622924805, total loss: 11458.96215903759
clearing cache


Processing Epoch 00:  26%|██▌       | 4641/18148 [4:06:19<11:55:55,  3.18s/it]


 Epoch 1, step: 4640, loss: 2.1460185050964355, total loss: 11480.690917015076
clearing cache


Processing Epoch 00:  26%|██▌       | 4651/18148 [4:06:50<11:55:01,  3.18s/it]


 Epoch 1, step: 4650, loss: 2.101700782775879, total loss: 11503.098038077354
clearing cache


Processing Epoch 00:  26%|██▌       | 4661/18148 [4:07:21<11:53:03,  3.17s/it]


 Epoch 1, step: 4660, loss: 2.2211496829986572, total loss: 11524.937128067017
clearing cache


Processing Epoch 00:  26%|██▌       | 4671/18148 [4:07:52<11:53:18,  3.18s/it]


 Epoch 1, step: 4670, loss: 2.0445590019226074, total loss: 11547.264237642288
clearing cache


Processing Epoch 00:  26%|██▌       | 4681/18148 [4:08:23<11:52:20,  3.17s/it]


 Epoch 1, step: 4680, loss: 2.2883102893829346, total loss: 11570.904179692268
clearing cache


Processing Epoch 00:  26%|██▌       | 4691/18148 [4:08:54<11:52:24,  3.18s/it]


 Epoch 1, step: 4690, loss: 2.4725775718688965, total loss: 11594.590212106705
clearing cache


Processing Epoch 00:  26%|██▌       | 4701/18148 [4:09:25<11:56:43,  3.20s/it]


 Epoch 1, step: 4700, loss: 2.807058811187744, total loss: 11618.520893931389
clearing cache


Processing Epoch 00:  26%|██▌       | 4711/18148 [4:09:56<11:54:39,  3.19s/it]


 Epoch 1, step: 4710, loss: 2.832533359527588, total loss: 11641.51396548748
clearing cache


Processing Epoch 00:  26%|██▌       | 4721/18148 [4:10:27<11:50:40,  3.18s/it]


 Epoch 1, step: 4720, loss: 2.0987701416015625, total loss: 11665.198078870773
clearing cache


Processing Epoch 00:  26%|██▌       | 4731/18148 [4:10:58<11:53:39,  3.19s/it]


 Epoch 1, step: 4730, loss: 2.91426420211792, total loss: 11689.271317481995
clearing cache


Processing Epoch 00:  26%|██▌       | 4741/18148 [4:11:29<11:46:27,  3.16s/it]


 Epoch 1, step: 4740, loss: 1.9265069961547852, total loss: 11712.421792507172
clearing cache


Processing Epoch 00:  26%|██▌       | 4751/18148 [4:12:00<11:45:50,  3.16s/it]


 Epoch 1, step: 4750, loss: 2.177617073059082, total loss: 11735.19348192215
clearing cache


Processing Epoch 00:  26%|██▌       | 4761/18148 [4:12:31<11:45:11,  3.16s/it]


 Epoch 1, step: 4760, loss: 2.1401238441467285, total loss: 11759.099378585815
clearing cache


Processing Epoch 00:  26%|██▋       | 4771/18148 [4:13:02<11:48:29,  3.18s/it]


 Epoch 1, step: 4770, loss: 1.9198859930038452, total loss: 11783.10049211979
clearing cache


Processing Epoch 00:  26%|██▋       | 4781/18148 [4:13:33<11:48:04,  3.18s/it]


 Epoch 1, step: 4780, loss: 2.20387601852417, total loss: 11806.563270688057
clearing cache


Processing Epoch 00:  26%|██▋       | 4791/18148 [4:14:04<11:45:52,  3.17s/it]


 Epoch 1, step: 4790, loss: 2.207491874694824, total loss: 11830.820971369743
clearing cache


Processing Epoch 00:  26%|██▋       | 4801/18148 [4:14:35<11:44:47,  3.17s/it]


 Epoch 1, step: 4800, loss: 2.1949667930603027, total loss: 11853.431935191154
clearing cache


Processing Epoch 00:  27%|██▋       | 4811/18148 [4:15:06<11:44:39,  3.17s/it]


 Epoch 1, step: 4810, loss: 2.900029182434082, total loss: 11876.284649252892
clearing cache


Processing Epoch 00:  27%|██▋       | 4821/18148 [4:15:37<11:50:01,  3.20s/it]


 Epoch 1, step: 4820, loss: 2.909731388092041, total loss: 11900.174028277397
clearing cache


Processing Epoch 00:  27%|██▋       | 4831/18148 [4:16:08<11:44:00,  3.17s/it]


 Epoch 1, step: 4830, loss: 2.803645372390747, total loss: 11924.243766903877
clearing cache


Processing Epoch 00:  27%|██▋       | 4841/18148 [4:16:39<11:43:58,  3.17s/it]


 Epoch 1, step: 4840, loss: 1.942713975906372, total loss: 11946.327182769775
clearing cache


Processing Epoch 00:  27%|██▋       | 4851/18148 [4:17:10<11:43:41,  3.18s/it]


 Epoch 1, step: 4850, loss: 2.0689449310302734, total loss: 11968.72828245163
clearing cache


Processing Epoch 00:  27%|██▋       | 4861/18148 [4:17:41<11:43:00,  3.17s/it]


 Epoch 1, step: 4860, loss: 1.9511690139770508, total loss: 11991.648346662521
clearing cache


Processing Epoch 00:  27%|██▋       | 4871/18148 [4:18:12<11:44:28,  3.18s/it]


 Epoch 1, step: 4870, loss: 2.6967544555664062, total loss: 12013.298844099045
clearing cache


Processing Epoch 00:  27%|██▋       | 4881/18148 [4:18:43<11:41:04,  3.17s/it]


 Epoch 1, step: 4880, loss: 2.1838202476501465, total loss: 12035.681450605392
clearing cache


Processing Epoch 00:  27%|██▋       | 4891/18148 [4:19:14<11:45:05,  3.19s/it]


 Epoch 1, step: 4890, loss: 2.845612049102783, total loss: 12060.796166300774
clearing cache


Processing Epoch 00:  27%|██▋       | 4901/18148 [4:19:45<11:43:07,  3.18s/it]


 Epoch 1, step: 4900, loss: 2.1214418411254883, total loss: 12083.696676492691
clearing cache


Processing Epoch 00:  27%|██▋       | 4911/18148 [4:20:16<11:42:17,  3.18s/it]


 Epoch 1, step: 4910, loss: 2.169783115386963, total loss: 12105.826645612717
clearing cache


Processing Epoch 00:  27%|██▋       | 4921/18148 [4:20:47<11:39:14,  3.17s/it]


 Epoch 1, step: 4920, loss: 2.004507064819336, total loss: 12128.47452545166
clearing cache


Processing Epoch 00:  27%|██▋       | 4931/18148 [4:21:18<11:39:51,  3.18s/it]


 Epoch 1, step: 4930, loss: 2.0362844467163086, total loss: 12151.31904578209
clearing cache


Processing Epoch 00:  27%|██▋       | 4941/18148 [4:21:49<11:41:31,  3.19s/it]


 Epoch 1, step: 4940, loss: 2.2374985218048096, total loss: 12175.108674049377
clearing cache


Processing Epoch 00:  27%|██▋       | 4951/18148 [4:22:20<11:37:01,  3.17s/it]


 Epoch 1, step: 4950, loss: 2.234053611755371, total loss: 12196.762247800827
clearing cache


Processing Epoch 00:  27%|██▋       | 4961/18148 [4:22:51<11:38:20,  3.18s/it]


 Epoch 1, step: 4960, loss: 2.0984907150268555, total loss: 12220.455785036087
clearing cache


Processing Epoch 00:  27%|██▋       | 4971/18148 [4:23:22<11:35:50,  3.17s/it]


 Epoch 1, step: 4970, loss: 1.9430288076400757, total loss: 12242.316976428032
clearing cache


Processing Epoch 00:  27%|██▋       | 4981/18148 [4:23:53<11:37:27,  3.18s/it]


 Epoch 1, step: 4980, loss: 2.3000667095184326, total loss: 12264.05539906025
clearing cache


Processing Epoch 00:  28%|██▊       | 4991/18148 [4:24:24<11:40:04,  3.19s/it]


 Epoch 1, step: 4990, loss: 2.9167401790618896, total loss: 12288.494047641754
clearing cache


Processing Epoch 00:  28%|██▊       | 5001/18148 [4:24:55<11:33:43,  3.17s/it]


 Epoch 1, step: 5000, loss: 1.9498820304870605, total loss: 12310.85716843605
clearing cache


Processing Epoch 00:  28%|██▊       | 5010/18148 [4:25:23<11:12:24,  3.07s/it]


 Epoch 1, step: 5010, loss: 2.1756327152252197, total loss: 12333.237962722778
clearing cache


Processing Epoch 00:  28%|██▊       | 5021/18148 [4:25:57<11:38:23,  3.19s/it]


 Epoch 1, step: 5020, loss: 2.900850772857666, total loss: 12358.333420753479
clearing cache


Processing Epoch 00:  28%|██▊       | 5031/18148 [4:26:28<11:33:43,  3.17s/it]


 Epoch 1, step: 5030, loss: 2.0276100635528564, total loss: 12381.697632193565
clearing cache


Processing Epoch 00:  28%|██▊       | 5041/18148 [4:26:59<11:34:32,  3.18s/it]


 Epoch 1, step: 5040, loss: 2.8760645389556885, total loss: 12405.957450389862
clearing cache


Processing Epoch 00:  28%|██▊       | 5051/18148 [4:27:31<11:34:35,  3.18s/it]


 Epoch 1, step: 5050, loss: 2.9129953384399414, total loss: 12431.457525253296
clearing cache


Processing Epoch 00:  28%|██▊       | 5061/18148 [4:28:02<11:33:06,  3.18s/it]


 Epoch 1, step: 5060, loss: 2.2645351886749268, total loss: 12453.972776889801
clearing cache


Processing Epoch 00:  28%|██▊       | 5071/18148 [4:28:33<11:35:03,  3.19s/it]


 Epoch 1, step: 5070, loss: 2.7738003730773926, total loss: 12477.44138622284
clearing cache


Processing Epoch 00:  28%|██▊       | 5081/18148 [4:29:04<11:31:33,  3.18s/it]


 Epoch 1, step: 5080, loss: 2.224273681640625, total loss: 12501.738310098648
clearing cache


Processing Epoch 00:  28%|██▊       | 5091/18148 [4:29:35<11:32:51,  3.18s/it]


 Epoch 1, step: 5090, loss: 2.347285270690918, total loss: 12523.789837241173
clearing cache


Processing Epoch 00:  28%|██▊       | 5101/18148 [4:30:06<11:33:18,  3.19s/it]


 Epoch 1, step: 5100, loss: 2.7489655017852783, total loss: 12547.10172533989
clearing cache


Processing Epoch 00:  28%|██▊       | 5111/18148 [4:30:37<11:33:03,  3.19s/it]


 Epoch 1, step: 5110, loss: 2.8713083267211914, total loss: 12570.225091457367
clearing cache


Processing Epoch 00:  28%|██▊       | 5121/18148 [4:31:08<11:30:18,  3.18s/it]


 Epoch 1, step: 5120, loss: 2.768444061279297, total loss: 12593.218196630478
clearing cache


Processing Epoch 00:  28%|██▊       | 5131/18148 [4:31:39<11:27:09,  3.17s/it]


 Epoch 1, step: 5130, loss: 2.8318755626678467, total loss: 12617.614600658417
clearing cache


Processing Epoch 00:  28%|██▊       | 5141/18148 [4:32:10<11:26:06,  3.16s/it]


 Epoch 1, step: 5140, loss: 2.0752055644989014, total loss: 12640.533559799194
clearing cache


Processing Epoch 00:  28%|██▊       | 5151/18148 [4:32:41<11:29:52,  3.18s/it]


 Epoch 1, step: 5150, loss: 2.7175137996673584, total loss: 12664.762805700302
clearing cache


Processing Epoch 00:  28%|██▊       | 5161/18148 [4:33:12<11:26:32,  3.17s/it]


 Epoch 1, step: 5160, loss: 2.255955457687378, total loss: 12687.623035311699
clearing cache


Processing Epoch 00:  28%|██▊       | 5171/18148 [4:33:43<11:23:52,  3.16s/it]


 Epoch 1, step: 5170, loss: 2.124704122543335, total loss: 12710.548691034317
clearing cache


Processing Epoch 00:  29%|██▊       | 5181/18148 [4:34:14<11:22:34,  3.16s/it]


 Epoch 1, step: 5180, loss: 2.0935885906219482, total loss: 12733.031172037125
clearing cache


Processing Epoch 00:  29%|██▊       | 5191/18148 [4:34:45<11:23:35,  3.17s/it]


 Epoch 1, step: 5190, loss: 2.3117737770080566, total loss: 12757.188987374306
clearing cache


Processing Epoch 00:  29%|██▊       | 5201/18148 [4:35:16<11:24:21,  3.17s/it]


 Epoch 1, step: 5200, loss: 2.2682342529296875, total loss: 12780.3439296484
clearing cache


Processing Epoch 00:  29%|██▊       | 5211/18148 [4:35:47<11:22:10,  3.16s/it]


 Epoch 1, step: 5210, loss: 2.2506206035614014, total loss: 12804.211996436119
clearing cache


Processing Epoch 00:  29%|██▉       | 5221/18148 [4:36:18<11:22:28,  3.17s/it]


 Epoch 1, step: 5220, loss: 2.177896499633789, total loss: 12827.398247599602
clearing cache


Processing Epoch 00:  29%|██▉       | 5231/18148 [4:36:49<11:22:30,  3.17s/it]


 Epoch 1, step: 5230, loss: 2.0788869857788086, total loss: 12849.21936750412
clearing cache


Processing Epoch 00:  29%|██▉       | 5241/18148 [4:37:20<11:20:26,  3.16s/it]


 Epoch 1, step: 5240, loss: 2.0370640754699707, total loss: 12873.728464603424
clearing cache


Processing Epoch 00:  29%|██▉       | 5251/18148 [4:37:51<11:21:44,  3.17s/it]


 Epoch 1, step: 5250, loss: 2.0764381885528564, total loss: 12896.865309238434
clearing cache


Processing Epoch 00:  29%|██▉       | 5261/18148 [4:38:22<11:22:59,  3.18s/it]


 Epoch 1, step: 5260, loss: 1.977909803390503, total loss: 12921.20085310936
clearing cache


Processing Epoch 00:  29%|██▉       | 5271/18148 [4:38:53<11:22:59,  3.18s/it]


 Epoch 1, step: 5270, loss: 2.8602490425109863, total loss: 12945.446418046951
clearing cache


Processing Epoch 00:  29%|██▉       | 5281/18148 [4:39:24<11:24:52,  3.19s/it]


 Epoch 1, step: 5280, loss: 2.897972345352173, total loss: 12969.28874528408
clearing cache


Processing Epoch 00:  29%|██▉       | 5291/18148 [4:39:55<11:18:16,  3.17s/it]


 Epoch 1, step: 5290, loss: 2.079969882965088, total loss: 12992.312967658043
clearing cache


Processing Epoch 00:  29%|██▉       | 5301/18148 [4:40:26<11:21:24,  3.18s/it]


 Epoch 1, step: 5300, loss: 2.790210485458374, total loss: 13015.922937750816
clearing cache


Processing Epoch 00:  29%|██▉       | 5311/18148 [4:40:57<11:21:07,  3.18s/it]


 Epoch 1, step: 5310, loss: 2.881502628326416, total loss: 13038.658357143402
clearing cache


Processing Epoch 00:  29%|██▉       | 5321/18148 [4:41:28<11:17:20,  3.17s/it]


 Epoch 1, step: 5320, loss: 2.9677939414978027, total loss: 13061.34363090992
clearing cache


Processing Epoch 00:  29%|██▉       | 5331/18148 [4:41:59<11:18:46,  3.18s/it]


 Epoch 1, step: 5330, loss: 2.784994602203369, total loss: 13083.872443556786
clearing cache


Processing Epoch 00:  29%|██▉       | 5341/18148 [4:42:30<11:17:12,  3.17s/it]


 Epoch 1, step: 5340, loss: 2.1413793563842773, total loss: 13104.806046962738
clearing cache


Processing Epoch 00:  29%|██▉       | 5351/18148 [4:43:01<11:17:07,  3.17s/it]


 Epoch 1, step: 5350, loss: 2.9499130249023438, total loss: 13127.480189561844
clearing cache


Processing Epoch 00:  30%|██▉       | 5361/18148 [4:43:31<11:13:36,  3.16s/it]


 Epoch 1, step: 5360, loss: 2.072887897491455, total loss: 13150.306212544441
clearing cache


Processing Epoch 00:  30%|██▉       | 5371/18148 [4:44:03<11:18:45,  3.19s/it]


 Epoch 1, step: 5370, loss: 2.0900657176971436, total loss: 13174.760088801384
clearing cache


Processing Epoch 00:  30%|██▉       | 5381/18148 [4:44:34<11:15:53,  3.18s/it]


 Epoch 1, step: 5380, loss: 2.2592110633850098, total loss: 13198.179348111153
clearing cache


Processing Epoch 00:  30%|██▉       | 5391/18148 [4:45:05<11:18:03,  3.19s/it]


 Epoch 1, step: 5390, loss: 2.8468945026397705, total loss: 13222.082764029503
clearing cache


Processing Epoch 00:  30%|██▉       | 5401/18148 [4:45:36<11:15:24,  3.18s/it]


 Epoch 1, step: 5400, loss: 1.9962472915649414, total loss: 13245.784373164177
clearing cache


Processing Epoch 00:  30%|██▉       | 5411/18148 [4:46:07<11:12:15,  3.17s/it]


 Epoch 1, step: 5410, loss: 2.1580183506011963, total loss: 13268.434291481972
clearing cache


Processing Epoch 00:  30%|██▉       | 5421/18148 [4:46:38<11:14:05,  3.18s/it]


 Epoch 1, step: 5420, loss: 2.6883230209350586, total loss: 13291.999098181725
clearing cache


Processing Epoch 00:  30%|██▉       | 5431/18148 [4:47:09<11:08:54,  3.16s/it]


 Epoch 1, step: 5430, loss: 2.157181978225708, total loss: 13313.372434973717
clearing cache


Processing Epoch 00:  30%|██▉       | 5441/18148 [4:47:40<11:14:09,  3.18s/it]


 Epoch 1, step: 5440, loss: 2.815277099609375, total loss: 13337.009397387505
clearing cache


Processing Epoch 00:  30%|███       | 5451/18148 [4:48:11<11:13:25,  3.18s/it]


 Epoch 1, step: 5450, loss: 2.1997897624969482, total loss: 13361.065343260765
clearing cache


Processing Epoch 00:  30%|███       | 5461/18148 [4:48:42<11:15:11,  3.19s/it]


 Epoch 1, step: 5460, loss: 2.7693710327148438, total loss: 13387.171592116356
clearing cache


Processing Epoch 00:  30%|███       | 5471/18148 [4:49:13<11:09:15,  3.17s/it]


 Epoch 1, step: 5470, loss: 2.2599666118621826, total loss: 13410.208887219429
clearing cache


Processing Epoch 00:  30%|███       | 5481/18148 [4:49:44<11:06:58,  3.16s/it]


 Epoch 1, step: 5480, loss: 1.978033423423767, total loss: 13431.53658747673
clearing cache


Processing Epoch 00:  30%|███       | 5491/18148 [4:50:15<11:13:09,  3.19s/it]


 Epoch 1, step: 5490, loss: 2.4051103591918945, total loss: 13455.027871847153
clearing cache


Processing Epoch 00:  30%|███       | 5501/18148 [4:50:46<11:09:46,  3.18s/it]


 Epoch 1, step: 5500, loss: 2.346529006958008, total loss: 13477.816943883896
clearing cache


Processing Epoch 00:  30%|███       | 5511/18148 [4:51:17<11:10:26,  3.18s/it]


 Epoch 1, step: 5510, loss: 2.042235851287842, total loss: 13502.366186857224
clearing cache


Processing Epoch 00:  30%|███       | 5521/18148 [4:51:48<11:07:41,  3.17s/it]


 Epoch 1, step: 5520, loss: 1.9932680130004883, total loss: 13527.410695552826
clearing cache


Processing Epoch 00:  30%|███       | 5531/18148 [4:52:19<11:08:01,  3.18s/it]


 Epoch 1, step: 5530, loss: 2.15043044090271, total loss: 13550.59964132309
clearing cache


Processing Epoch 00:  31%|███       | 5541/18148 [4:52:50<11:08:43,  3.18s/it]


 Epoch 1, step: 5540, loss: 3.0044991970062256, total loss: 13574.080692529678
clearing cache


Processing Epoch 00:  31%|███       | 5551/18148 [4:53:21<11:04:13,  3.16s/it]


 Epoch 1, step: 5550, loss: 2.0164713859558105, total loss: 13595.654349088669
clearing cache


Processing Epoch 00:  31%|███       | 5561/18148 [4:53:52<11:07:10,  3.18s/it]


 Epoch 1, step: 5560, loss: 1.9455971717834473, total loss: 13617.261201500893
clearing cache


Processing Epoch 00:  31%|███       | 5571/18148 [4:54:23<11:07:26,  3.18s/it]


 Epoch 1, step: 5570, loss: 2.847506523132324, total loss: 13640.394426584244
clearing cache


Processing Epoch 00:  31%|███       | 5581/18148 [4:54:54<11:05:11,  3.18s/it]


 Epoch 1, step: 5580, loss: 1.9740915298461914, total loss: 13665.469101667404
clearing cache


Processing Epoch 00:  31%|███       | 5591/18148 [4:55:25<11:03:48,  3.17s/it]


 Epoch 1, step: 5590, loss: 2.0729787349700928, total loss: 13689.051223754883
clearing cache


Processing Epoch 00:  31%|███       | 5601/18148 [4:55:56<11:06:11,  3.19s/it]


 Epoch 1, step: 5600, loss: 2.0697574615478516, total loss: 13713.157142043114
clearing cache


Processing Epoch 00:  31%|███       | 5611/18148 [4:56:27<11:04:34,  3.18s/it]


 Epoch 1, step: 5610, loss: 2.1897430419921875, total loss: 13735.650166630745
clearing cache


Processing Epoch 00:  31%|███       | 5621/18148 [4:56:58<11:01:41,  3.17s/it]


 Epoch 1, step: 5620, loss: 2.306708574295044, total loss: 13759.284343838692
clearing cache


Processing Epoch 00:  31%|███       | 5631/18148 [4:57:29<10:57:04,  3.15s/it]


 Epoch 1, step: 5630, loss: 2.0692005157470703, total loss: 13781.309012174606
clearing cache


Processing Epoch 00:  31%|███       | 5641/18148 [4:58:00<11:01:55,  3.18s/it]


 Epoch 1, step: 5640, loss: 2.941119432449341, total loss: 13807.07468366623
clearing cache


Processing Epoch 00:  31%|███       | 5651/18148 [4:58:31<11:01:17,  3.17s/it]


 Epoch 1, step: 5650, loss: 2.936046838760376, total loss: 13829.825929641724
clearing cache


Processing Epoch 00:  31%|███       | 5661/18148 [4:59:02<11:03:31,  3.19s/it]


 Epoch 1, step: 5660, loss: 2.7577009201049805, total loss: 13853.794583320618
clearing cache


Processing Epoch 00:  31%|███       | 5671/18148 [4:59:33<10:59:29,  3.17s/it]


 Epoch 1, step: 5670, loss: 2.024087429046631, total loss: 13876.579540252686
clearing cache


Processing Epoch 00:  31%|███▏      | 5681/18148 [5:00:04<11:00:18,  3.18s/it]


 Epoch 1, step: 5680, loss: 2.157144546508789, total loss: 13900.525353431702
clearing cache


Processing Epoch 00:  31%|███▏      | 5691/18148 [5:00:35<11:01:46,  3.19s/it]


 Epoch 1, step: 5690, loss: 2.8068275451660156, total loss: 13923.109025239944
clearing cache


Processing Epoch 00:  31%|███▏      | 5701/18148 [5:01:07<10:58:55,  3.18s/it]


 Epoch 1, step: 5700, loss: 2.842599391937256, total loss: 13947.0651948452
clearing cache


Processing Epoch 00:  31%|███▏      | 5711/18148 [5:01:37<10:56:44,  3.17s/it]


 Epoch 1, step: 5710, loss: 2.090686798095703, total loss: 13969.458271026611
clearing cache


Processing Epoch 00:  32%|███▏      | 5721/18148 [5:02:09<10:56:58,  3.17s/it]


 Epoch 1, step: 5720, loss: 1.9681817293167114, total loss: 13992.57331764698
clearing cache


Processing Epoch 00:  32%|███▏      | 5731/18148 [5:02:40<11:00:19,  3.19s/it]


 Epoch 1, step: 5730, loss: 2.8581345081329346, total loss: 14016.38949906826
clearing cache


Processing Epoch 00:  32%|███▏      | 5741/18148 [5:03:11<10:58:54,  3.19s/it]


 Epoch 1, step: 5740, loss: 2.875025510787964, total loss: 14038.946006417274
clearing cache


Processing Epoch 00:  32%|███▏      | 5751/18148 [5:03:42<10:59:25,  3.19s/it]


 Epoch 1, step: 5750, loss: 2.0974538326263428, total loss: 14064.76867210865
clearing cache


Processing Epoch 00:  32%|███▏      | 5761/18148 [5:04:13<10:55:10,  3.17s/it]


 Epoch 1, step: 5760, loss: 2.0939953327178955, total loss: 14086.396271348
clearing cache


Processing Epoch 00:  32%|███▏      | 5771/18148 [5:04:44<10:55:33,  3.18s/it]


 Epoch 1, step: 5770, loss: 2.8703818321228027, total loss: 14109.50639808178
clearing cache


Processing Epoch 00:  32%|███▏      | 5781/18148 [5:05:15<10:56:44,  3.19s/it]


 Epoch 1, step: 5780, loss: 2.9862608909606934, total loss: 14133.973374843597
clearing cache


Processing Epoch 00:  32%|███▏      | 5791/18148 [5:05:46<10:52:07,  3.17s/it]


 Epoch 1, step: 5790, loss: 2.043525457382202, total loss: 14156.256520271301
clearing cache


Processing Epoch 00:  32%|███▏      | 5801/18148 [5:06:17<10:54:41,  3.18s/it]


 Epoch 1, step: 5800, loss: 2.959498882293701, total loss: 14179.318575501442
clearing cache


Processing Epoch 00:  32%|███▏      | 5811/18148 [5:06:48<10:53:14,  3.18s/it]


 Epoch 1, step: 5810, loss: 2.733642101287842, total loss: 14202.397790551186
clearing cache


Processing Epoch 00:  32%|███▏      | 5821/18148 [5:07:19<10:52:04,  3.17s/it]


 Epoch 1, step: 5820, loss: 2.7012596130371094, total loss: 14226.101576924324
clearing cache


Processing Epoch 00:  32%|███▏      | 5831/18148 [5:07:50<10:50:04,  3.17s/it]


 Epoch 1, step: 5830, loss: 2.2559092044830322, total loss: 14250.332121253014
clearing cache


Processing Epoch 00:  32%|███▏      | 5841/18148 [5:08:21<10:48:38,  3.16s/it]


 Epoch 1, step: 5840, loss: 2.19203782081604, total loss: 14274.306181550026
clearing cache


Processing Epoch 00:  32%|███▏      | 5851/18148 [5:08:52<10:49:33,  3.17s/it]


 Epoch 1, step: 5850, loss: 2.158201217651367, total loss: 14297.778730273247
clearing cache


Processing Epoch 00:  32%|███▏      | 5861/18148 [5:09:23<10:49:06,  3.17s/it]


 Epoch 1, step: 5860, loss: 2.078763961791992, total loss: 14322.053155064583
clearing cache


Processing Epoch 00:  32%|███▏      | 5871/18148 [5:09:54<10:52:27,  3.19s/it]


 Epoch 1, step: 5870, loss: 2.7150192260742188, total loss: 14346.396060109138
clearing cache


Processing Epoch 00:  32%|███▏      | 5881/18148 [5:10:25<10:47:45,  3.17s/it]


 Epoch 1, step: 5880, loss: 2.2773423194885254, total loss: 14368.811327457428
clearing cache


Processing Epoch 00:  32%|███▏      | 5891/18148 [5:10:56<10:54:47,  3.21s/it]


 Epoch 1, step: 5890, loss: 2.839933395385742, total loss: 14392.340191602707
clearing cache


Processing Epoch 00:  33%|███▎      | 5901/18148 [5:11:27<10:47:36,  3.17s/it]


 Epoch 1, step: 5900, loss: 2.099214792251587, total loss: 14415.100484609604
clearing cache


Processing Epoch 00:  33%|███▎      | 5911/18148 [5:11:58<10:47:12,  3.17s/it]


 Epoch 1, step: 5910, loss: 2.1094555854797363, total loss: 14437.07485628128
clearing cache


Processing Epoch 00:  33%|███▎      | 5921/18148 [5:12:29<10:46:11,  3.17s/it]


 Epoch 1, step: 5920, loss: 2.028019905090332, total loss: 14459.040304660797
clearing cache


Processing Epoch 00:  33%|███▎      | 5931/18148 [5:13:00<10:45:52,  3.17s/it]


 Epoch 1, step: 5930, loss: 2.1633901596069336, total loss: 14482.454283714294
clearing cache


Processing Epoch 00:  33%|███▎      | 5941/18148 [5:13:31<10:45:20,  3.17s/it]


 Epoch 1, step: 5940, loss: 2.2520241737365723, total loss: 14505.763487696648
clearing cache


Processing Epoch 00:  33%|███▎      | 5951/18148 [5:14:02<10:43:43,  3.17s/it]


 Epoch 1, step: 5950, loss: 1.9151756763458252, total loss: 14526.561854720116
clearing cache


Processing Epoch 00:  33%|███▎      | 5961/18148 [5:14:33<10:46:35,  3.18s/it]


 Epoch 1, step: 5960, loss: 2.804588556289673, total loss: 14550.53212094307
clearing cache


Processing Epoch 00:  33%|███▎      | 5971/18148 [5:15:04<10:41:36,  3.16s/it]


 Epoch 1, step: 5970, loss: 2.1548900604248047, total loss: 14572.002482652664
clearing cache


Processing Epoch 00:  33%|███▎      | 5981/18148 [5:15:35<10:43:11,  3.17s/it]


 Epoch 1, step: 5980, loss: 2.0604941844940186, total loss: 14593.463734269142
clearing cache


Processing Epoch 00:  33%|███▎      | 5991/18148 [5:16:06<10:44:35,  3.18s/it]


 Epoch 1, step: 5990, loss: 2.2218947410583496, total loss: 14618.62512934208
clearing cache


Processing Epoch 00:  33%|███▎      | 6001/18148 [5:16:37<10:44:08,  3.18s/it]


 Epoch 1, step: 6000, loss: 2.2196121215820312, total loss: 14643.86006617546
clearing cache


Processing Epoch 00:  33%|███▎      | 6011/18148 [5:17:08<10:45:07,  3.19s/it]


 Epoch 1, step: 6010, loss: 2.824920415878296, total loss: 14666.296892166138
clearing cache


Processing Epoch 00:  33%|███▎      | 6021/18148 [5:17:39<10:42:32,  3.18s/it]


 Epoch 1, step: 6020, loss: 2.0383858680725098, total loss: 14689.792885899544
clearing cache


Processing Epoch 00:  33%|███▎      | 6031/18148 [5:18:10<10:43:40,  3.19s/it]


 Epoch 1, step: 6030, loss: 2.737236499786377, total loss: 14712.303148627281
clearing cache


Processing Epoch 00:  33%|███▎      | 6041/18148 [5:18:41<10:41:23,  3.18s/it]


 Epoch 1, step: 6040, loss: 1.9300302267074585, total loss: 14735.13525402546
clearing cache


Processing Epoch 00:  33%|███▎      | 6051/18148 [5:19:12<10:43:38,  3.19s/it]


 Epoch 1, step: 6050, loss: 2.090728759765625, total loss: 14760.514488577843
clearing cache


Processing Epoch 00:  33%|███▎      | 6061/18148 [5:19:43<10:41:22,  3.18s/it]


 Epoch 1, step: 6060, loss: 2.2045345306396484, total loss: 14784.280255794525
clearing cache


Processing Epoch 00:  33%|███▎      | 6071/18148 [5:20:14<10:41:25,  3.19s/it]


 Epoch 1, step: 6070, loss: 2.1318674087524414, total loss: 14808.793391942978
clearing cache


Processing Epoch 00:  34%|███▎      | 6081/18148 [5:20:45<10:39:49,  3.18s/it]


 Epoch 1, step: 6080, loss: 3.014094591140747, total loss: 14832.126266479492
clearing cache


Processing Epoch 00:  34%|███▎      | 6091/18148 [5:21:16<10:38:28,  3.18s/it]


 Epoch 1, step: 6090, loss: 2.771338939666748, total loss: 14856.652052640915
clearing cache


Processing Epoch 00:  34%|███▎      | 6101/18148 [5:21:47<10:33:18,  3.15s/it]


 Epoch 1, step: 6100, loss: 2.0988664627075195, total loss: 14879.881069540977
clearing cache


Processing Epoch 00:  34%|███▎      | 6111/18148 [5:22:18<10:33:33,  3.16s/it]


 Epoch 1, step: 6110, loss: 2.204831123352051, total loss: 14903.390948176384
clearing cache


Processing Epoch 00:  34%|███▎      | 6121/18148 [5:22:49<10:34:22,  3.16s/it]


 Epoch 1, step: 6120, loss: 2.186899185180664, total loss: 14925.207331299782
clearing cache


Processing Epoch 00:  34%|███▍      | 6131/18148 [5:23:20<10:33:37,  3.16s/it]


 Epoch 1, step: 6130, loss: 2.2414357662200928, total loss: 14949.252329945564
clearing cache


Processing Epoch 00:  34%|███▍      | 6141/18148 [5:23:51<10:33:19,  3.16s/it]


 Epoch 1, step: 6140, loss: 1.9743536710739136, total loss: 14972.402455091476
clearing cache


Processing Epoch 00:  34%|███▍      | 6151/18148 [5:24:22<10:34:14,  3.17s/it]


 Epoch 1, step: 6150, loss: 3.0118675231933594, total loss: 14995.348546028137
clearing cache


Processing Epoch 00:  34%|███▍      | 6161/18148 [5:24:53<10:33:01,  3.17s/it]


 Epoch 1, step: 6160, loss: 2.7854180335998535, total loss: 15018.94014120102
clearing cache


Processing Epoch 00:  34%|███▍      | 6171/18148 [5:25:24<10:34:07,  3.18s/it]


 Epoch 1, step: 6170, loss: 2.775163412094116, total loss: 15041.183295488358
clearing cache


Processing Epoch 00:  34%|███▍      | 6181/18148 [5:25:55<10:32:12,  3.17s/it]


 Epoch 1, step: 6180, loss: 2.004702568054199, total loss: 15063.85737323761
clearing cache


Processing Epoch 00:  34%|███▍      | 6191/18148 [5:26:26<10:32:54,  3.18s/it]


 Epoch 1, step: 6190, loss: 2.0506138801574707, total loss: 15085.754533052444
clearing cache


Processing Epoch 00:  34%|███▍      | 6201/18148 [5:26:57<10:32:31,  3.18s/it]


 Epoch 1, step: 6200, loss: 2.079958915710449, total loss: 15108.777647256851
clearing cache


Processing Epoch 00:  34%|███▍      | 6211/18148 [5:27:28<10:30:29,  3.17s/it]


 Epoch 1, step: 6210, loss: 2.0711069107055664, total loss: 15131.119437456131
clearing cache


Processing Epoch 00:  34%|███▍      | 6221/18148 [5:27:59<10:31:33,  3.18s/it]


 Epoch 1, step: 6220, loss: 2.170969009399414, total loss: 15155.485157966614
clearing cache


Processing Epoch 00:  34%|███▍      | 6231/18148 [5:28:30<10:39:16,  3.22s/it]


 Epoch 1, step: 6230, loss: 2.847071647644043, total loss: 15181.35611796379
clearing cache


Processing Epoch 00:  34%|███▍      | 6241/18148 [5:29:01<10:27:56,  3.16s/it]


 Epoch 1, step: 6240, loss: 2.349792718887329, total loss: 15204.297132015228
clearing cache


Processing Epoch 00:  34%|███▍      | 6251/18148 [5:29:32<10:33:26,  3.19s/it]


 Epoch 1, step: 6250, loss: 2.6520235538482666, total loss: 15230.437469005585
clearing cache


Processing Epoch 00:  34%|███▍      | 6261/18148 [5:30:03<10:28:12,  3.17s/it]


 Epoch 1, step: 6260, loss: 2.194748640060425, total loss: 15254.046829462051
clearing cache


Processing Epoch 00:  35%|███▍      | 6271/18148 [5:30:34<10:27:27,  3.17s/it]


 Epoch 1, step: 6270, loss: 2.1533119678497314, total loss: 15277.601467370987
clearing cache


Processing Epoch 00:  35%|███▍      | 6281/18148 [5:31:05<10:30:01,  3.19s/it]


 Epoch 1, step: 6280, loss: 2.753391981124878, total loss: 15302.586124897003
clearing cache


Processing Epoch 00:  35%|███▍      | 6291/18148 [5:31:36<10:29:16,  3.18s/it]


 Epoch 1, step: 6290, loss: 2.777304172515869, total loss: 15327.20349764824
clearing cache


Processing Epoch 00:  35%|███▍      | 6301/18148 [5:32:07<10:26:50,  3.17s/it]


 Epoch 1, step: 6300, loss: 2.0629029273986816, total loss: 15351.404877781868
clearing cache


Processing Epoch 00:  35%|███▍      | 6311/18148 [5:32:38<10:24:58,  3.17s/it]


 Epoch 1, step: 6310, loss: 2.2031936645507812, total loss: 15374.352355599403
clearing cache


Processing Epoch 00:  35%|███▍      | 6321/18148 [5:33:09<10:24:48,  3.17s/it]


 Epoch 1, step: 6320, loss: 2.059009075164795, total loss: 15396.618513822556
clearing cache


Processing Epoch 00:  35%|███▍      | 6331/18148 [5:33:40<10:23:46,  3.17s/it]


 Epoch 1, step: 6330, loss: 1.9409842491149902, total loss: 15418.831720590591
clearing cache


Processing Epoch 00:  35%|███▍      | 6341/18148 [5:34:11<10:25:03,  3.18s/it]


 Epoch 1, step: 6340, loss: 2.0297558307647705, total loss: 15442.137314081192
clearing cache


Processing Epoch 00:  35%|███▍      | 6351/18148 [5:34:42<10:21:09,  3.16s/it]


 Epoch 1, step: 6350, loss: 1.9262481927871704, total loss: 15463.474232077599
clearing cache


Processing Epoch 00:  35%|███▌      | 6361/18148 [5:35:13<10:23:46,  3.18s/it]


 Epoch 1, step: 6360, loss: 2.175393581390381, total loss: 15487.330867648125
clearing cache


Processing Epoch 00:  35%|███▌      | 6371/18148 [5:35:44<10:22:23,  3.17s/it]


 Epoch 1, step: 6370, loss: 1.9627007246017456, total loss: 15509.788367509842
clearing cache


Processing Epoch 00:  35%|███▌      | 6381/18148 [5:36:15<10:21:06,  3.17s/it]


 Epoch 1, step: 6380, loss: 2.878175735473633, total loss: 15531.944087147713
clearing cache


Processing Epoch 00:  35%|███▌      | 6391/18148 [5:36:46<10:23:28,  3.18s/it]


 Epoch 1, step: 6390, loss: 2.9343881607055664, total loss: 15556.323257088661
clearing cache


Processing Epoch 00:  35%|███▌      | 6401/18148 [5:37:17<10:22:23,  3.18s/it]


 Epoch 1, step: 6400, loss: 2.836261510848999, total loss: 15578.919598698616
clearing cache


Processing Epoch 00:  35%|███▌      | 6411/18148 [5:37:48<10:21:33,  3.18s/it]


 Epoch 1, step: 6410, loss: 2.9697728157043457, total loss: 15602.120328068733
clearing cache


Processing Epoch 00:  35%|███▌      | 6421/18148 [5:38:19<10:18:57,  3.17s/it]


 Epoch 1, step: 6420, loss: 2.229351043701172, total loss: 15625.112460970879
clearing cache


Processing Epoch 00:  35%|███▌      | 6431/18148 [5:38:50<10:17:51,  3.16s/it]


 Epoch 1, step: 6430, loss: 1.9557924270629883, total loss: 15648.78485071659
clearing cache


Processing Epoch 00:  35%|███▌      | 6441/18148 [5:39:21<10:16:41,  3.16s/it]


 Epoch 1, step: 6440, loss: 2.0576648712158203, total loss: 15671.654883027077
clearing cache


Processing Epoch 00:  36%|███▌      | 6451/18148 [5:39:52<10:18:09,  3.17s/it]


 Epoch 1, step: 6450, loss: 2.99320650100708, total loss: 15695.638761401176
clearing cache


Processing Epoch 00:  36%|███▌      | 6461/18148 [5:40:23<10:18:13,  3.17s/it]


 Epoch 1, step: 6460, loss: 2.6882128715515137, total loss: 15717.904510855675
clearing cache


Processing Epoch 00:  36%|███▌      | 6471/18148 [5:40:54<10:16:28,  3.17s/it]


 Epoch 1, step: 6470, loss: 2.1124613285064697, total loss: 15741.103515625
clearing cache


Processing Epoch 00:  36%|███▌      | 6481/18148 [5:41:25<10:16:41,  3.17s/it]


 Epoch 1, step: 6480, loss: 2.217168092727661, total loss: 15763.786899089813
clearing cache


Processing Epoch 00:  36%|███▌      | 6491/18148 [5:41:56<10:16:02,  3.17s/it]


 Epoch 1, step: 6490, loss: 2.340369701385498, total loss: 15785.567202687263
clearing cache


Processing Epoch 00:  36%|███▌      | 6501/18148 [5:42:27<10:16:15,  3.17s/it]


 Epoch 1, step: 6500, loss: 2.8329992294311523, total loss: 15808.203855752945
clearing cache


Processing Epoch 00:  36%|███▌      | 6511/18148 [5:42:58<10:12:31,  3.16s/it]


 Epoch 1, step: 6510, loss: 1.9572805166244507, total loss: 15829.484112262726
clearing cache


Processing Epoch 00:  36%|███▌      | 6521/18148 [5:43:29<10:11:05,  3.15s/it]


 Epoch 1, step: 6520, loss: 2.0491340160369873, total loss: 15850.57978105545
clearing cache


Processing Epoch 00:  36%|███▌      | 6531/18148 [5:44:00<10:12:52,  3.17s/it]


 Epoch 1, step: 6530, loss: 1.9367389678955078, total loss: 15873.374600410461
clearing cache


Processing Epoch 00:  36%|███▌      | 6541/18148 [5:44:31<10:13:46,  3.17s/it]


 Epoch 1, step: 6540, loss: 1.9914573431015015, total loss: 15897.455561757088
clearing cache


Processing Epoch 00:  36%|███▌      | 6551/18148 [5:45:02<10:16:09,  3.19s/it]


 Epoch 1, step: 6550, loss: 2.87062406539917, total loss: 15923.999470829964
clearing cache


Processing Epoch 00:  36%|███▌      | 6561/18148 [5:45:33<10:07:59,  3.15s/it]


 Epoch 1, step: 6560, loss: 2.0103893280029297, total loss: 15944.82856798172
clearing cache


Processing Epoch 00:  36%|███▌      | 6571/18148 [5:46:04<10:11:04,  3.17s/it]


 Epoch 1, step: 6570, loss: 1.9229942560195923, total loss: 15968.480088233948
clearing cache


Processing Epoch 00:  36%|███▋      | 6581/18148 [5:46:35<10:09:29,  3.16s/it]


 Epoch 1, step: 6580, loss: 1.8851001262664795, total loss: 15989.657330989838
clearing cache


Processing Epoch 00:  36%|███▋      | 6591/18148 [5:47:06<10:11:46,  3.18s/it]


 Epoch 1, step: 6590, loss: 2.105738878250122, total loss: 16013.190910100937
clearing cache


Processing Epoch 00:  36%|███▋      | 6601/18148 [5:47:37<10:09:24,  3.17s/it]


 Epoch 1, step: 6600, loss: 2.988030433654785, total loss: 16035.557374835014
clearing cache


Processing Epoch 00:  36%|███▋      | 6611/18148 [5:48:08<10:10:50,  3.18s/it]


 Epoch 1, step: 6610, loss: 1.903799057006836, total loss: 16058.653337359428
clearing cache


Processing Epoch 00:  36%|███▋      | 6621/18148 [5:48:39<10:08:55,  3.17s/it]


 Epoch 1, step: 6620, loss: 2.109814167022705, total loss: 16079.74332678318
clearing cache


Processing Epoch 00:  37%|███▋      | 6631/18148 [5:49:10<10:05:26,  3.15s/it]


 Epoch 1, step: 6630, loss: 2.0846002101898193, total loss: 16101.605664134026
clearing cache


Processing Epoch 00:  37%|███▋      | 6641/18148 [5:49:41<10:08:33,  3.17s/it]


 Epoch 1, step: 6640, loss: 2.165207862854004, total loss: 16125.147535681725
clearing cache


Processing Epoch 00:  37%|███▋      | 6651/18148 [5:50:12<10:06:20,  3.16s/it]


 Epoch 1, step: 6650, loss: 2.164842128753662, total loss: 16147.856440663338
clearing cache


Processing Epoch 00:  37%|███▋      | 6661/18148 [5:50:43<10:08:55,  3.18s/it]


 Epoch 1, step: 6660, loss: 2.1428682804107666, total loss: 16171.359073638916
clearing cache


Processing Epoch 00:  37%|███▋      | 6671/18148 [5:51:14<10:06:29,  3.17s/it]


 Epoch 1, step: 6670, loss: 2.246077537536621, total loss: 16193.962562799454
clearing cache


Processing Epoch 00:  37%|███▋      | 6681/18148 [5:51:45<10:06:38,  3.17s/it]


 Epoch 1, step: 6680, loss: 2.6974291801452637, total loss: 16217.396581888199
clearing cache


Processing Epoch 00:  37%|███▋      | 6691/18148 [5:52:16<10:09:10,  3.19s/it]


 Epoch 1, step: 6690, loss: 2.8633651733398438, total loss: 16242.577061653137
clearing cache


Processing Epoch 00:  37%|███▋      | 6701/18148 [5:52:47<10:04:04,  3.17s/it]


 Epoch 1, step: 6700, loss: 2.1511099338531494, total loss: 16266.299805164337
clearing cache


Processing Epoch 00:  37%|███▋      | 6711/18148 [5:53:18<10:03:17,  3.16s/it]


 Epoch 1, step: 6710, loss: 1.955224633216858, total loss: 16288.509468078613
clearing cache


Processing Epoch 00:  37%|███▋      | 6721/18148 [5:53:49<10:01:51,  3.16s/it]


 Epoch 1, step: 6720, loss: 2.075207233428955, total loss: 16310.62548160553
clearing cache


Processing Epoch 00:  37%|███▋      | 6731/18148 [5:54:20<10:06:18,  3.19s/it]


 Epoch 1, step: 6730, loss: 2.7779715061187744, total loss: 16333.559757947922
clearing cache


Processing Epoch 00:  37%|███▋      | 6741/18148 [5:54:50<10:02:58,  3.17s/it]


 Epoch 1, step: 6740, loss: 2.0893189907073975, total loss: 16356.161031246185
clearing cache


Processing Epoch 00:  37%|███▋      | 6751/18148 [5:55:21<10:05:25,  3.19s/it]


 Epoch 1, step: 6750, loss: 2.153903007507324, total loss: 16380.162745833397
clearing cache


Processing Epoch 00:  37%|███▋      | 6761/18148 [5:55:53<10:03:10,  3.18s/it]


 Epoch 1, step: 6760, loss: 2.794076442718506, total loss: 16403.16065311432
clearing cache


Processing Epoch 00:  37%|███▋      | 6771/18148 [5:56:23<10:00:43,  3.17s/it]


 Epoch 1, step: 6770, loss: 2.1394383907318115, total loss: 16424.706244945526
clearing cache


Processing Epoch 00:  37%|███▋      | 6781/18148 [5:56:54<10:01:11,  3.17s/it]


 Epoch 1, step: 6780, loss: 2.062152862548828, total loss: 16447.15152323246
clearing cache


Processing Epoch 00:  37%|███▋      | 6791/18148 [5:57:25<9:58:06,  3.16s/it] 


 Epoch 1, step: 6790, loss: 2.190863609313965, total loss: 16468.78596162796
clearing cache


Processing Epoch 00:  37%|███▋      | 6801/18148 [5:57:56<10:00:51,  3.18s/it]


 Epoch 1, step: 6800, loss: 2.294093608856201, total loss: 16493.83502817154
clearing cache


Processing Epoch 00:  38%|███▊      | 6811/18148 [5:58:27<9:58:25,  3.17s/it] 


 Epoch 1, step: 6810, loss: 2.785257577896118, total loss: 16516.67458486557
clearing cache


Processing Epoch 00:  38%|███▊      | 6821/18148 [5:58:58<10:01:25,  3.19s/it]


 Epoch 1, step: 6820, loss: 2.806962490081787, total loss: 16541.3196849823
clearing cache


Processing Epoch 00:  38%|███▊      | 6831/18148 [5:59:29<9:59:13,  3.18s/it] 


 Epoch 1, step: 6830, loss: 2.316007137298584, total loss: 16564.849969029427
clearing cache


Processing Epoch 00:  38%|███▊      | 6841/18148 [6:00:00<9:58:48,  3.18s/it]


 Epoch 1, step: 6840, loss: 2.4114856719970703, total loss: 16588.541645646095
clearing cache


Processing Epoch 00:  38%|███▊      | 6851/18148 [6:00:31<10:00:32,  3.19s/it]


 Epoch 1, step: 6850, loss: 2.7932705879211426, total loss: 16612.562287926674
clearing cache


Processing Epoch 00:  38%|███▊      | 6861/18148 [6:01:03<9:59:13,  3.19s/it] 


 Epoch 1, step: 6860, loss: 2.2662465572357178, total loss: 16636.889466762543
clearing cache


Processing Epoch 00:  38%|███▊      | 6871/18148 [6:01:34<9:57:00,  3.18s/it]


 Epoch 1, step: 6870, loss: 2.0668692588806152, total loss: 16660.737827301025
clearing cache


Processing Epoch 00:  38%|███▊      | 6881/18148 [6:02:05<9:56:57,  3.18s/it]


 Epoch 1, step: 6880, loss: 2.0114705562591553, total loss: 16684.40212392807
clearing cache


Processing Epoch 00:  38%|███▊      | 6891/18148 [6:02:36<9:51:33,  3.15s/it]


 Epoch 1, step: 6890, loss: 2.1249828338623047, total loss: 16705.60073721409
clearing cache


Processing Epoch 00:  38%|███▊      | 6901/18148 [6:03:06<9:52:02,  3.16s/it]


 Epoch 1, step: 6900, loss: 2.086198329925537, total loss: 16727.251076817513
clearing cache


Processing Epoch 00:  38%|███▊      | 6911/18148 [6:03:37<9:50:46,  3.15s/it]


 Epoch 1, step: 6910, loss: 2.0886728763580322, total loss: 16751.470606446266
clearing cache


Processing Epoch 00:  38%|███▊      | 6921/18148 [6:04:08<9:53:29,  3.17s/it]


 Epoch 1, step: 6920, loss: 2.898193597793579, total loss: 16775.394887924194
clearing cache


Processing Epoch 00:  38%|███▊      | 6931/18148 [6:04:39<9:50:58,  3.16s/it]


 Epoch 1, step: 6930, loss: 2.056170701980591, total loss: 16798.517268538475
clearing cache


Processing Epoch 00:  38%|███▊      | 6941/18148 [6:05:10<9:52:38,  3.17s/it]


 Epoch 1, step: 6940, loss: 2.092508554458618, total loss: 16824.546461701393
clearing cache


Processing Epoch 00:  38%|███▊      | 6951/18148 [6:05:41<9:50:32,  3.16s/it]


 Epoch 1, step: 6950, loss: 2.260093927383423, total loss: 16846.20781981945
clearing cache


Processing Epoch 00:  38%|███▊      | 6961/18148 [6:06:12<9:48:03,  3.15s/it]


 Epoch 1, step: 6960, loss: 1.9633032083511353, total loss: 16868.26325714588
clearing cache


Processing Epoch 00:  38%|███▊      | 6971/18148 [6:06:43<9:48:22,  3.16s/it]


 Epoch 1, step: 6970, loss: 1.963714838027954, total loss: 16890.60699546337
clearing cache


Processing Epoch 00:  38%|███▊      | 6981/18148 [6:07:14<9:50:53,  3.17s/it]


 Epoch 1, step: 6980, loss: 2.638826370239258, total loss: 16914.170403003693
clearing cache


Processing Epoch 00:  39%|███▊      | 6991/18148 [6:07:45<9:49:24,  3.17s/it]


 Epoch 1, step: 6990, loss: 1.9594197273254395, total loss: 16936.12172150612
clearing cache


Processing Epoch 00:  39%|███▊      | 7001/18148 [6:08:16<9:50:16,  3.18s/it]


 Epoch 1, step: 7000, loss: 2.023155689239502, total loss: 16959.314415097237
clearing cache


Processing Epoch 00:  39%|███▊      | 7011/18148 [6:08:47<9:47:05,  3.16s/it]


 Epoch 1, step: 7010, loss: 2.0975773334503174, total loss: 16979.905668735504
clearing cache


Processing Epoch 00:  39%|███▊      | 7021/18148 [6:09:18<9:48:08,  3.17s/it]


 Epoch 1, step: 7020, loss: 2.232273578643799, total loss: 17002.05414068699
clearing cache


Processing Epoch 00:  39%|███▊      | 7031/18148 [6:09:49<9:49:57,  3.18s/it]


 Epoch 1, step: 7030, loss: 2.0450961589813232, total loss: 17025.8419829607
clearing cache


Processing Epoch 00:  39%|███▉      | 7041/18148 [6:10:20<9:44:51,  3.16s/it]


 Epoch 1, step: 7040, loss: 1.9628690481185913, total loss: 17047.554248571396
clearing cache


Processing Epoch 00:  39%|███▉      | 7051/18148 [6:10:51<9:47:12,  3.17s/it]


 Epoch 1, step: 7050, loss: 1.9139727354049683, total loss: 17069.99937236309
clearing cache


Processing Epoch 00:  39%|███▉      | 7061/18148 [6:11:22<9:46:25,  3.17s/it]


 Epoch 1, step: 7060, loss: 2.26580548286438, total loss: 17092.693129181862
clearing cache


Processing Epoch 00:  39%|███▉      | 7071/18148 [6:11:53<9:44:30,  3.17s/it]


 Epoch 1, step: 7070, loss: 2.0392377376556396, total loss: 17116.73225915432
clearing cache


Processing Epoch 00:  39%|███▉      | 7081/18148 [6:12:24<9:44:28,  3.17s/it]


 Epoch 1, step: 7080, loss: 2.9219143390655518, total loss: 17139.63705289364
clearing cache


Processing Epoch 00:  39%|███▉      | 7091/18148 [6:12:55<9:46:37,  3.18s/it]


 Epoch 1, step: 7090, loss: 2.8303380012512207, total loss: 17164.79628932476
clearing cache


Processing Epoch 00:  39%|███▉      | 7101/18148 [6:13:25<9:44:29,  3.17s/it]


 Epoch 1, step: 7100, loss: 2.778520345687866, total loss: 17187.986629486084
clearing cache


Processing Epoch 00:  39%|███▉      | 7111/18148 [6:13:56<9:40:58,  3.16s/it]


 Epoch 1, step: 7110, loss: 2.110072374343872, total loss: 17210.312023878098
clearing cache


Processing Epoch 00:  39%|███▉      | 7121/18148 [6:14:27<9:41:24,  3.16s/it]


 Epoch 1, step: 7120, loss: 2.055957078933716, total loss: 17233.3146276474
clearing cache


Processing Epoch 00:  39%|███▉      | 7131/18148 [6:14:58<9:42:41,  3.17s/it]


 Epoch 1, step: 7130, loss: 2.0849764347076416, total loss: 17256.403576374054
clearing cache


Processing Epoch 00:  39%|███▉      | 7141/18148 [6:15:29<9:40:37,  3.17s/it]


 Epoch 1, step: 7140, loss: 2.0383617877960205, total loss: 17278.987326860428
clearing cache


Processing Epoch 00:  39%|███▉      | 7151/18148 [6:16:00<9:39:16,  3.16s/it]


 Epoch 1, step: 7150, loss: 1.8987702131271362, total loss: 17303.105293869972
clearing cache


Processing Epoch 00:  39%|███▉      | 7161/18148 [6:16:31<9:42:31,  3.18s/it]


 Epoch 1, step: 7160, loss: 2.7899255752563477, total loss: 17326.581805586815
clearing cache


Processing Epoch 00:  40%|███▉      | 7171/18148 [6:17:02<9:41:25,  3.18s/it]


 Epoch 1, step: 7170, loss: 2.0657873153686523, total loss: 17349.73674595356
clearing cache


Processing Epoch 00:  40%|███▉      | 7181/18148 [6:17:33<9:36:21,  3.15s/it]


 Epoch 1, step: 7180, loss: 2.1208295822143555, total loss: 17372.081333756447
clearing cache


Processing Epoch 00:  40%|███▉      | 7191/18148 [6:18:04<9:40:56,  3.18s/it]


 Epoch 1, step: 7190, loss: 2.8231096267700195, total loss: 17396.561072945595
clearing cache


Processing Epoch 00:  40%|███▉      | 7201/18148 [6:18:35<9:36:27,  3.16s/it]


 Epoch 1, step: 7200, loss: 2.053340435028076, total loss: 17418.070724010468
clearing cache


Processing Epoch 00:  40%|███▉      | 7211/18148 [6:19:06<9:38:33,  3.17s/it]


 Epoch 1, step: 7210, loss: 2.144760847091675, total loss: 17443.358576774597
clearing cache


Processing Epoch 00:  40%|███▉      | 7221/18148 [6:19:37<9:38:11,  3.17s/it]


 Epoch 1, step: 7220, loss: 2.8191003799438477, total loss: 17465.531009435654
clearing cache


Processing Epoch 00:  40%|███▉      | 7231/18148 [6:20:08<9:36:52,  3.17s/it]


 Epoch 1, step: 7230, loss: 2.0330567359924316, total loss: 17488.376883506775
clearing cache


Processing Epoch 00:  40%|███▉      | 7241/18148 [6:20:39<9:38:08,  3.18s/it]


 Epoch 1, step: 7240, loss: 2.828890800476074, total loss: 17510.704168319702
clearing cache


Processing Epoch 00:  40%|███▉      | 7251/18148 [6:21:10<9:38:43,  3.19s/it]


 Epoch 1, step: 7250, loss: 2.764099359512329, total loss: 17535.393480539322
clearing cache


Processing Epoch 00:  40%|████      | 7261/18148 [6:21:41<9:34:15,  3.16s/it]


 Epoch 1, step: 7260, loss: 1.9022648334503174, total loss: 17557.274278640747
clearing cache


Processing Epoch 00:  40%|████      | 7271/18148 [6:22:12<9:36:39,  3.18s/it]


 Epoch 1, step: 7270, loss: 2.8411855697631836, total loss: 17581.027607679367
clearing cache


Processing Epoch 00:  40%|████      | 7281/18148 [6:22:43<9:33:50,  3.17s/it]


 Epoch 1, step: 7280, loss: 2.168523073196411, total loss: 17604.484658956528
clearing cache


Processing Epoch 00:  40%|████      | 7291/18148 [6:23:14<9:34:17,  3.17s/it]


 Epoch 1, step: 7290, loss: 2.2031731605529785, total loss: 17628.365899801254
clearing cache


Processing Epoch 00:  40%|████      | 7301/18148 [6:23:45<9:38:01,  3.20s/it]


 Epoch 1, step: 7300, loss: 2.685384511947632, total loss: 17654.224527835846
clearing cache


Processing Epoch 00:  40%|████      | 7311/18148 [6:24:16<9:33:35,  3.18s/it]


 Epoch 1, step: 7310, loss: 2.186012029647827, total loss: 17676.474269866943
clearing cache


Processing Epoch 00:  40%|████      | 7321/18148 [6:24:47<9:36:38,  3.20s/it]


 Epoch 1, step: 7320, loss: 2.8548498153686523, total loss: 17701.792306423187
clearing cache


Processing Epoch 00:  40%|████      | 7331/18148 [6:25:18<9:31:09,  3.17s/it]


 Epoch 1, step: 7330, loss: 2.080655097961426, total loss: 17724.869713306427
clearing cache


Processing Epoch 00:  40%|████      | 7341/18148 [6:25:49<9:33:50,  3.19s/it]


 Epoch 1, step: 7340, loss: 2.7581024169921875, total loss: 17749.007774591446
clearing cache


Processing Epoch 00:  41%|████      | 7351/18148 [6:26:20<9:31:15,  3.17s/it]


 Epoch 1, step: 7350, loss: 2.204394578933716, total loss: 17773.10990035534
clearing cache


Processing Epoch 00:  41%|████      | 7361/18148 [6:26:51<9:28:44,  3.16s/it]


 Epoch 1, step: 7360, loss: 1.9516838788986206, total loss: 17794.32439315319
clearing cache


Processing Epoch 00:  41%|████      | 7371/18148 [6:27:22<9:28:22,  3.16s/it]


 Epoch 1, step: 7370, loss: 2.0923752784729004, total loss: 17817.371424794197
clearing cache


Processing Epoch 00:  41%|████      | 7381/18148 [6:27:53<9:30:49,  3.18s/it]


 Epoch 1, step: 7380, loss: 2.8569626808166504, total loss: 17840.885206103325
clearing cache


Processing Epoch 00:  41%|████      | 7391/18148 [6:28:23<9:26:19,  3.16s/it]


 Epoch 1, step: 7390, loss: 2.050830841064453, total loss: 17863.017769932747
clearing cache


Processing Epoch 00:  41%|████      | 7401/18148 [6:28:55<9:28:44,  3.18s/it]


 Epoch 1, step: 7400, loss: 2.1676392555236816, total loss: 17887.009566903114
clearing cache


Processing Epoch 00:  41%|████      | 7411/18148 [6:29:25<9:27:16,  3.17s/it]


 Epoch 1, step: 7410, loss: 2.8652279376983643, total loss: 17910.014323949814
clearing cache


Processing Epoch 00:  41%|████      | 7421/18148 [6:29:57<9:30:08,  3.19s/it]


 Epoch 1, step: 7420, loss: 2.7713623046875, total loss: 17934.339816331863
clearing cache


Processing Epoch 00:  41%|████      | 7431/18148 [6:30:28<9:26:01,  3.17s/it]


 Epoch 1, step: 7430, loss: 2.023712635040283, total loss: 17957.03784251213
clearing cache


Processing Epoch 00:  41%|████      | 7441/18148 [6:30:58<9:25:02,  3.17s/it]


 Epoch 1, step: 7440, loss: 2.016180992126465, total loss: 17978.605031132698
clearing cache


Processing Epoch 00:  41%|████      | 7451/18148 [6:31:29<9:25:04,  3.17s/it]


 Epoch 1, step: 7450, loss: 1.9336628913879395, total loss: 18000.508476376534
clearing cache


Processing Epoch 00:  41%|████      | 7461/18148 [6:32:00<9:26:22,  3.18s/it]


 Epoch 1, step: 7460, loss: 2.8323636054992676, total loss: 18022.902392745018
clearing cache


Processing Epoch 00:  41%|████      | 7471/18148 [6:32:31<9:21:31,  3.16s/it]


 Epoch 1, step: 7470, loss: 2.1503071784973145, total loss: 18043.4876101017
clearing cache


Processing Epoch 00:  41%|████      | 7481/18148 [6:33:02<9:24:29,  3.18s/it]


 Epoch 1, step: 7480, loss: 2.1152901649475098, total loss: 18064.98809528351
clearing cache


Processing Epoch 00:  41%|████▏     | 7491/18148 [6:33:33<9:23:33,  3.17s/it]


 Epoch 1, step: 7490, loss: 2.144402027130127, total loss: 18088.229583263397
clearing cache


Processing Epoch 00:  41%|████▏     | 7501/18148 [6:34:04<9:23:50,  3.18s/it]


 Epoch 1, step: 7500, loss: 2.8678689002990723, total loss: 18111.49061882496
clearing cache


Processing Epoch 00:  41%|████▏     | 7511/18148 [6:34:35<9:21:59,  3.17s/it]


 Epoch 1, step: 7510, loss: 2.0810036659240723, total loss: 18132.716832518578
clearing cache


Processing Epoch 00:  41%|████▏     | 7521/18148 [6:35:06<9:20:42,  3.17s/it]


 Epoch 1, step: 7520, loss: 1.9487005472183228, total loss: 18155.09464907646
clearing cache


Processing Epoch 00:  41%|████▏     | 7531/18148 [6:35:37<9:23:14,  3.18s/it]


 Epoch 1, step: 7530, loss: 2.115133285522461, total loss: 18179.42407333851
clearing cache


Processing Epoch 00:  42%|████▏     | 7541/18148 [6:36:08<9:21:32,  3.18s/it]


 Epoch 1, step: 7540, loss: 1.8977864980697632, total loss: 18201.442289352417
clearing cache


Processing Epoch 00:  42%|████▏     | 7551/18148 [6:36:39<9:19:22,  3.17s/it]


 Epoch 1, step: 7550, loss: 2.1466100215911865, total loss: 18224.47901749611
clearing cache


Processing Epoch 00:  42%|████▏     | 7561/18148 [6:37:10<9:19:28,  3.17s/it]


 Epoch 1, step: 7560, loss: 2.2142410278320312, total loss: 18247.407601714134
clearing cache


Processing Epoch 00:  42%|████▏     | 7571/18148 [6:37:41<9:20:43,  3.18s/it]


 Epoch 1, step: 7570, loss: 2.1710987091064453, total loss: 18272.186490893364
clearing cache


Processing Epoch 00:  42%|████▏     | 7581/18148 [6:38:12<9:21:23,  3.19s/it]


 Epoch 1, step: 7580, loss: 2.9371910095214844, total loss: 18296.241498708725
clearing cache


Processing Epoch 00:  42%|████▏     | 7591/18148 [6:38:43<9:19:21,  3.18s/it]


 Epoch 1, step: 7590, loss: 2.7806601524353027, total loss: 18317.61244869232
clearing cache


Processing Epoch 00:  42%|████▏     | 7601/18148 [6:39:14<9:20:18,  3.19s/it]


 Epoch 1, step: 7600, loss: 2.095161199569702, total loss: 18338.874945759773
clearing cache


Processing Epoch 00:  42%|████▏     | 7611/18148 [6:39:45<9:15:15,  3.16s/it]


 Epoch 1, step: 7610, loss: 2.280639410018921, total loss: 18360.798841118813
clearing cache


Processing Epoch 00:  42%|████▏     | 7621/18148 [6:40:16<9:15:26,  3.17s/it]


 Epoch 1, step: 7620, loss: 2.787109851837158, total loss: 18382.97847175598
clearing cache


Processing Epoch 00:  42%|████▏     | 7631/18148 [6:40:47<9:16:12,  3.17s/it]


 Epoch 1, step: 7630, loss: 2.7879867553710938, total loss: 18405.34156215191
clearing cache


Processing Epoch 00:  42%|████▏     | 7641/18148 [6:41:18<9:15:31,  3.17s/it]


 Epoch 1, step: 7640, loss: 2.031301975250244, total loss: 18427.810067534447
clearing cache


Processing Epoch 00:  42%|████▏     | 7651/18148 [6:41:49<9:14:53,  3.17s/it]


 Epoch 1, step: 7650, loss: 1.9274400472640991, total loss: 18452.240857362747
clearing cache


Processing Epoch 00:  42%|████▏     | 7661/18148 [6:42:20<9:13:09,  3.16s/it]


 Epoch 1, step: 7660, loss: 2.14074969291687, total loss: 18474.406245470047
clearing cache


Processing Epoch 00:  42%|████▏     | 7671/18148 [6:42:51<9:12:45,  3.17s/it]


 Epoch 1, step: 7670, loss: 2.3271617889404297, total loss: 18496.767320632935
clearing cache


Processing Epoch 00:  42%|████▏     | 7681/18148 [6:43:22<9:12:10,  3.17s/it]


 Epoch 1, step: 7680, loss: 2.1334762573242188, total loss: 18520.82771205902
clearing cache


Processing Epoch 00:  42%|████▏     | 7691/18148 [6:43:53<9:10:30,  3.16s/it]


 Epoch 1, step: 7690, loss: 1.9929945468902588, total loss: 18541.391986727715
clearing cache


Processing Epoch 00:  42%|████▏     | 7701/18148 [6:44:24<9:13:04,  3.18s/it]


 Epoch 1, step: 7700, loss: 2.0412914752960205, total loss: 18565.93093407154
clearing cache


Processing Epoch 00:  42%|████▏     | 7711/18148 [6:44:55<9:07:55,  3.15s/it]


 Epoch 1, step: 7710, loss: 2.135786771774292, total loss: 18587.535198569298
clearing cache


Processing Epoch 00:  43%|████▎     | 7721/18148 [6:45:25<9:10:09,  3.17s/it]


 Epoch 1, step: 7720, loss: 2.161900520324707, total loss: 18610.066197037697
clearing cache


Processing Epoch 00:  43%|████▎     | 7731/18148 [6:45:56<9:11:42,  3.18s/it]


 Epoch 1, step: 7730, loss: 2.1532299518585205, total loss: 18634.29780638218
clearing cache


Processing Epoch 00:  43%|████▎     | 7741/18148 [6:46:28<9:12:32,  3.19s/it]


 Epoch 1, step: 7740, loss: 3.003824234008789, total loss: 18658.993159770966
clearing cache


Processing Epoch 00:  43%|████▎     | 7751/18148 [6:46:59<9:11:00,  3.18s/it]


 Epoch 1, step: 7750, loss: 2.1289138793945312, total loss: 18683.082273721695
clearing cache


Processing Epoch 00:  43%|████▎     | 7761/18148 [6:47:29<9:09:21,  3.17s/it]


 Epoch 1, step: 7760, loss: 2.950775146484375, total loss: 18705.864656567574
clearing cache


Processing Epoch 00:  43%|████▎     | 7771/18148 [6:48:00<9:09:26,  3.18s/it]


 Epoch 1, step: 7770, loss: 2.8498387336730957, total loss: 18730.14874112606
clearing cache


Processing Epoch 00:  43%|████▎     | 7781/18148 [6:48:31<9:07:50,  3.17s/it]


 Epoch 1, step: 7780, loss: 2.874213933944702, total loss: 18753.962838292122
clearing cache


Processing Epoch 00:  43%|████▎     | 7791/18148 [6:49:02<9:05:11,  3.16s/it]


 Epoch 1, step: 7790, loss: 2.2733707427978516, total loss: 18778.97908771038
clearing cache


Processing Epoch 00:  43%|████▎     | 7801/18148 [6:49:33<9:05:05,  3.16s/it]


 Epoch 1, step: 7800, loss: 2.3434829711914062, total loss: 18801.879949331284
clearing cache


Processing Epoch 00:  43%|████▎     | 7811/18148 [6:50:04<9:04:50,  3.16s/it]


 Epoch 1, step: 7810, loss: 2.2125229835510254, total loss: 18824.671080350876
clearing cache


Processing Epoch 00:  43%|████▎     | 7821/18148 [6:50:35<9:05:44,  3.17s/it]


 Epoch 1, step: 7820, loss: 2.326615810394287, total loss: 18848.01922559738
clearing cache


Processing Epoch 00:  43%|████▎     | 7831/18148 [6:51:06<9:04:19,  3.17s/it]


 Epoch 1, step: 7830, loss: 2.0668721199035645, total loss: 18870.54735672474
clearing cache


Processing Epoch 00:  43%|████▎     | 7841/18148 [6:51:37<9:05:42,  3.18s/it]


 Epoch 1, step: 7840, loss: 2.7289977073669434, total loss: 18893.644890666008
clearing cache


Processing Epoch 00:  43%|████▎     | 7851/18148 [6:52:08<9:04:33,  3.17s/it]


 Epoch 1, step: 7850, loss: 2.9163365364074707, total loss: 18918.233523130417
clearing cache


Processing Epoch 00:  43%|████▎     | 7861/18148 [6:52:39<9:02:02,  3.16s/it]


 Epoch 1, step: 7860, loss: 2.155000925064087, total loss: 18940.289844989777
clearing cache


Processing Epoch 00:  43%|████▎     | 7871/18148 [6:53:10<9:00:29,  3.16s/it]


 Epoch 1, step: 7870, loss: 1.965994119644165, total loss: 18962.74021434784
clearing cache


Processing Epoch 00:  43%|████▎     | 7881/18148 [6:53:41<9:02:28,  3.17s/it]


 Epoch 1, step: 7880, loss: 2.9231629371643066, total loss: 18985.63424360752
clearing cache


Processing Epoch 00:  43%|████▎     | 7891/18148 [6:54:12<8:59:41,  3.16s/it]


 Epoch 1, step: 7890, loss: 2.0287790298461914, total loss: 19010.065748095512
clearing cache


Processing Epoch 00:  44%|████▎     | 7901/18148 [6:54:43<9:02:26,  3.18s/it]


 Epoch 1, step: 7900, loss: 2.2329320907592773, total loss: 19034.157824873924
clearing cache


Processing Epoch 00:  44%|████▎     | 7911/18148 [6:55:14<8:57:14,  3.15s/it]


 Epoch 1, step: 7910, loss: 2.1259021759033203, total loss: 19056.59755373001
clearing cache


Processing Epoch 00:  44%|████▎     | 7921/18148 [6:55:44<8:57:08,  3.15s/it]


 Epoch 1, step: 7920, loss: 2.0568737983703613, total loss: 19077.506155490875
clearing cache


Processing Epoch 00:  44%|████▎     | 7931/18148 [6:56:15<9:01:10,  3.18s/it]


 Epoch 1, step: 7930, loss: 2.7042622566223145, total loss: 19100.81347322464
clearing cache


Processing Epoch 00:  44%|████▍     | 7941/18148 [6:56:46<8:59:32,  3.17s/it]


 Epoch 1, step: 7940, loss: 2.0226001739501953, total loss: 19124.613888025284
clearing cache


Processing Epoch 00:  44%|████▍     | 7951/18148 [6:57:17<8:58:22,  3.17s/it]


 Epoch 1, step: 7950, loss: 2.2295079231262207, total loss: 19147.404426574707
clearing cache


Processing Epoch 00:  44%|████▍     | 7961/18148 [6:57:48<8:56:44,  3.16s/it]


 Epoch 1, step: 7960, loss: 2.1519358158111572, total loss: 19169.581875681877
clearing cache


Processing Epoch 00:  44%|████▍     | 7971/18148 [6:58:19<8:55:44,  3.16s/it]


 Epoch 1, step: 7970, loss: 2.073082447052002, total loss: 19193.653770327568
clearing cache


Processing Epoch 00:  44%|████▍     | 7981/18148 [6:58:50<8:58:46,  3.18s/it]


 Epoch 1, step: 7980, loss: 2.9102861881256104, total loss: 19217.948440670967
clearing cache


Processing Epoch 00:  44%|████▍     | 7991/18148 [6:59:21<8:57:59,  3.18s/it]


 Epoch 1, step: 7990, loss: 2.0071160793304443, total loss: 19241.02911913395
clearing cache


Processing Epoch 00:  44%|████▍     | 8001/18148 [6:59:52<8:53:46,  3.16s/it]


 Epoch 1, step: 8000, loss: 2.0136613845825195, total loss: 19262.82346892357
clearing cache


Processing Epoch 00:  44%|████▍     | 8011/18148 [7:00:23<8:51:23,  3.15s/it]


 Epoch 1, step: 8010, loss: 2.0247275829315186, total loss: 19284.82672548294
clearing cache


Processing Epoch 00:  44%|████▍     | 8021/18148 [7:00:54<8:50:57,  3.15s/it]


 Epoch 1, step: 8020, loss: 2.396273136138916, total loss: 19306.938695549965
clearing cache


Processing Epoch 00:  44%|████▍     | 8031/18148 [7:01:24<8:52:16,  3.16s/it]


 Epoch 1, step: 8030, loss: 2.2626230716705322, total loss: 19330.673866152763
clearing cache


Processing Epoch 00:  44%|████▍     | 8041/18148 [7:01:55<8:50:59,  3.15s/it]


 Epoch 1, step: 8040, loss: 2.17919659614563, total loss: 19352.893242836
clearing cache


Processing Epoch 00:  44%|████▍     | 8051/18148 [7:02:26<8:50:54,  3.15s/it]


 Epoch 1, step: 8050, loss: 2.1177196502685547, total loss: 19376.31490755081
clearing cache


Processing Epoch 00:  44%|████▍     | 8061/18148 [7:02:57<8:52:28,  3.17s/it]


 Epoch 1, step: 8060, loss: 1.985763669013977, total loss: 19399.641942858696
clearing cache


Processing Epoch 00:  44%|████▍     | 8071/18148 [7:03:28<8:52:31,  3.17s/it]


 Epoch 1, step: 8070, loss: 2.7716317176818848, total loss: 19423.34281051159
clearing cache


Processing Epoch 00:  45%|████▍     | 8081/18148 [7:03:59<8:53:40,  3.18s/it]


 Epoch 1, step: 8080, loss: 2.803280830383301, total loss: 19446.761076807976
clearing cache


Processing Epoch 00:  45%|████▍     | 8091/18148 [7:04:30<8:51:10,  3.17s/it]


 Epoch 1, step: 8090, loss: 2.7624683380126953, total loss: 19470.75918853283
clearing cache


Processing Epoch 00:  45%|████▍     | 8101/18148 [7:05:01<8:50:32,  3.17s/it]


 Epoch 1, step: 8100, loss: 2.4519238471984863, total loss: 19495.471536040306
clearing cache


Processing Epoch 00:  45%|████▍     | 8111/18148 [7:05:32<8:50:37,  3.17s/it]


 Epoch 1, step: 8110, loss: 2.047891855239868, total loss: 19520.410045027733
clearing cache


Processing Epoch 00:  45%|████▍     | 8121/18148 [7:06:03<8:51:38,  3.18s/it]


 Epoch 1, step: 8120, loss: 2.1237375736236572, total loss: 19544.414104819298
clearing cache


Processing Epoch 00:  45%|████▍     | 8131/18148 [7:06:34<8:46:30,  3.15s/it]


 Epoch 1, step: 8130, loss: 2.057107925415039, total loss: 19565.635943889618
clearing cache


Processing Epoch 00:  45%|████▍     | 8141/18148 [7:07:05<8:50:56,  3.18s/it]


 Epoch 1, step: 8140, loss: 2.6872940063476562, total loss: 19589.99689936638
clearing cache


Processing Epoch 00:  45%|████▍     | 8151/18148 [7:07:36<8:49:34,  3.18s/it]


 Epoch 1, step: 8150, loss: 2.7033309936523438, total loss: 19615.071599006653
clearing cache


Processing Epoch 00:  45%|████▍     | 8161/18148 [7:08:07<8:45:34,  3.16s/it]


 Epoch 1, step: 8160, loss: 2.030560255050659, total loss: 19637.723933935165
clearing cache


Processing Epoch 00:  45%|████▌     | 8171/18148 [7:08:38<8:50:05,  3.19s/it]


 Epoch 1, step: 8170, loss: 2.9474735260009766, total loss: 19664.212441921234
clearing cache


Processing Epoch 00:  45%|████▌     | 8181/18148 [7:09:09<8:45:24,  3.16s/it]


 Epoch 1, step: 8180, loss: 1.9894404411315918, total loss: 19688.16426193714
clearing cache


Processing Epoch 00:  45%|████▌     | 8191/18148 [7:09:40<8:42:52,  3.15s/it]


 Epoch 1, step: 8190, loss: 1.9685012102127075, total loss: 19710.47562634945
clearing cache


Processing Epoch 00:  45%|████▌     | 8201/18148 [7:10:11<8:45:22,  3.17s/it]


 Epoch 1, step: 8200, loss: 2.761352300643921, total loss: 19734.515845894814
clearing cache


Processing Epoch 00:  45%|████▌     | 8211/18148 [7:10:42<8:43:40,  3.16s/it]


 Epoch 1, step: 8210, loss: 2.173038959503174, total loss: 19757.049250483513
clearing cache


Processing Epoch 00:  45%|████▌     | 8221/18148 [7:11:13<8:43:14,  3.16s/it]


 Epoch 1, step: 8220, loss: 2.178682804107666, total loss: 19779.100984215736
clearing cache


Processing Epoch 00:  45%|████▌     | 8231/18148 [7:11:44<8:43:16,  3.17s/it]


 Epoch 1, step: 8230, loss: 2.2115118503570557, total loss: 19801.4222741127
clearing cache


Processing Epoch 00:  45%|████▌     | 8241/18148 [7:12:15<8:42:17,  3.16s/it]


 Epoch 1, step: 8240, loss: 2.115797281265259, total loss: 19825.002027750015
clearing cache


Processing Epoch 00:  45%|████▌     | 8251/18148 [7:12:45<8:41:00,  3.16s/it]


 Epoch 1, step: 8250, loss: 1.968686819076538, total loss: 19846.08487701416
clearing cache


Processing Epoch 00:  46%|████▌     | 8261/18148 [7:13:16<8:43:08,  3.17s/it]


 Epoch 1, step: 8260, loss: 2.13930082321167, total loss: 19869.40991294384
clearing cache


Processing Epoch 00:  46%|████▌     | 8271/18148 [7:13:47<8:40:30,  3.16s/it]


 Epoch 1, step: 8270, loss: 2.084216356277466, total loss: 19891.744941353798
clearing cache


Processing Epoch 00:  46%|████▌     | 8281/18148 [7:14:18<8:39:08,  3.16s/it]


 Epoch 1, step: 8280, loss: 1.9808425903320312, total loss: 19914.368240475655
clearing cache


Processing Epoch 00:  46%|████▌     | 8291/18148 [7:14:49<8:39:55,  3.16s/it]


 Epoch 1, step: 8290, loss: 2.1736397743225098, total loss: 19937.28346478939
clearing cache


Processing Epoch 00:  46%|████▌     | 8301/18148 [7:15:20<8:42:07,  3.18s/it]


 Epoch 1, step: 8300, loss: 2.82694673538208, total loss: 19960.562950015068
clearing cache


Processing Epoch 00:  46%|████▌     | 8311/18148 [7:15:51<8:40:00,  3.17s/it]


 Epoch 1, step: 8310, loss: 2.2883429527282715, total loss: 19983.779168725014
clearing cache


Processing Epoch 00:  46%|████▌     | 8321/18148 [7:16:22<8:40:56,  3.18s/it]


 Epoch 1, step: 8320, loss: 2.124763011932373, total loss: 20008.200425744057
clearing cache


Processing Epoch 00:  46%|████▌     | 8331/18148 [7:16:53<8:39:43,  3.18s/it]


 Epoch 1, step: 8330, loss: 2.7908246517181396, total loss: 20030.968881964684
clearing cache


Processing Epoch 00:  46%|████▌     | 8341/18148 [7:17:24<8:36:44,  3.16s/it]


 Epoch 1, step: 8340, loss: 1.9416568279266357, total loss: 20053.184646129608
clearing cache


Processing Epoch 00:  46%|████▌     | 8351/18148 [7:17:55<8:37:06,  3.17s/it]


 Epoch 1, step: 8350, loss: 2.3155534267425537, total loss: 20078.01657986641
clearing cache


Processing Epoch 00:  46%|████▌     | 8361/18148 [7:18:26<8:38:47,  3.18s/it]


 Epoch 1, step: 8360, loss: 2.817166805267334, total loss: 20102.25718522072
clearing cache


Processing Epoch 00:  46%|████▌     | 8371/18148 [7:18:57<8:36:27,  3.17s/it]


 Epoch 1, step: 8370, loss: 2.037548303604126, total loss: 20125.802991628647
clearing cache


Processing Epoch 00:  46%|████▌     | 8381/18148 [7:19:28<8:32:17,  3.15s/it]


 Epoch 1, step: 8380, loss: 2.0209248065948486, total loss: 20146.3597779274
clearing cache


Processing Epoch 00:  46%|████▌     | 8391/18148 [7:19:59<8:34:09,  3.16s/it]


 Epoch 1, step: 8390, loss: 2.2463197708129883, total loss: 20168.28524696827
clearing cache


Processing Epoch 00:  46%|████▋     | 8401/18148 [7:20:30<8:31:32,  3.15s/it]


 Epoch 1, step: 8400, loss: 1.9475622177124023, total loss: 20189.68031179905
clearing cache


Processing Epoch 00:  46%|████▋     | 8411/18148 [7:21:00<8:33:22,  3.16s/it]


 Epoch 1, step: 8410, loss: 2.025961399078369, total loss: 20212.568050265312
clearing cache


Processing Epoch 00:  46%|████▋     | 8421/18148 [7:21:32<8:34:38,  3.17s/it]


 Epoch 1, step: 8420, loss: 1.9115965366363525, total loss: 20236.219044089317
clearing cache


Processing Epoch 00:  46%|████▋     | 8431/18148 [7:22:03<8:34:05,  3.17s/it]


 Epoch 1, step: 8430, loss: 1.945417881011963, total loss: 20258.832102537155
clearing cache


Processing Epoch 00:  47%|████▋     | 8441/18148 [7:22:34<8:32:14,  3.17s/it]


 Epoch 1, step: 8440, loss: 2.262941360473633, total loss: 20282.792234897614
clearing cache


Processing Epoch 00:  47%|████▋     | 8451/18148 [7:23:04<8:33:48,  3.18s/it]


 Epoch 1, step: 8450, loss: 2.806455612182617, total loss: 20306.600160241127
clearing cache


Processing Epoch 00:  47%|████▋     | 8461/18148 [7:23:35<8:29:27,  3.16s/it]


 Epoch 1, step: 8460, loss: 2.194948673248291, total loss: 20330.023056268692
clearing cache


Processing Epoch 00:  47%|████▋     | 8471/18148 [7:24:06<8:28:17,  3.15s/it]


 Epoch 1, step: 8470, loss: 2.0248446464538574, total loss: 20352.16879105568
clearing cache


Processing Epoch 00:  47%|████▋     | 8481/18148 [7:24:37<8:28:38,  3.16s/it]


 Epoch 1, step: 8480, loss: 2.1467747688293457, total loss: 20374.7018481493
clearing cache


Processing Epoch 00:  47%|████▋     | 8491/18148 [7:25:08<8:29:21,  3.16s/it]


 Epoch 1, step: 8490, loss: 2.2276721000671387, total loss: 20397.157441735268
clearing cache


Processing Epoch 00:  47%|████▋     | 8501/18148 [7:25:39<8:28:56,  3.17s/it]


 Epoch 1, step: 8500, loss: 2.759706497192383, total loss: 20419.82328104973
clearing cache


Processing Epoch 00:  47%|████▋     | 8511/18148 [7:26:10<8:30:11,  3.18s/it]


 Epoch 1, step: 8510, loss: 2.782653331756592, total loss: 20442.248104929924
clearing cache


Processing Epoch 00:  47%|████▋     | 8521/18148 [7:26:41<8:32:02,  3.19s/it]


 Epoch 1, step: 8520, loss: 2.8816566467285156, total loss: 20466.940590262413
clearing cache


Processing Epoch 00:  47%|████▋     | 8531/18148 [7:27:12<8:26:59,  3.16s/it]


 Epoch 1, step: 8530, loss: 2.034062623977661, total loss: 20490.50247347355
clearing cache


Processing Epoch 00:  47%|████▋     | 8541/18148 [7:27:43<8:26:46,  3.17s/it]


 Epoch 1, step: 8540, loss: 2.175201654434204, total loss: 20511.78822338581
clearing cache


Processing Epoch 00:  47%|████▋     | 8551/18148 [7:28:13<8:24:43,  3.16s/it]


 Epoch 1, step: 8550, loss: 1.970109462738037, total loss: 20534.227538704872
clearing cache


Processing Epoch 00:  47%|████▋     | 8561/18148 [7:28:45<8:29:16,  3.19s/it]


 Epoch 1, step: 8560, loss: 2.7908518314361572, total loss: 20559.344556331635
clearing cache


Processing Epoch 00:  47%|████▋     | 8571/18148 [7:29:16<8:26:15,  3.17s/it]


 Epoch 1, step: 8570, loss: 1.9383997917175293, total loss: 20582.498047590256
clearing cache


Processing Epoch 00:  47%|████▋     | 8581/18148 [7:29:46<8:25:22,  3.17s/it]


 Epoch 1, step: 8580, loss: 2.147397041320801, total loss: 20604.322937846184
clearing cache


Processing Epoch 00:  47%|████▋     | 8591/18148 [7:30:17<8:25:17,  3.17s/it]


 Epoch 1, step: 8590, loss: 2.0401413440704346, total loss: 20627.469371914864
clearing cache


Processing Epoch 00:  47%|████▋     | 8601/18148 [7:30:48<8:24:41,  3.17s/it]


 Epoch 1, step: 8600, loss: 2.2512617111206055, total loss: 20651.634582281113
clearing cache


Processing Epoch 00:  47%|████▋     | 8611/18148 [7:31:19<8:25:37,  3.18s/it]


 Epoch 1, step: 8610, loss: 2.054892063140869, total loss: 20674.192289114
clearing cache


Processing Epoch 00:  48%|████▊     | 8621/18148 [7:31:50<8:22:25,  3.16s/it]


 Epoch 1, step: 8620, loss: 2.16609787940979, total loss: 20694.78110229969
clearing cache


Processing Epoch 00:  48%|████▊     | 8631/18148 [7:32:21<8:22:07,  3.17s/it]


 Epoch 1, step: 8630, loss: 2.1779747009277344, total loss: 20719.170233249664
clearing cache


Processing Epoch 00:  48%|████▊     | 8641/18148 [7:32:52<8:24:29,  3.18s/it]


 Epoch 1, step: 8640, loss: 2.852278709411621, total loss: 20743.28934788704
clearing cache


Processing Epoch 00:  48%|████▊     | 8651/18148 [7:33:23<8:22:59,  3.18s/it]


 Epoch 1, step: 8650, loss: 2.776125431060791, total loss: 20767.941595435143
clearing cache


Processing Epoch 00:  48%|████▊     | 8661/18148 [7:33:54<8:21:47,  3.17s/it]


 Epoch 1, step: 8660, loss: 2.1857335567474365, total loss: 20790.802763819695
clearing cache


Processing Epoch 00:  48%|████▊     | 8666/18148 [7:34:10<8:16:55,  3.14s/it]


KeyboardInterrupt: 

In [None]:
import shutil
from IPython.display import FileLink, display

# Define the paths
WORKING_DIR = '/kaggle/working'
FINE_TUNED_DIR = os.path.join(WORKING_DIR, 'fine_tuned')
ARCHIVE_NAME = 'fine_tuned_model1.tar.gz'
ARCHIVE_PATH = os.path.join(WORKING_DIR, ARCHIVE_NAME)

# Function to create a downloadable archive
def create_downloadable_archive():
    if os.path.exists(FINE_TUNED_DIR):
        # Create a tar.gz archive of the fine_tuned directory
        shutil.make_archive(
            os.path.join(WORKING_DIR, 'fine_tuned_model1'),
            'gztar',
            FINE_TUNED_DIR
        )
        print(f"Archive created: {ARCHIVE_PATH}")

        # Create a download link
        display(FileLink(ARCHIVE_PATH, result_html_prefix="Click here to download the fine-tuned model: "))
    else:
        print(f"Error: {FINE_TUNED_DIR} does not exist.")



In [None]:
# Call this function after your model training is complete and you've saved the model
create_downloadable_archive()

# # Optional: Print the contents of the archive for verification
# print("\nContents of the archive:")
# !tar -tvf {ARCHIVE_PATH}

Archive created: /kaggle/working/fine_tuned_model1.tar.gz
