In [1]:
!pip install -U portalocker>=2.0.0

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


## Imports

In [1]:
import torch
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch import nn
import time
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

import torch
import transformer_engine.pytorch as te
from transformer_engine.common import recipe
import numpy as np
# from transformer_engine.pytorch import Float8Tensor, E4M3, tensor_to_scale
from transformer_engine.pytorch.fp8 import get_global_fp8_buffer
from matplotlib import pyplot as plt
torch.manual_seed(0)
np.random.seed(0)

someone called API registrations


## Datasets/dataloaders

In [2]:
tokenizer = get_tokenizer("basic_english")
train_iter = AG_NEWS(split="train")

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

def dataloader_repeater(dataloader):
    c = 0
    while True:
        for i, batch in enumerate(dataloader):
            c += 1
            yield c, batch

# Hyperparameters
# num_class = len(set([label for (label, text) in train_iter]))
num_class = 16 # making the `num_classes=16` deliberately to make compatible with fp8 intrinsics
vocab_size = len(vocab)
emsize = 16
EPOCHS = 1  # epoch
LR = 1  # learning rate
BATCH_SIZE = 64  # batch size for training
criterion = torch.nn.CrossEntropyLoss()
            
# Create datasets/dataloaders
train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

# Create a repeating dataloader since the default one just trips
train_dataloader = dataloader_repeater(train_dataloader)

## Define hyperparams, models

In [3]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class, bias=False)

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)
    
class TETextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TETextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = te.Linear(embed_dim, num_class, bias=False, params_dtype=torch.float32, primary_weights_in_fp8=True)

    def forward(self, text, offsets, is_first_microbatch=None):
        embedded = self.embedding(text, offsets)
        out = self.fc(embedded, is_first_microbatch=is_first_microbatch)
        return out

## FP32 Model

In [4]:
fp_model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
optimizer = torch.optim.SGD(fp_model.parameters(), lr=LR)

## FP8 Model

In [5]:
fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3, reduce_amax=False)
with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
    te_model = TETextClassificationModel(vocab_size, emsize, num_class).to(device)

assigning weights in fp8
init fp8 meta tensors
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.detach.default
 func inplace: False
 mutable args: [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.detach.default
 func inplace: False
 mutable args: [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>]


## Replace FP8 model parameters with those of FP32 model (using `cast_to_fp8` internally)

In [6]:
te_model.load_state_dict(fp_model.state_dict(), strict=False)

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]


_IncompatibleKeys(missing_keys=['fc._extra_state'], unexpected_keys=[])

## Check that FP8 model weights match with FP32 model weights

In [7]:
te_model.state_dict()['fc.weight']

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.detach.default
 func inplace: False
 mutable args: [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.detach.default
 func inplace: False
 mutable args: [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>]


Float8Tensor(flavor=0, scale=1.0, as_float32=tensor([[ 0.0234, -0.2031,  0.1875, -0.1562, -0.1719, -0.1250, -0.0078,  0.0117,
          0.2500, -0.2188, -0.1719,  0.1250,  0.0000,  0.0156,  0.0430, -0.0703],
        [ 0.1875, -0.1406,  0.0703,  0.1719, -0.2500, -0.1250,  0.2031, -0.2031,
         -0.0098, -0.1875, -0.0938,  0.0312,  0.0547,  0.2031, -0.1094,  0.0781],
        [-0.1719,  0.0254, -0.2031,  0.2344, -0.0430,  0.2031, -0.0430, -0.1250,
         -0.1875, -0.1719, -0.2031, -0.0176,  0.0625, -0.0273, -0.0254, -0.0547],
        [-0.1094, -0.1562,  0.1562,  0.1875,  0.2031, -0.2031, -0.2031,  0.1875,
          0.2344, -0.2344,  0.1719, -0.0781, -0.0547, -0.1250,  0.1250, -0.0508],
        [-0.0508,  0.1719, -0.0859, -0.1406, -0.0625,  0.1562,  0.1875,  0.1875,
         -0.1406,  0.0508,  0.1172, -0.1875, -0.0430,  0.2031,  0.1250,  0.1875],
        [ 0.0391,  0.2031, -0.1719,  0.1719, -0.1562,  0.0938,  0.2188,  0.1016,
          0.0859, -0.1875, -0.2500, -0.1250, -0.0859, -0.01

In [8]:
fp_model.state_dict()['fc.weight']

tensor([[ 0.0234, -0.1999,  0.1872, -0.1597, -0.1755, -0.1288, -0.0085,  0.0118,
          0.2460, -0.2213, -0.1779,  0.1241,  0.0003,  0.0150,  0.0441, -0.0731],
        [ 0.1944, -0.1465,  0.0734,  0.1735, -0.2467, -0.1258,  0.2024, -0.2086,
         -0.0090, -0.1814, -0.0943,  0.0321,  0.0543,  0.1989, -0.1057,  0.0745],
        [-0.1687,  0.0255, -0.2074,  0.2293, -0.0430,  0.2026, -0.0426, -0.1269,
         -0.1805, -0.1756, -0.2082, -0.0169,  0.0656, -0.0278, -0.0249, -0.0528],
        [-0.1058, -0.1548,  0.1524,  0.1945,  0.2004, -0.1959, -0.2102,  0.1927,
          0.2396, -0.2326,  0.1673, -0.0798, -0.0557, -0.1319,  0.1229, -0.0510],
        [-0.0489,  0.1732, -0.0861, -0.1459, -0.0664,  0.1624,  0.1897,  0.1895,
         -0.1461,  0.0489,  0.1150, -0.1871, -0.0423,  0.1975,  0.1211,  0.1821],
        [ 0.0383,  0.2104, -0.1748,  0.1684, -0.1524,  0.0941,  0.2164,  0.1042,
          0.0894, -0.1916, -0.2436, -0.1283, -0.0893, -0.0147, -0.1092,  0.0966],
        [ 0.1184,  0.2

## Train FP32 and FP8 models together

This sort of represents how distributed optimizer would maintain a master weight. FP32 (`fp_model`) contains the FP32 weights and `manual_opt_apply` uses those FP32 weights (from `fp_model`) along with FP32 grads (from `te_model`) to calculate the weights which are cast to FP8 and stored in the FP8 model.

In [10]:
def manual_opt_apply(weight, grad):
    """
    Manually do SGD and return the updated weights
    """
    return weight - 1.0 * grad

total_acc_fp, total_acc_te, total_count = 0, 0, 0
log_interval = 1
start_time = time.time()
nbatches = 1000

for epoch in range(1, EPOCHS + 1):
    for c in range(nbatches):
        idx, (label, text, offsets) = next(train_dataloader)
        print(f"----------------------------batch: {idx}------------------------")
        
        optimizer.zero_grad()
        te_model.zero_grad()
    
        predicted_label_fp = fp_model(text, offsets)
        with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
            predicted_label_te = te_model(text, offsets, is_first_microbatch=None)
    
        loss_fp = criterion(predicted_label_fp, label)
        loss_te = criterion(predicted_label_te, label)
        
        loss_fp.backward()
        loss_te.backward()

        with torch.no_grad():
            for p1, p2 in zip(te_model.parameters(), fp_model.parameters()):
                new_val = manual_opt_apply(p2, p1.grad)
                p1.copy_(new_val)
                
        optimizer.step()
        
        total_acc_fp += (predicted_label_fp.argmax(1) == label).sum().item()
        total_acc_te += (predicted_label_te.argmax(1) == label).sum().item()
        
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy_fp {:8.3f}, accuracy_te {:8.3f}".format(
                    epoch, idx, 1782, total_acc_fp / total_count, total_acc_te / total_count
                )
            )
            total_acc_fp, total_acc_te, total_count = 0, 0, 0
            start_time = time.time()

----------------------------batch: 1------------------------
doing amax and scale update
fp8.py:  tensor([0.0000, 0.2467, 0.0000], device='cuda:0')
fp8.py:  tensor([0., 0.], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |     1/ 1782 batches | accuracy_fp    0.047, accuracy_te    0.047
----------------------------batch: 2------------------------
doing amax and scale update
fp8.py:  tensor([0.7287, 0.2740, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0146, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transfor

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |    20/ 1782 batches | accuracy_fp    0.297, accuracy_te    0.266
----------------------------batch: 21------------------------
doing amax and scale update
fp8.py:  tensor([0.9255, 0.8753, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0150, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/

fp8.py:  tensor([0.0151, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |    43/ 1782 batches | accuracy_fp    0.281, accuracy_te    0.281
----------------------------batch: 44------------------------
doing amax and scale update
fp8.py:  tensor([0.9255, 1.4716, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0151, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, Fa

----------------------------batch: 60------------------------
doing amax and scale update
fp8.py:  tensor([1.0631, 1.6988, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0152, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |    60/ 1782 batches | accuracy_fp    0.391, accuracy_te    0.391
----------------------------batch: 61------------------------
doing amax and scale update
fp8.py:  tensor([1.0631, 1.7400, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0152, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngin

fp8.py:  tensor([0.0152, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |    82/ 1782 batches | accuracy_fp    0.266, accuracy_te    0.281
----------------------------batch: 83------------------------
doing amax and scale update
fp8.py:  tensor([1.1092, 2.0282, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0154, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, Fa

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |    99/ 1782 batches | accuracy_fp    0.281, accuracy_te    0.250
----------------------------batch: 100------------------------
doing amax and scale update
fp8.py:  tensor([1.1167, 2.1443, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0154, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   122/ 1782 batches | accuracy_fp    0.344, accuracy_te    0.328
----------------------------batch: 123------------------------
doing amax and scale update
fp8.py:  tensor([1.1720, 2.2282, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   139/ 1782 batches | accuracy_fp    0.344, accuracy_te    0.328
----------------------------batch: 140------------------------
doing amax and scale update
fp8.py:  tensor([1.1720, 2.3559, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([1.3238, 2.5127, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   162/ 1782 batches | accuracy_fp    0.375, accuracy_te    0.375
----------------------------batch: 163------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.5127, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   179/ 1782 batches | accuracy_fp    0.344, accuracy_te    0.312
----------------------------batch: 180------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.5549, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

----------------------------batch: 202------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.6317, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   202/ 1782 batches | accuracy_fp    0.469, accuracy_te    0.453
----------------------------batch: 203------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.6317, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

----------------------------batch: 219------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.6557, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   219/ 1782 batches | accuracy_fp    0.391, accuracy_te    0.375
----------------------------batch: 220------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.6557, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

----------------------------batch: 241------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.6960, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   241/ 1782 batches | accuracy_fp    0.422, accuracy_te    0.422
----------------------------batch: 242------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.6960, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

fp8.py:  tensor([1.3238, 2.7215, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   258/ 1782 batches | accuracy_fp    0.422, accuracy_te    0.406
----------------------------batch: 259------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.7215, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   280/ 1782 batches | accuracy_fp    0.422, accuracy_te    0.422
----------------------------batch: 281------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.7537, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine

----------------------------batch: 298------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.8354, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   298/ 1782 batches | accuracy_fp    0.484, accuracy_te    0.453
----------------------------batch: 299------------------------
doing amax and scale update
fp8.py:  tensor([1.3238, 2.8354, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

----------------------------batch: 320------------------------
doing amax and scale update
fp8.py:  tensor([1.3881, 2.9256, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   320/ 1782 batches | accuracy_fp    0.281, accuracy_te    0.312
----------------------------batch: 321------------------------
doing amax and scale update
fp8.py:  tensor([1.3881, 2.9256, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   337/ 1782 batches | accuracy_fp    0.344, accuracy_te    0.297
----------------------------batch: 338------------------------
doing amax and scale update
fp8.py:  tensor([1.3881, 2.9333, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   359/ 1782 batches | accuracy_fp    0.391, accuracy_te    0.375
----------------------------batch: 360------------------------
doing amax and scale update
fp8.py:  tensor([1.3881, 2.9333, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   377/ 1782 batches | accuracy_fp    0.469, accuracy_te    0.453
----------------------------batch: 378------------------------
doing amax and scale update
fp8.py:  tensor([1.3881, 2.9692, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   399/ 1782 batches | accuracy_fp    0.453, accuracy_te    0.391
----------------------------batch: 400------------------------
doing amax and scale update
fp8.py:  tensor([1.3881, 3.0482, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   416/ 1782 batches | accuracy_fp    0.562, accuracy_te    0.578
----------------------------batch: 417------------------------
doing amax and scale update
fp8.py:  tensor([1.3881, 3.1068, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   438/ 1782 batches | accuracy_fp    0.375, accuracy_te    0.375
----------------------------batch: 439------------------------
doing amax and scale update
fp8.py:  tensor([1.5825, 3.1232, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   456/ 1782 batches | accuracy_fp    0.281, accuracy_te    0.297
----------------------------batch: 457------------------------
doing amax and scale update
fp8.py:  tensor([1.5825, 3.1232, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

doing amax and scale update
fp8.py:  tensor([1.5825, 3.1547, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   478/ 1782 batches | accuracy_fp    0.562, accuracy_te    0.562
----------------------------batch: 479------------------------
doing amax and scale update
fp8.py:  tensor([1.5825, 3.1547, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tenso

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   495/ 1782 batches | accuracy_fp    0.500, accuracy_te    0.453
----------------------------batch: 496------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.2331, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   517/ 1782 batches | accuracy_fp    0.484, accuracy_te    0.516
----------------------------batch: 518------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.2331, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([1.5972, 3.2331, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   535/ 1782 batches | accuracy_fp    0.438, accuracy_te    0.469
----------------------------batch: 536------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.2563, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   557/ 1782 batches | accuracy_fp    0.547, accuracy_te    0.547
----------------------------batch: 558------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.2563, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

----------------------------batch: 575------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.2880, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   575/ 1782 batches | accuracy_fp    0.359, accuracy_te    0.375
----------------------------batch: 576------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.2880, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

fp8.py:  tensor([1.5972, 3.3315, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   597/ 1782 batches | accuracy_fp    0.547, accuracy_te    0.516
----------------------------batch: 598------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.3315, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   614/ 1782 batches | accuracy_fp    0.469, accuracy_te    0.438
----------------------------batch: 615------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.3315, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   636/ 1782 batches | accuracy_fp    0.500, accuracy_te    0.516
----------------------------batch: 637------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.3360, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine

----------------------------batch: 654------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.3360, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   654/ 1782 batches | accuracy_fp    0.500, accuracy_te    0.453
----------------------------batch: 655------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.3360, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

doing amax and scale update
fp8.py:  tensor([1.5972, 3.3417, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   676/ 1782 batches | accuracy_fp    0.469, accuracy_te    0.469
----------------------------batch: 677------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.3417, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tenso

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   693/ 1782 batches | accuracy_fp    0.469, accuracy_te    0.422
----------------------------batch: 694------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.3953, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   715/ 1782 batches | accuracy_fp    0.516, accuracy_te    0.531
----------------------------batch: 716------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.4432, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   732/ 1782 batches | accuracy_fp    0.719, accuracy_te    0.719
----------------------------batch: 733------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.4595, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   754/ 1782 batches | accuracy_fp    0.578, accuracy_te    0.562
----------------------------batch: 755------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.5109, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

doing amax and scale update
fp8.py:  tensor([1.5972, 3.5638, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   772/ 1782 batches | accuracy_fp    0.516, accuracy_te    0.516
----------------------------batch: 773------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.5773, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tenso

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   794/ 1782 batches | accuracy_fp    0.531, accuracy_te    0.531
----------------------------batch: 795------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.6108, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

----------------------------batch: 812------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.6624, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   812/ 1782 batches | accuracy_fp    0.469, accuracy_te    0.453
----------------------------batch: 813------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.6624, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   834/ 1782 batches | accuracy_fp    0.625, accuracy_te    0.625
----------------------------batch: 835------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.7253, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   852/ 1782 batches | accuracy_fp    0.594, accuracy_te    0.578
----------------------------batch: 853------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.7446, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   874/ 1782 batches | accuracy_fp    0.453, accuracy_te    0.438
----------------------------batch: 875------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.8089, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

doing amax and scale update
fp8.py:  tensor([1.5972, 3.8212, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   892/ 1782 batches | accuracy_fp    0.453, accuracy_te    0.484
----------------------------batch: 893------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.8212, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tenso

----------------------------batch: 914------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.8773, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   914/ 1782 batches | accuracy_fp    0.516, accuracy_te    0.531
----------------------------batch: 915------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.8773, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   931/ 1782 batches | accuracy_fp    0.469, accuracy_te    0.469
----------------------------batch: 932------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.9010, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   953/ 1782 batches | accuracy_fp    0.578, accuracy_te    0.594
----------------------------batch: 954------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.9118, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0155, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, F

| epoch   1 |   970/ 1782 batches | accuracy_fp    0.531, accuracy_te    0.531
----------------------------batch: 971------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.9723, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0156, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   971/ 1782 batches | accuracy_fp    0.562, accuracy_te    0.562
----------------------------batch: 972------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.9723, 0.0000], device='cuda:0')
fp8.py:

----------------------------batch: 993------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.9723, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0156, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:114 [Floa8Tensor Torch Dispatch] func: aten.copy_.default
 func inplace: True
 mutable args: [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:115 [False, False, False]
/perfhome/repos/2023/TransformerEngine/transformer_engine/pytorch/float8_tensor.py:116 [<class 'transformer_engine.pytorch.float8_tensor.Float8Tensor'>, <class 'torch.Tensor'>]
| epoch   1 |   993/ 1782 batches | accuracy_fp    0.641, accuracy_te    0.625
----------------------------batch: 994------------------------
doing amax and scale update
fp8.py:  tensor([1.5972, 3.9723, 0.0000], device='cuda:0')
fp8.py:  tensor([0.0156, 0.0000], device='cuda:0')
/perfhome/repos/2023/TransformerEng