VERSION-1

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


print(f"PyTorch Version: {torch.__version__}")
use_compile = hasattr(torch, 'compile')

Using device: cuda
PyTorch Version: 2.8.0+cu126


In [9]:
config = {

    "image_size": 32,
    "in_channels": 3,
    "num_classes": 10,


    "patch_size": 4,
    "embed_dim": 512,
    "depth": 6,
    "heads": 8,
    "mlp_dim": 1024,
    "dropout": 0.1,


    "learning_rate": 1e-3,
    "weight_decay": 5e-5,
    "batch_size": 256,
    "epochs": 50,
}


config["num_patches"] = (config["image_size"] // config["patch_size"]) ** 2

In [10]:

transform_train = transforms.Compose([
    transforms.RandomCrop(config["image_size"], padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandAugment(num_ops=2, magnitude=9),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    transforms.RandomErasing(p=0.25, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


print("Downloading and setting up datasets...")
trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(
    trainset, batch_size=config["batch_size"], shuffle=True, num_workers=4, pin_memory=True)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(
    testset, batch_size=config["batch_size"], shuffle=False, num_workers=4, pin_memory=True)
print("Data setup complete.")

Downloading and setting up datasets...




Data setup complete.


In [11]:
class PatchEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_size = config["patch_size"]
        self.embed_dim = config["embed_dim"]
        self.projection = nn.Conv2d(config["in_channels"], self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, self.embed_dim))
        self.positional_embedding = nn.Parameter(torch.randn(1, config["num_patches"] + 1, self.embed_dim))

    def forward(self, x):
        x = self.projection(x).flatten(2).transpose(1, 2)
        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.positional_embedding
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, heads, mlp_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attention(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_embedding = PatchEmbedding(config)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoder(config["embed_dim"], config["heads"], config["mlp_dim"], config["dropout"])
            for _ in range(config["depth"])
        ])
        self.norm = nn.LayerNorm(config["embed_dim"])
        self.classifier = nn.Linear(config["embed_dim"], config["num_classes"])

    def forward(self, x):
        x = self.patch_embedding(x)
        for layer in self.encoder_layers:
            x = layer(x)
        cls_token_final = self.norm(x[:, 0])
        return self.classifier(cls_token_final)

In [12]:
print("Initializing model...")
model = VisionTransformer(config).to(device)

if use_compile:
    print("Using torch.compile for model optimization.")
    model = torch.compile(model)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
scheduler = OneCycleLR(optimizer, max_lr=config["learning_rate"], steps_per_epoch=len(trainloader), epochs=config["epochs"])
scaler = GradScaler()

print("Model and training components initialized.")

Initializing model...
Using torch.compile for model optimization.
Model and training components initialized.


  scaler = GradScaler()


In [15]:
def train(epoch):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    progress_bar = tqdm(trainloader, desc=f"Epoch {epoch+1}/{config['epochs']} [Train]")

    for batch_idx, (inputs, targets) in enumerate(progress_bar):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()


        with torch.amp.autocast(device_type=device.type, dtype=torch.float16):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar.set_postfix(
            loss=f'{train_loss/(batch_idx+1):.3f}',
            acc=f'{100.*correct/total:.2f}%',
            lr=f'{scheduler.get_last_lr()[0]:.1e}'
        )

def test(epoch):
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    progress_bar = tqdm(testloader, desc=f"Epoch {epoch+1}/{config['epochs']} [Test]")
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(progress_bar):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            progress_bar.set_postfix(
                loss=f'{test_loss/(batch_idx+1):.3f}',
                acc=f'{100.*correct/total:.2f}%'
            )

    acc = 100. * correct / total
    if acc > best_acc:
        print(f'New best accuracy: {acc:.2f}%. Saving model...')
        best_acc = acc


    return acc

In [17]:


print("Initializing model for training...")
model = VisionTransformer(config).to(device)

if use_compile:
    print("Using torch.compile for model optimization.")
    model = torch.compile(model)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
scheduler = OneCycleLR(optimizer, max_lr=config["learning_rate"], steps_per_epoch=len(trainloader), epochs=config["epochs"])

scaler = GradScaler()
print("Model and training components initialized.")


best_acc = 0.0
start_time = time.time()
print("\nStarting training...")

for epoch in range(config['epochs']):
    train(epoch)
    current_acc = test(epoch)

end_time = time.time()
total_time = end_time - start_time

print("\n--- Training Finished ---")
print(f"Total Training Time: {total_time/60:.2f} minutes")
print(f"Best Test Accuracy: {best_acc:.2f}%")

  scaler = GradScaler()


Initializing model for training...
Using torch.compile for model optimization.
Model and training components initialized.

Starting training...


Epoch 1/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=22.56%, loss=2.090, lr=5.0e-05]
Epoch 1/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.18it/s, acc=32.38%, loss=1.839]


New best accuracy: 32.38%. Saving model...


Epoch 2/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=32.39%, loss=1.868, lr=8.2e-05]
Epoch 2/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.17it/s, acc=46.22%, loss=1.522]


New best accuracy: 46.22%. Saving model...


Epoch 3/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=37.65%, loss=1.721, lr=1.3e-04]
Epoch 3/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.16it/s, acc=50.70%, loss=1.393]


New best accuracy: 50.70%. Saving model...


Epoch 4/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.42it/s, acc=41.03%, loss=1.623, lr=2.0e-04]
Epoch 4/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.39it/s, acc=52.27%, loss=1.337]


New best accuracy: 52.27%. Saving model...


Epoch 5/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=43.87%, loss=1.551, lr=2.8e-04]
Epoch 5/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.43it/s, acc=54.66%, loss=1.250]


New best accuracy: 54.66%. Saving model...


Epoch 6/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.29it/s, acc=45.64%, loss=1.503, lr=3.7e-04]
Epoch 6/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=55.00%, loss=1.229]


New best accuracy: 55.00%. Saving model...


Epoch 7/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=47.10%, loss=1.463, lr=4.7e-04]
Epoch 7/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.31it/s, acc=53.86%, loss=1.257]
Epoch 8/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.35it/s, acc=47.88%, loss=1.446, lr=5.7e-04]
Epoch 8/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=55.73%, loss=1.199]


New best accuracy: 55.73%. Saving model...


Epoch 9/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=48.59%, loss=1.427, lr=6.7e-04]
Epoch 9/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=58.98%, loss=1.155]


New best accuracy: 58.98%. Saving model...


Epoch 10/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=49.13%, loss=1.418, lr=7.6e-04]
Epoch 10/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.50it/s, acc=57.29%, loss=1.194]
Epoch 11/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.35it/s, acc=48.78%, loss=1.422, lr=8.4e-04]
Epoch 11/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=57.72%, loss=1.162]
Epoch 12/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.33it/s, acc=48.51%, loss=1.431, lr=9.1e-04]
Epoch 12/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=59.71%, loss=1.126]


New best accuracy: 59.71%. Saving model...


Epoch 13/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.37it/s, acc=49.11%, loss=1.413, lr=9.6e-04]
Epoch 13/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=57.97%, loss=1.182]
Epoch 14/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.31it/s, acc=49.56%, loss=1.392, lr=9.9e-04]
Epoch 14/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.33it/s, acc=55.89%, loss=1.197]
Epoch 15/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.31it/s, acc=51.20%, loss=1.359, lr=1.0e-03]
Epoch 15/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.48it/s, acc=58.92%, loss=1.137]
Epoch 16/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=52.20%, loss=1.327, lr=1.0e-03]
Epoch 16/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.36it/s, acc=61.45%, loss=1.093]


New best accuracy: 61.45%. Saving model...


Epoch 17/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=54.01%, loss=1.279, lr=9.9e-04]
Epoch 17/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.39it/s, acc=63.14%, loss=1.026]


New best accuracy: 63.14%. Saving model...


Epoch 18/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.39it/s, acc=55.14%, loss=1.252, lr=9.8e-04]
Epoch 18/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.51it/s, acc=64.64%, loss=0.984]


New best accuracy: 64.64%. Saving model...


Epoch 19/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.39it/s, acc=56.30%, loss=1.217, lr=9.7e-04]
Epoch 19/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.46it/s, acc=66.15%, loss=0.951]


New best accuracy: 66.15%. Saving model...


Epoch 20/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.30it/s, acc=58.06%, loss=1.171, lr=9.5e-04]
Epoch 20/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=66.15%, loss=0.961]
Epoch 21/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.28it/s, acc=59.58%, loss=1.131, lr=9.3e-04]
Epoch 21/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=68.95%, loss=0.868]


New best accuracy: 68.95%. Saving model...


Epoch 22/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=60.35%, loss=1.108, lr=9.0e-04]
Epoch 22/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.47it/s, acc=68.97%, loss=0.860]


New best accuracy: 68.97%. Saving model...


Epoch 23/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.30it/s, acc=61.38%, loss=1.080, lr=8.8e-04]
Epoch 23/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=69.77%, loss=0.846]


New best accuracy: 69.77%. Saving model...


Epoch 24/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=62.37%, loss=1.051, lr=8.5e-04]
Epoch 24/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.47it/s, acc=70.01%, loss=0.834]


New best accuracy: 70.01%. Saving model...


Epoch 25/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.45it/s, acc=63.71%, loss=1.024, lr=8.1e-04]
Epoch 25/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.25it/s, acc=70.63%, loss=0.829]


New best accuracy: 70.63%. Saving model...


Epoch 26/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.41it/s, acc=65.08%, loss=0.985, lr=7.8e-04]
Epoch 26/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.18it/s, acc=72.31%, loss=0.786]


New best accuracy: 72.31%. Saving model...


Epoch 27/50 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.50it/s, acc=65.68%, loss=0.964, lr=7.4e-04]
Epoch 27/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.50it/s, acc=73.90%, loss=0.736]


New best accuracy: 73.90%. Saving model...


Epoch 28/50 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.54it/s, acc=66.94%, loss=0.932, lr=7.0e-04]
Epoch 28/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.33it/s, acc=74.06%, loss=0.725]


New best accuracy: 74.06%. Saving model...


Epoch 29/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=68.06%, loss=0.907, lr=6.5e-04]
Epoch 29/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.19it/s, acc=75.43%, loss=0.693]


New best accuracy: 75.43%. Saving model...


Epoch 30/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=68.59%, loss=0.890, lr=6.1e-04]
Epoch 30/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.39it/s, acc=76.40%, loss=0.667]


New best accuracy: 76.40%. Saving model...


Epoch 31/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.39it/s, acc=69.72%, loss=0.859, lr=5.7e-04]
Epoch 31/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=76.94%, loss=0.638]


New best accuracy: 76.94%. Saving model...


Epoch 32/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=70.56%, loss=0.832, lr=5.2e-04]
Epoch 32/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.47it/s, acc=76.95%, loss=0.649]


New best accuracy: 76.95%. Saving model...


Epoch 33/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.37it/s, acc=71.12%, loss=0.813, lr=4.8e-04]
Epoch 33/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.48it/s, acc=76.95%, loss=0.642]
Epoch 34/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=72.21%, loss=0.789, lr=4.3e-04]
Epoch 34/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=79.71%, loss=0.588]


New best accuracy: 79.71%. Saving model...


Epoch 35/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=72.99%, loss=0.767, lr=3.9e-04]
Epoch 35/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.18it/s, acc=79.24%, loss=0.589]
Epoch 36/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.45it/s, acc=73.61%, loss=0.750, lr=3.5e-04]
Epoch 36/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.13it/s, acc=79.79%, loss=0.574]


New best accuracy: 79.79%. Saving model...


Epoch 37/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.28it/s, acc=74.22%, loss=0.726, lr=3.0e-04]
Epoch 37/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.01it/s, acc=80.10%, loss=0.560]


New best accuracy: 80.10%. Saving model...


Epoch 38/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.33it/s, acc=75.34%, loss=0.697, lr=2.6e-04]
Epoch 38/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.15it/s, acc=81.46%, loss=0.533]


New best accuracy: 81.46%. Saving model...


Epoch 39/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.27it/s, acc=75.57%, loss=0.689, lr=2.2e-04]
Epoch 39/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.15it/s, acc=81.51%, loss=0.526]


New best accuracy: 81.51%. Saving model...


Epoch 40/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=76.39%, loss=0.663, lr=1.9e-04]
Epoch 40/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.36it/s, acc=81.61%, loss=0.522]


New best accuracy: 81.61%. Saving model...


Epoch 41/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.37it/s, acc=77.12%, loss=0.648, lr=1.5e-04]
Epoch 41/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=82.13%, loss=0.505]


New best accuracy: 82.13%. Saving model...


Epoch 42/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=77.56%, loss=0.631, lr=1.2e-04]
Epoch 42/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.45it/s, acc=82.52%, loss=0.503]


New best accuracy: 82.52%. Saving model...


Epoch 43/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=78.15%, loss=0.617, lr=9.5e-05]
Epoch 43/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=82.52%, loss=0.494]
Epoch 44/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.41it/s, acc=78.57%, loss=0.607, lr=7.1e-05]
Epoch 44/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=82.80%, loss=0.481]


New best accuracy: 82.80%. Saving model...


Epoch 45/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.37it/s, acc=78.89%, loss=0.597, lr=4.9e-05]
Epoch 45/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=82.99%, loss=0.483]


New best accuracy: 82.99%. Saving model...


Epoch 46/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.26it/s, acc=79.11%, loss=0.588, lr=3.2e-05]
Epoch 46/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=83.12%, loss=0.481]


New best accuracy: 83.12%. Saving model...


Epoch 47/50 [Train]: 100%|██████████| 196/196 [00:46<00:00,  4.26it/s, acc=79.41%, loss=0.582, lr=1.8e-05]
Epoch 47/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=83.31%, loss=0.475]


New best accuracy: 83.31%. Saving model...


Epoch 48/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=79.75%, loss=0.573, lr=8.0e-06]
Epoch 48/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=83.39%, loss=0.475]


New best accuracy: 83.39%. Saving model...


Epoch 49/50 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=79.79%, loss=0.572, lr=2.0e-06]
Epoch 49/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.45it/s, acc=83.45%, loss=0.474]


New best accuracy: 83.45%. Saving model...


Epoch 50/50 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=79.68%, loss=0.571, lr=4.1e-09]
Epoch 50/50 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.43it/s, acc=83.47%, loss=0.474]

New best accuracy: 83.47%. Saving model...

--- Training Finished ---
Total Training Time: 42.66 minutes
Best Test Accuracy: 83.47%





VERSION-2

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.ops import StochasticDepth
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
import time


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
use_compile = hasattr(torch, 'compile')
print(f"PyTorch Version: {torch.__version__}, using torch.compile: {use_compile}")

Using device: cuda
PyTorch Version: 2.8.0+cu126, using torch.compile: True


In [19]:
config = {

    "image_size": 32,
    "in_channels": 3,
    "num_classes": 10,


    "patch_size": 4,
    "embed_dim": 512,
    "depth": 6,
    "heads": 8,
    "mlp_dim": 1024,
    "dropout": 0.1,
    "stochastic_depth_prob": 0.1,


    "learning_rate": 1e-3,
    "weight_decay": 5e-5,
    "batch_size": 256,
    "epochs": 100,
    "label_smoothing": 0.1,
}


config["num_patches"] = (config["image_size"] // config["patch_size"]) ** 2

In [20]:

transform_train = transforms.Compose([
    transforms.RandomCrop(config["image_size"], padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandAugment(num_ops=2, magnitude=9),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    transforms.RandomErasing(p=0.25),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


print("Downloading and setting up datasets...")
trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(
    trainset, batch_size=config["batch_size"], shuffle=True, num_workers=4, pin_memory=True)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(
    testset, batch_size=config["batch_size"], shuffle=False, num_workers=4, pin_memory=True)
print("Data setup complete.")

Downloading and setting up datasets...
Data setup complete.


In [21]:
class PatchEmbedding(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.patch_size = config["patch_size"]
        self.embed_dim = config["embed_dim"]
        self.projection = nn.Conv2d(config["in_channels"], self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, self.embed_dim))
        self.positional_embedding = nn.Parameter(torch.randn(1, config["num_patches"] + 1, self.embed_dim))

    def forward(self, x):
        x = self.projection(x).flatten(2).transpose(1, 2)
        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.positional_embedding
        return x

class TransformerEncoder(nn.Module):

    def __init__(self, embed_dim, heads, mlp_dim, dropout, stochastic_depth_prob):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )
        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")

    def forward(self, x):
        attn_output = self.attention(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.stochastic_depth(attn_output)
        mlp_output = self.mlp(self.norm2(x))
        x = x + self.stochastic_depth(mlp_output)
        return x

class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_embedding = PatchEmbedding(config)


        self.encoder_layers = nn.ModuleList([
            TransformerEncoder(
                config["embed_dim"],
                config["heads"],
                config["mlp_dim"],
                config["dropout"],
                config["stochastic_depth_prob"]
            )
            for _ in range(config["depth"])
        ])

        self.norm = nn.LayerNorm(config["embed_dim"])
        self.classifier = nn.Linear(config["embed_dim"], config["num_classes"])

    def forward(self, x):
        x = self.patch_embedding(x)
        for layer in self.encoder_layers:
            x = layer(x)
        cls_token_final = self.norm(x[:, 0])
        return self.classifier(cls_token_final)

In [22]:


def train(epoch):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    progress_bar = tqdm(trainloader, desc=f"Epoch {epoch+1}/{config['epochs']} [Train]")

    for batch_idx, (inputs, targets) in enumerate(progress_bar):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()

        with torch.amp.autocast(device_type=device.type, dtype=torch.float16):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar.set_postfix(
            loss=f'{train_loss/(batch_idx+1):.3f}',
            acc=f'{100.*correct/total:.2f}%',
            lr=f'{scheduler.get_last_lr()[0]:.1e}'
        )

def test(epoch):
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    progress_bar = tqdm(testloader, desc=f"Epoch {epoch+1}/{config['epochs']} [Test]")
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(progress_bar):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            progress_bar.set_postfix(
                loss=f'{test_loss/(batch_idx+1):.3f}',
                acc=f'{100.*correct/total:.2f}%'
            )

    acc = 100. * correct / total
    if acc > best_acc:
        print(f'New best accuracy: {acc:.2f}%.')
        best_acc = acc

    return acc

In [23]:
print("Initializing model for training...")
model = VisionTransformer(config).to(device)

if use_compile:
    print("Using torch.compile for model optimization.")
    model = torch.compile(model)

criterion = nn.CrossEntropyLoss(label_smoothing=config["label_smoothing"])

optimizer = optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
scheduler = OneCycleLR(optimizer, max_lr=config["learning_rate"], steps_per_epoch=len(trainloader), epochs=config["epochs"])
scaler = torch.amp.GradScaler(device.type)

print("Model and training components initialized.")


best_acc = 0.0
start_time = time.time()
print("\nStarting training...")

for epoch in range(config['epochs']):
    train(epoch)
    current_acc = test(epoch)

end_time = time.time()
total_time = end_time - start_time

print("\n--- Training Finished ---")
print(f"Total Training Time: {total_time/60:.2f} minutes")
print(f"Best Test Accuracy: {best_acc:.2f}%")

Initializing model for training...
Using torch.compile for model optimization.
Model and training components initialized.

Starting training...


Epoch 1/100 [Train]: 100%|██████████| 196/196 [01:51<00:00,  1.75it/s, acc=22.74%, loss=2.133, lr=4.3e-05]
Epoch 1/100 [Test]: 100%|██████████| 40/40 [00:12<00:00,  3.30it/s, acc=33.34%, loss=1.897]


New best accuracy: 33.34%.


Epoch 2/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.46it/s, acc=31.20%, loss=1.977, lr=5.0e-05]
Epoch 2/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.44it/s, acc=42.55%, loss=1.756]


New best accuracy: 42.55%.


Epoch 3/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=35.97%, loss=1.887, lr=6.4e-05]
Epoch 3/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.34it/s, acc=48.47%, loss=1.630]


New best accuracy: 48.47%.


Epoch 4/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.47it/s, acc=38.81%, loss=1.826, lr=8.2e-05]
Epoch 4/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=51.22%, loss=1.584]


New best accuracy: 51.22%.


Epoch 5/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.41it/s, acc=41.24%, loss=1.777, lr=1.0e-04]
Epoch 5/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.31it/s, acc=53.35%, loss=1.525]


New best accuracy: 53.35%.


Epoch 6/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.42it/s, acc=42.92%, loss=1.741, lr=1.3e-04]
Epoch 6/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.34it/s, acc=54.19%, loss=1.503]


New best accuracy: 54.19%.


Epoch 7/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=44.60%, loss=1.703, lr=1.6e-04]
Epoch 7/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.44it/s, acc=55.31%, loss=1.464]


New best accuracy: 55.31%.


Epoch 8/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=46.02%, loss=1.672, lr=2.0e-04]
Epoch 8/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.44it/s, acc=56.60%, loss=1.457]


New best accuracy: 56.60%.


Epoch 9/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=47.17%, loss=1.651, lr=2.4e-04]
Epoch 9/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=57.20%, loss=1.437]


New best accuracy: 57.20%.


Epoch 10/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.33it/s, acc=48.35%, loss=1.625, lr=2.8e-04]
Epoch 10/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.36it/s, acc=57.94%, loss=1.433]


New best accuracy: 57.94%.


Epoch 11/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.29it/s, acc=49.31%, loss=1.603, lr=3.2e-04]
Epoch 11/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.43it/s, acc=58.47%, loss=1.387]


New best accuracy: 58.47%.


Epoch 12/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.29it/s, acc=49.82%, loss=1.594, lr=3.7e-04]
Epoch 12/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.35it/s, acc=60.31%, loss=1.364]


New best accuracy: 60.31%.


Epoch 13/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.28it/s, acc=50.22%, loss=1.580, lr=4.2e-04]
Epoch 13/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=60.25%, loss=1.371]
Epoch 14/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.31it/s, acc=50.94%, loss=1.567, lr=4.7e-04]
Epoch 14/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.39it/s, acc=58.45%, loss=1.383]
Epoch 15/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.29it/s, acc=51.30%, loss=1.559, lr=5.2e-04]
Epoch 15/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=59.32%, loss=1.376]
Epoch 16/100 [Train]: 100%|██████████| 196/196 [00:46<00:00,  4.21it/s, acc=51.59%, loss=1.556, lr=5.7e-04]
Epoch 16/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.46it/s, acc=63.31%, loss=1.302]


New best accuracy: 63.31%.


Epoch 17/100 [Train]: 100%|██████████| 196/196 [00:46<00:00,  4.19it/s, acc=52.05%, loss=1.543, lr=6.2e-04]
Epoch 17/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=62.25%, loss=1.329]
Epoch 18/100 [Train]: 100%|██████████| 196/196 [00:46<00:00,  4.26it/s, acc=52.15%, loss=1.541, lr=6.7e-04]
Epoch 18/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=62.79%, loss=1.321]
Epoch 19/100 [Train]: 100%|██████████| 196/196 [00:46<00:00,  4.26it/s, acc=52.50%, loss=1.538, lr=7.2e-04]
Epoch 19/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=62.25%, loss=1.313]
Epoch 20/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.31it/s, acc=53.03%, loss=1.528, lr=7.6e-04]
Epoch 20/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.35it/s, acc=62.15%, loss=1.329]
Epoch 21/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=52.46%, loss=1.533, lr=8.0e-04]
Epoch 21/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.49it/s, acc=62.50%, loss=1.

New best accuracy: 66.25%.


Epoch 24/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.52it/s, acc=54.97%, loss=1.488, lr=9.1e-04]
Epoch 24/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.25it/s, acc=66.36%, loss=1.246]


New best accuracy: 66.36%.


Epoch 25/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.48it/s, acc=55.76%, loss=1.466, lr=9.4e-04]
Epoch 25/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=66.68%, loss=1.241]


New best accuracy: 66.68%.


Epoch 26/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.46it/s, acc=56.57%, loss=1.450, lr=9.6e-04]
Epoch 26/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.35it/s, acc=67.14%, loss=1.221]


New best accuracy: 67.14%.


Epoch 27/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.43it/s, acc=58.17%, loss=1.427, lr=9.8e-04]
Epoch 27/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=69.05%, loss=1.175]


New best accuracy: 69.05%.


Epoch 28/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.42it/s, acc=58.74%, loss=1.409, lr=9.9e-04]
Epoch 28/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.48it/s, acc=70.33%, loss=1.153]


New best accuracy: 70.33%.


Epoch 29/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=59.68%, loss=1.385, lr=1.0e-03]
Epoch 29/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.48it/s, acc=70.64%, loss=1.148]


New best accuracy: 70.64%.


Epoch 30/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=60.51%, loss=1.367, lr=1.0e-03]
Epoch 30/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=70.68%, loss=1.141]


New best accuracy: 70.68%.


Epoch 31/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.35it/s, acc=61.80%, loss=1.345, lr=1.0e-03]
Epoch 31/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=71.04%, loss=1.127]


New best accuracy: 71.04%.


Epoch 32/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=62.82%, loss=1.327, lr=1.0e-03]
Epoch 32/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.43it/s, acc=73.00%, loss=1.112]


New best accuracy: 73.00%.


Epoch 33/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=63.14%, loss=1.311, lr=1.0e-03]
Epoch 33/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=72.00%, loss=1.113]
Epoch 34/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.49it/s, acc=63.99%, loss=1.299, lr=9.9e-04]
Epoch 34/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.12it/s, acc=73.80%, loss=1.080]


New best accuracy: 73.80%.


Epoch 35/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.45it/s, acc=64.71%, loss=1.285, lr=9.9e-04]
Epoch 35/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.18it/s, acc=75.76%, loss=1.046]


New best accuracy: 75.76%.


Epoch 36/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.48it/s, acc=65.45%, loss=1.265, lr=9.8e-04]
Epoch 36/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.45it/s, acc=75.11%, loss=1.059]
Epoch 37/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.47it/s, acc=65.86%, loss=1.257, lr=9.8e-04]
Epoch 37/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=75.84%, loss=1.023]


New best accuracy: 75.84%.


Epoch 38/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.46it/s, acc=66.28%, loss=1.245, lr=9.7e-04]
Epoch 38/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.35it/s, acc=75.94%, loss=1.033]


New best accuracy: 75.94%.


Epoch 39/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=67.01%, loss=1.232, lr=9.6e-04]
Epoch 39/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=75.28%, loss=1.040]
Epoch 40/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.43it/s, acc=67.73%, loss=1.217, lr=9.5e-04]
Epoch 40/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=76.75%, loss=1.026]


New best accuracy: 76.75%.


Epoch 41/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=68.36%, loss=1.207, lr=9.4e-04]
Epoch 41/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.46it/s, acc=77.20%, loss=1.011]


New best accuracy: 77.20%.


Epoch 42/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.37it/s, acc=68.68%, loss=1.195, lr=9.3e-04]
Epoch 42/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.43it/s, acc=78.91%, loss=0.973]


New best accuracy: 78.91%.


Epoch 43/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.30it/s, acc=69.03%, loss=1.191, lr=9.2e-04]
Epoch 43/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.48it/s, acc=77.10%, loss=1.003]
Epoch 44/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.31it/s, acc=69.80%, loss=1.173, lr=9.0e-04]
Epoch 44/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=78.96%, loss=0.969]


New best accuracy: 78.96%.


Epoch 45/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=70.26%, loss=1.163, lr=8.9e-04]
Epoch 45/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=80.30%, loss=0.949]


New best accuracy: 80.30%.


Epoch 46/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=70.65%, loss=1.155, lr=8.8e-04]
Epoch 46/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.50it/s, acc=79.81%, loss=0.941]
Epoch 47/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=71.05%, loss=1.146, lr=8.6e-04]
Epoch 47/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=79.61%, loss=0.953]
Epoch 48/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=71.33%, loss=1.137, lr=8.5e-04]
Epoch 48/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.33it/s, acc=80.22%, loss=0.944]
Epoch 49/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=71.73%, loss=1.126, lr=8.3e-04]
Epoch 49/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.13it/s, acc=80.11%, loss=0.943]
Epoch 50/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.34it/s, acc=72.38%, loss=1.117, lr=8.1e-04]
Epoch 50/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.12it/s, acc=81.54%, loss=0.

New best accuracy: 81.54%.


Epoch 51/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=72.60%, loss=1.109, lr=7.9e-04]
Epoch 51/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.04it/s, acc=81.29%, loss=0.914]
Epoch 52/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.36it/s, acc=73.27%, loss=1.099, lr=7.8e-04]
Epoch 52/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.06it/s, acc=80.80%, loss=0.925]
Epoch 53/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.43it/s, acc=73.77%, loss=1.088, lr=7.6e-04]
Epoch 53/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.36it/s, acc=81.14%, loss=0.924]
Epoch 54/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.45it/s, acc=73.94%, loss=1.085, lr=7.4e-04]
Epoch 54/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.34it/s, acc=81.66%, loss=0.915]


New best accuracy: 81.66%.


Epoch 55/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=74.44%, loss=1.072, lr=7.2e-04]
Epoch 55/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=82.51%, loss=0.889]


New best accuracy: 82.51%.


Epoch 56/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=74.62%, loss=1.066, lr=7.0e-04]
Epoch 56/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.45it/s, acc=83.00%, loss=0.881]


New best accuracy: 83.00%.


Epoch 57/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.42it/s, acc=75.17%, loss=1.057, lr=6.8e-04]
Epoch 57/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=82.21%, loss=0.891]
Epoch 58/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.37it/s, acc=75.73%, loss=1.042, lr=6.5e-04]
Epoch 58/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=83.07%, loss=0.878]


New best accuracy: 83.07%.


Epoch 59/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.35it/s, acc=76.27%, loss=1.035, lr=6.3e-04]
Epoch 59/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.39it/s, acc=84.19%, loss=0.862]


New best accuracy: 84.19%.


Epoch 60/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.38it/s, acc=76.18%, loss=1.031, lr=6.1e-04]
Epoch 60/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=82.99%, loss=0.886]
Epoch 61/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=76.98%, loss=1.017, lr=5.9e-04]
Epoch 61/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=84.30%, loss=0.857]


New best accuracy: 84.30%.


Epoch 62/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.40it/s, acc=77.12%, loss=1.012, lr=5.7e-04]
Epoch 62/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.39it/s, acc=84.40%, loss=0.857]


New best accuracy: 84.40%.


Epoch 63/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.48it/s, acc=77.59%, loss=1.004, lr=5.4e-04]
Epoch 63/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.20it/s, acc=84.01%, loss=0.853]
Epoch 64/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.47it/s, acc=77.88%, loss=0.998, lr=5.2e-04]
Epoch 64/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.14it/s, acc=84.67%, loss=0.843]


New best accuracy: 84.67%.


Epoch 65/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.45it/s, acc=78.36%, loss=0.986, lr=5.0e-04]
Epoch 65/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.31it/s, acc=84.71%, loss=0.845]


New best accuracy: 84.71%.


Epoch 66/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.50it/s, acc=78.56%, loss=0.979, lr=4.8e-04]
Epoch 66/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=85.05%, loss=0.836]


New best accuracy: 85.05%.


Epoch 67/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.49it/s, acc=79.12%, loss=0.971, lr=4.6e-04]
Epoch 67/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=85.19%, loss=0.829]


New best accuracy: 85.19%.


Epoch 68/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.47it/s, acc=79.45%, loss=0.964, lr=4.3e-04]
Epoch 68/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.34it/s, acc=85.22%, loss=0.832]


New best accuracy: 85.22%.


Epoch 69/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.43it/s, acc=79.51%, loss=0.958, lr=4.1e-04]
Epoch 69/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=85.94%, loss=0.822]


New best accuracy: 85.94%.


Epoch 70/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.44it/s, acc=79.94%, loss=0.949, lr=3.9e-04]
Epoch 70/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=85.97%, loss=0.821]


New best accuracy: 85.97%.


Epoch 71/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.42it/s, acc=80.39%, loss=0.942, lr=3.7e-04]
Epoch 71/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.45it/s, acc=86.00%, loss=0.818]


New best accuracy: 86.00%.


Epoch 72/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.48it/s, acc=80.75%, loss=0.932, lr=3.5e-04]
Epoch 72/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.24it/s, acc=85.90%, loss=0.821]
Epoch 73/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.54it/s, acc=81.04%, loss=0.928, lr=3.2e-04]
Epoch 73/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=86.46%, loss=0.812]


New best accuracy: 86.46%.


Epoch 74/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.53it/s, acc=81.29%, loss=0.923, lr=3.0e-04]
Epoch 74/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=86.60%, loss=0.807]


New best accuracy: 86.60%.


Epoch 75/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.49it/s, acc=81.40%, loss=0.915, lr=2.8e-04]
Epoch 75/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.36it/s, acc=87.12%, loss=0.800]


New best accuracy: 87.12%.


Epoch 76/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.46it/s, acc=82.13%, loss=0.906, lr=2.6e-04]
Epoch 76/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.36it/s, acc=86.97%, loss=0.796]
Epoch 77/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.42it/s, acc=82.02%, loss=0.905, lr=2.4e-04]
Epoch 77/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=87.08%, loss=0.794]
Epoch 78/100 [Train]: 100%|██████████| 196/196 [00:47<00:00,  4.09it/s, acc=82.58%, loss=0.896, lr=2.2e-04]
Epoch 78/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.31it/s, acc=87.39%, loss=0.791]


New best accuracy: 87.39%.


Epoch 79/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.39it/s, acc=83.09%, loss=0.889, lr=2.1e-04]
Epoch 79/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.32it/s, acc=87.43%, loss=0.789]


New best accuracy: 87.43%.


Epoch 80/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.30it/s, acc=83.11%, loss=0.884, lr=1.9e-04]
Epoch 80/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.35it/s, acc=87.59%, loss=0.781]


New best accuracy: 87.59%.


Epoch 81/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.30it/s, acc=83.43%, loss=0.877, lr=1.7e-04]
Epoch 81/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=87.56%, loss=0.786]
Epoch 82/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.27it/s, acc=83.62%, loss=0.872, lr=1.5e-04]
Epoch 82/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.36it/s, acc=87.46%, loss=0.786]
Epoch 83/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.32it/s, acc=83.82%, loss=0.866, lr=1.4e-04]
Epoch 83/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=87.61%, loss=0.783]


New best accuracy: 87.61%.


Epoch 84/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.32it/s, acc=83.95%, loss=0.863, lr=1.2e-04]
Epoch 84/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=87.94%, loss=0.778]


New best accuracy: 87.94%.


Epoch 85/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.39it/s, acc=84.23%, loss=0.862, lr=1.1e-04]
Epoch 85/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.41it/s, acc=87.77%, loss=0.781]
Epoch 86/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.46it/s, acc=84.47%, loss=0.855, lr=9.5e-05]
Epoch 86/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.25it/s, acc=87.78%, loss=0.778]
Epoch 87/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.54it/s, acc=84.54%, loss=0.849, lr=8.3e-05]
Epoch 87/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.32it/s, acc=87.82%, loss=0.777]
Epoch 88/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.51it/s, acc=84.78%, loss=0.848, lr=7.1e-05]
Epoch 88/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=88.16%, loss=0.773]


New best accuracy: 88.16%.


Epoch 89/100 [Train]: 100%|██████████| 196/196 [00:43<00:00,  4.54it/s, acc=84.95%, loss=0.844, lr=6.0e-05]
Epoch 89/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.30it/s, acc=88.37%, loss=0.770]


New best accuracy: 88.37%.


Epoch 90/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.39it/s, acc=85.11%, loss=0.840, lr=4.9e-05]
Epoch 90/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.40it/s, acc=88.25%, loss=0.770]
Epoch 91/100 [Train]: 100%|██████████| 196/196 [00:45<00:00,  4.28it/s, acc=85.20%, loss=0.838, lr=4.0e-05]
Epoch 91/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.38it/s, acc=88.25%, loss=0.770]
Epoch 92/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.37it/s, acc=85.22%, loss=0.838, lr=3.2e-05]
Epoch 92/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.42it/s, acc=88.26%, loss=0.769]
Epoch 93/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.41it/s, acc=85.28%, loss=0.835, lr=2.4e-05]
Epoch 93/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.37it/s, acc=88.22%, loss=0.767]
Epoch 94/100 [Train]: 100%|██████████| 196/196 [00:44<00:00,  4.41it/s, acc=85.50%, loss=0.830, lr=1.8e-05]
Epoch 94/100 [Test]: 100%|██████████| 40/40 [00:06<00:00,  6.55it/s, acc=88.26%, loss=0.


--- Training Finished ---
Total Training Time: 86.02 minutes
Best Test Accuracy: 88.37%



