In [1]:
import math
from typing import Tuple, Dict, Any, Optional, List
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torchvision import datasets, transforms
from torch.cuda.amp import autocast, GradScaler
import random
import time
from datetime import datetime
import copy

# ============================================================================
# GPU DEVICE SETUP FOR T4
# ============================================================================

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

def print_gpu_utilization():
    if torch.cuda.is_available():
        print(f"GPU Memory Used: {torch.cuda.memory_allocated(0) / 1024**3:.1f} GB")
        print(f"GPU Memory Cached: {torch.cuda.memory_reserved(0) / 1024**3:.1f} GB")

# ============================================================================
# SOTA CONFIGURATION CONSTANTS - OPTIMIZED FOR T4 GPU
# ============================================================================

DATASET_NAMES = ['FashionMNIST']
HIDDEN_DIM: int = 2048 #512
LEARNING_RATE: float = 1e-3
weight_decay_ = 1e-4
DROPOUT_RATE: float = 0.4
EPOCHS_PER_DATASET: int = 100
BATCH_SIZE: int = 1024 #128  # Optimized for T4 GPU memory
ENSEMBLE_SIZE: int = 3

# ============================================================================
# ENHANCED DATA AUGMENTATION
# ============================================================================

class MultiAugmentDataset(Dataset):
    """Enhanced dataset with multiple augmentations"""
    def __init__(self, base_dataset, num_augments=2):
        self.base = base_dataset
        self.num_augments = num_augments

    def __len__(self):
        return len(self.base) * self.num_augments

    def __getitem__(self, idx):
        base_idx = idx // self.num_augments
        return self.base[base_idx]

class MixUp(object):
    """MixUp data augmentation"""
    def __init__(self, alpha=0.2):
        self.alpha = alpha

    def __call__(self, x, y):
        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
        else:
            lam = 1

        batch_size = x.size(0)
        index = torch.randperm(batch_size).to(device)

        mixed_x = lam * x + (1 - lam) * x[index, :]
        y_a, y_b = y, y[index]
        return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

def get_sota_data_loaders(dataset_name: str, batch_size: int) -> Tuple[DataLoader, DataLoader, int, int]:
    """SOTA data loading with advanced augmentation"""

    if dataset_name == 'FashionMNIST':
        transform_train = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomRotation(15),
            transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
            transforms.RandomResizedCrop(28, scale=(0.8, 1.0)),
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,)),
            transforms.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3)),
        ])

        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))
        ])

        train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform_train)
        test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform_test)
        n_classes = 10
        n_channels = 1
        img_size = 28

    enhanced_train = MultiAugmentDataset(train_dataset, 2)

    train_loader = DataLoader(
        enhanced_train,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True,
        drop_last=True,
        prefetch_factor=2,
        persistent_workers=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size * 2,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        drop_last=False,
        prefetch_factor=2,
        persistent_workers=True
    )

    input_dim = (n_channels, img_size, img_size)
    return train_loader, test_loader, input_dim, n_classes

# ============================================================================
# SOTA ARCHITECTURE COMPONENTS
# ============================================================================

class SEBlock(nn.Module):
    """Squeeze-and-Excitation Block"""
    def __init__(self, channel, reduction=16):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.SiLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

class ResidualBlock(nn.Module):
    """Advanced Residual Block with SE attention"""
    def __init__(self, in_channels, out_channels, stride=1, use_se=True):
        super(ResidualBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

        self.se = SEBlock(out_channels) if use_se else None
        self.dropout = nn.Dropout2d(0.1)

    def forward(self, x):
        residual = x

        out = F.silu(self.bn1(self.conv1(x)))
        out = self.dropout(out)
        out = self.bn2(self.conv2(out))

        if self.se:
            out = self.se(out)

        out += self.shortcut(residual)
        out = F.silu(out)

        return out

class MultiScaleBlock(nn.Module):
    """Multi-scale feature extraction"""
    def __init__(self, in_channels, out_channels):
        super(MultiScaleBlock, self).__init__()

        self.branch1 = nn.Conv2d(in_channels, out_channels//4, 1, bias=False)
        self.branch2 = nn.Conv2d(in_channels, out_channels//4, 3, padding=1, bias=False)
        self.branch3 = nn.Conv2d(in_channels, out_channels//4, 5, padding=2, bias=False)
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(3, stride=1, padding=1),
            nn.Conv2d(in_channels, out_channels//4, 1, bias=False)
        )

        self.bn = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)

        out = torch.cat([branch1, branch2, branch3, branch4], 1)
        return F.silu(self.bn(out))

# ============================================================================
# SOTA NEURAL NETWORK ARCHITECTURE
# ============================================================================

class SOTAFashionNet(nn.Module):
    """State-of-the-Art Architecture for FashionMNIST"""

    def __init__(self, input_dim: tuple, hidden_dim: int, output_dim: int, dropout_rate: float = 0.4):
        super(SOTAFashionNet, self).__init__()

        channels, height, width = input_dim

        self.initial_conv = nn.Sequential(
            nn.Conv2d(channels, 32, 3, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.SiLU(inplace=True)
        )

        self.multiscale1 = MultiScaleBlock(32, 64)

        self.res_block1 = ResidualBlock(64, 64)
        self.res_block2 = ResidualBlock(64, 128, stride=2)
        self.res_block3 = ResidualBlock(128, 128)
        self.res_block4 = ResidualBlock(128, 256, stride=2)
        self.res_block5 = ResidualBlock(256, 256)
        self.res_block6 = ResidualBlock(256, 512, stride=2)

        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)

        self.classifier = nn.Sequential(
            nn.Linear(512, hidden_dim),
            nn.SiLU(inplace=True),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_dim, hidden_dim//2),
            nn.SiLU(inplace=True),
            nn.Dropout(dropout_rate//2),

            nn.Linear(hidden_dim//2, hidden_dim//4),
            nn.SiLU(inplace=True),
            nn.Dropout(dropout_rate//4),

            nn.Linear(hidden_dim//4, output_dim)
        )

        self._initialize_weights()

    def _initialize_weights(self):
        """Advanced weight initialization"""
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.initial_conv(x)
        x = self.multiscale1(x)

        x = self.res_block1(x)
        x = self.res_block2(x)
        x = self.res_block3(x)
        x = self.res_block4(x)
        x = self.res_block5(x)
        x = self.res_block6(x)

        x = self.global_avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)

        return x

# ============================================================================
# SOTA TRAINING WITH T4 GPU OPTIMIZATION
# ============================================================================

def train_sota_model(model: nn.Module, train_loader: DataLoader, criterion: nn.Module,
                    test_loader: DataLoader, optimizer: optim.Optimizer, scheduler,
                    num_epochs: int = 100, verbose: bool = True) -> float:

    model = model.to(device)
    scaler = GradScaler()  # Mixed precision training
    mixup = MixUp(alpha=0.2)
    best_accuracy = 0.0
    patience_counter = 0
    max_patience = 20

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0.0

        for batch_x, batch_y in train_loader:
            # Move data to GPU
            batch_x = batch_x.to(device, non_blocking=True)
            batch_y = batch_y.to(device, non_blocking=True)

            optimizer.zero_grad()

            # Mixed precision training
            with autocast():
                if epoch > 20 and np.random.random() < 0.3:
                    mixed_x, y_a, y_b, lam = mixup(batch_x, batch_y)
                    outputs = model(mixed_x)
                    loss = mixup_criterion(criterion, outputs, y_a, y_b, lam)
                else:
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)

            # Scaled backward pass
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

            # Periodic GPU cache cleanup
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        scheduler.step()
        avg_loss = total_loss / len(train_loader)

        if epoch % 1 == 0:
            test_loss, test_acc = evaluate_model(model, test_loader, criterion)

            if verbose:
                current_time = datetime.now().strftime("%B %d, %Y at %I:%M:%S %p")
                print(f"Epoch {epoch}/{num_epochs}: Loss = {avg_loss:.4f} | "
                      f"Test Acc = {test_acc:.2f}% | LR = {scheduler.get_last_lr()[0]:.6f} | Time: {current_time}")
                print_gpu_utilization()

            if test_acc > best_accuracy:
                best_accuracy = test_acc
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= max_patience//2:
                print(f"Early stopping at epoch {epoch}")
                break

    return best_accuracy

@torch.no_grad()
def evaluate_model(model: nn.Module, test_loader: DataLoader, criterion: nn.Module) -> Tuple[float, float]:
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch_x, batch_y in test_loader:
        # Move data to GPU
        batch_x = batch_x.to(device, non_blocking=True)
        batch_y = batch_y.to(device, non_blocking=True)

        with autocast():
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)

        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == batch_y).sum().item()
        total += batch_y.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = 100.0 * correct / total
    return avg_loss, accuracy

# ============================================================================
# ENSEMBLE TRAINING FOR T4 GPU
# ============================================================================

def train_ensemble_models(input_dim, hidden_dim, n_classes, train_loader, test_loader,
                         ensemble_size=3) -> Tuple[List[nn.Module], List[float]]:
    """Train multiple models for ensembling - T4 GPU optimized"""

    models = []
    best_accuracies = []

    for i in range(ensemble_size):
        print(f"\n{'='*60}")
        print(f"Training Ensemble Model {i+1}/{ensemble_size}")
        print(f"{'='*60}")

        model_hidden = hidden_dim + (i * 64)
        model = SOTAFashionNet(input_dim, model_hidden, n_classes, dropout_rate=DROPOUT_RATE + i*0.05)
        model = model.to(device)

        optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE * (0.9 + i*0.1),
                               weight_decay=weight_decay_, betas=(0.9, 0.999), eps=1e-8)

        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_PER_DATASET, eta_min=1e-6)
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

        best_acc = train_sota_model(model, train_loader, criterion, test_loader, optimizer, scheduler,
                                   num_epochs=EPOCHS_PER_DATASET)

        models.append(model)
        best_accuracies.append(best_acc)

        print(f"Model {i+1} Best Accuracy: {best_acc:.2f}%")

        # Clear GPU memory between models
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return models, best_accuracies

@torch.no_grad()
def ensemble_predict(models: List[nn.Module], test_loader: DataLoader) -> float:
    """Make ensemble predictions"""
    for model in models:
        model.eval()

    correct = 0
    total = 0

    for batch_x, batch_y in test_loader:
        batch_x = batch_x.to(device, non_blocking=True)
        batch_y = batch_y.to(device, non_blocking=True)

        ensemble_outputs = []
        for model in models:
            with autocast():
                outputs = model(batch_x)
                ensemble_outputs.append(F.softmax(outputs, dim=1))

        avg_outputs = torch.stack(ensemble_outputs).mean(0)
        preds = avg_outputs.argmax(dim=1)

        correct += (preds == batch_y).sum().item()
        total += batch_y.size(0)

    accuracy = 100.0 * correct / total
    return accuracy

# ============================================================================
# MAIN SOTA EXPERIMENT PIPELINE
# ============================================================================

def run_sota_experiments():
    """Run SOTA experiments - T4 GPU optimized"""

    for dataset_name in DATASET_NAMES:
        print(f"\n{'='*80}")
        print(f"RUNNING SOTA EXPERIMENTS ON {dataset_name}")
        print(f"TARGET: 95%+ ACCURACY (T4 GPU OPTIMIZED)")
        print(f"{'='*80}")

        train_loader, test_loader, input_dim, n_classes = get_sota_data_loaders(dataset_name, BATCH_SIZE)
        print(f"Dataset: {dataset_name} | Input: {input_dim} | Classes: {n_classes}")
        print(f"Train: {len(train_loader.dataset)} | Test: {len(test_loader.dataset)}")
        print_gpu_utilization()

        models, individual_accuracies = train_ensemble_models(
            input_dim, HIDDEN_DIM, n_classes, train_loader, test_loader, ENSEMBLE_SIZE
        )

        print(f"\n{'='*60}")
        print("INDIVIDUAL MODEL RESULTS")
        print(f"{'='*60}")
        for i, acc in enumerate(individual_accuracies):
            print(f"Model {i+1}: {acc:.2f}%")

        print(f"\n{'='*60}")
        print("ENSEMBLE RESULTS")
        print(f"{'='*60}")

        ensemble_acc = ensemble_predict(models, test_loader)
        print(f"Ensemble Accuracy: {ensemble_acc:.2f}%")

        final_accuracy = max(max(individual_accuracies), ensemble_acc)
        print(f"\n{'='*60}")
        print("FINAL SOTA RESULTS")
        print(f"{'='*60}")
        print(f"Best Individual Model: {max(individual_accuracies):.2f}%")
        print(f"Ensemble Accuracy: {ensemble_acc:.2f}%")
        print(f"FINAL BEST ACCURACY: {final_accuracy:.2f}%")

        if final_accuracy >= 97.0:
            print(f"🎉 EXCELLENT! {final_accuracy:.2f}% ≥ 97%")
        elif final_accuracy >= 95.0:
            print(f"✅ VERY GOOD! {final_accuracy:.2f}% ≥ 95%")
        elif final_accuracy >= 92.0:
            print(f"🟡 GOOD! {final_accuracy:.2f}% ≥ 92%")
        else:
            print(f"⚠️ NEEDS IMPROVEMENT: {final_accuracy:.2f}%")

def main():
    SEED_ = 42
    print("="*80)
    print("T4 GPU-OPTIMIZED SOTA FASHIONMNIST IMPLEMENTATION")
    print("TARGET: 95%+ ACCURACY WITH ENSEMBLE")
    print("="*80)
    print(f"Device: {device}")
    print(f"Ensemble Size: {ENSEMBLE_SIZE}")
    print(f"Epochs per Model: {EPOCHS_PER_DATASET}")
    print(f"Hidden Dim: {HIDDEN_DIM}")
    print(f"Batch Size: {BATCH_SIZE}")
    print(f"Mixed Precision: Enabled")

    torch.manual_seed(SEED_)
    np.random.seed(SEED_)
    random.seed(SEED_)

    # Enable optimizations for T4 GPU
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

    run_sota_experiments()
    print("\n🎉 SOTA EXPERIMENTS COMPLETED!")



Using device: cuda
GPU Name: Tesla T4
GPU Memory: 14.7 GB


In [None]:
main()

T4 GPU-OPTIMIZED SOTA FASHIONMNIST IMPLEMENTATION
TARGET: 95%+ ACCURACY WITH ENSEMBLE
Device: cuda
Ensemble Size: 3
Epochs per Model: 100
Hidden Dim: 2048
Batch Size: 1024
Mixed Precision: Enabled

RUNNING SOTA EXPERIMENTS ON FashionMNIST
TARGET: 95%+ ACCURACY (T4 GPU OPTIMIZED)
Dataset: FashionMNIST | Input: (1, 28, 28) | Classes: 10
Train: 120000 | Test: 10000
GPU Memory Used: 0.0 GB
GPU Memory Cached: 0.0 GB

Training Ensemble Model 1/3


  scaler = GradScaler()  # Mixed precision training
  with autocast():
  with autocast():


Epoch 1/100: Loss = 1.1718 | Test Acc = 79.94% | LR = 0.000900 | Time: September 01, 2025 at 09:14:50 AM
GPU Memory Used: 0.2 GB
GPU Memory Cached: 0.5 GB
Epoch 2/100: Loss = 0.9198 | Test Acc = 86.24% | LR = 0.000899 | Time: September 01, 2025 at 09:16:10 AM
GPU Memory Used: 0.2 GB
GPU Memory Cached: 1.7 GB
Epoch 3/100: Loss = 0.8660 | Test Acc = 87.31% | LR = 0.000898 | Time: September 01, 2025 at 09:17:30 AM
GPU Memory Used: 0.2 GB
GPU Memory Cached: 1.8 GB
Epoch 4/100: Loss = 0.8295 | Test Acc = 88.74% | LR = 0.000896 | Time: September 01, 2025 at 09:18:49 AM
GPU Memory Used: 0.2 GB
GPU Memory Cached: 1.8 GB
Epoch 5/100: Loss = 0.8135 | Test Acc = 88.01% | LR = 0.000894 | Time: September 01, 2025 at 09:20:08 AM
GPU Memory Used: 0.2 GB
GPU Memory Cached: 1.8 GB
Epoch 6/100: Loss = 0.7935 | Test Acc = 89.75% | LR = 0.000892 | Time: September 01, 2025 at 09:21:28 AM
GPU Memory Used: 0.2 GB
GPU Memory Cached: 1.8 GB
Epoch 7/100: Loss = 0.7807 | Test Acc = 90.44% | LR = 0.000889 | Time:

  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


Epoch 73/100: Loss = 0.6376 | Test Acc = 95.15% | LR = 0.000170 | Time: September 01, 2025 at 12:04:18 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.1 GB


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


Epoch 74/100: Loss = 0.6471 | Test Acc = 95.16% | LR = 0.000159 | Time: September 01, 2025 at 12:05:39 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.1 GB
Epoch 75/100: Loss = 0.6666 | Test Acc = 95.25% | LR = 0.000147 | Time: September 01, 2025 at 12:07:01 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.0 GB
Epoch 76/100: Loss = 0.6605 | Test Acc = 95.05% | LR = 0.000136 | Time: September 01, 2025 at 12:08:23 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.0 GB
Epoch 77/100: Loss = 0.6816 | Test Acc = 95.12% | LR = 0.000126 | Time: September 01, 2025 at 12:09:44 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.0 GB
Epoch 78/100: Loss = 0.6651 | Test Acc = 95.20% | LR = 0.000116 | Time: September 01, 2025 at 12:11:04 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.1 GB
Epoch 79/100: Loss = 0.6799 | Test Acc = 94.94% | LR = 0.000106 | Time: September 01, 2025 at 12:12:24 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.1 GB
Epoch 80/100: Loss = 0.6657 | Test Acc = 95.16% | LR = 0.000096 

  scaler = GradScaler()  # Mixed precision training


Epoch 1/100: Loss = 1.1837 | Test Acc = 79.78% | LR = 0.001100 | Time: September 01, 2025 at 12:40:57 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.3 GB
Epoch 2/100: Loss = 0.9256 | Test Acc = 86.15% | LR = 0.001099 | Time: September 01, 2025 at 12:42:13 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.3 GB
Epoch 3/100: Loss = 0.8689 | Test Acc = 87.52% | LR = 0.001098 | Time: September 01, 2025 at 12:43:29 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.3 GB
Epoch 4/100: Loss = 0.8316 | Test Acc = 87.92% | LR = 0.001096 | Time: September 01, 2025 at 12:44:46 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.3 GB
Epoch 5/100: Loss = 0.8119 | Test Acc = 89.27% | LR = 0.001093 | Time: September 01, 2025 at 12:46:01 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.3 GB
Epoch 6/100: Loss = 0.7983 | Test Acc = 90.24% | LR = 0.001090 | Time: September 01, 2025 at 12:47:19 PM
GPU Memory Used: 0.3 GB
GPU Memory Cached: 2.3 GB
Epoch 7/100: Loss = 0.7806 | Test Acc = 90.34% | LR = 0.001087 | Time: