In [1]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

https://lernapparat.de/debug-device-assert/

In [2]:
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import optim

In [3]:
import sys
sys.path.insert(1, "../../../")

In [4]:
from train import *
from data_preprocessing import *
from data_augmentation import *
import torch.optim.lr_scheduler as lr_scheduler
from Models.yolov8cls_path import Model

In [5]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu') 

In [6]:
Model.variants

{'n': {'d': 0.34, 'w': 0.25, 'mc': 1024},
 's': {'d': 0.34, 'w': 0.5, 'mc': 1024},
 'm': {'d': 0.67, 'w': 0.75, 'mc': 768},
 'l': {'d': 1.0, 'w': 1.0, 'mc': 512},
 'xl': {'d': 1.0, 'w': 1.25, 'mc': 512}}

In [7]:
model = Model(num_classes=10, 
              residual_connection=True, 
              CSP=True, 
              add_hidden=True,
              classifyV8=True,
              bottleneck=0.5, 
              variant='s', 
              device=device, 
              dtype=torch.float32)

In [8]:
model

Model(
  (conv1): Conv(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): SiLU()
  )
  (conv2): Conv(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): SiLU()
  )
  (c2f1): C2f(
    (conv1): Conv(
      (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (conv2): Conv(
      (conv): Conv2d(96, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (n_blocks): ModuleList(
      (0): Bottleneck(
        (conv1): Conv(
          (conv): Conv2d(32, 16, kernel_s

In [9]:
data_path = '../../../../datasets/imagenette2/'
norms_path = os.path.join(data_path, 'norms.json')

In [10]:
# means = get_means(path=norms_path, train_loader=None)
# stds = get_stds(path=norms_path, train_loader=None)
norms = get_norms(path=norms_path, train_loader=None)

In [11]:
means, stds = norms['means'], norms['stds']


Profiling your personal module 
https://pytorch.org/tutorials/beginner/profiler.html

https://discuss.pytorch.org/t/how-to-prevent-overfitting/1902
Right now, with my augmented dataset, at epoch 8, I am getting a testset Top1 accuracy of 45% but a trainset Top1 accuracy of 69%.

You should strongly consider data augmentation in some meaningful way. If you’re attempting to do classification then think about what augmentations might add useful information and help distinguish classes in your dataset. In one of my cases, introducing background variation increased recognition rate by over 50%. Basically, with small datasets there is too much overfitting so you want the network to learn real-world distinctions vs. irrelevant artifacts like backgrounds / shadows etc.

In [12]:
transformations = transforms.Compose([transforms.RandomResizedCrop((224, 224)),
                                              Augmentation(),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean=means, std=stds)])
transformations_val = transforms.Compose([transforms.Resize((224, 224)),
                                                 transforms.ToTensor(),
                                                 transforms.Normalize(mean=means, std=stds)
                                                 ])

In [13]:
train_dataset = ImageNetSubset(path=data_path, train=True, transform=transformations, half=False, show=False)
val_dataset = ImageNetSubset(path=data_path, train=False, transform=transformations_val, half=False, show=False)

In [14]:
epochs=10
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

In [15]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)

In [16]:
loss_fn = nn.NLLLoss()

In [17]:
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [18]:
history, gradient_stats = train(epochs, train_loader, val_loader, model, optimizer, loss_fn, scheduler, outputs_path='../../log/YOLOv8cls-version-6/training/')

2025-04-06 16:58:56.602764 Epoch 1 
2025-04-06 16:59:16.970678 Batch 15 
2025-04-06 16:59:19.695843 Batch 30 
2025-04-06 16:59:22.497966 Batch 45 
2025-04-06 16:59:25.964383 Batch 60 
2025-04-06 16:59:28.925614 Batch 75 
2025-04-06 16:59:31.789236 Batch 90 
2025-04-06 16:59:34.538711 Batch 105 
2025-04-06 16:59:37.783342 Batch 120 
2025-04-06 16:59:40.667622 Batch 135 
[Train] Accuracy: 26.1907%, Loss per batch: 2.0574
2025-04-06 16:59:59.849662 Batch 15 
2025-04-06 17:00:01.800225 Batch 30 
2025-04-06 17:00:04.260619 Batch 45 
2025-04-06 17:00:06.320098 Batch 60 
[Val] Accuracy: 31.8217%, loss per batch: 1.9032
Epoch 1: SGD lr 0.0100 -> 0.0090
2025-04-06 17:00:07.855354 Epoch 2 
2025-04-06 17:00:24.899691 Batch 15 
2025-04-06 17:00:27.418362 Batch 30 
2025-04-06 17:00:30.168551 Batch 45 
2025-04-06 17:00:33.009977 Batch 60 
2025-04-06 17:00:35.619893 Batch 75 
2025-04-06 17:00:38.154100 Batch 90 
2025-04-06 17:00:41.056731 Batch 105 
2025-04-06 17:00:43.784622 Batch 120 
2025-04-06 17

In [19]:
history, gradient_stats = train(epochs, train_loader, val_loader, model, optimizer, 
                                loss_fn, scheduler, outputs_path='../../log/YOLOv8cls-version-6/training/', resume=True)

  state = torch.load(os.path.join(outputs_path, f"state.pt"))


2025-04-06 17:09:38.002728 Epoch 11 
2025-04-06 17:09:53.528253 Batch 15 
2025-04-06 17:09:56.216705 Batch 30 
2025-04-06 17:09:58.748998 Batch 45 
2025-04-06 17:10:01.482702 Batch 60 
2025-04-06 17:10:04.460796 Batch 75 
2025-04-06 17:10:07.175423 Batch 90 
2025-04-06 17:10:09.881386 Batch 105 
2025-04-06 17:10:12.104509 Batch 120 
2025-04-06 17:10:14.927322 Batch 135 
[Train] Accuracy: 62.9422%, Loss per batch: 1.0984
2025-04-06 17:10:32.777246 Batch 15 
2025-04-06 17:10:34.488695 Batch 30 
2025-04-06 17:10:36.675732 Batch 45 
2025-04-06 17:10:38.328622 Batch 60 
[Val] Accuracy: 68.6369%, loss per batch: 0.9784
Epoch 11: SGD lr 0.0035 -> 0.0031
2025-04-06 17:10:39.317540 Epoch 12 
2025-04-06 17:10:54.676498 Batch 15 
2025-04-06 17:10:57.140910 Batch 30 
2025-04-06 17:11:00.123207 Batch 45 
2025-04-06 17:11:02.659192 Batch 60 
2025-04-06 17:11:05.356490 Batch 75 
2025-04-06 17:11:07.950648 Batch 90 
2025-04-06 17:11:10.440509 Batch 105 
2025-04-06 17:11:12.700677 Batch 120 
2025-04-06