In [1]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

https://lernapparat.de/debug-device-assert/

In [2]:
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import optim

In [3]:
import sys
sys.path.insert(1, "../../../")

In [4]:
from train import *
from data_preprocessing import *
from data_augmentation import *
import torch.optim.lr_scheduler as lr_scheduler
from Models.yolov8cls_path import Model

In [5]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu') 

In [10]:
Model.variants

{'n': {'d': 0.34, 'w': 0.25, 'mc': 1024},
 's': {'d': 0.34, 'w': 0.5, 'mc': 1024},
 'm': {'d': 0.67, 'w': 0.75, 'mc': 768},
 'l': {'d': 1.0, 'w': 1.0, 'mc': 512},
 'xl': {'d': 1.0, 'w': 1.25, 'mc': 512}}

In [15]:
model = Model(num_classes=10, 
              residual_connection=True, 
              CSP=True, 
              add_hidden=True,
              classifyV8=True,
              bottleneck=0.75, 
              variant='s', 
              device=device, 
              dtype=torch.float32)

In [16]:
model

Model(
  (conv1): Conv(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): SiLU()
  )
  (conv2): Conv(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): SiLU()
  )
  (c2f1): C2f(
    (conv1): Conv(
      (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (conv2): Conv(
      (conv): Conv2d(96, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (n_blocks): ModuleList(
      (0): Bottleneck(
        (conv1): Conv(
          (conv): Conv2d(32, 24, kernel_s

In [17]:
data_path = '../../../../datasets/imagenette2/'
norms_path = os.path.join(data_path, 'norms.json')

In [18]:
# means = get_means(path=norms_path, train_loader=None)
# stds = get_stds(path=norms_path, train_loader=None)
norms = get_norms(path=norms_path, train_loader=None)

In [19]:
means, stds = norms['means'], norms['stds']


Profiling your personal module 
https://pytorch.org/tutorials/beginner/profiler.html

https://discuss.pytorch.org/t/how-to-prevent-overfitting/1902
Right now, with my augmented dataset, at epoch 8, I am getting a testset Top1 accuracy of 45% but a trainset Top1 accuracy of 69%.

You should strongly consider data augmentation in some meaningful way. If you’re attempting to do classification then think about what augmentations might add useful information and help distinguish classes in your dataset. In one of my cases, introducing background variation increased recognition rate by over 50%. Basically, with small datasets there is too much overfitting so you want the network to learn real-world distinctions vs. irrelevant artifacts like backgrounds / shadows etc.

In [20]:
transformations = transforms.Compose([transforms.RandomResizedCrop((224, 224)),
                                              Augmentation(),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean=means, std=stds)])
transformations_val = transforms.Compose([transforms.Resize((224, 224)),
                                                 transforms.ToTensor(),
                                                 transforms.Normalize(mean=means, std=stds)
                                                 ])

In [21]:
train_dataset = ImageNetSubset(path=data_path, train=True, transform=transformations, half=False, show=False)
val_dataset = ImageNetSubset(path=data_path, train=False, transform=transformations_val, half=False, show=False)

In [22]:
epochs=10
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

In [23]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)

In [24]:
loss_fn = nn.NLLLoss()

In [25]:
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [26]:
history, gradient_stats = train(epochs, train_loader, val_loader, model, optimizer, loss_fn, scheduler, outputs_path='../../log/YOLOv8cls-version-6/training/')

2025-04-06 11:54:12.429210 Epoch 1 
2025-04-06 11:54:38.788744 Batch 15 
2025-04-06 11:54:41.716343 Batch 30 
2025-04-06 11:54:44.280535 Batch 45 
2025-04-06 11:54:46.990268 Batch 60 
2025-04-06 11:54:50.194543 Batch 75 
2025-04-06 11:54:54.048923 Batch 90 
2025-04-06 11:54:57.789160 Batch 105 
2025-04-06 11:55:01.573120 Batch 120 
2025-04-06 11:55:04.941133 Batch 135 
[Train] Accuracy: 24.237%, Loss per batch: 2.0781
2025-04-06 11:55:24.841268 Batch 15 
2025-04-06 11:55:26.882468 Batch 30 
2025-04-06 11:55:29.020529 Batch 45 
2025-04-06 11:55:31.040684 Batch 60 
[Val] Accuracy: 31.4395%, loss per batch: 2.0123
Epoch 1: SGD lr 0.0100 -> 0.0090
2025-04-06 11:55:32.263981 Epoch 2 
2025-04-06 11:55:48.229531 Batch 15 
2025-04-06 11:55:51.000905 Batch 30 
2025-04-06 11:55:53.473526 Batch 45 
2025-04-06 11:55:56.081634 Batch 60 
2025-04-06 11:55:59.303414 Batch 75 
2025-04-06 11:56:02.413055 Batch 90 
2025-04-06 11:56:05.267740 Batch 105 
2025-04-06 11:56:07.460546 Batch 120 
2025-04-06 11:

In [27]:
history, gradient_stats = train(epochs, train_loader, val_loader, model, optimizer, 
                                loss_fn, scheduler, outputs_path='../../log/YOLOv8cls-version-6/training/', resume=True)

  state = torch.load(os.path.join(outputs_path, f"state.pt"))


2025-04-06 12:05:23.500251 Epoch 11 
2025-04-06 12:05:39.380942 Batch 15 
2025-04-06 12:05:42.010033 Batch 30 
2025-04-06 12:05:44.898517 Batch 45 
2025-04-06 12:05:48.133070 Batch 60 
2025-04-06 12:05:51.570162 Batch 75 
2025-04-06 12:05:54.445288 Batch 90 
2025-04-06 12:05:57.002434 Batch 105 
2025-04-06 12:05:59.823867 Batch 120 
2025-04-06 12:06:02.782670 Batch 135 
[Train] Accuracy: 63.2802%, Loss per batch: 1.1053
2025-04-06 12:06:22.531566 Batch 15 
2025-04-06 12:06:24.516194 Batch 30 
2025-04-06 12:06:26.724958 Batch 45 
2025-04-06 12:06:28.471543 Batch 60 
[Val] Accuracy: 64.6624%, loss per batch: 1.0502
Epoch 11: SGD lr 0.0035 -> 0.0031
2025-04-06 12:06:29.568195 Epoch 12 
2025-04-06 12:06:45.653207 Batch 15 
2025-04-06 12:06:48.683536 Batch 30 
2025-04-06 12:06:51.500089 Batch 45 
2025-04-06 12:06:54.339846 Batch 60 
2025-04-06 12:06:57.212749 Batch 75 
2025-04-06 12:06:59.932843 Batch 90 
2025-04-06 12:07:02.732857 Batch 105 
2025-04-06 12:07:05.291865 Batch 120 
2025-04-06